VeuReu commited on
Commit
6397e15
·
verified ·
1 Parent(s): 68bc808

Upload 35 files

Browse files
page_modules/analyze_audiodescriptions.py CHANGED
@@ -47,6 +47,25 @@ def _load_labels_from_config() -> Dict[str, str]:
47
  }
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def load_eval_values(vid_dir: Path, version: str, eval_content: Optional[str] = None) -> Optional[Dict[str, int]]:
51
  """Carga los valores de evaluación desde eval (DB o CSV) si existe.
52
 
@@ -587,18 +606,35 @@ def render_analyze_audiodescriptions_page(api, permissions: Dict[str, bool]) ->
587
  # Determinar versió i llegir UNE/free per a la inserció detallada
588
  version = subcarpeta_seleccio or "MoE"
589
  video_dir = base_media_dir / selected_sha1
590
- une_path = video_dir / version / "une_ad.srt"
591
- free_path = video_dir / version / "free_ad.txt"
 
592
 
593
  try:
594
- une_ad_text = une_path.read_text(encoding="utf-8") if une_path.exists() else ""
 
 
 
 
595
  except Exception:
596
- une_ad_text = une_path.read_text(errors="ignore") if une_path.exists() else ""
 
 
 
 
597
 
598
  try:
599
- free_ad_text = free_path.read_text(encoding="utf-8") if free_path.exists() else ""
 
 
 
 
600
  except Exception:
601
- free_ad_text = free_path.read_text(errors="ignore") if free_path.exists() else ""
 
 
 
 
602
 
603
  user_name = (
604
  st.session_state.user.get("username")
 
47
  }
48
 
49
 
50
+ def _find_best_file_for_version(vid_dir: Path, version: str, filename: str) -> Optional[Path]:
51
+ """Busca un fitxer dins de temp/media/<sha1>/<version>/<subtype> amb prioritat.
52
+
53
+ Ordre de cerca de subtipus: "HITL OK" -> "HITL Test" -> "Original" -> arrel de <version>.
54
+ """
55
+
56
+ preferred_subtypes = ["HITL OK", "HITL Test", "Original"]
57
+ for subtype in preferred_subtypes:
58
+ candidate = vid_dir / version / subtype / filename
59
+ if candidate.exists():
60
+ return candidate
61
+
62
+ legacy = vid_dir / version / filename
63
+ if legacy.exists():
64
+ return legacy
65
+
66
+ return None
67
+
68
+
69
  def load_eval_values(vid_dir: Path, version: str, eval_content: Optional[str] = None) -> Optional[Dict[str, int]]:
70
  """Carga los valores de evaluación desde eval (DB o CSV) si existe.
71
 
 
606
  # Determinar versió i llegir UNE/free per a la inserció detallada
607
  version = subcarpeta_seleccio or "MoE"
608
  video_dir = base_media_dir / selected_sha1
609
+
610
+ une_path = _find_best_file_for_version(video_dir, version, "une_ad.srt")
611
+ free_path = _find_best_file_for_version(video_dir, version, "free_ad.txt")
612
 
613
  try:
614
+ une_ad_text = (
615
+ une_path.read_text(encoding="utf-8")
616
+ if une_path is not None and une_path.exists()
617
+ else ""
618
+ )
619
  except Exception:
620
+ une_ad_text = (
621
+ une_path.read_text(errors="ignore")
622
+ if une_path is not None and une_path.exists()
623
+ else ""
624
+ )
625
 
626
  try:
627
+ free_ad_text = (
628
+ free_path.read_text(encoding="utf-8")
629
+ if free_path is not None and free_path.exists()
630
+ else ""
631
+ )
632
  except Exception:
633
+ free_ad_text = (
634
+ free_path.read_text(errors="ignore")
635
+ if free_path is not None and free_path.exists()
636
+ else ""
637
+ )
638
 
639
  user_name = (
640
  st.session_state.user.get("username")
persistent_data_gate.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import shutil
3
  import zipfile
4
  import io
 
5
  from pathlib import Path
6
  from typing import Optional
7
 
@@ -142,10 +143,12 @@ def ensure_temp_databases(base_dir: Path, api_client) -> None:
142
 
143
  # Verificació opcional: llistar estat de demo/data/db i demo/temp/db al log
144
  try:
145
- from scripts.verify_temp_dbs import run_verification as _run_db_verification
146
-
147
- print("[ensure_temp_databases] Executant verificador de BDs (demo/scripts/verify_temp_dbs.py)...")
148
- _run_db_verification()
 
 
149
  except Exception as _e_ver:
150
  print(f"[ensure_temp_databases] Error executant verificador de BDs: {_e_ver}")
151
 
 
2
  import shutil
3
  import zipfile
4
  import io
5
+ import runpy
6
  from pathlib import Path
7
  from typing import Optional
8
 
 
143
 
144
  # Verificació opcional: llistar estat de demo/data/db i demo/temp/db al log
145
  try:
146
+ script_path = base_dir / "scripts" / "verify_temp_dbs.py"
147
+ if script_path.exists():
148
+ print("[ensure_temp_databases] Executant verificador de BDs (scripts/verify_temp_dbs.py)...")
149
+ runpy.run_path(str(script_path), run_name="__main__")
150
+ else:
151
+ print(f"[ensure_temp_databases] verify_temp_dbs.py no trobat a {script_path}")
152
  except Exception as _e_ver:
153
  print(f"[ensure_temp_databases] Error executant verificador de BDs: {_e_ver}")
154
 
scripts/build_audiodescriptions_db.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ import sqlite3
5
+ import csv
6
+ import json
7
+ from typing import Optional
8
+
9
+ from engine.finetuning.video_analysis import analyze_srt, embed_srt_sentences
10
+
11
+ BASE_DEMO = Path(__file__).resolve().parent.parent
12
+ MEDIA_ROOT = BASE_DEMO / "data" / "media"
13
+ DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
14
+
15
+ VALID_VERSIONS = ["MoE", "Salamandra", "HITL"]
16
+
17
+
18
+ def read_text_file(path: Path) -> Optional[str]:
19
+ if not path.exists():
20
+ return None
21
+ try:
22
+ return path.read_text(encoding="utf-8")
23
+ except Exception:
24
+ try:
25
+ return path.read_text(errors="ignore")
26
+ except Exception:
27
+ return None
28
+
29
+
30
+ def read_eval_csv(path: Path) -> Optional[str]:
31
+ if not path.exists():
32
+ return None
33
+ try:
34
+ # Guardamos el CSV entero como texto para poderlo reutilizar tal cual
35
+ return path.read_text(encoding="utf-8")
36
+ except Exception:
37
+ try:
38
+ return path.read_text(errors="ignore")
39
+ except Exception:
40
+ return None
41
+
42
+
43
+ def summarize_free_ad(text: Optional[str], max_chars: int = 280) -> str:
44
+ if not text:
45
+ return ""
46
+ s = " ".join(text.split()) # normalizar espacios y saltos de línea
47
+ if len(s) <= max_chars:
48
+ return s
49
+ return s[: max_chars - 3] + "..."
50
+
51
+
52
+ def ensure_schema(conn: sqlite3.Connection) -> None:
53
+ cur = conn.cursor()
54
+ cur.execute(
55
+ """
56
+ CREATE TABLE IF NOT EXISTS audiodescriptions (
57
+ sha1sum TEXT NOT NULL,
58
+ version TEXT NOT NULL,
59
+ une_ad TEXT,
60
+ free_ad TEXT,
61
+ eval TEXT,
62
+ srt_duration REAL,
63
+ ad_ratio REAL,
64
+ words_pm REAL,
65
+ speakers_pm REAL,
66
+ blocks_pm REAL,
67
+ description TEXT,
68
+ embedding TEXT,
69
+ PRIMARY KEY (sha1sum, version)
70
+ );
71
+ """
72
+ )
73
+ conn.commit()
74
+
75
+
76
+ def main() -> None:
77
+ print(f"MEDIA_ROOT: {MEDIA_ROOT} (exists={MEDIA_ROOT.exists()})")
78
+ if not MEDIA_ROOT.exists():
79
+ raise SystemExit("❌ No s'ha trobat demo/data/media")
80
+
81
+ DB_PATH.parent.mkdir(parents=True, exist_ok=True)
82
+ conn = sqlite3.connect(str(DB_PATH))
83
+ conn.row_factory = sqlite3.Row
84
+ ensure_schema(conn)
85
+
86
+ cur = conn.cursor()
87
+
88
+ total_rows = 0
89
+ for video_dir in sorted(MEDIA_ROOT.iterdir()):
90
+ if not video_dir.is_dir():
91
+ continue
92
+ sha1sum = video_dir.name
93
+
94
+ for version in VALID_VERSIONS:
95
+ version_dir = video_dir / version
96
+ if not version_dir.exists() or not version_dir.is_dir():
97
+ continue
98
+
99
+ une_path = version_dir / "une_ad.srt"
100
+ free_path = version_dir / "free_ad.txt"
101
+ eval_path = version_dir / "eval.csv"
102
+
103
+ une_ad = read_text_file(une_path)
104
+ free_ad = read_text_file(free_path)
105
+ eval_csv = read_eval_csv(eval_path)
106
+
107
+ if une_ad is None and free_ad is None and eval_csv is None:
108
+ # Nada que indexar para esta versión
109
+ continue
110
+
111
+ # Analizar SRT si existe
112
+ srt_duration = ad_ratio = words_pm = speakers_pm = blocks_pm = None
113
+ if une_ad:
114
+ try:
115
+ metrics = analyze_srt(une_ad)
116
+ srt_duration = float(metrics.get("duration_sec", 0.0))
117
+ ad_ratio = float(metrics.get("ad_time_ratio", 0.0))
118
+ words_pm = float(metrics.get("words_per_min", 0.0))
119
+ speakers_pm = float(metrics.get("speakers_blocks_per_min", 0.0))
120
+ blocks_pm = float(metrics.get("blocks_per_min", 0.0))
121
+ except Exception as e:
122
+ print(f"[WARN] Error analitzant SRT per {sha1sum}/{version}: {e}")
123
+
124
+ # Resumen del free_ad
125
+ description = summarize_free_ad(free_ad)
126
+
127
+ # Embedding del SRT (puede fallar si no están instalados los deps)
128
+ embedding_json = None
129
+ if une_ad:
130
+ try:
131
+ emb_info = embed_srt_sentences(une_ad)
132
+ embeddings = emb_info.get("embeddings") or []
133
+ # Guardamos como JSON; puede ser muy grande pero sirve para prototipo
134
+ embedding_json = json.dumps(embeddings)
135
+ except Exception as e:
136
+ print(f"[WARN] Error generant embeddings per {sha1sum}/{version}: {e}")
137
+
138
+ cur.execute(
139
+ """
140
+ INSERT OR REPLACE INTO audiodescriptions (
141
+ sha1sum, version, une_ad, free_ad, eval,
142
+ srt_duration, ad_ratio, words_pm, speakers_pm, blocks_pm,
143
+ description, embedding
144
+ ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?);
145
+ """,
146
+ (
147
+ sha1sum,
148
+ version,
149
+ une_ad,
150
+ free_ad,
151
+ eval_csv,
152
+ srt_duration,
153
+ ad_ratio,
154
+ words_pm,
155
+ speakers_pm,
156
+ blocks_pm,
157
+ description,
158
+ embedding_json,
159
+ ),
160
+ )
161
+ total_rows += 1
162
+
163
+ conn.commit()
164
+ conn.close()
165
+
166
+ print(f"✅ audiodescriptions.db generat a {DB_PATH} amb {total_rows} files.")
167
+
168
+
169
+ if __name__ == "__main__":
170
+ main()
scripts/debug_audiodescriptions_db.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import sqlite3
3
+
4
+ BASE = Path(__file__).resolve().parent.parent
5
+ ADB = BASE / "temp" / "audiodescriptions.db"
6
+
7
+ print(f"AUDIODESCRIPTIONS_DB: {ADB} (exists={ADB.exists()})")
8
+
9
+ if not ADB.exists():
10
+ raise SystemExit("❌ audiodescriptions.db no existeix")
11
+
12
+ conn = sqlite3.connect(str(ADB))
13
+ conn.row_factory = sqlite3.Row
14
+ cur = conn.cursor()
15
+
16
+ print("\n[SCHEMA] PRAGMA table_info(audiodescriptions):")
17
+ try:
18
+ cur.execute("PRAGMA table_info(audiodescriptions)")
19
+ for row in cur.fetchall():
20
+ # row: (cid, name, type, notnull, dflt_value, pk)
21
+ print(f" - cid={row['cid']}, name={row['name']}, type={row['type']}, notnull={row['notnull']}, pk={row['pk']}")
22
+ except Exception as e:
23
+ print("Error llegint esquema:", e)
24
+
25
+ print("\n[DATA] Primeres 10 files de audiodescriptions:")
26
+ try:
27
+ cur.execute("SELECT * FROM audiodescriptions LIMIT 10")
28
+ rows = cur.fetchall()
29
+ for r in rows:
30
+ print(" -", dict(r))
31
+ except Exception as e:
32
+ print("Error llegint dades:", e)
33
+
34
+ conn.close()
scripts/debug_videos_and_media.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import sqlite3
3
+
4
+ BASE = Path(__file__).resolve().parent.parent
5
+ VIDEOS_DB = BASE / "temp" / "videos.db"
6
+ MEDIA_DIR = BASE / "temp" / "media"
7
+
8
+ print(f"VIDEOS_DB: {VIDEOS_DB} (exists={VIDEOS_DB.exists()})")
9
+ if VIDEOS_DB.exists():
10
+ conn = sqlite3.connect(str(VIDEOS_DB))
11
+ conn.row_factory = sqlite3.Row
12
+ cur = conn.cursor()
13
+ try:
14
+ # Llistar només les columnes que ens interessen ara mateix
15
+ cur.execute("SELECT video_name, sha1sum FROM videos")
16
+ rows = cur.fetchall()
17
+ print("\n[VIDEOS.DB] Registres (video_name, sha1sum):")
18
+ for r in rows:
19
+ print(f" - video_name={r['video_name']!r}, sha1sum={r['sha1sum']!r}")
20
+ except Exception as e:
21
+ print("Error llegint videos.db:", e)
22
+ finally:
23
+ conn.close()
24
+ else:
25
+ print("videos.db no existeix")
26
+
27
+ print(f"\nMEDIA_DIR: {MEDIA_DIR} (exists={MEDIA_DIR.exists()})")
28
+ if MEDIA_DIR.exists():
29
+ subdirs = [p.name for p in sorted(MEDIA_DIR.iterdir()) if p.is_dir()]
30
+ print("[MEDIA] Subcarpetes a demo/temp/media:")
31
+ for name in subdirs:
32
+ print(" -", name)
33
+ else:
34
+ print("Directori media no existeix")
scripts/explore_data.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ from typing import Dict, Any
4
+
5
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
6
+ TEMP_DIR = os.path.join(BASE_DIR, "temp")
7
+
8
+
9
+ def analyze_db(db_path: str) -> Dict[str, Any]:
10
+ """Devuelve un dict con info de todas las tablas de un fichero SQLite."""
11
+ conn = sqlite3.connect(db_path)
12
+ conn.row_factory = sqlite3.Row
13
+ cur = conn.cursor()
14
+
15
+ cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
16
+ tables = [r[0] for r in cur.fetchall()]
17
+
18
+ db_info: Dict[str, Any] = {}
19
+
20
+ for table in tables:
21
+ # columnas de la tabla
22
+ cur.execute(f"PRAGMA table_info(\"{table}\")")
23
+ cols = [r[1] for r in cur.fetchall()]
24
+
25
+ table_info: Dict[str, Any] = {"n_rows": 0, "columns": {}}
26
+
27
+ # número de registros
28
+ cur.execute(f"SELECT COUNT(*) AS n FROM \"{table}\"")
29
+ n_rows = cur.fetchone()[0]
30
+ table_info["n_rows"] = n_rows
31
+
32
+ for col in cols:
33
+ # número de nulos
34
+ cur.execute(
35
+ f"SELECT COUNT(*) AS n_null FROM \"{table}\" WHERE \"{col}\" IS NULL"
36
+ )
37
+ n_null = cur.fetchone()[0]
38
+ null_pct = (n_null / n_rows * 100.0) if n_rows > 0 else 0.0
39
+
40
+ # número de valores distintos
41
+ cur.execute(
42
+ f"SELECT COUNT(DISTINCT \"{col}\") AS n_distinct FROM \"{table}\""
43
+ )
44
+ n_distinct = cur.fetchone()[0]
45
+
46
+ table_info["columns"][col] = {
47
+ "null_pct": null_pct,
48
+ "n_distinct": n_distinct,
49
+ }
50
+
51
+ db_info[table] = table_info
52
+
53
+ conn.close()
54
+ return db_info
55
+
56
+
57
+ def main() -> None:
58
+ print(f"Analizando ficheros .db en: {TEMP_DIR}")
59
+
60
+ for fname in sorted(os.listdir(TEMP_DIR)):
61
+ if not fname.endswith(".db"):
62
+ continue
63
+
64
+ db_path = os.path.join(TEMP_DIR, fname)
65
+ print("\n" + "=" * 80)
66
+ print(f"Base de datos: {fname}")
67
+ print("=" * 80)
68
+
69
+ db_info = analyze_db(db_path)
70
+
71
+ if not db_info:
72
+ print(" (Sin tablas de usuario)")
73
+ continue
74
+
75
+ for table_name, tinfo in db_info.items():
76
+ print(f"\nTabla: {table_name}")
77
+ print(f" Nº registros: {tinfo['n_rows']}")
78
+ print(" Campos:")
79
+ for col, cinfo in tinfo["columns"].items():
80
+ print(
81
+ f" - {col}: "
82
+ f"{cinfo['n_distinct']} valores distintos, "
83
+ f"{cinfo['null_pct']:.2f}% nulos"
84
+ )
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
scripts/generate_media_sha1sums.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import csv
3
+ from pathlib import Path
4
+
5
+
6
+ BASE = Path(__file__).resolve().parent.parent / "temp" / "media"
7
+
8
+
9
+ def sha1_of_file(path: Path, buf_size: int = 65536) -> str:
10
+ """Calcula el SHA1 de un fichero binario."""
11
+
12
+ h = hashlib.sha1()
13
+ with path.open("rb") as f:
14
+ while True:
15
+ chunk = f.read(buf_size)
16
+ if not chunk:
17
+ break
18
+ h.update(chunk)
19
+ return h.hexdigest()
20
+
21
+
22
+ def main() -> None:
23
+ if not BASE.exists():
24
+ raise SystemExit(f"No existe la carpeta {BASE}")
25
+
26
+ rows: list[tuple[str, str]] = []
27
+ subdirs = sorted(p for p in BASE.iterdir() if p.is_dir())
28
+
29
+ # 1) Calcular SHA1 y guardar filas para el CSV
30
+ for d in subdirs:
31
+ video_path = d / "video.mp4"
32
+ if not video_path.exists():
33
+ print(f"[WARN] No se encontró video.mp4 en {d}, se omite esta carpeta.")
34
+ continue
35
+
36
+ digest = sha1_of_file(video_path)
37
+ print(f"{d.name} -> {digest}")
38
+ rows.append((d.name, digest))
39
+
40
+ if not rows:
41
+ print("[INFO] No se han encontrado carpetas con video.mp4, nada que hacer.")
42
+ return
43
+
44
+ # 2) Escribir CSV con mapping carpeta original -> sha1sum
45
+ csv_path = BASE / "sha1sums.csv"
46
+ with csv_path.open("w", newline="", encoding="utf-8") as f:
47
+ writer = csv.writer(f)
48
+ writer.writerow(["folder_name", "sha1sum"])
49
+ writer.writerows(rows)
50
+
51
+ print(f"[INFO] CSV generado en {csv_path}")
52
+
53
+ # 3) Renombrar carpetas a su sha1sum
54
+ # Si dos carpetas tienen el mismo sha1, se deja la segunda sin renombrar.
55
+ for old_name, digest in rows:
56
+ old_path = BASE / old_name
57
+ new_path = BASE / digest
58
+ if not old_path.exists():
59
+ print(f"[INFO] Carpeta {old_path} ya no existe, se omite.")
60
+ continue
61
+ if new_path.exists():
62
+ print(
63
+ f"[ERROR] Ya existe {new_path}, posible colisión de SHA1 o renombrado previo. "
64
+ f"No se renombra {old_path}."
65
+ )
66
+ continue
67
+ print(f"Renombrando {old_path} -> {new_path}")
68
+ old_path.rename(new_path)
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
scripts/init_casting_scenarios.py CHANGED
@@ -6,7 +6,7 @@ from typing import Iterable, Tuple
6
 
7
  BASE_DIR = Path(__file__).resolve().parent.parent
8
  DATA_DIR = BASE_DIR / "data"
9
- VIDEOS_DIR = DATA_DIR / "videos"
10
 
11
  CASTING_DB_PATH = DATA_DIR / "casting.db"
12
  SCENARIOS_DB_PATH = DATA_DIR / "scenarios.db"
@@ -84,7 +84,7 @@ def populate_single_table(
84
  ) -> int:
85
  """Rellena una tabla (casting o scenarios) a partir dels CSV per vídeo.
86
 
87
- Recorre demo/data/videos/<video_name>, busca <csv_filename> y un .mp4,
88
  calcula sha1sum del vídeo y genera un registre por cada fila del CSV.
89
  """
90
 
 
6
 
7
  BASE_DIR = Path(__file__).resolve().parent.parent
8
  DATA_DIR = BASE_DIR / "data"
9
+ VIDEOS_DIR = DATA_DIR / "media"
10
 
11
  CASTING_DB_PATH = DATA_DIR / "casting.db"
12
  SCENARIOS_DB_PATH = DATA_DIR / "scenarios.db"
 
84
  ) -> int:
85
  """Rellena una tabla (casting o scenarios) a partir dels CSV per vídeo.
86
 
87
+ Recorre demo/data/media/<video_name>, busca <csv_filename> y un .mp4,
88
  calcula sha1sum del vídeo y genera un registre por cada fila del CSV.
89
  """
90
 
scripts/init_feedback_demo.py CHANGED
@@ -6,7 +6,7 @@ from typing import Dict
6
 
7
  BASE_DIR = Path(__file__).resolve().parent.parent
8
  DATA_DIR = BASE_DIR / "data"
9
- VIDEOS_DIR = DATA_DIR / "videos"
10
  DB_PATH = DATA_DIR / "feedback.db"
11
 
12
  SLIDER_CAPTIONS = [
@@ -99,7 +99,7 @@ def parse_eval_csv(csv_path: Path) -> Dict[str, int]:
99
 
100
  def migrate() -> None:
101
  if not VIDEOS_DIR.exists():
102
- print(f"[INFO] No existe demo/data/videos, nada que hacer.")
103
  return
104
 
105
  conn = ensure_db()
 
6
 
7
  BASE_DIR = Path(__file__).resolve().parent.parent
8
  DATA_DIR = BASE_DIR / "data"
9
+ VIDEOS_DIR = DATA_DIR / "media"
10
  DB_PATH = DATA_DIR / "feedback.db"
11
 
12
  SLIDER_CAPTIONS = [
 
99
 
100
  def migrate() -> None:
101
  if not VIDEOS_DIR.exists():
102
+ print(f"[INFO] No existe demo/data/media, nada que hacer.")
103
  return
104
 
105
  conn = ensure_db()
scripts/inspect_audiodescriptions.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Inspecta el contingut de demo/temp/audiodescriptions.db.
2
+
3
+ Mostra:
4
+ - Si el fitxer existeix
5
+ - Nombre total de files
6
+ - Parelles (sha1sum, version)
7
+ - Resultats de consultes parametritzades vs literals per a uns sha1sum/version de mostra
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import sqlite3
14
+ from pathlib import Path
15
+
16
+ DB_PATH = Path(__file__).resolve().parent.parent / "temp" / "audiodescriptions.db"
17
+
18
+ SAMPLE_SHA1S = (
19
+ "8ff4b2aaccfeee31ecc59b96e1ae90273de78864",
20
+ "3df04d2b7df70210fcceb7b9d9a35731bb43a39c",
21
+ "150f0d2abfe26602e38dc3cc1a0030d16c8ed0a2",
22
+ )
23
+ SAMPLE_VERSIONS = ("Salamandra", "MoE")
24
+
25
+
26
+ def main() -> None:
27
+ print(f"DB path: {DB_PATH}")
28
+ if not DB_PATH.exists():
29
+ print("❌ DB file does not exist")
30
+ return
31
+
32
+ conn = sqlite3.connect(DB_PATH)
33
+ conn.row_factory = sqlite3.Row
34
+ cur = conn.cursor()
35
+
36
+ cur.execute("SELECT COUNT(*) FROM audiodescriptions")
37
+ total = cur.fetchone()[0]
38
+ print(f"Total rows: {total}")
39
+
40
+ # Ver esquema de la tabla
41
+ cur.execute("PRAGMA table_info(audiodescriptions)")
42
+ columns = cur.fetchall()
43
+ print("Columns:", [col[1] for col in columns])
44
+
45
+ cur.execute(
46
+ "SELECT sha1sum, version FROM audiodescriptions ORDER BY sha1sum, version"
47
+ )
48
+ pairs = [dict(row) for row in cur.fetchall()]
49
+ print("Pairs:")
50
+ print(json.dumps(pairs, ensure_ascii=False, indent=2))
51
+
52
+ for sha1 in SAMPLE_SHA1S:
53
+ for version in SAMPLE_VERSIONS:
54
+ cur.execute(
55
+ "SELECT COUNT(*) FROM audiodescriptions WHERE sha1sum=? AND version=?",
56
+ (sha1, version),
57
+ )
58
+ count_param = cur.fetchone()[0]
59
+
60
+ cur.execute(
61
+ "SELECT COUNT(*) FROM audiodescriptions WHERE sha1sum=:sha AND version=:ver",
62
+ {"sha": sha1, "ver": version},
63
+ )
64
+ count_named = cur.fetchone()[0]
65
+
66
+ cur.execute(
67
+ f"SELECT COUNT(*) FROM audiodescriptions WHERE sha1sum='{sha1}' AND version='{version}'"
68
+ )
69
+ count_literal = cur.fetchone()[0]
70
+
71
+ # Test con LOWER() - igual que la función (sin updated_at/created_at)
72
+ cur.execute(
73
+ "SELECT * FROM audiodescriptions WHERE sha1sum = ? AND LOWER(version) = LOWER(?) ORDER BY rowid DESC LIMIT 1",
74
+ (sha1, version),
75
+ )
76
+ rows_lower = cur.fetchall()
77
+ count_lower = len(rows_lower)
78
+
79
+ print(
80
+ f"sha1={sha1} version={version} -> param={count_param} "
81
+ f"named={count_named} literal={count_literal} SELECT*_LOWER={count_lower}"
82
+ )
83
+
84
+ conn.close()
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
scripts/migrate_audiodescriptions.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import sqlite3
3
+ from pathlib import Path
4
+ from typing import Set, Dict, List
5
+
6
+ # Este script se guarda como demo/scripts/migrate_audiodescriptions.py
7
+ # BASE_DIR apunta a la carpeta demo/
8
+ BASE_DIR = Path(__file__).resolve().parent.parent
9
+
10
+ DB_REL_PATHS = [
11
+ Path("temp") / "audiodescriptions.db",
12
+ Path("data") / "audiodescriptions.db",
13
+ ]
14
+
15
+
16
+ def get_existing_columns(conn, table_name: str) -> Set[str]:
17
+ cur = conn.execute(f"PRAGMA table_info({table_name})")
18
+ cols = {row[1] for row in cur.fetchall()} # row[1] = nombre de columna
19
+ return cols
20
+
21
+
22
+ def ensure_columns(conn, table_name: str, columns_sql: Dict[str, str]) -> None:
23
+ """Asegura que existen las columnas indicadas (si no, hace ALTER TABLE).
24
+
25
+ columns_sql: nombre_columna -> sentencia ALTER TABLE ADD COLUMN ...
26
+ """
27
+ existing = get_existing_columns(conn, table_name)
28
+ for col_name, alter_sql in columns_sql.items():
29
+ if col_name not in existing:
30
+ print(f" - Añadiendo columna {col_name}...")
31
+ conn.execute(alter_sql)
32
+ else:
33
+ print(f" - Columna {col_name} ya existe, se omite.")
34
+
35
+
36
+ def copy_free_ad_into_new_columns(conn, table_name: str, target_columns: List[str]) -> None:
37
+ """Actualiza cada columna destino con el valor actual de free_ad."""
38
+ set_clause = ", ".join(f"{col} = free_ad" for col in target_columns)
39
+ sql = f"UPDATE {table_name} SET {set_clause}"
40
+ print(f" - Ejecutando: {sql}")
41
+ conn.execute(sql)
42
+
43
+
44
+ def migrate_db(db_path: Path) -> None:
45
+ full_path = BASE_DIR / db_path
46
+ if not full_path.exists():
47
+ print(f"[AVISO] DB no encontrada, se omite: {full_path}")
48
+ return
49
+
50
+ print(f"\n=== Migrando BD: {full_path} ===")
51
+ conn = sqlite3.connect(full_path)
52
+ try:
53
+ conn.isolation_level = None # manejo manual de transacciones
54
+ conn.execute("BEGIN")
55
+
56
+ table_name = "audiodescriptions"
57
+
58
+ # 1) Asegurar columnas nuevas con los nombres finales
59
+ columns_sql: Dict[str, str] = {
60
+ "ok_une_ad": f"ALTER TABLE {table_name} ADD COLUMN ok_une_ad TEXT",
61
+ "test_une_ad": f"ALTER TABLE {table_name} ADD COLUMN test_une_ad TEXT",
62
+ "ok_free_ad": f"ALTER TABLE {table_name} ADD COLUMN ok_free_ad TEXT",
63
+ "test_free_ad": f"ALTER TABLE {table_name} ADD COLUMN test_free_ad TEXT",
64
+ }
65
+ ensure_columns(conn, table_name, columns_sql)
66
+
67
+ # 2) Copiar free_ad a las cuatro columnas nuevas
68
+ target_cols: List[str] = ["ok_une_ad", "test_une_ad", "ok_free_ad", "test_free_ad"]
69
+ copy_free_ad_into_new_columns(conn, table_name, target_cols)
70
+
71
+ conn.execute("COMMIT")
72
+ print(f"✔ Migración completada en: {full_path}")
73
+ except Exception as e:
74
+ print(f"❌ Error en {full_path}: {e}")
75
+ try:
76
+ conn.execute("ROLLBACK")
77
+ except Exception:
78
+ pass
79
+ finally:
80
+ conn.close()
81
+
82
+
83
+ def main() -> None:
84
+ print("Script de migración de audiodescriptions.db")
85
+ print("Añade columnas ok_une_ad, test_une_ad, ok_free_ad, test_free_ad")
86
+ print("y copia free_ad en todas ellas.\n")
87
+
88
+ for rel in DB_REL_PATHS:
89
+ migrate_db(rel)
90
+
91
+
92
+ if __name__ == "__main__":
93
+ main()
scripts/migrate_audiodescriptions_info_ad.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import sqlite3
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ # Este script se debe guardar como:
7
+ # demo/scripts/migrate_audiodescriptions_info_ad.py
8
+ # BASE_DIR apunta a la carpeta demo/
9
+ BASE_DIR = Path(__file__).resolve().parent.parent
10
+
11
+ DB_REL_PATHS: List[Path] = [
12
+ Path("temp") / "audiodescriptions.db",
13
+ Path("data") / "audiodescriptions.db",
14
+ ]
15
+
16
+
17
+ def add_info_ad_column(db_path: Path) -> None:
18
+ full_path = BASE_DIR / db_path
19
+ if not full_path.exists():
20
+ print(f"[AVISO] DB no encontrada, se omite: {full_path}")
21
+ return
22
+
23
+ print(f"\n=== Migrando BD: {full_path} ===")
24
+ conn = sqlite3.connect(full_path)
25
+ try:
26
+ conn.isolation_level = None
27
+ conn.execute("BEGIN")
28
+
29
+ # Comprobar columnas existentes en la tabla audiodescriptions
30
+ cur = conn.execute("PRAGMA table_info(audiodescriptions)")
31
+ cols = {row[1] for row in cur.fetchall()} # row[1] = nombre columna
32
+
33
+ if "info_ad" in cols:
34
+ print(" - Columna info_ad ya existe, no se hace nada.")
35
+ else:
36
+ print(" - Añadiendo columna info_ad...")
37
+ conn.execute("ALTER TABLE audiodescriptions ADD COLUMN info_ad TEXT")
38
+
39
+ conn.execute("COMMIT")
40
+ print(f"✔ Migración completada en: {full_path}")
41
+ except Exception as e:
42
+ print(f"❌ Error en {full_path}: {e}")
43
+ try:
44
+ conn.execute("ROLLBACK")
45
+ except Exception:
46
+ pass
47
+ finally:
48
+ conn.close()
49
+
50
+
51
+ def main() -> None:
52
+ print("Script de migración: añadir columna info_ad a audiodescriptions.db\n")
53
+ for rel in DB_REL_PATHS:
54
+ add_info_ad_column(rel)
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
scripts/publish_monthly_digest.py CHANGED
@@ -1,7 +1,7 @@
1
  import argparse
2
  from datetime import datetime, timezone
3
 
4
- from aws_qldb import qldb_manager
5
 
6
 
7
  def _current_period_utc() -> str:
@@ -14,7 +14,7 @@ def main() -> None:
14
  parser = argparse.ArgumentParser(
15
  description=(
16
  "Publica el digest mensual de autorizaciones en Polygon "
17
- "usando aws_qldb.qldb_manager (modo simulado por ahora)."
18
  )
19
  )
20
  parser.add_argument(
@@ -26,8 +26,8 @@ def main() -> None:
26
  args = parser.parse_args()
27
 
28
  period = args.period or _current_period_utc()
29
- print(f"[DIGEST] Publicando digest para el período {period}...")
30
- tx_hash = qldb_manager.publish_monthly_digest_to_polygon(period)
31
 
32
  if tx_hash:
33
  print(f"[DIGEST] Digest publicado correctamente. Tx hash: {tx_hash}")
 
1
  import argparse
2
  from datetime import datetime, timezone
3
 
4
+ from compliance_client import compliance_client
5
 
6
 
7
  def _current_period_utc() -> str:
 
14
  parser = argparse.ArgumentParser(
15
  description=(
16
  "Publica el digest mensual de autorizaciones en Polygon "
17
+ "usando el microservicio 'compliance' (ComplianceClient)."
18
  )
19
  )
20
  parser.add_argument(
 
26
  args = parser.parse_args()
27
 
28
  period = args.period or _current_period_utc()
29
+ print(f"[DIGEST] Publicando digest para el período {period} via compliance...")
30
+ tx_hash = compliance_client.publish_monthly_digest(period)
31
 
32
  if tx_hash:
33
  print(f"[DIGEST] Digest publicado correctamente. Tx hash: {tx_hash}")
scripts/test_full_refinement_via_api.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import difflib
5
+ import os
6
+ import sqlite3
7
+ from pathlib import Path
8
+
9
+ import requests
10
+
11
+
12
+ BASE_DEMO = Path(__file__).resolve().parent.parent
13
+ DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
14
+ DEFAULT_API_URL = "http://localhost:8000/apply_refinement"
15
+
16
+
17
+ def load_une_ad(sha1sum: str, version: str) -> str:
18
+ if not DB_PATH.exists():
19
+ raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
20
+
21
+ conn = sqlite3.connect(str(DB_PATH))
22
+ conn.row_factory = sqlite3.Row
23
+ try:
24
+ cur = conn.cursor()
25
+ row = cur.execute(
26
+ "SELECT une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
27
+ (sha1sum, version),
28
+ ).fetchone()
29
+ if not row or not row["une_ad"]:
30
+ raise SystemExit(
31
+ f"❌ No s'ha trobat une_ad per sha1sum={sha1sum}, version={version} a audiodescriptions.db"
32
+ )
33
+ return row["une_ad"]
34
+ finally:
35
+ conn.close()
36
+
37
+
38
+ def show_diff(initial_srt: str, refined_srt: str) -> None:
39
+ initial_lines = initial_srt.splitlines()
40
+ refined_lines = refined_srt.splitlines()
41
+
42
+ diff = difflib.unified_diff(
43
+ initial_lines,
44
+ refined_lines,
45
+ fromfile="initial_une_ad.srt",
46
+ tofile="refined_une_ad.srt",
47
+ lineterm="",
48
+ )
49
+ for line in diff:
50
+ print(line)
51
+
52
+
53
+ def main() -> None:
54
+ parser = argparse.ArgumentParser(
55
+ description=(
56
+ "Prova de la pipeline completa de refinement (reflection + reflexion + introspection) "
57
+ "via l'endpoint /apply_refinement."
58
+ )
59
+ )
60
+ parser.add_argument("sha1sum", type=str, help="SHA1 del vídeo")
61
+ parser.add_argument("version", type=str, help="Versió de la AD (p.ex. Salamandra, MoE, HITL)")
62
+ parser.add_argument(
63
+ "--api-url",
64
+ type=str,
65
+ default=DEFAULT_API_URL,
66
+ help=f"URL de l'endpoint apply_refinement (per defecte: {DEFAULT_API_URL})",
67
+ )
68
+ parser.add_argument(
69
+ "--no-reflection",
70
+ action="store_true",
71
+ help="Desactiva el pas de reflection per a aquesta prova",
72
+ )
73
+ parser.add_argument(
74
+ "--no-reflexion",
75
+ action="store_true",
76
+ help="Desactiva el pas de reflexion per a aquesta prova",
77
+ )
78
+ parser.add_argument(
79
+ "--no-introspection",
80
+ action="store_true",
81
+ help="Desactiva el pas d'introspection per a aquesta prova",
82
+ )
83
+ args = parser.parse_args()
84
+
85
+ token = os.getenv("API_SHARED_TOKEN")
86
+ if not token:
87
+ print("⚠️ Variable d'entorn API_SHARED_TOKEN no definida; es farà la crida sense token.")
88
+
89
+ initial_srt = load_une_ad(args.sha1sum, args.version)
90
+
91
+ payload = {
92
+ "token": token,
93
+ "sha1sum": args.sha1sum,
94
+ "version": args.version,
95
+ "reflection_enabled": not args.no_reflection,
96
+ "reflexion_enabled": not args.no_reflexion,
97
+ "introspection_enabled": not args.no_introspection,
98
+ }
99
+
100
+ print(f"Cridant {args.api_url} amb payload: { {k: v for k, v in payload.items() if k != 'token'} }")
101
+
102
+ resp = requests.post(args.api_url, json=payload)
103
+ if resp.status_code != 200:
104
+ print(f"❌ Error {resp.status_code} des de l'API: {resp.text}")
105
+ raise SystemExit(1)
106
+
107
+ data = resp.json()
108
+ refined_srt = data.get("refined_srt", "")
109
+ if not refined_srt:
110
+ print("⚠️ Resposta sense camp 'refined_srt'. JSON complet:")
111
+ print(data)
112
+ raise SystemExit(1)
113
+
114
+ print("\n===== DIFF entre SRT inicial i SRT refinat via API =====")
115
+ show_diff(initial_srt, refined_srt)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
scripts/test_introspection_only_on_db_srt.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import difflib
5
+ import sqlite3
6
+ from pathlib import Path
7
+
8
+ import yaml
9
+
10
+ from engine.refinement.multiagent_refinement import execute_refinement
11
+
12
+
13
+ BASE_DEMO = Path(__file__).resolve().parent.parent
14
+ DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
15
+ CONFIG_PATH = BASE_DEMO / "temp" / "introspection_only.yaml"
16
+
17
+
18
+ def ensure_introspection_only_config() -> Path:
19
+ """Crea (o sobrescribe) un config.yaml mínimo con solo introspection activado."""
20
+
21
+ cfg = {
22
+ "refinement": {
23
+ "reflection_enabled": False,
24
+ "reflexion_enabled": False,
25
+ "introspection_enabled": True,
26
+ }
27
+ }
28
+ CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
29
+ with CONFIG_PATH.open("w", encoding="utf-8") as f:
30
+ yaml.safe_dump(cfg, f, allow_unicode=True)
31
+ return CONFIG_PATH
32
+
33
+
34
+ def load_srt_from_db(sha1sum: str | None = None, version: str | None = None) -> tuple[str, str, str]:
35
+ """Carga un UNE SRT desde audiodescriptions.db.
36
+
37
+ Si no se especifican sha1sum/version, toma la primera fila con une_ad no nulo.
38
+ Devuelve (sha1sum, version, une_ad).
39
+ """
40
+
41
+ if not DB_PATH.exists():
42
+ raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
43
+
44
+ conn = sqlite3.connect(str(DB_PATH))
45
+ conn.row_factory = sqlite3.Row
46
+ try:
47
+ cur = conn.cursor()
48
+ if sha1sum and version:
49
+ row = cur.execute(
50
+ "SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
51
+ (sha1sum, version),
52
+ ).fetchone()
53
+ else:
54
+ row = cur.execute(
55
+ "SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE une_ad IS NOT NULL LIMIT 1",
56
+ ).fetchone()
57
+
58
+ if not row or not row["une_ad"]:
59
+ raise SystemExit("❌ No s'ha trobat cap registre amb une_ad a audiodescriptions.db")
60
+
61
+ return row["sha1sum"], row["version"], row["une_ad"]
62
+ finally:
63
+ conn.close()
64
+
65
+
66
+ def show_diff(initial_srt: str, refined_srt: str) -> None:
67
+ initial_lines = initial_srt.splitlines()
68
+ refined_lines = refined_srt.splitlines()
69
+
70
+ diff = difflib.unified_diff(
71
+ initial_lines,
72
+ refined_lines,
73
+ fromfile="initial_une_ad.srt",
74
+ tofile="introspected_une_ad.srt",
75
+ lineterm="",
76
+ )
77
+ for line in diff:
78
+ print(line)
79
+
80
+
81
+ def main() -> None:
82
+ parser = argparse.ArgumentParser(
83
+ description="Prova del pas d'introspection sobre un SRT de audiodescriptions.db (sense reflection/reflexion).",
84
+ )
85
+ parser.add_argument("--sha1sum", type=str, default=None, help="SHA1 del vídeo (opcional)")
86
+ parser.add_argument("--version", type=str, default=None, help="Versió de la AD (opcional)")
87
+ args = parser.parse_args()
88
+
89
+ cfg_path = ensure_introspection_only_config()
90
+
91
+ sha1sum, version, une_ad = load_srt_from_db(args.sha1sum, args.version)
92
+ print(f"Usant registre sha1sum={sha1sum}, version={version} de {DB_PATH}")
93
+
94
+ refined = execute_refinement(une_ad, config_path=cfg_path)
95
+
96
+ print("\n===== DIFF entre SRT inicial i SRT després d'introspection =====")
97
+ show_diff(une_ad, refined)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
scripts/test_reflection_only_on_db_srt.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import difflib
5
+ import sqlite3
6
+ from pathlib import Path
7
+
8
+ import yaml
9
+
10
+ from engine.refinement.multiagent_refinement import execute_refinement
11
+
12
+
13
+ BASE_DEMO = Path(__file__).resolve().parent.parent
14
+ DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
15
+ CONFIG_PATH = BASE_DEMO / "temp" / "reflection_only.yaml"
16
+
17
+
18
+ def ensure_reflection_only_config() -> Path:
19
+ """Crea (o sobrescribe) un config.yaml mínimo con solo reflection activado."""
20
+
21
+ cfg = {
22
+ "refinement": {
23
+ "reflection_enabled": True,
24
+ "reflexion_enabled": False,
25
+ "introspection_enabled": False,
26
+ }
27
+ }
28
+ CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
29
+ with CONFIG_PATH.open("w", encoding="utf-8") as f:
30
+ yaml.safe_dump(cfg, f, allow_unicode=True)
31
+ return CONFIG_PATH
32
+
33
+
34
+ def load_srt_from_db(sha1sum: str | None = None, version: str | None = None) -> tuple[str, str, str]:
35
+ """Carga un UNE SRT desde audiodescriptions.db.
36
+
37
+ Si no se especifican sha1sum/version, toma la primera fila con une_ad no nulo.
38
+ Devuelve (sha1sum, version, une_ad).
39
+ """
40
+
41
+ if not DB_PATH.exists():
42
+ raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
43
+
44
+ conn = sqlite3.connect(str(DB_PATH))
45
+ conn.row_factory = sqlite3.Row
46
+ try:
47
+ cur = conn.cursor()
48
+ if sha1sum and version:
49
+ row = cur.execute(
50
+ "SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
51
+ (sha1sum, version),
52
+ ).fetchone()
53
+ else:
54
+ row = cur.execute(
55
+ "SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE une_ad IS NOT NULL LIMIT 1",
56
+ ).fetchone()
57
+
58
+ if not row or not row["une_ad"]:
59
+ raise SystemExit("❌ No s'ha trobat cap registre amb une_ad a audiodescriptions.db")
60
+
61
+ return row["sha1sum"], row["version"], row["une_ad"]
62
+ finally:
63
+ conn.close()
64
+
65
+
66
+ def show_diff(initial_srt: str, refined_srt: str) -> None:
67
+ initial_lines = initial_srt.splitlines()
68
+ refined_lines = refined_srt.splitlines()
69
+
70
+ diff = difflib.unified_diff(
71
+ initial_lines,
72
+ refined_lines,
73
+ fromfile="initial_une_ad.srt",
74
+ tofile="reflected_une_ad.srt",
75
+ lineterm="",
76
+ )
77
+ for line in diff:
78
+ print(line)
79
+
80
+
81
+ def main() -> None:
82
+ parser = argparse.ArgumentParser(
83
+ description="Prova del pas de reflection sobre un SRT de audiodescriptions.db (sense reflexion/introspection).",
84
+ )
85
+ parser.add_argument("--sha1sum", type=str, default=None, help="SHA1 del vídeo (opcional)")
86
+ parser.add_argument("--version", type=str, default=None, help="Versió de la AD (opcional)")
87
+ args = parser.parse_args()
88
+
89
+ cfg_path = ensure_reflection_only_config()
90
+
91
+ sha1sum, version, une_ad = load_srt_from_db(args.sha1sum, args.version)
92
+ print(f"Usant registre sha1sum={sha1sum}, version={version} de {DB_PATH}")
93
+
94
+ refined = execute_refinement(une_ad, config_path=cfg_path)
95
+
96
+ print("\n===== DIFF entre SRT inicial i SRT després de reflection =====")
97
+ show_diff(une_ad, refined)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
scripts/test_reflexion_only_on_db_srt.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import difflib
5
+ import sqlite3
6
+ from pathlib import Path
7
+
8
+ import yaml
9
+
10
+ from engine.refinement.multiagent_refinement import execute_refinement
11
+
12
+
13
+ BASE_DEMO = Path(__file__).resolve().parent.parent
14
+ DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
15
+ CONFIG_PATH = BASE_DEMO / "temp" / "reflexion_only.yaml"
16
+
17
+
18
+ def ensure_reflexion_only_config() -> Path:
19
+ """Crea (o sobrescribe) un config.yaml mínimo con solo reflexion activado."""
20
+
21
+ cfg = {
22
+ "refinement": {
23
+ "reflection_enabled": False,
24
+ "reflexion_enabled": True,
25
+ "introspection_enabled": False,
26
+ }
27
+ }
28
+ CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
29
+ with CONFIG_PATH.open("w", encoding="utf-8") as f:
30
+ yaml.safe_dump(cfg, f, allow_unicode=True)
31
+ return CONFIG_PATH
32
+
33
+
34
+ def load_srt_from_db(sha1sum: str | None = None, version: str | None = None) -> tuple[str, str, str]:
35
+ """Carga un UNE SRT desde audiodescriptions.db.
36
+
37
+ Si no se especifican sha1sum/version, toma la primera fila con une_ad no nulo.
38
+ Devuelve (sha1sum, version, une_ad).
39
+ """
40
+
41
+ if not DB_PATH.exists():
42
+ raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
43
+
44
+ conn = sqlite3.connect(str(DB_PATH))
45
+ conn.row_factory = sqlite3.Row
46
+ try:
47
+ cur = conn.cursor()
48
+ if sha1sum and version:
49
+ row = cur.execute(
50
+ "SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
51
+ (sha1sum, version),
52
+ ).fetchone()
53
+ else:
54
+ row = cur.execute(
55
+ "SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE une_ad IS NOT NULL LIMIT 1",
56
+ ).fetchone()
57
+
58
+ if not row or not row["une_ad"]:
59
+ raise SystemExit("❌ No s'ha trobat cap registre amb une_ad a audiodescriptions.db")
60
+
61
+ return row["sha1sum"], row["version"], row["une_ad"]
62
+ finally:
63
+ conn.close()
64
+
65
+
66
+ def show_diff(initial_srt: str, refined_srt: str) -> None:
67
+ initial_lines = initial_srt.splitlines()
68
+ refined_lines = refined_srt.splitlines()
69
+
70
+ diff = difflib.unified_diff(
71
+ initial_lines,
72
+ refined_lines,
73
+ fromfile="initial_une_ad.srt",
74
+ tofile="reflexioned_une_ad.srt",
75
+ lineterm="",
76
+ )
77
+ for line in diff:
78
+ print(line)
79
+
80
+
81
+ def main() -> None:
82
+ parser = argparse.ArgumentParser(
83
+ description="Prova del pas de 'reflexion' sobre un SRT de audiodescriptions.db (sense reflection/introspection).",
84
+ )
85
+ parser.add_argument("--sha1sum", type=str, default=None, help="SHA1 del vídeo (opcional)")
86
+ parser.add_argument("--version", type=str, default=None, help="Versió de la AD (opcional)")
87
+ args = parser.parse_args()
88
+
89
+ cfg_path = ensure_reflexion_only_config()
90
+
91
+ sha1sum, version, une_ad = load_srt_from_db(args.sha1sum, args.version)
92
+ print(f"Usant registre sha1sum={sha1sum}, version={version} de {DB_PATH}")
93
+
94
+ refined = execute_refinement(une_ad, config_path=cfg_path)
95
+
96
+ print("\n===== DIFF entre SRT inicial i SRT després de reflexion =====")
97
+ show_diff(une_ad, refined)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
scripts/train_introspection.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from engine.refinement.introspection import (
7
+ FEW_SHOT_PATH,
8
+ RULES_PATH,
9
+ train_introspection_rules,
10
+ )
11
+
12
+
13
+ def _count_nonempty_blocks(path: Path) -> int:
14
+ """Compta aproximadament quants blocs d'exemple hi ha en un fitxer.
15
+
16
+ Per a ``few_shot_examples.txt`` comptem línies que comencen per ``# sha1sum=``.
17
+ Per a ``rules.txt`` comptem línies no buides.
18
+ """
19
+
20
+ if not path.exists():
21
+ return 0
22
+ try:
23
+ text = path.read_text(encoding="utf-8")
24
+ except Exception:
25
+ return 0
26
+
27
+ if path.name == "few_shot_examples.txt":
28
+ return sum(1 for line in text.splitlines() if line.lstrip().startswith("# sha1sum="))
29
+ return sum(1 for line in text.splitlines() if line.strip())
30
+
31
+
32
+ def main() -> None:
33
+ parser = argparse.ArgumentParser(
34
+ description=(
35
+ "Entrena les regles d'introspecció a partir de les correccions HITL "
36
+ "emmagatzemades a demo/temp/audiodescriptions.db."
37
+ )
38
+ )
39
+ parser.add_argument(
40
+ "--max-examples",
41
+ type=int,
42
+ default=None,
43
+ help=(
44
+ "Nombre màxim de parelles (MoE/Salamandra, HITL) a processar. "
45
+ "Per defecte es processen totes."
46
+ ),
47
+ )
48
+ args = parser.parse_args()
49
+
50
+ train_introspection_rules(max_examples=args.max_examples)
51
+
52
+ n_examples = _count_nonempty_blocks(FEW_SHOT_PATH)
53
+ n_rules = _count_nonempty_blocks(RULES_PATH)
54
+
55
+ print(
56
+ f"✅ Entrenament d'introspection completat. "
57
+ f"Few-shot examples: {n_examples}, regles: {n_rules}."
58
+ )
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
scripts/train_reflexion.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from engine.refinement.reflexion import (
7
+ REFLEXION_CSV_PATH,
8
+ REFLEXION_MODEL_PATH,
9
+ train_reflexion_model,
10
+ )
11
+
12
+
13
+ def main() -> None:
14
+ parser = argparse.ArgumentParser(
15
+ description=(
16
+ "Entrena el model KNN de 'reflexion' a partir de les parelles "
17
+ "(MoE/Salamandra, HITL) a demo/temp/audiodescriptions.db."
18
+ )
19
+ )
20
+ parser.add_argument(
21
+ "--max-examples",
22
+ type=int,
23
+ default=None,
24
+ help=(
25
+ "Nombre màxim de mostres d'entrenament a processar. "
26
+ "Per defecte es processen totes."
27
+ ),
28
+ )
29
+ args = parser.parse_args()
30
+
31
+ train_reflexion_model(max_examples=args.max_examples)
32
+
33
+ n_rows = 0
34
+ if REFLEXION_CSV_PATH.exists():
35
+ try:
36
+ text = REFLEXION_CSV_PATH.read_text(encoding="utf-8")
37
+ # descomptar la capçalera
38
+ n_rows = max(0, len([l for l in text.splitlines() if l.strip()]) - 1)
39
+ except Exception:
40
+ n_rows = 0
41
+
42
+ model_str = "creat" if REFLEXION_MODEL_PATH.exists() else "no creat"
43
+
44
+ print(
45
+ f"✅ Entrenament de reflexion completat. "
46
+ f"Mostres al CSV: {n_rows}, fitxer de model: {model_str} ({REFLEXION_MODEL_PATH})."
47
+ )
48
+
49
+
50
+ if __name__ == "__main__":
51
+ main()
scripts/update_audiodescriptions_json_ad.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ import sqlite3
5
+ from typing import Iterable, Optional
6
+
7
+ import json
8
+
9
+
10
+ BASE_DEMO = Path(__file__).resolve().parent.parent
11
+ MEDIA_ROOT = BASE_DEMO / "data" / "media"
12
+ DB_PATHS = [
13
+ BASE_DEMO / "data" / "audiodescriptions.db",
14
+ BASE_DEMO / "temp" / "audiodescriptions.db",
15
+ ]
16
+
17
+
18
+ def read_preprocess_json(sha1sum: str) -> Optional[str]:
19
+ """Lee el fichero preprocess.json para un sha1sum dado.
20
+
21
+ Devuelve el contenido en texto plano (UTF-8) o None si no existe o hay
22
+ errores de lectura.
23
+ """
24
+
25
+ video_dir = MEDIA_ROOT / sha1sum
26
+ path = video_dir / "preprocess.json"
27
+ if not path.exists():
28
+ return None
29
+
30
+ try:
31
+ # Leemos tal cual el JSON para que quede guardado como texto
32
+ return path.read_text(encoding="utf-8")
33
+ except Exception:
34
+ try:
35
+ return path.read_text(errors="ignore")
36
+ except Exception:
37
+ return None
38
+
39
+
40
+ def ensure_json_ad_column(conn: sqlite3.Connection) -> None:
41
+ """Añade la columna info_ad si no existe todavía en audiodescriptions.
42
+
43
+ (Nom antic json_ad, mantingut per compatibilitat del nom de la funció.)
44
+ """
45
+
46
+ cur = conn.cursor()
47
+ cur.execute("PRAGMA table_info(audiodescriptions);")
48
+ cols = [row[1] for row in cur.fetchall()]
49
+ target_col = "info_ad"
50
+ if target_col not in cols:
51
+ cur.execute(f"ALTER TABLE audiodescriptions ADD COLUMN {target_col} TEXT;")
52
+ conn.commit()
53
+
54
+
55
+ def update_db(path: Path) -> None:
56
+ """Actualiza una base de datos audiodescriptions.db añadiendo info_ad.
57
+
58
+ - Asegura que existe la columna info_ad.
59
+ - Para cada sha1sum presente en la tabla, intenta leer
60
+ demo/data/media/<sha1sum>/preprocess.json i guarda el seu contingut a info_ad.
61
+ """
62
+
63
+ if not path.exists():
64
+ print(f"[INFO] {path} no existe, se omite.")
65
+ return
66
+
67
+ print(f"[INFO] Actualitzant {path} ...")
68
+
69
+ conn = sqlite3.connect(str(path))
70
+ conn.row_factory = sqlite3.Row
71
+ try:
72
+ ensure_json_ad_column(conn)
73
+ cur = conn.cursor()
74
+
75
+ # Obtener todos los sha1sum distintos presentes en la tabla
76
+ cur.execute("SELECT DISTINCT sha1sum FROM audiodescriptions;")
77
+ rows = cur.fetchall()
78
+ total = len(rows)
79
+ print(f" - {total} sha1sum diferents trobats.")
80
+
81
+ updated_rows = 0
82
+ for idx, row in enumerate(rows, start=1):
83
+ sha1sum = row["sha1sum"]
84
+ json_text = read_preprocess_json(sha1sum)
85
+
86
+ if json_text is None:
87
+ # No hi ha preprocess.json per a aquest sha1sum; deixem info_ad a NULL
88
+ print(f" [{idx}/{total}] {sha1sum}: preprocess.json no trobat, s'omet.")
89
+ continue
90
+
91
+ cur.execute(
92
+ "UPDATE audiodescriptions SET info_ad = ? WHERE sha1sum = ?;",
93
+ (json_text, sha1sum),
94
+ )
95
+ updated_rows += cur.rowcount
96
+ print(f" [{idx}/{total}] {sha1sum}: json_ad actualitzat per {cur.rowcount} files.")
97
+
98
+ conn.commit()
99
+ print(f"[OK] {path}: {updated_rows} files actualitzades amb info_ad.")
100
+ finally:
101
+ conn.close()
102
+
103
+
104
+ def main() -> None:
105
+ print(f"MEDIA_ROOT: {MEDIA_ROOT} (exists={MEDIA_ROOT.exists()})")
106
+ if not MEDIA_ROOT.exists():
107
+ raise SystemExit("❌ No s'ha trobat demo/data/media")
108
+
109
+ for db_path in DB_PATHS:
110
+ update_db(db_path)
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()
scripts/verify_temp_dbs.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Verificador de BDs de la demo (para usar desde el Space demo).
2
+
3
+ Muestra qué ficheros .db existen en:
4
+ - demo/data/db
5
+ - demo/temp/db
6
+
7
+ Se puede ejecutar como script independiente o invocado desde el código
8
+ para dejar trazas en el log.
9
+ """
10
+
11
+ from pathlib import Path
12
+ import yaml
13
+
14
+ # Este archivo vive en demo/scripts, así que la raíz de demo es el padre
15
+ DEMO_ROOT = Path(__file__).resolve().parent.parent
16
+
17
+
18
+ def read_data_origin() -> str:
19
+ cfg_path = DEMO_ROOT / "config.yaml"
20
+ if not cfg_path.exists():
21
+ return "(config.yaml no encontrado)"
22
+ try:
23
+ with cfg_path.open("r", encoding="utf-8") as f:
24
+ cfg = yaml.safe_load(f) or {}
25
+ app_cfg = cfg.get("app", {}) or {}
26
+ return str(app_cfg.get("data_origin", "internal")).lower()
27
+ except Exception as e:
28
+ return f"(error leyendo config.yaml: {e})"
29
+
30
+
31
+ def list_dbs(path: Path) -> list[str]:
32
+ if not path.exists():
33
+ return []
34
+ return sorted(str(p.name) for p in path.glob("*.db"))
35
+
36
+
37
+ def run_verification() -> None:
38
+ """Ejecuta la verificación y escribe por stdout.
39
+
40
+ Pensado para ser llamado desde ensure_temp_databases para dejar
41
+ trazas en el log del Space demo.
42
+ """
43
+ data_origin = read_data_origin()
44
+ data_db_dir = DEMO_ROOT / "data" / "db"
45
+ temp_db_dir = DEMO_ROOT / "temp" / "db"
46
+
47
+ print("=== Verificación de BDs demo (invocada desde Space demo) ===")
48
+ print(f"Raíz demo: {DEMO_ROOT}")
49
+ print(f"data_origin: {data_origin}")
50
+ print(f"data/db dir: {data_db_dir}")
51
+ print(f"temp/db dir: {temp_db_dir}")
52
+ print()
53
+
54
+ data_dbs = list_dbs(data_db_dir)
55
+ temp_dbs = list_dbs(temp_db_dir)
56
+
57
+ print("-- demo/data/db --")
58
+ if data_dbs:
59
+ for name in data_dbs:
60
+ print(f" - {name}")
61
+ else:
62
+ print(" (sin .db)")
63
+ print()
64
+
65
+ print("-- demo/temp/db --")
66
+ if temp_dbs:
67
+ for name in temp_dbs:
68
+ print(f" - {name}")
69
+ else:
70
+ print(" (sin .db)")
71
+ print()
72
+
73
+ missing_in_temp = [n for n in data_dbs if n not in temp_dbs]
74
+ if missing_in_temp:
75
+ print("Ficheros presentes en data/db pero NO en temp/db:")
76
+ for n in missing_in_temp:
77
+ print(f" - {n}")
78
+ else:
79
+ print("Todos los .db de data/db están también en temp/db (o no hay .db)")
80
+
81
+
82
+ def main() -> None:
83
+ run_verification()
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
scripts/video_analysis.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from datetime import timedelta
6
+ from typing import List, Optional, Dict, Any
7
+
8
+
9
+ TIME_RE = re.compile(
10
+ r"(?P<start>\d{2}:\d{2}:\d{2}[,\.]\d{3})\s*-->\s*(?P<end>\d{2}:\d{2}:\d{2}[,\.]\d{3})"
11
+ )
12
+
13
+
14
+ @dataclass
15
+ class SRTBlock:
16
+ index: int
17
+ start: float # seconds
18
+ end: float # seconds
19
+ text: str
20
+
21
+
22
+ def _parse_timestamp(ts: str) -> float:
23
+ """Convierte 'HH:MM:SS,mmm' o 'HH:MM:SS.mmm' a segundos (float)."""
24
+ ts = ts.replace(",", ".")
25
+ h, m, s = ts.split(":")
26
+ seconds, millis = (s.split("." ) + ["0"])[:2]
27
+ td = timedelta(
28
+ hours=int(h),
29
+ minutes=int(m),
30
+ seconds=int(seconds),
31
+ milliseconds=int(millis.ljust(3, "0")),
32
+ )
33
+ return td.total_seconds()
34
+
35
+
36
+ def _parse_srt(srt_text: str) -> List[SRTBlock]:
37
+ """Parsea texto SRT en una lista de bloques SRTBlock."""
38
+ srt_text = srt_text.replace("\r\n", "\n").replace("\r", "\n")
39
+ chunks = [c.strip() for c in re.split(r"\n\s*\n", srt_text) if c.strip()]
40
+ blocks: List[SRTBlock] = []
41
+
42
+ for chunk in chunks:
43
+ lines = chunk.split("\n")
44
+ idx_line = 0
45
+ index = None
46
+
47
+ if lines and lines[0].strip().isdigit():
48
+ index = int(lines[0].strip())
49
+ idx_line = 1
50
+
51
+ time_match = None
52
+ time_line_idx = None
53
+ for i in range(idx_line, min(idx_line + 3, len(lines))):
54
+ m = TIME_RE.search(lines[i])
55
+ if m:
56
+ time_match = m
57
+ time_line_idx = i
58
+ break
59
+
60
+ if not time_match or time_line_idx is None:
61
+ continue
62
+
63
+ start = _parse_timestamp(time_match.group("start"))
64
+ end = _parse_timestamp(time_match.group("end"))
65
+ if index is None:
66
+ index = len(blocks) + 1
67
+
68
+ text = "\n".join(lines[time_line_idx + 1 :]).strip()
69
+ blocks.append(SRTBlock(index=index, start=start, end=end, text=text))
70
+
71
+ return blocks
72
+
73
+
74
+ def analyze_srt(
75
+ srt_text: str,
76
+ *,
77
+ ad_markers: Optional[List[str]] = None,
78
+ ) -> Dict[str, Any]:
79
+ """Analiza un SRT y devuelve métricas básicas.
80
+
81
+ Métricas devueltas:
82
+ - duration_sec: duración total estimada del vídeo (segundos)
83
+ - words_per_min: número de palabras por minuto
84
+ - speakers_blocks_per_min: número de bloques de diálogo por minuto
85
+ - ad_time_ratio: porcentaje (0..1) del tiempo total con bloques marcados como AD
86
+ - blocks_per_min: número total de bloques por minuto
87
+
88
+ Heurísticas:
89
+ - Se asume que la duración del vídeo es el final del último bloque.
90
+ - Un "bloque de AD" es aquel cuya primera línea contiene alguno de los
91
+ marcadores indicados en `ad_markers` (por ejemplo: "[AD]", "AD:", "(AD)").
92
+ """
93
+
94
+ blocks = _parse_srt(srt_text)
95
+ if not blocks:
96
+ return {
97
+ "duration_sec": 0.0,
98
+ "words_per_min": 0.0,
99
+ "speakers_blocks_per_min": 0.0,
100
+ "ad_time_ratio": 0.0,
101
+ "blocks_per_min": 0.0,
102
+ }
103
+
104
+ duration_sec = max(b.end for b in blocks)
105
+ duration_min = max(duration_sec / 60.0, 1e-6)
106
+
107
+ # Palabras totales
108
+ total_words = 0
109
+ for b in blocks:
110
+ total_words += len(b.text.split())
111
+
112
+ # Bloques considerados de "hablante" (no AD)
113
+ if ad_markers is None:
114
+ ad_markers = ["[AD]", "AD:", "(AD)"]
115
+
116
+ def is_ad_block(block: SRTBlock) -> bool:
117
+ first_line = (block.text.splitlines() or [""])[0].strip().upper()
118
+ for mk in ad_markers:
119
+ if mk.upper() in first_line:
120
+ return True
121
+ return False
122
+
123
+ ad_time = 0.0
124
+ speech_blocks = 0
125
+ for b in blocks:
126
+ if is_ad_block(b):
127
+ ad_time += max(0.0, b.end - b.start)
128
+ else:
129
+ speech_blocks += 1
130
+
131
+ words_per_min = total_words / duration_min
132
+ speakers_blocks_per_min = speech_blocks / duration_min
133
+ blocks_per_min = len(blocks) / duration_min
134
+ ad_time_ratio = ad_time / duration_sec if duration_sec > 0 else 0.0
135
+
136
+ return {
137
+ "duration_sec": float(duration_sec),
138
+ "words_per_min": float(words_per_min),
139
+ "speakers_blocks_per_min": float(speakers_blocks_per_min),
140
+ "ad_time_ratio": float(ad_time_ratio),
141
+ "blocks_per_min": float(blocks_per_min),
142
+ }
143
+
144
+
145
+ def embed_srt_sentences(
146
+ srt_text: str,
147
+ *,
148
+ model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
149
+ ) -> Dict[str, Any]:
150
+ """Devuelve embeddings para las frases de un SRT.
151
+
152
+ Args:
153
+ srt_text: Contenido completo del archivo SRT como string.
154
+ model_name: Nombre del modelo de sentence-transformers a usar.
155
+
156
+ Returns:
157
+ Diccionario con:
158
+ - "model_name": nombre del modelo utilizado
159
+ - "sentences": lista de strings (una por bloque)
160
+ - "embeddings": lista de listas de floats con los embeddings
161
+
162
+ NOTA: Requiere instalar `sentence-transformers` y un backend de PyTorch
163
+ compatible. Si no está instalado, lanzará ImportError.
164
+ """
165
+
166
+ blocks = _parse_srt(srt_text)
167
+ sentences = [b.text.replace("\n", " ").strip() for b in blocks if b.text.strip()]
168
+
169
+ if not sentences:
170
+ return {"model_name": model_name, "sentences": [], "embeddings": []}
171
+
172
+ try:
173
+ from sentence_transformers import SentenceTransformer
174
+ except ImportError as exc:
175
+ raise ImportError(
176
+ "sentence-transformers no está instalado. "
177
+ "Instala la dependencia para poder generar embeddings."
178
+ ) from exc
179
+
180
+ model = SentenceTransformer(model_name)
181
+ embs = model.encode(sentences, convert_to_numpy=False)
182
+
183
+ embeddings = [list(map(float, vec)) for vec in embs]
184
+
185
+ return {
186
+ "model_name": model_name,
187
+ "sentences": sentences,
188
+ "embeddings": embeddings,
189
+ }