Upload 35 files
Browse files- page_modules/analyze_audiodescriptions.py +42 -6
- persistent_data_gate.py +7 -4
- scripts/build_audiodescriptions_db.py +170 -0
- scripts/debug_audiodescriptions_db.py +34 -0
- scripts/debug_videos_and_media.py +34 -0
- scripts/explore_data.py +88 -0
- scripts/generate_media_sha1sums.py +72 -0
- scripts/init_casting_scenarios.py +2 -2
- scripts/init_feedback_demo.py +2 -2
- scripts/inspect_audiodescriptions.py +88 -0
- scripts/migrate_audiodescriptions.py +93 -0
- scripts/migrate_audiodescriptions_info_ad.py +58 -0
- scripts/publish_monthly_digest.py +4 -4
- scripts/test_full_refinement_via_api.py +119 -0
- scripts/test_introspection_only_on_db_srt.py +101 -0
- scripts/test_reflection_only_on_db_srt.py +101 -0
- scripts/test_reflexion_only_on_db_srt.py +101 -0
- scripts/train_introspection.py +62 -0
- scripts/train_reflexion.py +51 -0
- scripts/update_audiodescriptions_json_ad.py +114 -0
- scripts/verify_temp_dbs.py +87 -0
- scripts/video_analysis.py +189 -0
page_modules/analyze_audiodescriptions.py
CHANGED
|
@@ -47,6 +47,25 @@ def _load_labels_from_config() -> Dict[str, str]:
|
|
| 47 |
}
|
| 48 |
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def load_eval_values(vid_dir: Path, version: str, eval_content: Optional[str] = None) -> Optional[Dict[str, int]]:
|
| 51 |
"""Carga los valores de evaluación desde eval (DB o CSV) si existe.
|
| 52 |
|
|
@@ -587,18 +606,35 @@ def render_analyze_audiodescriptions_page(api, permissions: Dict[str, bool]) ->
|
|
| 587 |
# Determinar versió i llegir UNE/free per a la inserció detallada
|
| 588 |
version = subcarpeta_seleccio or "MoE"
|
| 589 |
video_dir = base_media_dir / selected_sha1
|
| 590 |
-
|
| 591 |
-
|
|
|
|
| 592 |
|
| 593 |
try:
|
| 594 |
-
une_ad_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
except Exception:
|
| 596 |
-
une_ad_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
|
| 598 |
try:
|
| 599 |
-
free_ad_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
except Exception:
|
| 601 |
-
free_ad_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
|
| 603 |
user_name = (
|
| 604 |
st.session_state.user.get("username")
|
|
|
|
| 47 |
}
|
| 48 |
|
| 49 |
|
| 50 |
+
def _find_best_file_for_version(vid_dir: Path, version: str, filename: str) -> Optional[Path]:
|
| 51 |
+
"""Busca un fitxer dins de temp/media/<sha1>/<version>/<subtype> amb prioritat.
|
| 52 |
+
|
| 53 |
+
Ordre de cerca de subtipus: "HITL OK" -> "HITL Test" -> "Original" -> arrel de <version>.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
preferred_subtypes = ["HITL OK", "HITL Test", "Original"]
|
| 57 |
+
for subtype in preferred_subtypes:
|
| 58 |
+
candidate = vid_dir / version / subtype / filename
|
| 59 |
+
if candidate.exists():
|
| 60 |
+
return candidate
|
| 61 |
+
|
| 62 |
+
legacy = vid_dir / version / filename
|
| 63 |
+
if legacy.exists():
|
| 64 |
+
return legacy
|
| 65 |
+
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
def load_eval_values(vid_dir: Path, version: str, eval_content: Optional[str] = None) -> Optional[Dict[str, int]]:
|
| 70 |
"""Carga los valores de evaluación desde eval (DB o CSV) si existe.
|
| 71 |
|
|
|
|
| 606 |
# Determinar versió i llegir UNE/free per a la inserció detallada
|
| 607 |
version = subcarpeta_seleccio or "MoE"
|
| 608 |
video_dir = base_media_dir / selected_sha1
|
| 609 |
+
|
| 610 |
+
une_path = _find_best_file_for_version(video_dir, version, "une_ad.srt")
|
| 611 |
+
free_path = _find_best_file_for_version(video_dir, version, "free_ad.txt")
|
| 612 |
|
| 613 |
try:
|
| 614 |
+
une_ad_text = (
|
| 615 |
+
une_path.read_text(encoding="utf-8")
|
| 616 |
+
if une_path is not None and une_path.exists()
|
| 617 |
+
else ""
|
| 618 |
+
)
|
| 619 |
except Exception:
|
| 620 |
+
une_ad_text = (
|
| 621 |
+
une_path.read_text(errors="ignore")
|
| 622 |
+
if une_path is not None and une_path.exists()
|
| 623 |
+
else ""
|
| 624 |
+
)
|
| 625 |
|
| 626 |
try:
|
| 627 |
+
free_ad_text = (
|
| 628 |
+
free_path.read_text(encoding="utf-8")
|
| 629 |
+
if free_path is not None and free_path.exists()
|
| 630 |
+
else ""
|
| 631 |
+
)
|
| 632 |
except Exception:
|
| 633 |
+
free_ad_text = (
|
| 634 |
+
free_path.read_text(errors="ignore")
|
| 635 |
+
if free_path is not None and free_path.exists()
|
| 636 |
+
else ""
|
| 637 |
+
)
|
| 638 |
|
| 639 |
user_name = (
|
| 640 |
st.session_state.user.get("username")
|
persistent_data_gate.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
import shutil
|
| 3 |
import zipfile
|
| 4 |
import io
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Optional
|
| 7 |
|
|
@@ -142,10 +143,12 @@ def ensure_temp_databases(base_dir: Path, api_client) -> None:
|
|
| 142 |
|
| 143 |
# Verificació opcional: llistar estat de demo/data/db i demo/temp/db al log
|
| 144 |
try:
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
| 149 |
except Exception as _e_ver:
|
| 150 |
print(f"[ensure_temp_databases] Error executant verificador de BDs: {_e_ver}")
|
| 151 |
|
|
|
|
| 2 |
import shutil
|
| 3 |
import zipfile
|
| 4 |
import io
|
| 5 |
+
import runpy
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Optional
|
| 8 |
|
|
|
|
| 143 |
|
| 144 |
# Verificació opcional: llistar estat de demo/data/db i demo/temp/db al log
|
| 145 |
try:
|
| 146 |
+
script_path = base_dir / "scripts" / "verify_temp_dbs.py"
|
| 147 |
+
if script_path.exists():
|
| 148 |
+
print("[ensure_temp_databases] Executant verificador de BDs (scripts/verify_temp_dbs.py)...")
|
| 149 |
+
runpy.run_path(str(script_path), run_name="__main__")
|
| 150 |
+
else:
|
| 151 |
+
print(f"[ensure_temp_databases] verify_temp_dbs.py no trobat a {script_path}")
|
| 152 |
except Exception as _e_ver:
|
| 153 |
print(f"[ensure_temp_databases] Error executant verificador de BDs: {_e_ver}")
|
| 154 |
|
scripts/build_audiodescriptions_db.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import sqlite3
|
| 5 |
+
import csv
|
| 6 |
+
import json
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
from engine.finetuning.video_analysis import analyze_srt, embed_srt_sentences
|
| 10 |
+
|
| 11 |
+
BASE_DEMO = Path(__file__).resolve().parent.parent
|
| 12 |
+
MEDIA_ROOT = BASE_DEMO / "data" / "media"
|
| 13 |
+
DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
|
| 14 |
+
|
| 15 |
+
VALID_VERSIONS = ["MoE", "Salamandra", "HITL"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def read_text_file(path: Path) -> Optional[str]:
|
| 19 |
+
if not path.exists():
|
| 20 |
+
return None
|
| 21 |
+
try:
|
| 22 |
+
return path.read_text(encoding="utf-8")
|
| 23 |
+
except Exception:
|
| 24 |
+
try:
|
| 25 |
+
return path.read_text(errors="ignore")
|
| 26 |
+
except Exception:
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def read_eval_csv(path: Path) -> Optional[str]:
|
| 31 |
+
if not path.exists():
|
| 32 |
+
return None
|
| 33 |
+
try:
|
| 34 |
+
# Guardamos el CSV entero como texto para poderlo reutilizar tal cual
|
| 35 |
+
return path.read_text(encoding="utf-8")
|
| 36 |
+
except Exception:
|
| 37 |
+
try:
|
| 38 |
+
return path.read_text(errors="ignore")
|
| 39 |
+
except Exception:
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def summarize_free_ad(text: Optional[str], max_chars: int = 280) -> str:
|
| 44 |
+
if not text:
|
| 45 |
+
return ""
|
| 46 |
+
s = " ".join(text.split()) # normalizar espacios y saltos de línea
|
| 47 |
+
if len(s) <= max_chars:
|
| 48 |
+
return s
|
| 49 |
+
return s[: max_chars - 3] + "..."
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def ensure_schema(conn: sqlite3.Connection) -> None:
|
| 53 |
+
cur = conn.cursor()
|
| 54 |
+
cur.execute(
|
| 55 |
+
"""
|
| 56 |
+
CREATE TABLE IF NOT EXISTS audiodescriptions (
|
| 57 |
+
sha1sum TEXT NOT NULL,
|
| 58 |
+
version TEXT NOT NULL,
|
| 59 |
+
une_ad TEXT,
|
| 60 |
+
free_ad TEXT,
|
| 61 |
+
eval TEXT,
|
| 62 |
+
srt_duration REAL,
|
| 63 |
+
ad_ratio REAL,
|
| 64 |
+
words_pm REAL,
|
| 65 |
+
speakers_pm REAL,
|
| 66 |
+
blocks_pm REAL,
|
| 67 |
+
description TEXT,
|
| 68 |
+
embedding TEXT,
|
| 69 |
+
PRIMARY KEY (sha1sum, version)
|
| 70 |
+
);
|
| 71 |
+
"""
|
| 72 |
+
)
|
| 73 |
+
conn.commit()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def main() -> None:
|
| 77 |
+
print(f"MEDIA_ROOT: {MEDIA_ROOT} (exists={MEDIA_ROOT.exists()})")
|
| 78 |
+
if not MEDIA_ROOT.exists():
|
| 79 |
+
raise SystemExit("❌ No s'ha trobat demo/data/media")
|
| 80 |
+
|
| 81 |
+
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 82 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 83 |
+
conn.row_factory = sqlite3.Row
|
| 84 |
+
ensure_schema(conn)
|
| 85 |
+
|
| 86 |
+
cur = conn.cursor()
|
| 87 |
+
|
| 88 |
+
total_rows = 0
|
| 89 |
+
for video_dir in sorted(MEDIA_ROOT.iterdir()):
|
| 90 |
+
if not video_dir.is_dir():
|
| 91 |
+
continue
|
| 92 |
+
sha1sum = video_dir.name
|
| 93 |
+
|
| 94 |
+
for version in VALID_VERSIONS:
|
| 95 |
+
version_dir = video_dir / version
|
| 96 |
+
if not version_dir.exists() or not version_dir.is_dir():
|
| 97 |
+
continue
|
| 98 |
+
|
| 99 |
+
une_path = version_dir / "une_ad.srt"
|
| 100 |
+
free_path = version_dir / "free_ad.txt"
|
| 101 |
+
eval_path = version_dir / "eval.csv"
|
| 102 |
+
|
| 103 |
+
une_ad = read_text_file(une_path)
|
| 104 |
+
free_ad = read_text_file(free_path)
|
| 105 |
+
eval_csv = read_eval_csv(eval_path)
|
| 106 |
+
|
| 107 |
+
if une_ad is None and free_ad is None and eval_csv is None:
|
| 108 |
+
# Nada que indexar para esta versión
|
| 109 |
+
continue
|
| 110 |
+
|
| 111 |
+
# Analizar SRT si existe
|
| 112 |
+
srt_duration = ad_ratio = words_pm = speakers_pm = blocks_pm = None
|
| 113 |
+
if une_ad:
|
| 114 |
+
try:
|
| 115 |
+
metrics = analyze_srt(une_ad)
|
| 116 |
+
srt_duration = float(metrics.get("duration_sec", 0.0))
|
| 117 |
+
ad_ratio = float(metrics.get("ad_time_ratio", 0.0))
|
| 118 |
+
words_pm = float(metrics.get("words_per_min", 0.0))
|
| 119 |
+
speakers_pm = float(metrics.get("speakers_blocks_per_min", 0.0))
|
| 120 |
+
blocks_pm = float(metrics.get("blocks_per_min", 0.0))
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"[WARN] Error analitzant SRT per {sha1sum}/{version}: {e}")
|
| 123 |
+
|
| 124 |
+
# Resumen del free_ad
|
| 125 |
+
description = summarize_free_ad(free_ad)
|
| 126 |
+
|
| 127 |
+
# Embedding del SRT (puede fallar si no están instalados los deps)
|
| 128 |
+
embedding_json = None
|
| 129 |
+
if une_ad:
|
| 130 |
+
try:
|
| 131 |
+
emb_info = embed_srt_sentences(une_ad)
|
| 132 |
+
embeddings = emb_info.get("embeddings") or []
|
| 133 |
+
# Guardamos como JSON; puede ser muy grande pero sirve para prototipo
|
| 134 |
+
embedding_json = json.dumps(embeddings)
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f"[WARN] Error generant embeddings per {sha1sum}/{version}: {e}")
|
| 137 |
+
|
| 138 |
+
cur.execute(
|
| 139 |
+
"""
|
| 140 |
+
INSERT OR REPLACE INTO audiodescriptions (
|
| 141 |
+
sha1sum, version, une_ad, free_ad, eval,
|
| 142 |
+
srt_duration, ad_ratio, words_pm, speakers_pm, blocks_pm,
|
| 143 |
+
description, embedding
|
| 144 |
+
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?);
|
| 145 |
+
""",
|
| 146 |
+
(
|
| 147 |
+
sha1sum,
|
| 148 |
+
version,
|
| 149 |
+
une_ad,
|
| 150 |
+
free_ad,
|
| 151 |
+
eval_csv,
|
| 152 |
+
srt_duration,
|
| 153 |
+
ad_ratio,
|
| 154 |
+
words_pm,
|
| 155 |
+
speakers_pm,
|
| 156 |
+
blocks_pm,
|
| 157 |
+
description,
|
| 158 |
+
embedding_json,
|
| 159 |
+
),
|
| 160 |
+
)
|
| 161 |
+
total_rows += 1
|
| 162 |
+
|
| 163 |
+
conn.commit()
|
| 164 |
+
conn.close()
|
| 165 |
+
|
| 166 |
+
print(f"✅ audiodescriptions.db generat a {DB_PATH} amb {total_rows} files.")
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
if __name__ == "__main__":
|
| 170 |
+
main()
|
scripts/debug_audiodescriptions_db.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import sqlite3
|
| 3 |
+
|
| 4 |
+
BASE = Path(__file__).resolve().parent.parent
|
| 5 |
+
ADB = BASE / "temp" / "audiodescriptions.db"
|
| 6 |
+
|
| 7 |
+
print(f"AUDIODESCRIPTIONS_DB: {ADB} (exists={ADB.exists()})")
|
| 8 |
+
|
| 9 |
+
if not ADB.exists():
|
| 10 |
+
raise SystemExit("❌ audiodescriptions.db no existeix")
|
| 11 |
+
|
| 12 |
+
conn = sqlite3.connect(str(ADB))
|
| 13 |
+
conn.row_factory = sqlite3.Row
|
| 14 |
+
cur = conn.cursor()
|
| 15 |
+
|
| 16 |
+
print("\n[SCHEMA] PRAGMA table_info(audiodescriptions):")
|
| 17 |
+
try:
|
| 18 |
+
cur.execute("PRAGMA table_info(audiodescriptions)")
|
| 19 |
+
for row in cur.fetchall():
|
| 20 |
+
# row: (cid, name, type, notnull, dflt_value, pk)
|
| 21 |
+
print(f" - cid={row['cid']}, name={row['name']}, type={row['type']}, notnull={row['notnull']}, pk={row['pk']}")
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print("Error llegint esquema:", e)
|
| 24 |
+
|
| 25 |
+
print("\n[DATA] Primeres 10 files de audiodescriptions:")
|
| 26 |
+
try:
|
| 27 |
+
cur.execute("SELECT * FROM audiodescriptions LIMIT 10")
|
| 28 |
+
rows = cur.fetchall()
|
| 29 |
+
for r in rows:
|
| 30 |
+
print(" -", dict(r))
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print("Error llegint dades:", e)
|
| 33 |
+
|
| 34 |
+
conn.close()
|
scripts/debug_videos_and_media.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import sqlite3
|
| 3 |
+
|
| 4 |
+
BASE = Path(__file__).resolve().parent.parent
|
| 5 |
+
VIDEOS_DB = BASE / "temp" / "videos.db"
|
| 6 |
+
MEDIA_DIR = BASE / "temp" / "media"
|
| 7 |
+
|
| 8 |
+
print(f"VIDEOS_DB: {VIDEOS_DB} (exists={VIDEOS_DB.exists()})")
|
| 9 |
+
if VIDEOS_DB.exists():
|
| 10 |
+
conn = sqlite3.connect(str(VIDEOS_DB))
|
| 11 |
+
conn.row_factory = sqlite3.Row
|
| 12 |
+
cur = conn.cursor()
|
| 13 |
+
try:
|
| 14 |
+
# Llistar només les columnes que ens interessen ara mateix
|
| 15 |
+
cur.execute("SELECT video_name, sha1sum FROM videos")
|
| 16 |
+
rows = cur.fetchall()
|
| 17 |
+
print("\n[VIDEOS.DB] Registres (video_name, sha1sum):")
|
| 18 |
+
for r in rows:
|
| 19 |
+
print(f" - video_name={r['video_name']!r}, sha1sum={r['sha1sum']!r}")
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print("Error llegint videos.db:", e)
|
| 22 |
+
finally:
|
| 23 |
+
conn.close()
|
| 24 |
+
else:
|
| 25 |
+
print("videos.db no existeix")
|
| 26 |
+
|
| 27 |
+
print(f"\nMEDIA_DIR: {MEDIA_DIR} (exists={MEDIA_DIR.exists()})")
|
| 28 |
+
if MEDIA_DIR.exists():
|
| 29 |
+
subdirs = [p.name for p in sorted(MEDIA_DIR.iterdir()) if p.is_dir()]
|
| 30 |
+
print("[MEDIA] Subcarpetes a demo/temp/media:")
|
| 31 |
+
for name in subdirs:
|
| 32 |
+
print(" -", name)
|
| 33 |
+
else:
|
| 34 |
+
print("Directori media no existeix")
|
scripts/explore_data.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sqlite3
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
|
| 5 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 6 |
+
TEMP_DIR = os.path.join(BASE_DIR, "temp")
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def analyze_db(db_path: str) -> Dict[str, Any]:
|
| 10 |
+
"""Devuelve un dict con info de todas las tablas de un fichero SQLite."""
|
| 11 |
+
conn = sqlite3.connect(db_path)
|
| 12 |
+
conn.row_factory = sqlite3.Row
|
| 13 |
+
cur = conn.cursor()
|
| 14 |
+
|
| 15 |
+
cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
|
| 16 |
+
tables = [r[0] for r in cur.fetchall()]
|
| 17 |
+
|
| 18 |
+
db_info: Dict[str, Any] = {}
|
| 19 |
+
|
| 20 |
+
for table in tables:
|
| 21 |
+
# columnas de la tabla
|
| 22 |
+
cur.execute(f"PRAGMA table_info(\"{table}\")")
|
| 23 |
+
cols = [r[1] for r in cur.fetchall()]
|
| 24 |
+
|
| 25 |
+
table_info: Dict[str, Any] = {"n_rows": 0, "columns": {}}
|
| 26 |
+
|
| 27 |
+
# número de registros
|
| 28 |
+
cur.execute(f"SELECT COUNT(*) AS n FROM \"{table}\"")
|
| 29 |
+
n_rows = cur.fetchone()[0]
|
| 30 |
+
table_info["n_rows"] = n_rows
|
| 31 |
+
|
| 32 |
+
for col in cols:
|
| 33 |
+
# número de nulos
|
| 34 |
+
cur.execute(
|
| 35 |
+
f"SELECT COUNT(*) AS n_null FROM \"{table}\" WHERE \"{col}\" IS NULL"
|
| 36 |
+
)
|
| 37 |
+
n_null = cur.fetchone()[0]
|
| 38 |
+
null_pct = (n_null / n_rows * 100.0) if n_rows > 0 else 0.0
|
| 39 |
+
|
| 40 |
+
# número de valores distintos
|
| 41 |
+
cur.execute(
|
| 42 |
+
f"SELECT COUNT(DISTINCT \"{col}\") AS n_distinct FROM \"{table}\""
|
| 43 |
+
)
|
| 44 |
+
n_distinct = cur.fetchone()[0]
|
| 45 |
+
|
| 46 |
+
table_info["columns"][col] = {
|
| 47 |
+
"null_pct": null_pct,
|
| 48 |
+
"n_distinct": n_distinct,
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
db_info[table] = table_info
|
| 52 |
+
|
| 53 |
+
conn.close()
|
| 54 |
+
return db_info
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def main() -> None:
|
| 58 |
+
print(f"Analizando ficheros .db en: {TEMP_DIR}")
|
| 59 |
+
|
| 60 |
+
for fname in sorted(os.listdir(TEMP_DIR)):
|
| 61 |
+
if not fname.endswith(".db"):
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
db_path = os.path.join(TEMP_DIR, fname)
|
| 65 |
+
print("\n" + "=" * 80)
|
| 66 |
+
print(f"Base de datos: {fname}")
|
| 67 |
+
print("=" * 80)
|
| 68 |
+
|
| 69 |
+
db_info = analyze_db(db_path)
|
| 70 |
+
|
| 71 |
+
if not db_info:
|
| 72 |
+
print(" (Sin tablas de usuario)")
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
for table_name, tinfo in db_info.items():
|
| 76 |
+
print(f"\nTabla: {table_name}")
|
| 77 |
+
print(f" Nº registros: {tinfo['n_rows']}")
|
| 78 |
+
print(" Campos:")
|
| 79 |
+
for col, cinfo in tinfo["columns"].items():
|
| 80 |
+
print(
|
| 81 |
+
f" - {col}: "
|
| 82 |
+
f"{cinfo['n_distinct']} valores distintos, "
|
| 83 |
+
f"{cinfo['null_pct']:.2f}% nulos"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
main()
|
scripts/generate_media_sha1sums.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import csv
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
BASE = Path(__file__).resolve().parent.parent / "temp" / "media"
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def sha1_of_file(path: Path, buf_size: int = 65536) -> str:
|
| 10 |
+
"""Calcula el SHA1 de un fichero binario."""
|
| 11 |
+
|
| 12 |
+
h = hashlib.sha1()
|
| 13 |
+
with path.open("rb") as f:
|
| 14 |
+
while True:
|
| 15 |
+
chunk = f.read(buf_size)
|
| 16 |
+
if not chunk:
|
| 17 |
+
break
|
| 18 |
+
h.update(chunk)
|
| 19 |
+
return h.hexdigest()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def main() -> None:
|
| 23 |
+
if not BASE.exists():
|
| 24 |
+
raise SystemExit(f"No existe la carpeta {BASE}")
|
| 25 |
+
|
| 26 |
+
rows: list[tuple[str, str]] = []
|
| 27 |
+
subdirs = sorted(p for p in BASE.iterdir() if p.is_dir())
|
| 28 |
+
|
| 29 |
+
# 1) Calcular SHA1 y guardar filas para el CSV
|
| 30 |
+
for d in subdirs:
|
| 31 |
+
video_path = d / "video.mp4"
|
| 32 |
+
if not video_path.exists():
|
| 33 |
+
print(f"[WARN] No se encontró video.mp4 en {d}, se omite esta carpeta.")
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
digest = sha1_of_file(video_path)
|
| 37 |
+
print(f"{d.name} -> {digest}")
|
| 38 |
+
rows.append((d.name, digest))
|
| 39 |
+
|
| 40 |
+
if not rows:
|
| 41 |
+
print("[INFO] No se han encontrado carpetas con video.mp4, nada que hacer.")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
# 2) Escribir CSV con mapping carpeta original -> sha1sum
|
| 45 |
+
csv_path = BASE / "sha1sums.csv"
|
| 46 |
+
with csv_path.open("w", newline="", encoding="utf-8") as f:
|
| 47 |
+
writer = csv.writer(f)
|
| 48 |
+
writer.writerow(["folder_name", "sha1sum"])
|
| 49 |
+
writer.writerows(rows)
|
| 50 |
+
|
| 51 |
+
print(f"[INFO] CSV generado en {csv_path}")
|
| 52 |
+
|
| 53 |
+
# 3) Renombrar carpetas a su sha1sum
|
| 54 |
+
# Si dos carpetas tienen el mismo sha1, se deja la segunda sin renombrar.
|
| 55 |
+
for old_name, digest in rows:
|
| 56 |
+
old_path = BASE / old_name
|
| 57 |
+
new_path = BASE / digest
|
| 58 |
+
if not old_path.exists():
|
| 59 |
+
print(f"[INFO] Carpeta {old_path} ya no existe, se omite.")
|
| 60 |
+
continue
|
| 61 |
+
if new_path.exists():
|
| 62 |
+
print(
|
| 63 |
+
f"[ERROR] Ya existe {new_path}, posible colisión de SHA1 o renombrado previo. "
|
| 64 |
+
f"No se renombra {old_path}."
|
| 65 |
+
)
|
| 66 |
+
continue
|
| 67 |
+
print(f"Renombrando {old_path} -> {new_path}")
|
| 68 |
+
old_path.rename(new_path)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
scripts/init_casting_scenarios.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import Iterable, Tuple
|
|
| 6 |
|
| 7 |
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 8 |
DATA_DIR = BASE_DIR / "data"
|
| 9 |
-
VIDEOS_DIR = DATA_DIR / "
|
| 10 |
|
| 11 |
CASTING_DB_PATH = DATA_DIR / "casting.db"
|
| 12 |
SCENARIOS_DB_PATH = DATA_DIR / "scenarios.db"
|
|
@@ -84,7 +84,7 @@ def populate_single_table(
|
|
| 84 |
) -> int:
|
| 85 |
"""Rellena una tabla (casting o scenarios) a partir dels CSV per vídeo.
|
| 86 |
|
| 87 |
-
Recorre demo/data/
|
| 88 |
calcula sha1sum del vídeo y genera un registre por cada fila del CSV.
|
| 89 |
"""
|
| 90 |
|
|
|
|
| 6 |
|
| 7 |
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 8 |
DATA_DIR = BASE_DIR / "data"
|
| 9 |
+
VIDEOS_DIR = DATA_DIR / "media"
|
| 10 |
|
| 11 |
CASTING_DB_PATH = DATA_DIR / "casting.db"
|
| 12 |
SCENARIOS_DB_PATH = DATA_DIR / "scenarios.db"
|
|
|
|
| 84 |
) -> int:
|
| 85 |
"""Rellena una tabla (casting o scenarios) a partir dels CSV per vídeo.
|
| 86 |
|
| 87 |
+
Recorre demo/data/media/<video_name>, busca <csv_filename> y un .mp4,
|
| 88 |
calcula sha1sum del vídeo y genera un registre por cada fila del CSV.
|
| 89 |
"""
|
| 90 |
|
scripts/init_feedback_demo.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import Dict
|
|
| 6 |
|
| 7 |
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 8 |
DATA_DIR = BASE_DIR / "data"
|
| 9 |
-
VIDEOS_DIR = DATA_DIR / "
|
| 10 |
DB_PATH = DATA_DIR / "feedback.db"
|
| 11 |
|
| 12 |
SLIDER_CAPTIONS = [
|
|
@@ -99,7 +99,7 @@ def parse_eval_csv(csv_path: Path) -> Dict[str, int]:
|
|
| 99 |
|
| 100 |
def migrate() -> None:
|
| 101 |
if not VIDEOS_DIR.exists():
|
| 102 |
-
print(f"[INFO] No existe demo/data/
|
| 103 |
return
|
| 104 |
|
| 105 |
conn = ensure_db()
|
|
|
|
| 6 |
|
| 7 |
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 8 |
DATA_DIR = BASE_DIR / "data"
|
| 9 |
+
VIDEOS_DIR = DATA_DIR / "media"
|
| 10 |
DB_PATH = DATA_DIR / "feedback.db"
|
| 11 |
|
| 12 |
SLIDER_CAPTIONS = [
|
|
|
|
| 99 |
|
| 100 |
def migrate() -> None:
|
| 101 |
if not VIDEOS_DIR.exists():
|
| 102 |
+
print(f"[INFO] No existe demo/data/media, nada que hacer.")
|
| 103 |
return
|
| 104 |
|
| 105 |
conn = ensure_db()
|
scripts/inspect_audiodescriptions.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Inspecta el contingut de demo/temp/audiodescriptions.db.
|
| 2 |
+
|
| 3 |
+
Mostra:
|
| 4 |
+
- Si el fitxer existeix
|
| 5 |
+
- Nombre total de files
|
| 6 |
+
- Parelles (sha1sum, version)
|
| 7 |
+
- Resultats de consultes parametritzades vs literals per a uns sha1sum/version de mostra
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import sqlite3
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
DB_PATH = Path(__file__).resolve().parent.parent / "temp" / "audiodescriptions.db"
|
| 17 |
+
|
| 18 |
+
SAMPLE_SHA1S = (
|
| 19 |
+
"8ff4b2aaccfeee31ecc59b96e1ae90273de78864",
|
| 20 |
+
"3df04d2b7df70210fcceb7b9d9a35731bb43a39c",
|
| 21 |
+
"150f0d2abfe26602e38dc3cc1a0030d16c8ed0a2",
|
| 22 |
+
)
|
| 23 |
+
SAMPLE_VERSIONS = ("Salamandra", "MoE")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def main() -> None:
|
| 27 |
+
print(f"DB path: {DB_PATH}")
|
| 28 |
+
if not DB_PATH.exists():
|
| 29 |
+
print("❌ DB file does not exist")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
conn = sqlite3.connect(DB_PATH)
|
| 33 |
+
conn.row_factory = sqlite3.Row
|
| 34 |
+
cur = conn.cursor()
|
| 35 |
+
|
| 36 |
+
cur.execute("SELECT COUNT(*) FROM audiodescriptions")
|
| 37 |
+
total = cur.fetchone()[0]
|
| 38 |
+
print(f"Total rows: {total}")
|
| 39 |
+
|
| 40 |
+
# Ver esquema de la tabla
|
| 41 |
+
cur.execute("PRAGMA table_info(audiodescriptions)")
|
| 42 |
+
columns = cur.fetchall()
|
| 43 |
+
print("Columns:", [col[1] for col in columns])
|
| 44 |
+
|
| 45 |
+
cur.execute(
|
| 46 |
+
"SELECT sha1sum, version FROM audiodescriptions ORDER BY sha1sum, version"
|
| 47 |
+
)
|
| 48 |
+
pairs = [dict(row) for row in cur.fetchall()]
|
| 49 |
+
print("Pairs:")
|
| 50 |
+
print(json.dumps(pairs, ensure_ascii=False, indent=2))
|
| 51 |
+
|
| 52 |
+
for sha1 in SAMPLE_SHA1S:
|
| 53 |
+
for version in SAMPLE_VERSIONS:
|
| 54 |
+
cur.execute(
|
| 55 |
+
"SELECT COUNT(*) FROM audiodescriptions WHERE sha1sum=? AND version=?",
|
| 56 |
+
(sha1, version),
|
| 57 |
+
)
|
| 58 |
+
count_param = cur.fetchone()[0]
|
| 59 |
+
|
| 60 |
+
cur.execute(
|
| 61 |
+
"SELECT COUNT(*) FROM audiodescriptions WHERE sha1sum=:sha AND version=:ver",
|
| 62 |
+
{"sha": sha1, "ver": version},
|
| 63 |
+
)
|
| 64 |
+
count_named = cur.fetchone()[0]
|
| 65 |
+
|
| 66 |
+
cur.execute(
|
| 67 |
+
f"SELECT COUNT(*) FROM audiodescriptions WHERE sha1sum='{sha1}' AND version='{version}'"
|
| 68 |
+
)
|
| 69 |
+
count_literal = cur.fetchone()[0]
|
| 70 |
+
|
| 71 |
+
# Test con LOWER() - igual que la función (sin updated_at/created_at)
|
| 72 |
+
cur.execute(
|
| 73 |
+
"SELECT * FROM audiodescriptions WHERE sha1sum = ? AND LOWER(version) = LOWER(?) ORDER BY rowid DESC LIMIT 1",
|
| 74 |
+
(sha1, version),
|
| 75 |
+
)
|
| 76 |
+
rows_lower = cur.fetchall()
|
| 77 |
+
count_lower = len(rows_lower)
|
| 78 |
+
|
| 79 |
+
print(
|
| 80 |
+
f"sha1={sha1} version={version} -> param={count_param} "
|
| 81 |
+
f"named={count_named} literal={count_literal} SELECT*_LOWER={count_lower}"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
conn.close()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
main()
|
scripts/migrate_audiodescriptions.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import sqlite3
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Set, Dict, List
|
| 5 |
+
|
| 6 |
+
# Este script se guarda como demo/scripts/migrate_audiodescriptions.py
|
| 7 |
+
# BASE_DIR apunta a la carpeta demo/
|
| 8 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 9 |
+
|
| 10 |
+
DB_REL_PATHS = [
|
| 11 |
+
Path("temp") / "audiodescriptions.db",
|
| 12 |
+
Path("data") / "audiodescriptions.db",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_existing_columns(conn, table_name: str) -> Set[str]:
|
| 17 |
+
cur = conn.execute(f"PRAGMA table_info({table_name})")
|
| 18 |
+
cols = {row[1] for row in cur.fetchall()} # row[1] = nombre de columna
|
| 19 |
+
return cols
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def ensure_columns(conn, table_name: str, columns_sql: Dict[str, str]) -> None:
|
| 23 |
+
"""Asegura que existen las columnas indicadas (si no, hace ALTER TABLE).
|
| 24 |
+
|
| 25 |
+
columns_sql: nombre_columna -> sentencia ALTER TABLE ADD COLUMN ...
|
| 26 |
+
"""
|
| 27 |
+
existing = get_existing_columns(conn, table_name)
|
| 28 |
+
for col_name, alter_sql in columns_sql.items():
|
| 29 |
+
if col_name not in existing:
|
| 30 |
+
print(f" - Añadiendo columna {col_name}...")
|
| 31 |
+
conn.execute(alter_sql)
|
| 32 |
+
else:
|
| 33 |
+
print(f" - Columna {col_name} ya existe, se omite.")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def copy_free_ad_into_new_columns(conn, table_name: str, target_columns: List[str]) -> None:
|
| 37 |
+
"""Actualiza cada columna destino con el valor actual de free_ad."""
|
| 38 |
+
set_clause = ", ".join(f"{col} = free_ad" for col in target_columns)
|
| 39 |
+
sql = f"UPDATE {table_name} SET {set_clause}"
|
| 40 |
+
print(f" - Ejecutando: {sql}")
|
| 41 |
+
conn.execute(sql)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def migrate_db(db_path: Path) -> None:
|
| 45 |
+
full_path = BASE_DIR / db_path
|
| 46 |
+
if not full_path.exists():
|
| 47 |
+
print(f"[AVISO] DB no encontrada, se omite: {full_path}")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
print(f"\n=== Migrando BD: {full_path} ===")
|
| 51 |
+
conn = sqlite3.connect(full_path)
|
| 52 |
+
try:
|
| 53 |
+
conn.isolation_level = None # manejo manual de transacciones
|
| 54 |
+
conn.execute("BEGIN")
|
| 55 |
+
|
| 56 |
+
table_name = "audiodescriptions"
|
| 57 |
+
|
| 58 |
+
# 1) Asegurar columnas nuevas con los nombres finales
|
| 59 |
+
columns_sql: Dict[str, str] = {
|
| 60 |
+
"ok_une_ad": f"ALTER TABLE {table_name} ADD COLUMN ok_une_ad TEXT",
|
| 61 |
+
"test_une_ad": f"ALTER TABLE {table_name} ADD COLUMN test_une_ad TEXT",
|
| 62 |
+
"ok_free_ad": f"ALTER TABLE {table_name} ADD COLUMN ok_free_ad TEXT",
|
| 63 |
+
"test_free_ad": f"ALTER TABLE {table_name} ADD COLUMN test_free_ad TEXT",
|
| 64 |
+
}
|
| 65 |
+
ensure_columns(conn, table_name, columns_sql)
|
| 66 |
+
|
| 67 |
+
# 2) Copiar free_ad a las cuatro columnas nuevas
|
| 68 |
+
target_cols: List[str] = ["ok_une_ad", "test_une_ad", "ok_free_ad", "test_free_ad"]
|
| 69 |
+
copy_free_ad_into_new_columns(conn, table_name, target_cols)
|
| 70 |
+
|
| 71 |
+
conn.execute("COMMIT")
|
| 72 |
+
print(f"✔ Migración completada en: {full_path}")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"❌ Error en {full_path}: {e}")
|
| 75 |
+
try:
|
| 76 |
+
conn.execute("ROLLBACK")
|
| 77 |
+
except Exception:
|
| 78 |
+
pass
|
| 79 |
+
finally:
|
| 80 |
+
conn.close()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def main() -> None:
|
| 84 |
+
print("Script de migración de audiodescriptions.db")
|
| 85 |
+
print("Añade columnas ok_une_ad, test_une_ad, ok_free_ad, test_free_ad")
|
| 86 |
+
print("y copia free_ad en todas ellas.\n")
|
| 87 |
+
|
| 88 |
+
for rel in DB_REL_PATHS:
|
| 89 |
+
migrate_db(rel)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
main()
|
scripts/migrate_audiodescriptions_info_ad.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import sqlite3
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
# Este script se debe guardar como:
|
| 7 |
+
# demo/scripts/migrate_audiodescriptions_info_ad.py
|
| 8 |
+
# BASE_DIR apunta a la carpeta demo/
|
| 9 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 10 |
+
|
| 11 |
+
DB_REL_PATHS: List[Path] = [
|
| 12 |
+
Path("temp") / "audiodescriptions.db",
|
| 13 |
+
Path("data") / "audiodescriptions.db",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def add_info_ad_column(db_path: Path) -> None:
|
| 18 |
+
full_path = BASE_DIR / db_path
|
| 19 |
+
if not full_path.exists():
|
| 20 |
+
print(f"[AVISO] DB no encontrada, se omite: {full_path}")
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
print(f"\n=== Migrando BD: {full_path} ===")
|
| 24 |
+
conn = sqlite3.connect(full_path)
|
| 25 |
+
try:
|
| 26 |
+
conn.isolation_level = None
|
| 27 |
+
conn.execute("BEGIN")
|
| 28 |
+
|
| 29 |
+
# Comprobar columnas existentes en la tabla audiodescriptions
|
| 30 |
+
cur = conn.execute("PRAGMA table_info(audiodescriptions)")
|
| 31 |
+
cols = {row[1] for row in cur.fetchall()} # row[1] = nombre columna
|
| 32 |
+
|
| 33 |
+
if "info_ad" in cols:
|
| 34 |
+
print(" - Columna info_ad ya existe, no se hace nada.")
|
| 35 |
+
else:
|
| 36 |
+
print(" - Añadiendo columna info_ad...")
|
| 37 |
+
conn.execute("ALTER TABLE audiodescriptions ADD COLUMN info_ad TEXT")
|
| 38 |
+
|
| 39 |
+
conn.execute("COMMIT")
|
| 40 |
+
print(f"✔ Migración completada en: {full_path}")
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"❌ Error en {full_path}: {e}")
|
| 43 |
+
try:
|
| 44 |
+
conn.execute("ROLLBACK")
|
| 45 |
+
except Exception:
|
| 46 |
+
pass
|
| 47 |
+
finally:
|
| 48 |
+
conn.close()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def main() -> None:
|
| 52 |
+
print("Script de migración: añadir columna info_ad a audiodescriptions.db\n")
|
| 53 |
+
for rel in DB_REL_PATHS:
|
| 54 |
+
add_info_ad_column(rel)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
main()
|
scripts/publish_monthly_digest.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import argparse
|
| 2 |
from datetime import datetime, timezone
|
| 3 |
|
| 4 |
-
from
|
| 5 |
|
| 6 |
|
| 7 |
def _current_period_utc() -> str:
|
|
@@ -14,7 +14,7 @@ def main() -> None:
|
|
| 14 |
parser = argparse.ArgumentParser(
|
| 15 |
description=(
|
| 16 |
"Publica el digest mensual de autorizaciones en Polygon "
|
| 17 |
-
"usando
|
| 18 |
)
|
| 19 |
)
|
| 20 |
parser.add_argument(
|
|
@@ -26,8 +26,8 @@ def main() -> None:
|
|
| 26 |
args = parser.parse_args()
|
| 27 |
|
| 28 |
period = args.period or _current_period_utc()
|
| 29 |
-
print(f"[DIGEST] Publicando digest para el período {period}...")
|
| 30 |
-
tx_hash =
|
| 31 |
|
| 32 |
if tx_hash:
|
| 33 |
print(f"[DIGEST] Digest publicado correctamente. Tx hash: {tx_hash}")
|
|
|
|
| 1 |
import argparse
|
| 2 |
from datetime import datetime, timezone
|
| 3 |
|
| 4 |
+
from compliance_client import compliance_client
|
| 5 |
|
| 6 |
|
| 7 |
def _current_period_utc() -> str:
|
|
|
|
| 14 |
parser = argparse.ArgumentParser(
|
| 15 |
description=(
|
| 16 |
"Publica el digest mensual de autorizaciones en Polygon "
|
| 17 |
+
"usando el microservicio 'compliance' (ComplianceClient)."
|
| 18 |
)
|
| 19 |
)
|
| 20 |
parser.add_argument(
|
|
|
|
| 26 |
args = parser.parse_args()
|
| 27 |
|
| 28 |
period = args.period or _current_period_utc()
|
| 29 |
+
print(f"[DIGEST] Publicando digest para el período {period} via compliance...")
|
| 30 |
+
tx_hash = compliance_client.publish_monthly_digest(period)
|
| 31 |
|
| 32 |
if tx_hash:
|
| 33 |
print(f"[DIGEST] Digest publicado correctamente. Tx hash: {tx_hash}")
|
scripts/test_full_refinement_via_api.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import difflib
|
| 5 |
+
import os
|
| 6 |
+
import sqlite3
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import requests
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
BASE_DEMO = Path(__file__).resolve().parent.parent
|
| 13 |
+
DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
|
| 14 |
+
DEFAULT_API_URL = "http://localhost:8000/apply_refinement"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_une_ad(sha1sum: str, version: str) -> str:
|
| 18 |
+
if not DB_PATH.exists():
|
| 19 |
+
raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
|
| 20 |
+
|
| 21 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 22 |
+
conn.row_factory = sqlite3.Row
|
| 23 |
+
try:
|
| 24 |
+
cur = conn.cursor()
|
| 25 |
+
row = cur.execute(
|
| 26 |
+
"SELECT une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
|
| 27 |
+
(sha1sum, version),
|
| 28 |
+
).fetchone()
|
| 29 |
+
if not row or not row["une_ad"]:
|
| 30 |
+
raise SystemExit(
|
| 31 |
+
f"❌ No s'ha trobat une_ad per sha1sum={sha1sum}, version={version} a audiodescriptions.db"
|
| 32 |
+
)
|
| 33 |
+
return row["une_ad"]
|
| 34 |
+
finally:
|
| 35 |
+
conn.close()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def show_diff(initial_srt: str, refined_srt: str) -> None:
|
| 39 |
+
initial_lines = initial_srt.splitlines()
|
| 40 |
+
refined_lines = refined_srt.splitlines()
|
| 41 |
+
|
| 42 |
+
diff = difflib.unified_diff(
|
| 43 |
+
initial_lines,
|
| 44 |
+
refined_lines,
|
| 45 |
+
fromfile="initial_une_ad.srt",
|
| 46 |
+
tofile="refined_une_ad.srt",
|
| 47 |
+
lineterm="",
|
| 48 |
+
)
|
| 49 |
+
for line in diff:
|
| 50 |
+
print(line)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def main() -> None:
|
| 54 |
+
parser = argparse.ArgumentParser(
|
| 55 |
+
description=(
|
| 56 |
+
"Prova de la pipeline completa de refinement (reflection + reflexion + introspection) "
|
| 57 |
+
"via l'endpoint /apply_refinement."
|
| 58 |
+
)
|
| 59 |
+
)
|
| 60 |
+
parser.add_argument("sha1sum", type=str, help="SHA1 del vídeo")
|
| 61 |
+
parser.add_argument("version", type=str, help="Versió de la AD (p.ex. Salamandra, MoE, HITL)")
|
| 62 |
+
parser.add_argument(
|
| 63 |
+
"--api-url",
|
| 64 |
+
type=str,
|
| 65 |
+
default=DEFAULT_API_URL,
|
| 66 |
+
help=f"URL de l'endpoint apply_refinement (per defecte: {DEFAULT_API_URL})",
|
| 67 |
+
)
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
"--no-reflection",
|
| 70 |
+
action="store_true",
|
| 71 |
+
help="Desactiva el pas de reflection per a aquesta prova",
|
| 72 |
+
)
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--no-reflexion",
|
| 75 |
+
action="store_true",
|
| 76 |
+
help="Desactiva el pas de reflexion per a aquesta prova",
|
| 77 |
+
)
|
| 78 |
+
parser.add_argument(
|
| 79 |
+
"--no-introspection",
|
| 80 |
+
action="store_true",
|
| 81 |
+
help="Desactiva el pas d'introspection per a aquesta prova",
|
| 82 |
+
)
|
| 83 |
+
args = parser.parse_args()
|
| 84 |
+
|
| 85 |
+
token = os.getenv("API_SHARED_TOKEN")
|
| 86 |
+
if not token:
|
| 87 |
+
print("⚠️ Variable d'entorn API_SHARED_TOKEN no definida; es farà la crida sense token.")
|
| 88 |
+
|
| 89 |
+
initial_srt = load_une_ad(args.sha1sum, args.version)
|
| 90 |
+
|
| 91 |
+
payload = {
|
| 92 |
+
"token": token,
|
| 93 |
+
"sha1sum": args.sha1sum,
|
| 94 |
+
"version": args.version,
|
| 95 |
+
"reflection_enabled": not args.no_reflection,
|
| 96 |
+
"reflexion_enabled": not args.no_reflexion,
|
| 97 |
+
"introspection_enabled": not args.no_introspection,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
print(f"Cridant {args.api_url} amb payload: { {k: v for k, v in payload.items() if k != 'token'} }")
|
| 101 |
+
|
| 102 |
+
resp = requests.post(args.api_url, json=payload)
|
| 103 |
+
if resp.status_code != 200:
|
| 104 |
+
print(f"❌ Error {resp.status_code} des de l'API: {resp.text}")
|
| 105 |
+
raise SystemExit(1)
|
| 106 |
+
|
| 107 |
+
data = resp.json()
|
| 108 |
+
refined_srt = data.get("refined_srt", "")
|
| 109 |
+
if not refined_srt:
|
| 110 |
+
print("⚠️ Resposta sense camp 'refined_srt'. JSON complet:")
|
| 111 |
+
print(data)
|
| 112 |
+
raise SystemExit(1)
|
| 113 |
+
|
| 114 |
+
print("\n===== DIFF entre SRT inicial i SRT refinat via API =====")
|
| 115 |
+
show_diff(initial_srt, refined_srt)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
main()
|
scripts/test_introspection_only_on_db_srt.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import difflib
|
| 5 |
+
import sqlite3
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import yaml
|
| 9 |
+
|
| 10 |
+
from engine.refinement.multiagent_refinement import execute_refinement
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
BASE_DEMO = Path(__file__).resolve().parent.parent
|
| 14 |
+
DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
|
| 15 |
+
CONFIG_PATH = BASE_DEMO / "temp" / "introspection_only.yaml"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def ensure_introspection_only_config() -> Path:
|
| 19 |
+
"""Crea (o sobrescribe) un config.yaml mínimo con solo introspection activado."""
|
| 20 |
+
|
| 21 |
+
cfg = {
|
| 22 |
+
"refinement": {
|
| 23 |
+
"reflection_enabled": False,
|
| 24 |
+
"reflexion_enabled": False,
|
| 25 |
+
"introspection_enabled": True,
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
with CONFIG_PATH.open("w", encoding="utf-8") as f:
|
| 30 |
+
yaml.safe_dump(cfg, f, allow_unicode=True)
|
| 31 |
+
return CONFIG_PATH
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_srt_from_db(sha1sum: str | None = None, version: str | None = None) -> tuple[str, str, str]:
|
| 35 |
+
"""Carga un UNE SRT desde audiodescriptions.db.
|
| 36 |
+
|
| 37 |
+
Si no se especifican sha1sum/version, toma la primera fila con une_ad no nulo.
|
| 38 |
+
Devuelve (sha1sum, version, une_ad).
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
if not DB_PATH.exists():
|
| 42 |
+
raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
|
| 43 |
+
|
| 44 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 45 |
+
conn.row_factory = sqlite3.Row
|
| 46 |
+
try:
|
| 47 |
+
cur = conn.cursor()
|
| 48 |
+
if sha1sum and version:
|
| 49 |
+
row = cur.execute(
|
| 50 |
+
"SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
|
| 51 |
+
(sha1sum, version),
|
| 52 |
+
).fetchone()
|
| 53 |
+
else:
|
| 54 |
+
row = cur.execute(
|
| 55 |
+
"SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE une_ad IS NOT NULL LIMIT 1",
|
| 56 |
+
).fetchone()
|
| 57 |
+
|
| 58 |
+
if not row or not row["une_ad"]:
|
| 59 |
+
raise SystemExit("❌ No s'ha trobat cap registre amb une_ad a audiodescriptions.db")
|
| 60 |
+
|
| 61 |
+
return row["sha1sum"], row["version"], row["une_ad"]
|
| 62 |
+
finally:
|
| 63 |
+
conn.close()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def show_diff(initial_srt: str, refined_srt: str) -> None:
|
| 67 |
+
initial_lines = initial_srt.splitlines()
|
| 68 |
+
refined_lines = refined_srt.splitlines()
|
| 69 |
+
|
| 70 |
+
diff = difflib.unified_diff(
|
| 71 |
+
initial_lines,
|
| 72 |
+
refined_lines,
|
| 73 |
+
fromfile="initial_une_ad.srt",
|
| 74 |
+
tofile="introspected_une_ad.srt",
|
| 75 |
+
lineterm="",
|
| 76 |
+
)
|
| 77 |
+
for line in diff:
|
| 78 |
+
print(line)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main() -> None:
|
| 82 |
+
parser = argparse.ArgumentParser(
|
| 83 |
+
description="Prova del pas d'introspection sobre un SRT de audiodescriptions.db (sense reflection/reflexion).",
|
| 84 |
+
)
|
| 85 |
+
parser.add_argument("--sha1sum", type=str, default=None, help="SHA1 del vídeo (opcional)")
|
| 86 |
+
parser.add_argument("--version", type=str, default=None, help="Versió de la AD (opcional)")
|
| 87 |
+
args = parser.parse_args()
|
| 88 |
+
|
| 89 |
+
cfg_path = ensure_introspection_only_config()
|
| 90 |
+
|
| 91 |
+
sha1sum, version, une_ad = load_srt_from_db(args.sha1sum, args.version)
|
| 92 |
+
print(f"Usant registre sha1sum={sha1sum}, version={version} de {DB_PATH}")
|
| 93 |
+
|
| 94 |
+
refined = execute_refinement(une_ad, config_path=cfg_path)
|
| 95 |
+
|
| 96 |
+
print("\n===== DIFF entre SRT inicial i SRT després d'introspection =====")
|
| 97 |
+
show_diff(une_ad, refined)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
scripts/test_reflection_only_on_db_srt.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import difflib
|
| 5 |
+
import sqlite3
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import yaml
|
| 9 |
+
|
| 10 |
+
from engine.refinement.multiagent_refinement import execute_refinement
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
BASE_DEMO = Path(__file__).resolve().parent.parent
|
| 14 |
+
DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
|
| 15 |
+
CONFIG_PATH = BASE_DEMO / "temp" / "reflection_only.yaml"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def ensure_reflection_only_config() -> Path:
|
| 19 |
+
"""Crea (o sobrescribe) un config.yaml mínimo con solo reflection activado."""
|
| 20 |
+
|
| 21 |
+
cfg = {
|
| 22 |
+
"refinement": {
|
| 23 |
+
"reflection_enabled": True,
|
| 24 |
+
"reflexion_enabled": False,
|
| 25 |
+
"introspection_enabled": False,
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
with CONFIG_PATH.open("w", encoding="utf-8") as f:
|
| 30 |
+
yaml.safe_dump(cfg, f, allow_unicode=True)
|
| 31 |
+
return CONFIG_PATH
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_srt_from_db(sha1sum: str | None = None, version: str | None = None) -> tuple[str, str, str]:
|
| 35 |
+
"""Carga un UNE SRT desde audiodescriptions.db.
|
| 36 |
+
|
| 37 |
+
Si no se especifican sha1sum/version, toma la primera fila con une_ad no nulo.
|
| 38 |
+
Devuelve (sha1sum, version, une_ad).
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
if not DB_PATH.exists():
|
| 42 |
+
raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
|
| 43 |
+
|
| 44 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 45 |
+
conn.row_factory = sqlite3.Row
|
| 46 |
+
try:
|
| 47 |
+
cur = conn.cursor()
|
| 48 |
+
if sha1sum and version:
|
| 49 |
+
row = cur.execute(
|
| 50 |
+
"SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
|
| 51 |
+
(sha1sum, version),
|
| 52 |
+
).fetchone()
|
| 53 |
+
else:
|
| 54 |
+
row = cur.execute(
|
| 55 |
+
"SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE une_ad IS NOT NULL LIMIT 1",
|
| 56 |
+
).fetchone()
|
| 57 |
+
|
| 58 |
+
if not row or not row["une_ad"]:
|
| 59 |
+
raise SystemExit("❌ No s'ha trobat cap registre amb une_ad a audiodescriptions.db")
|
| 60 |
+
|
| 61 |
+
return row["sha1sum"], row["version"], row["une_ad"]
|
| 62 |
+
finally:
|
| 63 |
+
conn.close()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def show_diff(initial_srt: str, refined_srt: str) -> None:
|
| 67 |
+
initial_lines = initial_srt.splitlines()
|
| 68 |
+
refined_lines = refined_srt.splitlines()
|
| 69 |
+
|
| 70 |
+
diff = difflib.unified_diff(
|
| 71 |
+
initial_lines,
|
| 72 |
+
refined_lines,
|
| 73 |
+
fromfile="initial_une_ad.srt",
|
| 74 |
+
tofile="reflected_une_ad.srt",
|
| 75 |
+
lineterm="",
|
| 76 |
+
)
|
| 77 |
+
for line in diff:
|
| 78 |
+
print(line)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main() -> None:
|
| 82 |
+
parser = argparse.ArgumentParser(
|
| 83 |
+
description="Prova del pas de reflection sobre un SRT de audiodescriptions.db (sense reflexion/introspection).",
|
| 84 |
+
)
|
| 85 |
+
parser.add_argument("--sha1sum", type=str, default=None, help="SHA1 del vídeo (opcional)")
|
| 86 |
+
parser.add_argument("--version", type=str, default=None, help="Versió de la AD (opcional)")
|
| 87 |
+
args = parser.parse_args()
|
| 88 |
+
|
| 89 |
+
cfg_path = ensure_reflection_only_config()
|
| 90 |
+
|
| 91 |
+
sha1sum, version, une_ad = load_srt_from_db(args.sha1sum, args.version)
|
| 92 |
+
print(f"Usant registre sha1sum={sha1sum}, version={version} de {DB_PATH}")
|
| 93 |
+
|
| 94 |
+
refined = execute_refinement(une_ad, config_path=cfg_path)
|
| 95 |
+
|
| 96 |
+
print("\n===== DIFF entre SRT inicial i SRT després de reflection =====")
|
| 97 |
+
show_diff(une_ad, refined)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
scripts/test_reflexion_only_on_db_srt.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import difflib
|
| 5 |
+
import sqlite3
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import yaml
|
| 9 |
+
|
| 10 |
+
from engine.refinement.multiagent_refinement import execute_refinement
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
BASE_DEMO = Path(__file__).resolve().parent.parent
|
| 14 |
+
DB_PATH = BASE_DEMO / "temp" / "audiodescriptions.db"
|
| 15 |
+
CONFIG_PATH = BASE_DEMO / "temp" / "reflexion_only.yaml"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def ensure_reflexion_only_config() -> Path:
|
| 19 |
+
"""Crea (o sobrescribe) un config.yaml mínimo con solo reflexion activado."""
|
| 20 |
+
|
| 21 |
+
cfg = {
|
| 22 |
+
"refinement": {
|
| 23 |
+
"reflection_enabled": False,
|
| 24 |
+
"reflexion_enabled": True,
|
| 25 |
+
"introspection_enabled": False,
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
with CONFIG_PATH.open("w", encoding="utf-8") as f:
|
| 30 |
+
yaml.safe_dump(cfg, f, allow_unicode=True)
|
| 31 |
+
return CONFIG_PATH
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_srt_from_db(sha1sum: str | None = None, version: str | None = None) -> tuple[str, str, str]:
|
| 35 |
+
"""Carga un UNE SRT desde audiodescriptions.db.
|
| 36 |
+
|
| 37 |
+
Si no se especifican sha1sum/version, toma la primera fila con une_ad no nulo.
|
| 38 |
+
Devuelve (sha1sum, version, une_ad).
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
if not DB_PATH.exists():
|
| 42 |
+
raise SystemExit(f"❌ No s'ha trobat la base de dades: {DB_PATH}")
|
| 43 |
+
|
| 44 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 45 |
+
conn.row_factory = sqlite3.Row
|
| 46 |
+
try:
|
| 47 |
+
cur = conn.cursor()
|
| 48 |
+
if sha1sum and version:
|
| 49 |
+
row = cur.execute(
|
| 50 |
+
"SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE sha1sum=? AND version=?",
|
| 51 |
+
(sha1sum, version),
|
| 52 |
+
).fetchone()
|
| 53 |
+
else:
|
| 54 |
+
row = cur.execute(
|
| 55 |
+
"SELECT sha1sum, version, une_ad FROM audiodescriptions WHERE une_ad IS NOT NULL LIMIT 1",
|
| 56 |
+
).fetchone()
|
| 57 |
+
|
| 58 |
+
if not row or not row["une_ad"]:
|
| 59 |
+
raise SystemExit("❌ No s'ha trobat cap registre amb une_ad a audiodescriptions.db")
|
| 60 |
+
|
| 61 |
+
return row["sha1sum"], row["version"], row["une_ad"]
|
| 62 |
+
finally:
|
| 63 |
+
conn.close()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def show_diff(initial_srt: str, refined_srt: str) -> None:
|
| 67 |
+
initial_lines = initial_srt.splitlines()
|
| 68 |
+
refined_lines = refined_srt.splitlines()
|
| 69 |
+
|
| 70 |
+
diff = difflib.unified_diff(
|
| 71 |
+
initial_lines,
|
| 72 |
+
refined_lines,
|
| 73 |
+
fromfile="initial_une_ad.srt",
|
| 74 |
+
tofile="reflexioned_une_ad.srt",
|
| 75 |
+
lineterm="",
|
| 76 |
+
)
|
| 77 |
+
for line in diff:
|
| 78 |
+
print(line)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main() -> None:
|
| 82 |
+
parser = argparse.ArgumentParser(
|
| 83 |
+
description="Prova del pas de 'reflexion' sobre un SRT de audiodescriptions.db (sense reflection/introspection).",
|
| 84 |
+
)
|
| 85 |
+
parser.add_argument("--sha1sum", type=str, default=None, help="SHA1 del vídeo (opcional)")
|
| 86 |
+
parser.add_argument("--version", type=str, default=None, help="Versió de la AD (opcional)")
|
| 87 |
+
args = parser.parse_args()
|
| 88 |
+
|
| 89 |
+
cfg_path = ensure_reflexion_only_config()
|
| 90 |
+
|
| 91 |
+
sha1sum, version, une_ad = load_srt_from_db(args.sha1sum, args.version)
|
| 92 |
+
print(f"Usant registre sha1sum={sha1sum}, version={version} de {DB_PATH}")
|
| 93 |
+
|
| 94 |
+
refined = execute_refinement(une_ad, config_path=cfg_path)
|
| 95 |
+
|
| 96 |
+
print("\n===== DIFF entre SRT inicial i SRT després de reflexion =====")
|
| 97 |
+
show_diff(une_ad, refined)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
scripts/train_introspection.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from engine.refinement.introspection import (
|
| 7 |
+
FEW_SHOT_PATH,
|
| 8 |
+
RULES_PATH,
|
| 9 |
+
train_introspection_rules,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _count_nonempty_blocks(path: Path) -> int:
|
| 14 |
+
"""Compta aproximadament quants blocs d'exemple hi ha en un fitxer.
|
| 15 |
+
|
| 16 |
+
Per a ``few_shot_examples.txt`` comptem línies que comencen per ``# sha1sum=``.
|
| 17 |
+
Per a ``rules.txt`` comptem línies no buides.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
if not path.exists():
|
| 21 |
+
return 0
|
| 22 |
+
try:
|
| 23 |
+
text = path.read_text(encoding="utf-8")
|
| 24 |
+
except Exception:
|
| 25 |
+
return 0
|
| 26 |
+
|
| 27 |
+
if path.name == "few_shot_examples.txt":
|
| 28 |
+
return sum(1 for line in text.splitlines() if line.lstrip().startswith("# sha1sum="))
|
| 29 |
+
return sum(1 for line in text.splitlines() if line.strip())
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main() -> None:
|
| 33 |
+
parser = argparse.ArgumentParser(
|
| 34 |
+
description=(
|
| 35 |
+
"Entrena les regles d'introspecció a partir de les correccions HITL "
|
| 36 |
+
"emmagatzemades a demo/temp/audiodescriptions.db."
|
| 37 |
+
)
|
| 38 |
+
)
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--max-examples",
|
| 41 |
+
type=int,
|
| 42 |
+
default=None,
|
| 43 |
+
help=(
|
| 44 |
+
"Nombre màxim de parelles (MoE/Salamandra, HITL) a processar. "
|
| 45 |
+
"Per defecte es processen totes."
|
| 46 |
+
),
|
| 47 |
+
)
|
| 48 |
+
args = parser.parse_args()
|
| 49 |
+
|
| 50 |
+
train_introspection_rules(max_examples=args.max_examples)
|
| 51 |
+
|
| 52 |
+
n_examples = _count_nonempty_blocks(FEW_SHOT_PATH)
|
| 53 |
+
n_rules = _count_nonempty_blocks(RULES_PATH)
|
| 54 |
+
|
| 55 |
+
print(
|
| 56 |
+
f"✅ Entrenament d'introspection completat. "
|
| 57 |
+
f"Few-shot examples: {n_examples}, regles: {n_rules}."
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|
scripts/train_reflexion.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from engine.refinement.reflexion import (
|
| 7 |
+
REFLEXION_CSV_PATH,
|
| 8 |
+
REFLEXION_MODEL_PATH,
|
| 9 |
+
train_reflexion_model,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main() -> None:
|
| 14 |
+
parser = argparse.ArgumentParser(
|
| 15 |
+
description=(
|
| 16 |
+
"Entrena el model KNN de 'reflexion' a partir de les parelles "
|
| 17 |
+
"(MoE/Salamandra, HITL) a demo/temp/audiodescriptions.db."
|
| 18 |
+
)
|
| 19 |
+
)
|
| 20 |
+
parser.add_argument(
|
| 21 |
+
"--max-examples",
|
| 22 |
+
type=int,
|
| 23 |
+
default=None,
|
| 24 |
+
help=(
|
| 25 |
+
"Nombre màxim de mostres d'entrenament a processar. "
|
| 26 |
+
"Per defecte es processen totes."
|
| 27 |
+
),
|
| 28 |
+
)
|
| 29 |
+
args = parser.parse_args()
|
| 30 |
+
|
| 31 |
+
train_reflexion_model(max_examples=args.max_examples)
|
| 32 |
+
|
| 33 |
+
n_rows = 0
|
| 34 |
+
if REFLEXION_CSV_PATH.exists():
|
| 35 |
+
try:
|
| 36 |
+
text = REFLEXION_CSV_PATH.read_text(encoding="utf-8")
|
| 37 |
+
# descomptar la capçalera
|
| 38 |
+
n_rows = max(0, len([l for l in text.splitlines() if l.strip()]) - 1)
|
| 39 |
+
except Exception:
|
| 40 |
+
n_rows = 0
|
| 41 |
+
|
| 42 |
+
model_str = "creat" if REFLEXION_MODEL_PATH.exists() else "no creat"
|
| 43 |
+
|
| 44 |
+
print(
|
| 45 |
+
f"✅ Entrenament de reflexion completat. "
|
| 46 |
+
f"Mostres al CSV: {n_rows}, fitxer de model: {model_str} ({REFLEXION_MODEL_PATH})."
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
main()
|
scripts/update_audiodescriptions_json_ad.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import sqlite3
|
| 5 |
+
from typing import Iterable, Optional
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
BASE_DEMO = Path(__file__).resolve().parent.parent
|
| 11 |
+
MEDIA_ROOT = BASE_DEMO / "data" / "media"
|
| 12 |
+
DB_PATHS = [
|
| 13 |
+
BASE_DEMO / "data" / "audiodescriptions.db",
|
| 14 |
+
BASE_DEMO / "temp" / "audiodescriptions.db",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def read_preprocess_json(sha1sum: str) -> Optional[str]:
|
| 19 |
+
"""Lee el fichero preprocess.json para un sha1sum dado.
|
| 20 |
+
|
| 21 |
+
Devuelve el contenido en texto plano (UTF-8) o None si no existe o hay
|
| 22 |
+
errores de lectura.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
video_dir = MEDIA_ROOT / sha1sum
|
| 26 |
+
path = video_dir / "preprocess.json"
|
| 27 |
+
if not path.exists():
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
# Leemos tal cual el JSON para que quede guardado como texto
|
| 32 |
+
return path.read_text(encoding="utf-8")
|
| 33 |
+
except Exception:
|
| 34 |
+
try:
|
| 35 |
+
return path.read_text(errors="ignore")
|
| 36 |
+
except Exception:
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def ensure_json_ad_column(conn: sqlite3.Connection) -> None:
|
| 41 |
+
"""Añade la columna info_ad si no existe todavía en audiodescriptions.
|
| 42 |
+
|
| 43 |
+
(Nom antic json_ad, mantingut per compatibilitat del nom de la funció.)
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
cur = conn.cursor()
|
| 47 |
+
cur.execute("PRAGMA table_info(audiodescriptions);")
|
| 48 |
+
cols = [row[1] for row in cur.fetchall()]
|
| 49 |
+
target_col = "info_ad"
|
| 50 |
+
if target_col not in cols:
|
| 51 |
+
cur.execute(f"ALTER TABLE audiodescriptions ADD COLUMN {target_col} TEXT;")
|
| 52 |
+
conn.commit()
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def update_db(path: Path) -> None:
|
| 56 |
+
"""Actualiza una base de datos audiodescriptions.db añadiendo info_ad.
|
| 57 |
+
|
| 58 |
+
- Asegura que existe la columna info_ad.
|
| 59 |
+
- Para cada sha1sum presente en la tabla, intenta leer
|
| 60 |
+
demo/data/media/<sha1sum>/preprocess.json i guarda el seu contingut a info_ad.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
if not path.exists():
|
| 64 |
+
print(f"[INFO] {path} no existe, se omite.")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
print(f"[INFO] Actualitzant {path} ...")
|
| 68 |
+
|
| 69 |
+
conn = sqlite3.connect(str(path))
|
| 70 |
+
conn.row_factory = sqlite3.Row
|
| 71 |
+
try:
|
| 72 |
+
ensure_json_ad_column(conn)
|
| 73 |
+
cur = conn.cursor()
|
| 74 |
+
|
| 75 |
+
# Obtener todos los sha1sum distintos presentes en la tabla
|
| 76 |
+
cur.execute("SELECT DISTINCT sha1sum FROM audiodescriptions;")
|
| 77 |
+
rows = cur.fetchall()
|
| 78 |
+
total = len(rows)
|
| 79 |
+
print(f" - {total} sha1sum diferents trobats.")
|
| 80 |
+
|
| 81 |
+
updated_rows = 0
|
| 82 |
+
for idx, row in enumerate(rows, start=1):
|
| 83 |
+
sha1sum = row["sha1sum"]
|
| 84 |
+
json_text = read_preprocess_json(sha1sum)
|
| 85 |
+
|
| 86 |
+
if json_text is None:
|
| 87 |
+
# No hi ha preprocess.json per a aquest sha1sum; deixem info_ad a NULL
|
| 88 |
+
print(f" [{idx}/{total}] {sha1sum}: preprocess.json no trobat, s'omet.")
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
+
cur.execute(
|
| 92 |
+
"UPDATE audiodescriptions SET info_ad = ? WHERE sha1sum = ?;",
|
| 93 |
+
(json_text, sha1sum),
|
| 94 |
+
)
|
| 95 |
+
updated_rows += cur.rowcount
|
| 96 |
+
print(f" [{idx}/{total}] {sha1sum}: json_ad actualitzat per {cur.rowcount} files.")
|
| 97 |
+
|
| 98 |
+
conn.commit()
|
| 99 |
+
print(f"[OK] {path}: {updated_rows} files actualitzades amb info_ad.")
|
| 100 |
+
finally:
|
| 101 |
+
conn.close()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main() -> None:
|
| 105 |
+
print(f"MEDIA_ROOT: {MEDIA_ROOT} (exists={MEDIA_ROOT.exists()})")
|
| 106 |
+
if not MEDIA_ROOT.exists():
|
| 107 |
+
raise SystemExit("❌ No s'ha trobat demo/data/media")
|
| 108 |
+
|
| 109 |
+
for db_path in DB_PATHS:
|
| 110 |
+
update_db(db_path)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
main()
|
scripts/verify_temp_dbs.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Verificador de BDs de la demo (para usar desde el Space demo).
|
| 2 |
+
|
| 3 |
+
Muestra qué ficheros .db existen en:
|
| 4 |
+
- demo/data/db
|
| 5 |
+
- demo/temp/db
|
| 6 |
+
|
| 7 |
+
Se puede ejecutar como script independiente o invocado desde el código
|
| 8 |
+
para dejar trazas en el log.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import yaml
|
| 13 |
+
|
| 14 |
+
# Este archivo vive en demo/scripts, así que la raíz de demo es el padre
|
| 15 |
+
DEMO_ROOT = Path(__file__).resolve().parent.parent
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def read_data_origin() -> str:
|
| 19 |
+
cfg_path = DEMO_ROOT / "config.yaml"
|
| 20 |
+
if not cfg_path.exists():
|
| 21 |
+
return "(config.yaml no encontrado)"
|
| 22 |
+
try:
|
| 23 |
+
with cfg_path.open("r", encoding="utf-8") as f:
|
| 24 |
+
cfg = yaml.safe_load(f) or {}
|
| 25 |
+
app_cfg = cfg.get("app", {}) or {}
|
| 26 |
+
return str(app_cfg.get("data_origin", "internal")).lower()
|
| 27 |
+
except Exception as e:
|
| 28 |
+
return f"(error leyendo config.yaml: {e})"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def list_dbs(path: Path) -> list[str]:
|
| 32 |
+
if not path.exists():
|
| 33 |
+
return []
|
| 34 |
+
return sorted(str(p.name) for p in path.glob("*.db"))
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def run_verification() -> None:
|
| 38 |
+
"""Ejecuta la verificación y escribe por stdout.
|
| 39 |
+
|
| 40 |
+
Pensado para ser llamado desde ensure_temp_databases para dejar
|
| 41 |
+
trazas en el log del Space demo.
|
| 42 |
+
"""
|
| 43 |
+
data_origin = read_data_origin()
|
| 44 |
+
data_db_dir = DEMO_ROOT / "data" / "db"
|
| 45 |
+
temp_db_dir = DEMO_ROOT / "temp" / "db"
|
| 46 |
+
|
| 47 |
+
print("=== Verificación de BDs demo (invocada desde Space demo) ===")
|
| 48 |
+
print(f"Raíz demo: {DEMO_ROOT}")
|
| 49 |
+
print(f"data_origin: {data_origin}")
|
| 50 |
+
print(f"data/db dir: {data_db_dir}")
|
| 51 |
+
print(f"temp/db dir: {temp_db_dir}")
|
| 52 |
+
print()
|
| 53 |
+
|
| 54 |
+
data_dbs = list_dbs(data_db_dir)
|
| 55 |
+
temp_dbs = list_dbs(temp_db_dir)
|
| 56 |
+
|
| 57 |
+
print("-- demo/data/db --")
|
| 58 |
+
if data_dbs:
|
| 59 |
+
for name in data_dbs:
|
| 60 |
+
print(f" - {name}")
|
| 61 |
+
else:
|
| 62 |
+
print(" (sin .db)")
|
| 63 |
+
print()
|
| 64 |
+
|
| 65 |
+
print("-- demo/temp/db --")
|
| 66 |
+
if temp_dbs:
|
| 67 |
+
for name in temp_dbs:
|
| 68 |
+
print(f" - {name}")
|
| 69 |
+
else:
|
| 70 |
+
print(" (sin .db)")
|
| 71 |
+
print()
|
| 72 |
+
|
| 73 |
+
missing_in_temp = [n for n in data_dbs if n not in temp_dbs]
|
| 74 |
+
if missing_in_temp:
|
| 75 |
+
print("Ficheros presentes en data/db pero NO en temp/db:")
|
| 76 |
+
for n in missing_in_temp:
|
| 77 |
+
print(f" - {n}")
|
| 78 |
+
else:
|
| 79 |
+
print("Todos los .db de data/db están también en temp/db (o no hay .db)")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def main() -> None:
|
| 83 |
+
run_verification()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
main()
|
scripts/video_analysis.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from datetime import timedelta
|
| 6 |
+
from typing import List, Optional, Dict, Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
TIME_RE = re.compile(
|
| 10 |
+
r"(?P<start>\d{2}:\d{2}:\d{2}[,\.]\d{3})\s*-->\s*(?P<end>\d{2}:\d{2}:\d{2}[,\.]\d{3})"
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class SRTBlock:
|
| 16 |
+
index: int
|
| 17 |
+
start: float # seconds
|
| 18 |
+
end: float # seconds
|
| 19 |
+
text: str
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _parse_timestamp(ts: str) -> float:
|
| 23 |
+
"""Convierte 'HH:MM:SS,mmm' o 'HH:MM:SS.mmm' a segundos (float)."""
|
| 24 |
+
ts = ts.replace(",", ".")
|
| 25 |
+
h, m, s = ts.split(":")
|
| 26 |
+
seconds, millis = (s.split("." ) + ["0"])[:2]
|
| 27 |
+
td = timedelta(
|
| 28 |
+
hours=int(h),
|
| 29 |
+
minutes=int(m),
|
| 30 |
+
seconds=int(seconds),
|
| 31 |
+
milliseconds=int(millis.ljust(3, "0")),
|
| 32 |
+
)
|
| 33 |
+
return td.total_seconds()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _parse_srt(srt_text: str) -> List[SRTBlock]:
|
| 37 |
+
"""Parsea texto SRT en una lista de bloques SRTBlock."""
|
| 38 |
+
srt_text = srt_text.replace("\r\n", "\n").replace("\r", "\n")
|
| 39 |
+
chunks = [c.strip() for c in re.split(r"\n\s*\n", srt_text) if c.strip()]
|
| 40 |
+
blocks: List[SRTBlock] = []
|
| 41 |
+
|
| 42 |
+
for chunk in chunks:
|
| 43 |
+
lines = chunk.split("\n")
|
| 44 |
+
idx_line = 0
|
| 45 |
+
index = None
|
| 46 |
+
|
| 47 |
+
if lines and lines[0].strip().isdigit():
|
| 48 |
+
index = int(lines[0].strip())
|
| 49 |
+
idx_line = 1
|
| 50 |
+
|
| 51 |
+
time_match = None
|
| 52 |
+
time_line_idx = None
|
| 53 |
+
for i in range(idx_line, min(idx_line + 3, len(lines))):
|
| 54 |
+
m = TIME_RE.search(lines[i])
|
| 55 |
+
if m:
|
| 56 |
+
time_match = m
|
| 57 |
+
time_line_idx = i
|
| 58 |
+
break
|
| 59 |
+
|
| 60 |
+
if not time_match or time_line_idx is None:
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
start = _parse_timestamp(time_match.group("start"))
|
| 64 |
+
end = _parse_timestamp(time_match.group("end"))
|
| 65 |
+
if index is None:
|
| 66 |
+
index = len(blocks) + 1
|
| 67 |
+
|
| 68 |
+
text = "\n".join(lines[time_line_idx + 1 :]).strip()
|
| 69 |
+
blocks.append(SRTBlock(index=index, start=start, end=end, text=text))
|
| 70 |
+
|
| 71 |
+
return blocks
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def analyze_srt(
|
| 75 |
+
srt_text: str,
|
| 76 |
+
*,
|
| 77 |
+
ad_markers: Optional[List[str]] = None,
|
| 78 |
+
) -> Dict[str, Any]:
|
| 79 |
+
"""Analiza un SRT y devuelve métricas básicas.
|
| 80 |
+
|
| 81 |
+
Métricas devueltas:
|
| 82 |
+
- duration_sec: duración total estimada del vídeo (segundos)
|
| 83 |
+
- words_per_min: número de palabras por minuto
|
| 84 |
+
- speakers_blocks_per_min: número de bloques de diálogo por minuto
|
| 85 |
+
- ad_time_ratio: porcentaje (0..1) del tiempo total con bloques marcados como AD
|
| 86 |
+
- blocks_per_min: número total de bloques por minuto
|
| 87 |
+
|
| 88 |
+
Heurísticas:
|
| 89 |
+
- Se asume que la duración del vídeo es el final del último bloque.
|
| 90 |
+
- Un "bloque de AD" es aquel cuya primera línea contiene alguno de los
|
| 91 |
+
marcadores indicados en `ad_markers` (por ejemplo: "[AD]", "AD:", "(AD)").
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
blocks = _parse_srt(srt_text)
|
| 95 |
+
if not blocks:
|
| 96 |
+
return {
|
| 97 |
+
"duration_sec": 0.0,
|
| 98 |
+
"words_per_min": 0.0,
|
| 99 |
+
"speakers_blocks_per_min": 0.0,
|
| 100 |
+
"ad_time_ratio": 0.0,
|
| 101 |
+
"blocks_per_min": 0.0,
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
duration_sec = max(b.end for b in blocks)
|
| 105 |
+
duration_min = max(duration_sec / 60.0, 1e-6)
|
| 106 |
+
|
| 107 |
+
# Palabras totales
|
| 108 |
+
total_words = 0
|
| 109 |
+
for b in blocks:
|
| 110 |
+
total_words += len(b.text.split())
|
| 111 |
+
|
| 112 |
+
# Bloques considerados de "hablante" (no AD)
|
| 113 |
+
if ad_markers is None:
|
| 114 |
+
ad_markers = ["[AD]", "AD:", "(AD)"]
|
| 115 |
+
|
| 116 |
+
def is_ad_block(block: SRTBlock) -> bool:
|
| 117 |
+
first_line = (block.text.splitlines() or [""])[0].strip().upper()
|
| 118 |
+
for mk in ad_markers:
|
| 119 |
+
if mk.upper() in first_line:
|
| 120 |
+
return True
|
| 121 |
+
return False
|
| 122 |
+
|
| 123 |
+
ad_time = 0.0
|
| 124 |
+
speech_blocks = 0
|
| 125 |
+
for b in blocks:
|
| 126 |
+
if is_ad_block(b):
|
| 127 |
+
ad_time += max(0.0, b.end - b.start)
|
| 128 |
+
else:
|
| 129 |
+
speech_blocks += 1
|
| 130 |
+
|
| 131 |
+
words_per_min = total_words / duration_min
|
| 132 |
+
speakers_blocks_per_min = speech_blocks / duration_min
|
| 133 |
+
blocks_per_min = len(blocks) / duration_min
|
| 134 |
+
ad_time_ratio = ad_time / duration_sec if duration_sec > 0 else 0.0
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
"duration_sec": float(duration_sec),
|
| 138 |
+
"words_per_min": float(words_per_min),
|
| 139 |
+
"speakers_blocks_per_min": float(speakers_blocks_per_min),
|
| 140 |
+
"ad_time_ratio": float(ad_time_ratio),
|
| 141 |
+
"blocks_per_min": float(blocks_per_min),
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def embed_srt_sentences(
|
| 146 |
+
srt_text: str,
|
| 147 |
+
*,
|
| 148 |
+
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
| 149 |
+
) -> Dict[str, Any]:
|
| 150 |
+
"""Devuelve embeddings para las frases de un SRT.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
srt_text: Contenido completo del archivo SRT como string.
|
| 154 |
+
model_name: Nombre del modelo de sentence-transformers a usar.
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
Diccionario con:
|
| 158 |
+
- "model_name": nombre del modelo utilizado
|
| 159 |
+
- "sentences": lista de strings (una por bloque)
|
| 160 |
+
- "embeddings": lista de listas de floats con los embeddings
|
| 161 |
+
|
| 162 |
+
NOTA: Requiere instalar `sentence-transformers` y un backend de PyTorch
|
| 163 |
+
compatible. Si no está instalado, lanzará ImportError.
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
blocks = _parse_srt(srt_text)
|
| 167 |
+
sentences = [b.text.replace("\n", " ").strip() for b in blocks if b.text.strip()]
|
| 168 |
+
|
| 169 |
+
if not sentences:
|
| 170 |
+
return {"model_name": model_name, "sentences": [], "embeddings": []}
|
| 171 |
+
|
| 172 |
+
try:
|
| 173 |
+
from sentence_transformers import SentenceTransformer
|
| 174 |
+
except ImportError as exc:
|
| 175 |
+
raise ImportError(
|
| 176 |
+
"sentence-transformers no está instalado. "
|
| 177 |
+
"Instala la dependencia para poder generar embeddings."
|
| 178 |
+
) from exc
|
| 179 |
+
|
| 180 |
+
model = SentenceTransformer(model_name)
|
| 181 |
+
embs = model.encode(sentences, convert_to_numpy=False)
|
| 182 |
+
|
| 183 |
+
embeddings = [list(map(float, vec)) for vec in embs]
|
| 184 |
+
|
| 185 |
+
return {
|
| 186 |
+
"model_name": model_name,
|
| 187 |
+
"sentences": sentences,
|
| 188 |
+
"embeddings": embeddings,
|
| 189 |
+
}
|