Spaces:

VeuReu
/

demo

Sleeping

App Files Files Community

VeuReu commited on Nov 10, 2025

Commit

f026f25

1 Parent(s): 8404f78

Upload 6 files

Browse files

Files changed (2) hide show

app.py +1 -1
page_modules/process_video.py +1044 -232

app.py CHANGED Viewed

@@ -119,7 +119,7 @@ if page == "Processar vídeo nou":
         st.error("No tens permisos per processar nous vídeos. Verifica el teu mòbil per obtenir accés complet.")
         st.stop()
-    render_process_video_page()
 elif page == "Analitzar video-transcripcions":
     require_login(render_login_form)

         st.error("No tens permisos per processar nous vídeos. Verifica el teu mòbil per obtenir accés complet.")
         st.stop()
+    render_process_video_page(api, BACKEND_BASE_URL)
 elif page == "Analitzar video-transcripcions":
     require_login(render_login_form)

page_modules/process_video.py CHANGED Viewed

@@ -1,232 +1,1044 @@
-"""UI logic for the "Processar vídeo nou" page."""
-from __future__ import annotations
-import re
-import shutil
-import subprocess
-from pathlib import Path
-import streamlit as st
-from PIL import Image, ImageDraw
-def _get_video_duration(path: str) -> float:
-    """Return video duration in seconds using ffprobe, ffmpeg or OpenCV as fallback."""
-    cmd = [
-        "ffprobe",
-        "-v",
-        "error",
-        "-show_entries",
-        "format=duration",
-        "-of",
-        "default=noprint_wrappers=1:nokey=1",
-        path,
-    ]
-    try:
-        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
-        return float(result.stdout.strip())
-    except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
-        pass
-    if shutil.which("ffmpeg"):
-        try:
-            ffmpeg_cmd = ["ffmpeg", "-i", path]
-            result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, check=False)
-            output = result.stderr or result.stdout or ""
-            match = re.search(r"Duration:\s*(\d+):(\d+):(\d+\.\d+)", output)
-            if match:
-                hours, minutes, seconds = match.groups()
-                total_seconds = (int(hours) * 3600) + (int(minutes) * 60) + float(seconds)
-                return float(total_seconds)
-        except FileNotFoundError:
-            pass
-    # Últim recurs: intentar amb OpenCV si està disponible
-    try:
-        import cv2
-        cap = cv2.VideoCapture(path)
-        if cap.isOpened():
-            fps = cap.get(cv2.CAP_PROP_FPS) or 0
-            frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0
-            cap.release()
-            if fps > 0 and frame_count > 0:
-                return float(frame_count / fps)
-        else:
-            cap.release()
-    except Exception:
-        pass
-    return 0.0
-def _transcode_video(input_path: str, output_path: str, max_duration: int | None = None) -> None:
-    cmd = ["ffmpeg", "-y", "-i", input_path]
-    if max_duration is not None:
-        cmd += ["-t", str(max_duration)]
-    cmd += [
-        "-c:v",
-        "libx264",
-        "-preset",
-        "veryfast",
-        "-crf",
-        "23",
-        "-c:a",
-        "aac",
-        "-movflags",
-        "+faststart",
-        output_path,
-    ]
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(result.stderr.strip() or "ffmpeg failed")
-def render_process_video_page() -> None:
-    st.header("Processar un nou clip de vídeo")
-    # Inicializar el estado de la página si no existe
-    if "video_uploaded" not in st.session_state:
-        st.session_state.video_uploaded = None
-    if "characters_detected" not in st.session_state:
-        st.session_state.characters_detected = None
-    if "characters_saved" not in st.session_state:
-        st.session_state.characters_saved = False
-    # --- 1. Subida del vídeo ---
-    MAX_SIZE_MB = 20
-    MAX_DURATION_S = 240  # 4 minutos
-    uploaded_file = st.file_uploader(
-        "Puja un clip de vídeo (MP4, < 20MB, < 4 minuts)",
-        type=["mp4"],
-        key="video_uploader",
-    )
-    if uploaded_file is not None:
-        # Resetear el estado si se sube un nuevo archivo
-        if st.session_state.video_uploaded is None or uploaded_file.name != st.session_state.video_uploaded.get(
-            "original_name"
-        ):
-            st.session_state.video_uploaded = {"original_name": uploaded_file.name, "status": "validating"}
-            st.session_state.characters_detected = None
-            st.session_state.characters_saved = False
-        if st.session_state.video_uploaded["status"] == "validating":
-            is_valid = True
-            if uploaded_file.size > MAX_SIZE_MB * 1024 * 1024:
-                st.error(f"El vídeo supera el límit de {MAX_SIZE_MB}MB.")
-                is_valid = False
-            if is_valid:
-                with st.spinner("Processant el vídeo..."):
-                    temp_path = Path("temp_video.mp4")
-                    with temp_path.open("wb") as f:
-                        f.write(uploaded_file.getbuffer())
-                    was_truncated = False
-                    final_video_path = None
-                    try:
-                        duration = _get_video_duration(str(temp_path))
-                        duration_unknown = False
-                        if not duration:
-                            st.warning(
-                                "No s'ha pogut obtenir la durada del vídeo. Es continuarà assumint un màxim de 4 minuts."
-                            )
-                            duration = float(MAX_DURATION_S)
-                            duration_unknown = True
-                        if is_valid:
-                            if duration > MAX_DURATION_S:
-                                was_truncated = True
-                            video_name = Path(uploaded_file.name).stem
-                            video_dir = Path("/tmp/data/videos") / video_name
-                            video_dir.mkdir(parents=True, exist_ok=True)
-                            final_video_path = video_dir / f"{video_name}.mp4"
-                            try:
-                                _transcode_video(
-                                    str(temp_path),
-                                    str(final_video_path),
-                                    MAX_DURATION_S if (was_truncated or duration_unknown) else None,
-                                )
-                            except RuntimeError as exc:
-                                st.error(f"No s'ha pogut processar el vídeo: {exc}")
-                                is_valid = False
-                        if is_valid and final_video_path is not None:
-                            st.session_state.video_uploaded.update(
-                                {
-                                    "status": "processed",
-                                    "path": str(final_video_path),
-                                    "was_truncated": was_truncated or duration_unknown,
-                                    "duration_unknown": duration_unknown,
-                                }
-                            )
-                            st.rerun()
-                    finally:
-                        if temp_path.exists():
-                            temp_path.unlink()
-    if st.session_state.video_uploaded and st.session_state.video_uploaded["status"] == "processed":
-        st.success(f"Vídeo '{st.session_state.video_uploaded['original_name']}' pujat i processat correctament.")
-        if st.session_state.video_uploaded["was_truncated"]:
-            st.warning("El vídeo s'ha truncat a 4 minuts.")
-    st.markdown("---")
-    col1, col2 = st.columns([1, 3])
-    with col1:
-        detect_button_disabled = st.session_state.video_uploaded is None
-        if st.button("Detectar Personatges", disabled=detect_button_disabled):
-            with st.spinner("Detectant personatges..."):
-                st.session_state.characters_detected = [
-                    {
-                        "id": "char1",
-                        "image_path": "init_data/placeholder.png",
-                        "description": "Dona amb cabell ros i ulleres",
-                    },
-                    {
-                        "id": "char2",
-                        "image_path": "init_data/placeholder.png",
-                        "description": "Home amb barba i barret",
-                    },
-                ]
-                st.session_state.characters_saved = False
-    def _load_or_placeholder(path: str, size: tuple[int, int] = (150, 150)):
-        p = Path(path)
-        if p.exists():
-            return str(p)
-        img = Image.new("RGB", size, color=(230, 230, 230))
-        d = ImageDraw.Draw(img)
-        text = "No image"
-        tw, th = d.textlength(text), 12
-        d.text(((size[0]-tw)/2, (size[1]-th)/2), text, fill=(120, 120, 120))
-        return img
-    if st.session_state.characters_detected:
-        st.subheader("Personatges detectats")
-        for char in st.session_state.characters_detected:
-            with st.form(key=f"form_{char['id']}"):
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.image(_load_or_placeholder(char["image_path"]), width=150)
-                with col2:
-                    st.caption(char["description"])
-                    st.text_input("Nom del personatge", key=f"name_{char['id']}")
-                    st.form_submit_button("Cercar")
-        st.markdown("---_**")
-        col1, col2, col3 = st.columns([1, 1, 2])
-        with col1:
-            if st.button("Desar", type="primary"):
-                st.session_state.characters_saved = True
-                st.success("Personatges desats correctament.")
-        with col2:
-            if st.session_state.characters_saved:
-                st.button("Generar Audiodescripció")

+"""UI logic for the "Processar vídeo nou" page - Recovered from backup with full functionality."""
+from __future__ import annotations
+import re
+import shutil
+import subprocess
+import os
+import time
+import tempfile
+from pathlib import Path
+import streamlit as st
+from PIL import Image, ImageDraw
+def get_all_catalan_names():
+    """Retorna tots els noms catalans disponibles."""
+    noms_home = ["Jordi", "Marc", "Pau", "Pere", "Joan", "Josep", "David", "Àlex", "Guillem", "Albert",
+                 "Arnau", "Martí", "Bernat", "Oriol", "Roger", "Pol", "Lluís", "Sergi", "Carles", "Xavier"]
+    noms_dona = ["Maria", "Anna", "Laura", "Marta", "Cristina", "Núria", "Montserrat", "Júlia", "Sara", "Carla",
+                 "Alba", "Elisabet", "Rosa", "Gemma", "Sílvia", "Teresa", "Irene", "Laia", "Marina", "Bet"]
+    return noms_home, noms_dona
+def get_catalan_name_for_speaker(speaker_label: int, used_names_home: list = None, used_names_dona: list = None) -> str:
+    """Genera un nom català per a un speaker, reutilitzant noms de caras si estan disponibles."""
+    noms_home, noms_dona = get_all_catalan_names()
+    if used_names_home is None:
+        used_names_home = []
+    if used_names_dona is None:
+        used_names_dona = []
+    is_male = (speaker_label % 2 == 0)
+    if is_male:
+        if used_names_home:
+            idx = speaker_label // 2
+            return used_names_home[idx % len(used_names_home)]
+        else:
+            hash_val = hash(f"speaker_{speaker_label}")
+            return noms_home[abs(hash_val) % len(noms_home)]
+    else:
+        if used_names_dona:
+            idx = speaker_label // 2
+            return used_names_dona[idx % len(used_names_dona)]
+        else:
+            hash_val = hash(f"speaker_{speaker_label}")
+            return noms_dona[abs(hash_val) % len(noms_dona)]
+def _get_video_duration(path: str) -> float:
+    """Return video duration in seconds using ffprobe, ffmpeg or OpenCV as fallback."""
+    cmd = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "default=noprint_wrappers=1:nokey=1",
+        path,
+    ]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return float(result.stdout.strip())
+    except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
+        pass
+    if shutil.which("ffmpeg"):
+        try:
+            ffmpeg_cmd = ["ffmpeg", "-i", path]
+            result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, check=False)
+            output = result.stderr or result.stdout or ""
+            match = re.search(r"Duration:\s*(\d+):(\d+):(\d+\.\d+)", output)
+            if match:
+                hours, minutes, seconds = match.groups()
+                total_seconds = (int(hours) * 3600) + (int(minutes) * 60) + float(seconds)
+                return float(total_seconds)
+        except FileNotFoundError:
+            pass
+    # Últim recurs: intentar amb OpenCV si està disponible
+    try:
+        import cv2
+        cap = cv2.VideoCapture(path)
+        if cap.isOpened():
+            fps = cap.get(cv2.CAP_PROP_FPS) or 0
+            frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0
+            cap.release()
+            if fps > 0 and frame_count > 0:
+                return float(frame_count / fps)
+        else:
+            cap.release()
+    except Exception:
+        pass
+    return 0.0
+def _transcode_video(input_path: str, output_path: str, max_duration: int | None = None) -> None:
+    cmd = ["ffmpeg", "-y", "-i", input_path]
+    if max_duration is not None:
+        cmd += ["-t", str(max_duration)]
+    cmd += [
+        "-c:v",
+        "libx264",
+        "-preset",
+        "veryfast",
+        "-crf",
+        "23",
+        "-c:a",
+        "aac",
+        "-movflags",
+        "+faststart",
+        output_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(result.stderr.strip() or "ffmpeg failed")
+def render_process_video_page(api, backend_base_url: str) -> None:
+    st.header("Processar un nou clip de vídeo")
+    # Inicializar el estado de la página si no existe
+    if "video_uploaded" not in st.session_state:
+        st.session_state.video_uploaded = None
+    if "characters_detected" not in st.session_state:
+        st.session_state.characters_detected = None
+    if "audio_segments" not in st.session_state:
+        st.session_state.audio_segments = None
+    if "voice_labels" not in st.session_state:
+        st.session_state.voice_labels = None
+    if "face_labels" not in st.session_state:
+        st.session_state.face_labels = None
+    if "scene_clusters" not in st.session_state:
+        st.session_state.scene_clusters = None
+    if "scene_detection_done" not in st.session_state:
+        st.session_state.scene_detection_done = False
+    if "detect_done" not in st.session_state:
+        st.session_state.detect_done = False
+    if "casting_finalized" not in st.session_state:
+        st.session_state.casting_finalized = False
+    if "video_name_from_engine" not in st.session_state:
+        st.session_state.video_name_from_engine = None
+    if "diarization_info" not in st.session_state:
+        st.session_state.diarization_info = {}
+    if "characters_saved" not in st.session_state:
+        st.session_state.characters_saved = False
+    # --- 1. Subida del vídeo ---
+    MAX_SIZE_MB = 20
+    MAX_DURATION_S = 240  # 4 minutos
+    uploaded_file = st.file_uploader(
+        "Puja un clip de vídeo (MP4, < 20MB, < 4 minuts)",
+        type=["mp4"],
+        key="video_uploader",
+    )
+    if uploaded_file is not None:
+        # Resetear el estado si se sube un nuevo archivo
+        if st.session_state.video_uploaded is None or uploaded_file.name != st.session_state.video_uploaded.get(
+            "original_name"
+        ):
+            st.session_state.video_uploaded = {"original_name": uploaded_file.name, "status": "validating"}
+            st.session_state.characters_detected = None
+            st.session_state.characters_saved = False
+        if st.session_state.video_uploaded["status"] == "validating":
+            is_valid = True
+            if uploaded_file.size > MAX_SIZE_MB * 1024 * 1024:
+                st.error(f"El vídeo supera el límit de {MAX_SIZE_MB}MB.")
+                is_valid = False
+            if is_valid:
+                with st.spinner("Processant el vídeo..."):
+                    temp_path = Path("temp_video.mp4")
+                    with temp_path.open("wb") as f:
+                        f.write(uploaded_file.getbuffer())
+                    was_truncated = False
+                    final_video_path = None
+                    try:
+                        duration = _get_video_duration(str(temp_path))
+                        duration_unknown = False
+                        if not duration:
+                            st.warning(
+                                "No s'ha pogut obtenir la durada del vídeo. Es continuarà assumint un màxim de 4 minuts."
+                            )
+                            duration = float(MAX_DURATION_S)
+                            duration_unknown = True
+                        if is_valid:
+                            if duration > MAX_DURATION_S:
+                                was_truncated = True
+                            video_name = Path(uploaded_file.name).stem
+                            video_dir = Path("/tmp/data/videos") / video_name
+                            video_dir.mkdir(parents=True, exist_ok=True)
+                            final_video_path = video_dir / f"{video_name}.mp4"
+                            try:
+                                _transcode_video(
+                                    str(temp_path),
+                                    str(final_video_path),
+                                    MAX_DURATION_S if (was_truncated or duration_unknown) else None,
+                                )
+                            except RuntimeError as exc:
+                                st.error(f"No s'ha pogut processar el vídeo: {exc}")
+                                is_valid = False
+                        if is_valid and final_video_path is not None:
+                            st.session_state.video_uploaded.update(
+                                {
+                                    "status": "processed",
+                                    "path": str(final_video_path),
+                                    "was_truncated": was_truncated or duration_unknown,
+                                    "duration_unknown": duration_unknown,
+                                    "bytes": uploaded_file.getvalue(),
+                                    "name": uploaded_file.name,
+                                }
+                            )
+                            st.rerun()
+                    finally:
+                        if temp_path.exists():
+                            temp_path.unlink()
+    if st.session_state.video_uploaded and st.session_state.video_uploaded["status"] == "processed":
+        st.success(f"Vídeo '{st.session_state.video_uploaded['original_name']}' pujat i processat correctament.")
+        if st.session_state.video_uploaded["was_truncated"]:
+            st.warning("El vídeo s'ha truncat a 4 minuts.")
+    # --- 2. Form de detecció amb sliders ---
+    st.markdown("---")
+    with st.form("detect_form"):
+        col_btn, col_face, col_voice, col_scene = st.columns([1, 1, 1, 1])
+        with col_face:
+            st.markdown("**Cares**")
+            face_max_groups = st.slider("Límit de grups (cares)", 1, 10, 5, 1, key="face_max_groups")
+            face_min_cluster = st.slider("Mida mínima (cares)", 1, 5, 3, 1, key="face_min_cluster")
+            face_sensitivity = st.slider("Sensibilitat (cares)", 0.0, 1.0, 0.5, 0.05, key="face_sensitivity",
+                                      help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
+        with col_voice:
+            st.markdown("**Veus**")
+            voice_max_groups = st.slider("Límit de grups (veus)", 1, 10, 5, 1, key="voice_max_groups")
+            voice_min_cluster = st.slider("Mida mínima (veus)", 1, 5, 3, 1, key="voice_min_cluster")
+            voice_sensitivity = st.slider("Sensibilitat (veus)", 0.0, 1.0, 0.5, 0.05, key="voice_sensitivity",
+                                        help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
+        with col_scene:
+            st.markdown("**Escenes**")
+            scene_max_groups = st.slider("Límit de grups (escenes)", 1, 10, 3, 1, key="scene_max_groups")
+            scene_min_cluster = st.slider("Mida mínima (escenes)", 5, 20, 12, 1, key="scene_min_cluster")
+            scene_sensitivity = st.slider("Sensibilitat (escenes)", 0.0, 1.0, 0.5, 0.05, key="scene_sensitivity",
+                                        help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
+        with col_btn:
+            max_frames = st.number_input("Nombre de frames a processar", min_value=10, max_value=500, value=100, step=10,
+                                        help="Nombre de fotogrames equiespaciats a extreure del vídeo per detectar cares")
+            can_detect = st.session_state.video_uploaded is not None
+            submit_detect = st.form_submit_button("Detectar Personatges", disabled=not can_detect)
+            if not can_detect:
+                st.caption("📹 Necessites pujar un vídeo primer")
+        if submit_detect:
+            try:
+                v = st.session_state.video_uploaded
+                # Reset estat abans de començar
+                st.session_state.scene_clusters = None
+                st.session_state.scene_detection_done = False
+                st.session_state.detect_done = False
+                st.session_state.casting_finalized = False
+                resp = api.create_initial_casting(
+                    video_bytes=v["bytes"],
+                    video_name=v["name"],
+                    face_max_groups=face_max_groups,
+                    face_min_cluster_size=face_min_cluster,
+                    face_sensitivity=face_sensitivity,
+                    voice_max_groups=voice_max_groups,
+                    voice_min_cluster_size=voice_min_cluster,
+                    voice_sensitivity=voice_sensitivity,
+                    max_frames=max_frames,
+                )
+                if not isinstance(resp, dict) or not resp.get("job_id"):
+                    st.error("No s'ha pogut crear el job al servidor.")
+                else:
+                    job_id = resp["job_id"]
+                    with st.spinner("Processant al servidor…"):
+                        time.sleep(3)
+                        attempt, max_attempts = 0, 120
+                        progress_placeholder = st.empty()
+                        while attempt < max_attempts:
+                            stt = api.get_job(job_id)
+                            status = stt.get("status")
+                            if status in ("queued", "processing"):
+                                if attempt % 10 == 0:
+                                    elapsed_min = (attempt * 5) // 60
+                                    progress_placeholder.info(f"⏳ Processant al servidor... (~{elapsed_min} min)")
+                                time.sleep(5)
+                                attempt += 1
+                                continue
+                            if status == "failed":
+                                progress_placeholder.empty()
+                                st.error("El processament ha fallat al servidor.")
+                                break
+                            # Success
+                            res = stt.get("results", {})
+                            chars = res.get("characters", [])
+                            fl = res.get("face_labels", [])
+                            segs = res.get("audio_segments", [])
+                            vl = res.get("voice_labels", [])
+                            base_dir = res.get("base_dir")
+                            vname = os.path.basename(base_dir) if base_dir else None
+                            diar_info = res.get("diarization_info", {})
+                            st.session_state.characters_detected = chars or []
+                            st.session_state.face_labels = fl or []
+                            st.session_state.audio_segments = segs or []
+                            st.session_state.voice_labels = vl or []
+                            st.session_state.video_name_from_engine = vname
+                            st.session_state.engine_base_dir = base_dir
+                            st.session_state.diarization_info = diar_info or {}
+                            progress_placeholder.empty()
+                            if chars:
+                                st.success(f"✓ Detecció completada! Trobades {len(chars)} cares.")
+                                st.info("💡 Usa els botons '🎨 Generar descripció' a sota de cada personatge per obtenir descripcions automàtiques amb Salamandra Vision.")
+                            else:
+                                st.info("No s'han detectat cares en aquest vídeo.")
+                            # Detect scenes
+                            try:
+                                scene_out = api.detect_scenes(
+                                    video_bytes=v["bytes"],
+                                    video_name=v["name"],
+                                    max_groups=scene_max_groups,
+                                    min_cluster_size=scene_min_cluster,
+                                    scene_sensitivity=scene_sensitivity,
+                                    frame_interval_sec=0.5,
+                                )
+                                scs = scene_out.get("scene_clusters") if isinstance(scene_out, dict) else None
+                                if isinstance(scs, list):
+                                    st.session_state.scene_clusters = scs
+                                else:
+                                    st.session_state.scene_clusters = []
+                            except Exception:
+                                st.session_state.scene_clusters = []
+                            finally:
+                                st.session_state.scene_detection_done = True
+                            st.session_state.detect_done = True
+                            st.success("✅ Processament completat!")
+                            break
+                        else:
+                            progress_placeholder.empty()
+                            st.warning(f"⏱️ El servidor no ha completat el job en {max_attempts * 5 // 60} minuts.")
+            except Exception as e:
+                st.error(f"Error inesperat: {e}")
+    # --- 3. Carruseles de cares ---
+    if st.session_state.get("characters_detected") is not None:
+        st.markdown("---")
+        n_face_clusters = len(st.session_state.get("characters_detected") or [])
+        st.subheader(f"🖼️ Cares — clústers: {n_face_clusters}")
+        if n_face_clusters == 0:
+            st.info("No s'han detectat clústers de cara en aquest clip.")
+        for idx, ch in enumerate(st.session_state.characters_detected or []):
+            try:
+                folder_name = Path(ch.get("folder") or "").name
+            except Exception:
+                folder_name = ""
+            char_id = ch.get("id") or folder_name or f"char{idx+1}"
+            def _safe_key(s: str) -> str:
+                k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
+                return k or f"cluster_{idx+1}"
+            key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
+            if f"{key_prefix}_idx" not in st.session_state:
+                st.session_state[f"{key_prefix}_idx"] = 0
+            if f"{key_prefix}_discard" not in st.session_state:
+                st.session_state[f"{key_prefix}_discard"] = set()
+            faces_all = ch.get("face_files") or ([ch.get("image_url")] if ch.get("image_url") else [])
+            faces_all = [f for f in faces_all if f]
+            discard_set = st.session_state[f"{key_prefix}_discard"]
+            faces = [f for f in faces_all if f not in discard_set]
+            if not faces:
+                st.write(f"- {idx+1}. {ch.get('name','(sense nom)')} — sense imatges seleccionades")
+                continue
+            cur = st.session_state[f"{key_prefix}_idx"]
+            if cur >= len(faces):
+                cur = 0
+            st.session_state[f"{key_prefix}_idx"] = cur
+            fname = faces[cur]
+            if fname.startswith("/files/"):
+                img_url = f"{backend_base_url}{fname}"
+            else:
+                base = ch.get("image_url") or ""
+                base_dir = "/".join((base or "/").split("/")[:-1])
+                img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
+            st.markdown(f"**{idx+1}. {ch.get('name','(sense nom)')} — {ch.get('num_faces', 0)} cares**")
+            c1, c2 = st.columns([1, 3])
+            with c1:
+                st.image(img_url, width=150)
+                st.caption(f"Imatge {cur+1}/{len(faces)}")
+                bcol1, bcol2, bcol3 = st.columns(3)
+                with bcol1:
+                    if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
+                        st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(faces)
+                        st.rerun()
+                with bcol2:
+                    if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquesta imatge del clúster"):
+                        st.session_state[f"{key_prefix}_discard"].add(fname)
+                        new_list = [f for f in faces if f != fname]
+                        new_idx = cur if cur < len(new_list) else 0
+                        st.session_state[f"{key_prefix}_idx"] = new_idx
+                        st.rerun()
+                with bcol3:
+                    if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
+                        st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(faces)
+                        st.rerun()
+            with c2:
+                name_key = f"{key_prefix}_name"
+                desc_key = f"{key_prefix}_desc"
+                default_name = ch.get("name", "")
+                default_desc = ch.get("description", "")
+                if default_name and (name_key not in st.session_state or not st.session_state.get(name_key)):
+                    st.session_state[name_key] = default_name
+                elif name_key not in st.session_state:
+                    st.session_state[name_key] = default_name or ""
+                if default_desc and (desc_key not in st.session_state or not st.session_state.get(desc_key)):
+                    st.session_state[desc_key] = default_desc
+                elif desc_key not in st.session_state:
+                    st.session_state[desc_key] = default_desc or ""
+                pending_desc_key = f"{key_prefix}_pending_desc"
+                pending_name_key = f"{key_prefix}_pending_name"
+                if pending_desc_key in st.session_state:
+                    if desc_key not in st.session_state:
+                        st.session_state[desc_key] = ""
+                    st.session_state[desc_key] = st.session_state[pending_desc_key]
+                    del st.session_state[pending_desc_key]
+                if pending_name_key in st.session_state:
+                    if name_key not in st.session_state:
+                        st.session_state[name_key] = ""
+                    if not st.session_state.get(name_key):
+                        st.session_state[name_key] = st.session_state[pending_name_key]
+                    del st.session_state[pending_name_key]
+                st.text_input("Nom del clúster", key=name_key)
+                st.text_area("Descripció", key=desc_key, height=80)
+                if st.button("🎨 Generar descripció amb Salamandra Vision", key=f"svision_{key_prefix}"):
+                    with st.spinner("Generant descripció..."):
+                        from api_client import describe_image_with_svision
+                        import requests as _req
+                        try:
+                            resp = _req.get(img_url, timeout=10)
+                            if resp.status_code == 200:
+                                with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
+                                    tmp.write(resp.content)
+                                    tmp_path = tmp.name
+                                desc, name = describe_image_with_svision(tmp_path, is_face=True)
+                                if desc:
+                                    st.session_state[pending_desc_key] = desc
+                                    st.success("✅ Descripció generada!")
+                                else:
+                                    st.warning("⚠️ No s'ha pogut generar una descripció.")
+                                if name and not st.session_state.get(name_key):
+                                    st.session_state[pending_name_key] = name
+                                os.unlink(tmp_path)
+                                st.rerun()
+                            else:
+                                st.error(f"No s'ha pogut descarregar la imatge (status: {resp.status_code})")
+                        except Exception as e:
+                            st.error(f"Error generant descripció: {e}")
+    # --- 4. Carruseles de veus ---
+    if st.session_state.get("audio_segments") is not None:
+        st.markdown("---")
+        used_names_home = []
+        used_names_dona = []
+        noms_home_all, noms_dona_all = get_all_catalan_names()
+        for ch in (st.session_state.characters_detected or []):
+            ch_name = ch.get("name", "")
+            if ch_name in noms_home_all:
+                used_names_home.append(ch_name)
+            elif ch_name in noms_dona_all:
+                used_names_dona.append(ch_name)
+        segs = st.session_state.audio_segments or []
+        vlabels = st.session_state.voice_labels or []
+        valid_indices = [i for i, l in enumerate(vlabels) if isinstance(l, int) and l >= 0]
+        clusters = {}
+        for i in valid_indices:
+            lbl = int(vlabels[i])
+            clusters.setdefault(lbl, []).append(i)
+        n_vclusters = len(clusters)
+        st.subheader(f"🎙️ Empremtes de veu — clústers: {n_vclusters}")
+        di = st.session_state.get("diarization_info") or {}
+        if isinstance(di, dict) and not di.get("diarization_ok", True):
+            st.warning("No s'ha pogut fer la diarització amb pyannote (s'ha aplicat un sol segment de reserva).")
+        if not segs:
+            st.info("No s'han detectat mostres de veu.")
+        elif n_vclusters == 0:
+            st.info("No s'han format clústers de veu.")
+        else:
+            vname = st.session_state.video_name_from_engine
+            for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
+                key_prefix = f"voice_{lbl:02d}"
+                if f"{key_prefix}_idx" not in st.session_state:
+                    st.session_state[f"{key_prefix}_idx"] = 0
+                if f"{key_prefix}_discard" not in st.session_state:
+                    st.session_state[f"{key_prefix}_discard"] = set()
+                discard_set = st.session_state[f"{key_prefix}_discard"]
+                files = []
+                for i in idxs:
+                    clip_local = (segs[i] or {}).get("clip_path")
+                    fname = os.path.basename(clip_local) if clip_local else None
+                    if fname:
+                        files.append(fname)
+                files = [f for f in files if f and f not in discard_set]
+                if not files:
+                    st.write(f"- SPEAKER_{lbl:02d} — sense clips seleccionats")
+                    continue
+                cur = st.session_state[f"{key_prefix}_idx"]
+                if cur >= len(files):
+                    cur = 0
+                st.session_state[f"{key_prefix}_idx"] = cur
+                fname = files[cur]
+                audio_url = f"{backend_base_url}/audio/{vname}/{fname}" if (vname and fname) else None
+                st.markdown(f"**SPEAKER_{lbl:02d} — {len(files)} clips**")
+                c1, c2 = st.columns([1, 2])
+                with c1:
+                    if audio_url:
+                        st.audio(audio_url, format="audio/wav")
+                    st.caption(f"Clip {cur+1}/{len(files)}")
+                    bcol1, bcol2, bcol3 = st.columns(3)
+                    with bcol1:
+                        if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
+                            st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(files)
+                            st.rerun()
+                    with bcol2:
+                        if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquest clip del clúster"):
+                            st.session_state[f"{key_prefix}_discard"].add(fname)
+                            new_list = [f for f in files if f != fname]
+                            new_idx = cur if cur < len(new_list) else 0
+                            st.session_state[f"{key_prefix}_idx"] = new_idx
+                            st.rerun()
+                    with bcol3:
+                        if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
+                            st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(files)
+                            st.rerun()
+                with c2:
+                    name_key = f"{key_prefix}_name"
+                    desc_key = f"{key_prefix}_desc"
+                    default_name = get_catalan_name_for_speaker(lbl, used_names_home, used_names_dona)
+                    st.text_input("Nom del clúster", value=st.session_state.get(name_key, default_name), key=name_key)
+                    st.text_area("Descripció", value=st.session_state.get(desc_key, ""), key=desc_key, height=80)
+    # --- 5. Carruseles de escenas ---
+    if st.session_state.get("scene_detection_done"):
+        st.markdown("---")
+        scene_clusters = st.session_state.get("scene_clusters")
+        n_scenes = len(scene_clusters or [])
+        st.subheader(f"📍 Escenes — clústers: {n_scenes}")
+        if not scene_clusters:
+            st.info("No s'han detectat clústers d'escenes en aquest clip.")
+        else:
+            for sidx, sc in enumerate(scene_clusters):
+                try:
+                    folder_name = Path(sc.get("folder") or "").name
+                except Exception:
+                    folder_name = ""
+                scene_id = sc.get("id") or folder_name or f"scene{sidx+1}"
+                key_prefix = re.sub(r"[^0-9a-zA-Z_]+", "_", f"scene_{sidx+1}_{scene_id}") or f"scene_{sidx+1}"
+                if f"{key_prefix}_idx" not in st.session_state:
+                    st.session_state[f"{key_prefix}_idx"] = 0
+                if f"{key_prefix}_discard" not in st.session_state:
+                    st.session_state[f"{key_prefix}_discard"] = set()
+                frames_all = sc.get("frame_files") or ([sc.get("image_url")] if sc.get("image_url") else [])
+                frames_all = [f for f in frames_all if f]
+                discard_set = st.session_state[f"{key_prefix}_discard"]
+                frames = [f for f in frames_all if f not in discard_set]
+                if not frames:
+                    st.write(f"- {sidx+1}. (sense imatges de l'escena)")
+                    continue
+                cur = st.session_state[f"{key_prefix}_idx"]
+                if cur >= len(frames):
+                    cur = 0
+                st.session_state[f"{key_prefix}_idx"] = cur
+                fname = frames[cur]
+                if str(fname).startswith("/files/"):
+                    img_url = f"{backend_base_url}{fname}"
+                else:
+                    base = sc.get("image_url") or ""
+                    base_dir = "/".join((base or "/").split("/")[:-1])
+                    img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
+                st.markdown(f"**{sidx+1}. Escena — {sc.get('num_frames', 0)} frames**")
+                c1, c2 = st.columns([1, 2])
+                with c1:
+                    st.image(img_url, use_container_width=True)
+                    st.caption(f"Imatge {cur+1}/{len(frames)}")
+                    bcol1, bcol2, bcol3 = st.columns(3)
+                    with bcol1:
+                        if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
+                            st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(frames)
+                            st.rerun()
+                    with bcol2:
+                        if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquesta imatge del clúster"):
+                            st.session_state[f"{key_prefix}_discard"].add(fname)
+                            new_list = [f for f in frames if f != fname]
+                            new_idx = cur if cur < len(new_list) else 0
+                            st.session_state[f"{key_prefix}_idx"] = new_idx
+                            st.rerun()
+                    with bcol3:
+                        if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
+                            st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(frames)
+                            st.rerun()
+                with c2:
+                    name_key = f"{key_prefix}_name"
+                    desc_key = f"{key_prefix}_desc"
+                    default_scene_name = sc.get("name", "")
+                    default_scene_desc = sc.get("description", "")
+                    if default_scene_name and (name_key not in st.session_state or not st.session_state.get(name_key)):
+                        st.session_state[name_key] = default_scene_name
+                    elif name_key not in st.session_state:
+                        st.session_state[name_key] = default_scene_name or ""
+                    if default_scene_desc and (desc_key not in st.session_state or not st.session_state.get(desc_key)):
+                        st.session_state[desc_key] = default_scene_desc
+                    elif desc_key not in st.session_state:
+                        st.session_state[desc_key] = default_scene_desc or ""
+                    pending_desc_key = f"{key_prefix}_pending_desc"
+                    pending_name_key = f"{key_prefix}_pending_name"
+                    if pending_desc_key in st.session_state:
+                        if desc_key not in st.session_state:
+                            st.session_state[desc_key] = ""
+                        st.session_state[desc_key] = st.session_state[pending_desc_key]
+                        del st.session_state[pending_desc_key]
+                    if pending_name_key in st.session_state:
+                        if name_key not in st.session_state:
+                            st.session_state[name_key] = ""
+                        if not st.session_state.get(name_key):
+                            st.session_state[name_key] = st.session_state[pending_name_key]
+                        del st.session_state[pending_name_key]
+                    st.text_input("Nom del clúster", key=name_key)
+                    st.text_area("Descripció", key=desc_key, height=80)
+                    if st.button("🎨 Generar descripció amb Salamandra Vision", key=f"svision_{key_prefix}"):
+                        with st.spinner("Generant descripció..."):
+                            from api_client import describe_image_with_svision, generate_short_scene_name
+                            import requests as _req
+                            try:
+                                resp = _req.get(img_url, timeout=10)
+                                if resp.status_code == 200:
+                                    with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
+                                        tmp.write(resp.content)
+                                        tmp_path = tmp.name
+                                    desc, name = describe_image_with_svision(tmp_path, is_face=False)
+                                    if desc:
+                                        st.session_state[pending_desc_key] = desc
+                                        try:
+                                            short_name = generate_short_scene_name(desc)
+                                            if short_name:
+                                                st.session_state[pending_name_key] = short_name
+                                            elif name:
+                                                st.session_state[pending_name_key] = name
+                                        except Exception:
+                                            if name:
+                                                st.session_state[pending_name_key] = name
+                                        st.success("✅ Descripció i nom generats!")
+                                    else:
+                                        st.warning("⚠️ No s'ha pogut generar una descripció.")
+                                    os.unlink(tmp_path)
+                                    st.rerun()
+                                else:
+                                    st.error(f"No s'ha pogut descarregar la imatge (status: {resp.status_code})")
+                            except Exception as e:
+                                st.error(f"Error generant descripció: {e}")
+    # --- 6. Confirmación de casting y personajes combinados ---
+    if st.session_state.get("detect_done"):
+        st.markdown("---")
+        colc1, colc2 = st.columns([1,1])
+        with colc1:
+            if st.button("Confirmar càsting definitiu", type="primary"):
+                chars_payload = []
+                for idx, ch in enumerate(st.session_state.characters_detected or []):
+                    try:
+                        folder_name = Path(ch.get("folder") or "").name
+                    except Exception:
+                        folder_name = ""
+                    char_id = ch.get("id") or folder_name or f"char{idx+1}"
+                    def _safe_key(s: str) -> str:
+                        k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
+                        return k or f"cluster_{idx+1}"
+                    key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
+                    name = st.session_state.get(f"{key_prefix}_name") or ch.get("name") or f"Personatge {idx+1}"
+                    desc = st.session_state.get(f"{key_prefix}_desc", "")
+                    faces_all = ch.get("face_files") or []
+                    discard = st.session_state.get(f"{key_prefix}_discard", set())
+                    kept = [f for f in faces_all if f and f not in discard]
+                    chars_payload.append({
+                        "id": char_id,
+                        "name": name,
+                        "description": desc,
+                        "folder": ch.get("folder"),
+                        "kept_files": kept,
+                    })
+                used_names_home_fin = []
+                used_names_dona_fin = []
+                noms_home_all, noms_dona_all = get_all_catalan_names()
+                for cp in chars_payload:
+                    face_name = cp.get("name", "")
+                    if face_name in noms_home_all:
+                        used_names_home_fin.append(face_name)
+                    elif face_name in noms_dona_all:
+                        used_names_dona_fin.append(face_name)
+                segs = st.session_state.audio_segments or []
+                vlabels = st.session_state.voice_labels or []
+                vname = st.session_state.video_name_from_engine
+                voice_clusters = {}
+                for i, seg in enumerate(segs):
+                    lbl = vlabels[i] if i < len(vlabels) else -1
+                    clip_local = seg.get("clip_path")
+                    fname = os.path.basename(clip_local) if clip_local else None
+                    if fname:
+                        default_voice_name = get_catalan_name_for_speaker(int(lbl), used_names_home_fin, used_names_dona_fin) if isinstance(lbl, int) and lbl >= 0 else "UNKNOWN"
+                        voice_clusters.setdefault(lbl, {"label": lbl, "name": default_voice_name, "description": "", "clips": []})
+                        if isinstance(lbl, int) and lbl >= 0:
+                            vpref = f"voice_{int(lbl):02d}"
+                            vname_custom = st.session_state.get(f"{vpref}_name")
+                            vdesc_custom = st.session_state.get(f"{vpref}_desc")
+                            if vname_custom:
+                                voice_clusters[lbl]["name"] = vname_custom
+                            if vdesc_custom is not None:
+                                voice_clusters[lbl]["description"] = vdesc_custom
+                        voice_clusters[lbl]["clips"].append(fname)
+                payload = {
+                    "video_name": vname,
+                    "base_dir": st.session_state.get("engine_base_dir"),
+                    "characters": chars_payload,
+                    "voice_clusters": list(voice_clusters.values()),
+                }
+                if not payload["video_name"] or not payload["base_dir"]:
+                    st.error("Falten dades del vídeo per confirmar el càsting (video_name/base_dir). Torna a processar el vídeo.")
+                else:
+                    with st.spinner("Consolidant càsting al servidor…"):
+                        res_fc = api.finalize_casting(payload)
+                    if isinstance(res_fc, dict) and res_fc.get("ok"):
+                        st.success(f"Càsting consolidat. Identities: {len(res_fc.get('face_identities', []))} cares, {len(res_fc.get('voice_identities', []))} veus.")
+                        st.session_state.casting_finalized = True
+                        f_id = res_fc.get('face_identities', []) or []
+                        v_id = res_fc.get('voice_identities', []) or []
+                        c3, c4 = st.columns(2)
+                        with c3:
+                            st.markdown("**Identitats de cara**")
+                            for n in f_id:
+                                st.write(f"- {n}")
+                        with c4:
+                            st.markdown("**Identitats de veu**")
+                            for n in v_id:
+                                st.write(f"- {n}")
+                        faces_dir = res_fc.get('faces_dir')
+                        voices_dir = res_fc.get('voices_dir')
+                        db_dir = res_fc.get('db_dir')
+                        with st.spinner("Carregant índexs al cercador (Chroma)…"):
+                            load_res = api.load_casting(faces_dir=faces_dir, voices_dir=voices_dir, db_dir=db_dir, drop_collections=True)
+                        if isinstance(load_res, dict) and load_res.get('ok'):
+                            st.success(f"Índexs carregats: {load_res.get('faces', 0)} cares, {load_res.get('voices', 0)} veus.")
+                        else:
+                            st.error(f"Error carregant índexs: {load_res}")
+                    else:
+                        st.error(f"No s'ha pogut consolidar el càsting: {res_fc}")
+        # --- Personatges combinats (cares + veus) ---
+        if st.session_state.get("casting_finalized"):
+            st.markdown("---")
+            st.subheader("👥 Personatges")
+            def normalize_name(name: str) -> str:
+                import unicodedata
+                name_upper = name.upper()
+                name_normalized = ''.join(
+                    c for c in unicodedata.normalize('NFD', name_upper)
+                    if unicodedata.category(c) != 'Mn'
+                )
+                return name_normalized
+            chars_payload = []
+            for idx, ch in enumerate(st.session_state.characters_detected or []):
+                try:
+                    folder_name = Path(ch.get("folder") or "").name
+                except Exception:
+                    folder_name = ""
+                char_id = ch.get("id") or folder_name or f"char{idx+1}"
+                def _safe_key(s: str) -> str:
+                    k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
+                    return k or f"cluster_{idx+1}"
+                key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
+                name = st.session_state.get(f"{key_prefix}_name") or ch.get("name") or f"Personatge {idx+1}"
+                name_normalized = normalize_name(name)
+                desc = st.session_state.get(f"{key_prefix}_desc", "").strip()
+                chars_payload.append({
+                    "name": name,
+                    "name_normalized": name_normalized,
+                    "face_key_prefix": key_prefix,
+                    "face_files": ch.get("face_files") or [],
+                    "char_data": ch,
+                    "description": desc,
+                })
+            used_names_home_pers = []
+            used_names_dona_pers = []
+            noms_home_all, noms_dona_all = get_all_catalan_names()
+            for cp in chars_payload:
+                face_name = cp.get("name", "")
+                if face_name in noms_home_all:
+                    used_names_home_pers.append(face_name)
+                elif face_name in noms_dona_all:
+                    used_names_dona_pers.append(face_name)
+            segs = st.session_state.audio_segments or []
+            vlabels = st.session_state.voice_labels or []
+            vname = st.session_state.video_name_from_engine
+            voice_clusters_by_name = {}
+            for i, seg in enumerate(segs):
+                lbl = vlabels[i] if i < len(vlabels) else -1
+                if not (isinstance(lbl, int) and lbl >= 0):
+                    continue
+                vpref = f"voice_{int(lbl):02d}"
+                default_voice_name = get_catalan_name_for_speaker(int(lbl), used_names_home_pers, used_names_dona_pers) if isinstance(lbl, int) and lbl >= 0 else f"SPEAKER_{int(lbl):02d}"
+                vname_custom = st.session_state.get(f"{vpref}_name") or default_voice_name
+                vname_normalized = normalize_name(vname_custom)
+                vdesc = st.session_state.get(f"{vpref}_desc", "").strip()
+                clip_local = seg.get("clip_path")
+                fname = os.path.basename(clip_local) if clip_local else None
+                if fname:
+                    voice_clusters_by_name.setdefault(vname_normalized, {
+                        "voice_key_prefix": vpref,
+                        "clips": [],
+                        "label": lbl,
+                        "original_name": vname_custom,
+                        "description": vdesc,
+                    })
+                    voice_clusters_by_name[vname_normalized]["clips"].append(fname)
+            all_normalized_names = set([c["name_normalized"] for c in chars_payload] + list(voice_clusters_by_name.keys()))
+            for pidx, norm_name in enumerate(sorted(all_normalized_names)):
+                face_items = [c for c in chars_payload if c["name_normalized"] == norm_name]
+                voice_data = voice_clusters_by_name.get(norm_name)
+                display_name = face_items[0]["name"] if face_items else (voice_data["original_name"] if voice_data else norm_name)
+                descriptions = []
+                for face_item in face_items:
+                    if face_item["description"]:
+                        descriptions.append(face_item["description"])
+                if voice_data and voice_data.get("description"):
+                    descriptions.append(voice_data["description"])
+                combined_description = "\n".join(descriptions) if descriptions else ""
+                st.markdown(f"**{pidx+1}. {display_name}**")
+                all_faces = []
+                for face_item in face_items:
+                    all_faces.extend(face_item["face_files"])
+                face_data = face_items[0] if face_items else None
+                col_faces, col_voices, col_text = st.columns([1, 1, 1.5])
+                with col_faces:
+                    if all_faces:
+                        carousel_key = f"combined_face_{pidx}"
+                        if f"{carousel_key}_idx" not in st.session_state:
+                            st.session_state[f"{carousel_key}_idx"] = 0
+                        cur = st.session_state[f"{carousel_key}_idx"]
+                        if cur >= len(all_faces):
+                            cur = 0
+                        st.session_state[f"{carousel_key}_idx"] = cur
+                        fname = all_faces[cur]
+                        ch = face_data["char_data"] if face_data else {}
+                        if fname.startswith("/files/"):
+                            img_url = f"{backend_base_url}{fname}"
+                        else:
+                            base = ch.get("image_url") or ""
+                            base_dir = "/".join((base or "/").split("/")[:-1])
+                            img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
+                        st.image(img_url, width=150)
+                        st.caption(f"Cara {cur+1}/{len(all_faces)}")
+                        bcol1, bcol2 = st.columns(2)
+                        with bcol1:
+                            if st.button("⬅️", key=f"combined_face_prev_{pidx}"):
+                                st.session_state[f"{carousel_key}_idx"] = (cur - 1) % len(all_faces)
+                                st.rerun()
+                        with bcol2:
+                            if st.button("➡️", key=f"combined_face_next_{pidx}"):
+                                st.session_state[f"{carousel_key}_idx"] = (cur + 1) % len(all_faces)
+                                st.rerun()
+                    else:
+                        st.info("Sense imatges")
+                with col_voices:
+                    if voice_data:
+                        clips = voice_data["clips"]
+                        if clips:
+                            carousel_key = f"combined_voice_{pidx}"
+                            if f"{carousel_key}_idx" not in st.session_state:
+                                st.session_state[f"{carousel_key}_idx"] = 0
+                            cur = st.session_state[f"{carousel_key}_idx"]
+                            if cur >= len(clips):
+                                cur = 0
+                            st.session_state[f"{carousel_key}_idx"] = cur
+                            fname = clips[cur]
+                            audio_url = f"{backend_base_url}/audio/{vname}/{fname}" if (vname and fname) else None
+                            if audio_url:
+                                st.audio(audio_url, format="audio/wav")
+                            st.caption(f"Veu {cur+1}/{len(clips)}")
+                            bcol1, bcol2 = st.columns(2)
+                            with bcol1:
+                                if st.button("⬅️", key=f"combined_voice_prev_{pidx}"):
+                                    st.session_state[f"{carousel_key}_idx"] = (cur - 1) % len(clips)
+                                    st.rerun()
+                            with bcol2:
+                                if st.button("➡️", key=f"combined_voice_next_{pidx}"):
+                                    st.session_state[f"{carousel_key}_idx"] = (cur + 1) % len(clips)
+                                    st.rerun()
+                        else:
+                            st.info("Sense clips de veu")
+                    else:
+                        st.info("Sense dades de veu")
+                with col_text:
+                    combined_name_key = f"combined_char_{pidx}_name"
+                    combined_desc_key = f"combined_char_{pidx}_desc"
+                    if combined_name_key not in st.session_state:
+                        st.session_state[combined_name_key] = norm_name
+                    if combined_desc_key not in st.session_state:
+                        st.session_state[combined_desc_key] = combined_description
+                    st.text_input("Nom del personatge", key=combined_name_key, label_visibility="collapsed", placeholder="Nom del personatge")
+                    st.text_area("Descripció", key=combined_desc_key, height=120, label_visibility="collapsed", placeholder="Descripció del personatge")
+            # --- 7. Generar audiodescripció ---
+            st.markdown("---")
+            if st.button("🎬 Generar audiodescripció", type="primary", use_container_width=True):
+                v = st.session_state.get("video_uploaded")
+                if not v:
+                    st.error("No hi ha cap vídeo carregat.")
+                else:
+                    progress_placeholder = st.empty()
+                    result_placeholder = st.empty()
+                    with st.spinner("Generant audiodescripció... Aquest procés pot trigar diversos minuts."):
+                        progress_placeholder.info("⏳ Processant vídeo i generant audiodescripció UNE-153010...")
+                        try:
+                            out = api.generate_audiodescription(v["bytes"], v["name"])
+                            if isinstance(out, dict) and out.get("status") == "done":
+                                progress_placeholder.success("✅ Audiodescripció generada correctament!")
+                                res = out.get("results", {})
+                                with result_placeholder.container():
+                                    st.success("🎉 Audiodescripció completada!")
+                                    c1, c2 = st.columns([1,1])
+                                    with c1:
+                                        st.markdown("**📄 UNE-153010 SRT**")
+                                        une_srt_content = res.get("une_srt", "")
+                                        st.code(une_srt_content, language="text")
+                                        if une_srt_content:
+                                            st.download_button(
+                                                "⬇️ Descarregar UNE SRT",
+                                                data=une_srt_content,
+                                                file_name=f"{v['name']}_une.srt",
+                                                mime="text/plain"
+                                            )
+                                    with c2:
+                                        st.markdown("**📝 Narració lliure**")
+                                        free_text_content = res.get("free_text", "")
+                                        st.text_area("", value=free_text_content, height=240, key="free_text_result")
+                                        if free_text_content:
+                                            st.download_button(
+                                                "⬇️ Descarregar text lliure",
+                                                data=free_text_content,
+                                                file_name=f"{v['name']}_free.txt",
+                                                mime="text/plain"
+                                            )
+                            else:
+                                progress_placeholder.empty()
+                                error_msg = str(out.get("error", out)) if isinstance(out, dict) else str(out)
+                                result_placeholder.error(f"❌ Error generant l'audiodescripció: {error_msg}")
+                        except Exception as e:
+                            progress_placeholder.empty()
+                            result_placeholder.error(f"❌ Excepció durant la generació: {e}")