VeuReu commited on
Commit
f026f25
·
1 Parent(s): 8404f78

Upload 6 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. page_modules/process_video.py +1044 -232
app.py CHANGED
@@ -119,7 +119,7 @@ if page == "Processar vídeo nou":
119
  st.error("No tens permisos per processar nous vídeos. Verifica el teu mòbil per obtenir accés complet.")
120
  st.stop()
121
 
122
- render_process_video_page()
123
 
124
  elif page == "Analitzar video-transcripcions":
125
  require_login(render_login_form)
 
119
  st.error("No tens permisos per processar nous vídeos. Verifica el teu mòbil per obtenir accés complet.")
120
  st.stop()
121
 
122
+ render_process_video_page(api, BACKEND_BASE_URL)
123
 
124
  elif page == "Analitzar video-transcripcions":
125
  require_login(render_login_form)
page_modules/process_video.py CHANGED
@@ -1,232 +1,1044 @@
1
- """UI logic for the "Processar vídeo nou" page."""
2
-
3
- from __future__ import annotations
4
-
5
- import re
6
- import shutil
7
- import subprocess
8
- from pathlib import Path
9
-
10
- import streamlit as st
11
- from PIL import Image, ImageDraw
12
-
13
-
14
- def _get_video_duration(path: str) -> float:
15
- """Return video duration in seconds using ffprobe, ffmpeg or OpenCV as fallback."""
16
- cmd = [
17
- "ffprobe",
18
- "-v",
19
- "error",
20
- "-show_entries",
21
- "format=duration",
22
- "-of",
23
- "default=noprint_wrappers=1:nokey=1",
24
- path,
25
- ]
26
- try:
27
- result = subprocess.run(cmd, capture_output=True, text=True, check=True)
28
- return float(result.stdout.strip())
29
- except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
30
- pass
31
-
32
- if shutil.which("ffmpeg"):
33
- try:
34
- ffmpeg_cmd = ["ffmpeg", "-i", path]
35
- result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, check=False)
36
- output = result.stderr or result.stdout or ""
37
- match = re.search(r"Duration:\s*(\d+):(\d+):(\d+\.\d+)", output)
38
- if match:
39
- hours, minutes, seconds = match.groups()
40
- total_seconds = (int(hours) * 3600) + (int(minutes) * 60) + float(seconds)
41
- return float(total_seconds)
42
- except FileNotFoundError:
43
- pass
44
-
45
- # Últim recurs: intentar amb OpenCV si està disponible
46
- try:
47
- import cv2
48
-
49
- cap = cv2.VideoCapture(path)
50
- if cap.isOpened():
51
- fps = cap.get(cv2.CAP_PROP_FPS) or 0
52
- frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0
53
- cap.release()
54
-
55
- if fps > 0 and frame_count > 0:
56
- return float(frame_count / fps)
57
- else:
58
- cap.release()
59
- except Exception:
60
- pass
61
-
62
- return 0.0
63
-
64
-
65
- def _transcode_video(input_path: str, output_path: str, max_duration: int | None = None) -> None:
66
- cmd = ["ffmpeg", "-y", "-i", input_path]
67
- if max_duration is not None:
68
- cmd += ["-t", str(max_duration)]
69
- cmd += [
70
- "-c:v",
71
- "libx264",
72
- "-preset",
73
- "veryfast",
74
- "-crf",
75
- "23",
76
- "-c:a",
77
- "aac",
78
- "-movflags",
79
- "+faststart",
80
- output_path,
81
- ]
82
- result = subprocess.run(cmd, capture_output=True, text=True)
83
- if result.returncode != 0:
84
- raise RuntimeError(result.stderr.strip() or "ffmpeg failed")
85
-
86
-
87
- def render_process_video_page() -> None:
88
- st.header("Processar un nou clip de vídeo")
89
-
90
- # Inicializar el estado de la página si no existe
91
- if "video_uploaded" not in st.session_state:
92
- st.session_state.video_uploaded = None
93
- if "characters_detected" not in st.session_state:
94
- st.session_state.characters_detected = None
95
- if "characters_saved" not in st.session_state:
96
- st.session_state.characters_saved = False
97
-
98
- # --- 1. Subida del vídeo ---
99
- MAX_SIZE_MB = 20
100
- MAX_DURATION_S = 240 # 4 minutos
101
-
102
- uploaded_file = st.file_uploader(
103
- "Puja un clip de vídeo (MP4, < 20MB, < 4 minuts)",
104
- type=["mp4"],
105
- key="video_uploader",
106
- )
107
-
108
- if uploaded_file is not None:
109
- # Resetear el estado si se sube un nuevo archivo
110
- if st.session_state.video_uploaded is None or uploaded_file.name != st.session_state.video_uploaded.get(
111
- "original_name"
112
- ):
113
- st.session_state.video_uploaded = {"original_name": uploaded_file.name, "status": "validating"}
114
- st.session_state.characters_detected = None
115
- st.session_state.characters_saved = False
116
-
117
- if st.session_state.video_uploaded["status"] == "validating":
118
- is_valid = True
119
- if uploaded_file.size > MAX_SIZE_MB * 1024 * 1024:
120
- st.error(f"El vídeo supera el límit de {MAX_SIZE_MB}MB.")
121
- is_valid = False
122
-
123
- if is_valid:
124
- with st.spinner("Processant el vídeo..."):
125
- temp_path = Path("temp_video.mp4")
126
- with temp_path.open("wb") as f:
127
- f.write(uploaded_file.getbuffer())
128
-
129
- was_truncated = False
130
- final_video_path = None
131
- try:
132
- duration = _get_video_duration(str(temp_path))
133
- duration_unknown = False
134
- if not duration:
135
- st.warning(
136
- "No s'ha pogut obtenir la durada del vídeo. Es continuarà assumint un màxim de 4 minuts."
137
- )
138
- duration = float(MAX_DURATION_S)
139
- duration_unknown = True
140
-
141
- if is_valid:
142
- if duration > MAX_DURATION_S:
143
- was_truncated = True
144
-
145
- video_name = Path(uploaded_file.name).stem
146
- video_dir = Path("/tmp/data/videos") / video_name
147
- video_dir.mkdir(parents=True, exist_ok=True)
148
- final_video_path = video_dir / f"{video_name}.mp4"
149
-
150
- try:
151
- _transcode_video(
152
- str(temp_path),
153
- str(final_video_path),
154
- MAX_DURATION_S if (was_truncated or duration_unknown) else None,
155
- )
156
- except RuntimeError as exc:
157
- st.error(f"No s'ha pogut processar el vídeo: {exc}")
158
- is_valid = False
159
-
160
- if is_valid and final_video_path is not None:
161
- st.session_state.video_uploaded.update(
162
- {
163
- "status": "processed",
164
- "path": str(final_video_path),
165
- "was_truncated": was_truncated or duration_unknown,
166
- "duration_unknown": duration_unknown,
167
- }
168
- )
169
- st.rerun()
170
- finally:
171
- if temp_path.exists():
172
- temp_path.unlink()
173
-
174
- if st.session_state.video_uploaded and st.session_state.video_uploaded["status"] == "processed":
175
- st.success(f"Vídeo '{st.session_state.video_uploaded['original_name']}' pujat i processat correctament.")
176
- if st.session_state.video_uploaded["was_truncated"]:
177
- st.warning("El vídeo s'ha truncat a 4 minuts.")
178
-
179
- st.markdown("---")
180
- col1, col2 = st.columns([1, 3])
181
- with col1:
182
- detect_button_disabled = st.session_state.video_uploaded is None
183
- if st.button("Detectar Personatges", disabled=detect_button_disabled):
184
- with st.spinner("Detectant personatges..."):
185
- st.session_state.characters_detected = [
186
- {
187
- "id": "char1",
188
- "image_path": "init_data/placeholder.png",
189
- "description": "Dona amb cabell ros i ulleres",
190
- },
191
- {
192
- "id": "char2",
193
- "image_path": "init_data/placeholder.png",
194
- "description": "Home amb barba i barret",
195
- },
196
- ]
197
- st.session_state.characters_saved = False
198
-
199
- def _load_or_placeholder(path: str, size: tuple[int, int] = (150, 150)):
200
- p = Path(path)
201
- if p.exists():
202
- return str(p)
203
- img = Image.new("RGB", size, color=(230, 230, 230))
204
- d = ImageDraw.Draw(img)
205
- text = "No image"
206
- tw, th = d.textlength(text), 12
207
- d.text(((size[0]-tw)/2, (size[1]-th)/2), text, fill=(120, 120, 120))
208
- return img
209
-
210
- if st.session_state.characters_detected:
211
- st.subheader("Personatges detectats")
212
- for char in st.session_state.characters_detected:
213
- with st.form(key=f"form_{char['id']}"):
214
- col1, col2 = st.columns(2)
215
- with col1:
216
- st.image(_load_or_placeholder(char["image_path"]), width=150)
217
- with col2:
218
- st.caption(char["description"])
219
- st.text_input("Nom del personatge", key=f"name_{char['id']}")
220
- st.form_submit_button("Cercar")
221
-
222
- st.markdown("---_**")
223
-
224
- col1, col2, col3 = st.columns([1, 1, 2])
225
- with col1:
226
- if st.button("Desar", type="primary"):
227
- st.session_state.characters_saved = True
228
- st.success("Personatges desats correctament.")
229
-
230
- with col2:
231
- if st.session_state.characters_saved:
232
- st.button("Generar Audiodescripció")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """UI logic for the "Processar vídeo nou" page - Recovered from backup with full functionality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import os
9
+ import time
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ import streamlit as st
14
+ from PIL import Image, ImageDraw
15
+
16
+
17
+ def get_all_catalan_names():
18
+ """Retorna tots els noms catalans disponibles."""
19
+ noms_home = ["Jordi", "Marc", "Pau", "Pere", "Joan", "Josep", "David", "Àlex", "Guillem", "Albert",
20
+ "Arnau", "Martí", "Bernat", "Oriol", "Roger", "Pol", "Lluís", "Sergi", "Carles", "Xavier"]
21
+ noms_dona = ["Maria", "Anna", "Laura", "Marta", "Cristina", "Núria", "Montserrat", "Júlia", "Sara", "Carla",
22
+ "Alba", "Elisabet", "Rosa", "Gemma", "Sílvia", "Teresa", "Irene", "Laia", "Marina", "Bet"]
23
+ return noms_home, noms_dona
24
+
25
+
26
+ def get_catalan_name_for_speaker(speaker_label: int, used_names_home: list = None, used_names_dona: list = None) -> str:
27
+ """Genera un nom català per a un speaker, reutilitzant noms de caras si estan disponibles."""
28
+ noms_home, noms_dona = get_all_catalan_names()
29
+
30
+ if used_names_home is None:
31
+ used_names_home = []
32
+ if used_names_dona is None:
33
+ used_names_dona = []
34
+
35
+ is_male = (speaker_label % 2 == 0)
36
+
37
+ if is_male:
38
+ if used_names_home:
39
+ idx = speaker_label // 2
40
+ return used_names_home[idx % len(used_names_home)]
41
+ else:
42
+ hash_val = hash(f"speaker_{speaker_label}")
43
+ return noms_home[abs(hash_val) % len(noms_home)]
44
+ else:
45
+ if used_names_dona:
46
+ idx = speaker_label // 2
47
+ return used_names_dona[idx % len(used_names_dona)]
48
+ else:
49
+ hash_val = hash(f"speaker_{speaker_label}")
50
+ return noms_dona[abs(hash_val) % len(noms_dona)]
51
+
52
+
53
+ def _get_video_duration(path: str) -> float:
54
+ """Return video duration in seconds using ffprobe, ffmpeg or OpenCV as fallback."""
55
+ cmd = [
56
+ "ffprobe",
57
+ "-v",
58
+ "error",
59
+ "-show_entries",
60
+ "format=duration",
61
+ "-of",
62
+ "default=noprint_wrappers=1:nokey=1",
63
+ path,
64
+ ]
65
+ try:
66
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
67
+ return float(result.stdout.strip())
68
+ except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
69
+ pass
70
+
71
+ if shutil.which("ffmpeg"):
72
+ try:
73
+ ffmpeg_cmd = ["ffmpeg", "-i", path]
74
+ result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, check=False)
75
+ output = result.stderr or result.stdout or ""
76
+ match = re.search(r"Duration:\s*(\d+):(\d+):(\d+\.\d+)", output)
77
+ if match:
78
+ hours, minutes, seconds = match.groups()
79
+ total_seconds = (int(hours) * 3600) + (int(minutes) * 60) + float(seconds)
80
+ return float(total_seconds)
81
+ except FileNotFoundError:
82
+ pass
83
+
84
+ # Últim recurs: intentar amb OpenCV si està disponible
85
+ try:
86
+ import cv2
87
+
88
+ cap = cv2.VideoCapture(path)
89
+ if cap.isOpened():
90
+ fps = cap.get(cv2.CAP_PROP_FPS) or 0
91
+ frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0
92
+ cap.release()
93
+
94
+ if fps > 0 and frame_count > 0:
95
+ return float(frame_count / fps)
96
+ else:
97
+ cap.release()
98
+ except Exception:
99
+ pass
100
+
101
+ return 0.0
102
+
103
+
104
+ def _transcode_video(input_path: str, output_path: str, max_duration: int | None = None) -> None:
105
+ cmd = ["ffmpeg", "-y", "-i", input_path]
106
+ if max_duration is not None:
107
+ cmd += ["-t", str(max_duration)]
108
+ cmd += [
109
+ "-c:v",
110
+ "libx264",
111
+ "-preset",
112
+ "veryfast",
113
+ "-crf",
114
+ "23",
115
+ "-c:a",
116
+ "aac",
117
+ "-movflags",
118
+ "+faststart",
119
+ output_path,
120
+ ]
121
+ result = subprocess.run(cmd, capture_output=True, text=True)
122
+ if result.returncode != 0:
123
+ raise RuntimeError(result.stderr.strip() or "ffmpeg failed")
124
+
125
+
126
+ def render_process_video_page(api, backend_base_url: str) -> None:
127
+ st.header("Processar un nou clip de vídeo")
128
+
129
+ # Inicializar el estado de la página si no existe
130
+ if "video_uploaded" not in st.session_state:
131
+ st.session_state.video_uploaded = None
132
+ if "characters_detected" not in st.session_state:
133
+ st.session_state.characters_detected = None
134
+ if "audio_segments" not in st.session_state:
135
+ st.session_state.audio_segments = None
136
+ if "voice_labels" not in st.session_state:
137
+ st.session_state.voice_labels = None
138
+ if "face_labels" not in st.session_state:
139
+ st.session_state.face_labels = None
140
+ if "scene_clusters" not in st.session_state:
141
+ st.session_state.scene_clusters = None
142
+ if "scene_detection_done" not in st.session_state:
143
+ st.session_state.scene_detection_done = False
144
+ if "detect_done" not in st.session_state:
145
+ st.session_state.detect_done = False
146
+ if "casting_finalized" not in st.session_state:
147
+ st.session_state.casting_finalized = False
148
+ if "video_name_from_engine" not in st.session_state:
149
+ st.session_state.video_name_from_engine = None
150
+ if "diarization_info" not in st.session_state:
151
+ st.session_state.diarization_info = {}
152
+ if "characters_saved" not in st.session_state:
153
+ st.session_state.characters_saved = False
154
+
155
+ # --- 1. Subida del vídeo ---
156
+ MAX_SIZE_MB = 20
157
+ MAX_DURATION_S = 240 # 4 minutos
158
+
159
+ uploaded_file = st.file_uploader(
160
+ "Puja un clip de vídeo (MP4, < 20MB, < 4 minuts)",
161
+ type=["mp4"],
162
+ key="video_uploader",
163
+ )
164
+
165
+ if uploaded_file is not None:
166
+ # Resetear el estado si se sube un nuevo archivo
167
+ if st.session_state.video_uploaded is None or uploaded_file.name != st.session_state.video_uploaded.get(
168
+ "original_name"
169
+ ):
170
+ st.session_state.video_uploaded = {"original_name": uploaded_file.name, "status": "validating"}
171
+ st.session_state.characters_detected = None
172
+ st.session_state.characters_saved = False
173
+
174
+ if st.session_state.video_uploaded["status"] == "validating":
175
+ is_valid = True
176
+ if uploaded_file.size > MAX_SIZE_MB * 1024 * 1024:
177
+ st.error(f"El vídeo supera el límit de {MAX_SIZE_MB}MB.")
178
+ is_valid = False
179
+
180
+ if is_valid:
181
+ with st.spinner("Processant el vídeo..."):
182
+ temp_path = Path("temp_video.mp4")
183
+ with temp_path.open("wb") as f:
184
+ f.write(uploaded_file.getbuffer())
185
+
186
+ was_truncated = False
187
+ final_video_path = None
188
+ try:
189
+ duration = _get_video_duration(str(temp_path))
190
+ duration_unknown = False
191
+ if not duration:
192
+ st.warning(
193
+ "No s'ha pogut obtenir la durada del vídeo. Es continuarà assumint un màxim de 4 minuts."
194
+ )
195
+ duration = float(MAX_DURATION_S)
196
+ duration_unknown = True
197
+
198
+ if is_valid:
199
+ if duration > MAX_DURATION_S:
200
+ was_truncated = True
201
+
202
+ video_name = Path(uploaded_file.name).stem
203
+ video_dir = Path("/tmp/data/videos") / video_name
204
+ video_dir.mkdir(parents=True, exist_ok=True)
205
+ final_video_path = video_dir / f"{video_name}.mp4"
206
+
207
+ try:
208
+ _transcode_video(
209
+ str(temp_path),
210
+ str(final_video_path),
211
+ MAX_DURATION_S if (was_truncated or duration_unknown) else None,
212
+ )
213
+ except RuntimeError as exc:
214
+ st.error(f"No s'ha pogut processar el vídeo: {exc}")
215
+ is_valid = False
216
+
217
+ if is_valid and final_video_path is not None:
218
+ st.session_state.video_uploaded.update(
219
+ {
220
+ "status": "processed",
221
+ "path": str(final_video_path),
222
+ "was_truncated": was_truncated or duration_unknown,
223
+ "duration_unknown": duration_unknown,
224
+ "bytes": uploaded_file.getvalue(),
225
+ "name": uploaded_file.name,
226
+ }
227
+ )
228
+ st.rerun()
229
+ finally:
230
+ if temp_path.exists():
231
+ temp_path.unlink()
232
+
233
+ if st.session_state.video_uploaded and st.session_state.video_uploaded["status"] == "processed":
234
+ st.success(f"Vídeo '{st.session_state.video_uploaded['original_name']}' pujat i processat correctament.")
235
+ if st.session_state.video_uploaded["was_truncated"]:
236
+ st.warning("El vídeo s'ha truncat a 4 minuts.")
237
+
238
+ # --- 2. Form de detecció amb sliders ---
239
+ st.markdown("---")
240
+
241
+ with st.form("detect_form"):
242
+ col_btn, col_face, col_voice, col_scene = st.columns([1, 1, 1, 1])
243
+ with col_face:
244
+ st.markdown("**Cares**")
245
+ face_max_groups = st.slider("Límit de grups (cares)", 1, 10, 5, 1, key="face_max_groups")
246
+ face_min_cluster = st.slider("Mida mínima (cares)", 1, 5, 3, 1, key="face_min_cluster")
247
+ face_sensitivity = st.slider("Sensibilitat (cares)", 0.0, 1.0, 0.5, 0.05, key="face_sensitivity",
248
+ help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
249
+ with col_voice:
250
+ st.markdown("**Veus**")
251
+ voice_max_groups = st.slider("Límit de grups (veus)", 1, 10, 5, 1, key="voice_max_groups")
252
+ voice_min_cluster = st.slider("Mida mínima (veus)", 1, 5, 3, 1, key="voice_min_cluster")
253
+ voice_sensitivity = st.slider("Sensibilitat (veus)", 0.0, 1.0, 0.5, 0.05, key="voice_sensitivity",
254
+ help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
255
+ with col_scene:
256
+ st.markdown("**Escenes**")
257
+ scene_max_groups = st.slider("Límit de grups (escenes)", 1, 10, 3, 1, key="scene_max_groups")
258
+ scene_min_cluster = st.slider("Mida mínima (escenes)", 5, 20, 12, 1, key="scene_min_cluster")
259
+ scene_sensitivity = st.slider("Sensibilitat (escenes)", 0.0, 1.0, 0.5, 0.05, key="scene_sensitivity",
260
+ help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
261
+ with col_btn:
262
+ max_frames = st.number_input("Nombre de frames a processar", min_value=10, max_value=500, value=100, step=10,
263
+ help="Nombre de fotogrames equiespaciats a extreure del vídeo per detectar cares")
264
+ can_detect = st.session_state.video_uploaded is not None
265
+ submit_detect = st.form_submit_button("Detectar Personatges", disabled=not can_detect)
266
+
267
+ if not can_detect:
268
+ st.caption("📹 Necessites pujar un vídeo primer")
269
+
270
+ if submit_detect:
271
+ try:
272
+ v = st.session_state.video_uploaded
273
+ # Reset estat abans de començar
274
+ st.session_state.scene_clusters = None
275
+ st.session_state.scene_detection_done = False
276
+ st.session_state.detect_done = False
277
+ st.session_state.casting_finalized = False
278
+
279
+ resp = api.create_initial_casting(
280
+ video_bytes=v["bytes"],
281
+ video_name=v["name"],
282
+ face_max_groups=face_max_groups,
283
+ face_min_cluster_size=face_min_cluster,
284
+ face_sensitivity=face_sensitivity,
285
+ voice_max_groups=voice_max_groups,
286
+ voice_min_cluster_size=voice_min_cluster,
287
+ voice_sensitivity=voice_sensitivity,
288
+ max_frames=max_frames,
289
+ )
290
+
291
+ if not isinstance(resp, dict) or not resp.get("job_id"):
292
+ st.error("No s'ha pogut crear el job al servidor.")
293
+ else:
294
+ job_id = resp["job_id"]
295
+ with st.spinner("Processant al servidor…"):
296
+ time.sleep(3)
297
+ attempt, max_attempts = 0, 120
298
+ progress_placeholder = st.empty()
299
+ while attempt < max_attempts:
300
+ stt = api.get_job(job_id)
301
+ status = stt.get("status")
302
+ if status in ("queued", "processing"):
303
+ if attempt % 10 == 0:
304
+ elapsed_min = (attempt * 5) // 60
305
+ progress_placeholder.info(f"⏳ Processant al servidor... (~{elapsed_min} min)")
306
+ time.sleep(5)
307
+ attempt += 1
308
+ continue
309
+ if status == "failed":
310
+ progress_placeholder.empty()
311
+ st.error("El processament ha fallat al servidor.")
312
+ break
313
+
314
+ # Success
315
+ res = stt.get("results", {})
316
+ chars = res.get("characters", [])
317
+ fl = res.get("face_labels", [])
318
+ segs = res.get("audio_segments", [])
319
+ vl = res.get("voice_labels", [])
320
+ base_dir = res.get("base_dir")
321
+ vname = os.path.basename(base_dir) if base_dir else None
322
+ diar_info = res.get("diarization_info", {})
323
+
324
+ st.session_state.characters_detected = chars or []
325
+ st.session_state.face_labels = fl or []
326
+ st.session_state.audio_segments = segs or []
327
+ st.session_state.voice_labels = vl or []
328
+ st.session_state.video_name_from_engine = vname
329
+ st.session_state.engine_base_dir = base_dir
330
+ st.session_state.diarization_info = diar_info or {}
331
+
332
+ progress_placeholder.empty()
333
+
334
+ if chars:
335
+ st.success(f"✓ Detecció completada! Trobades {len(chars)} cares.")
336
+ st.info("💡 Usa els botons '🎨 Generar descripció' a sota de cada personatge per obtenir descripcions automàtiques amb Salamandra Vision.")
337
+ else:
338
+ st.info("No s'han detectat cares en aquest vídeo.")
339
+
340
+ # Detect scenes
341
+ try:
342
+ scene_out = api.detect_scenes(
343
+ video_bytes=v["bytes"],
344
+ video_name=v["name"],
345
+ max_groups=scene_max_groups,
346
+ min_cluster_size=scene_min_cluster,
347
+ scene_sensitivity=scene_sensitivity,
348
+ frame_interval_sec=0.5,
349
+ )
350
+ scs = scene_out.get("scene_clusters") if isinstance(scene_out, dict) else None
351
+ if isinstance(scs, list):
352
+ st.session_state.scene_clusters = scs
353
+ else:
354
+ st.session_state.scene_clusters = []
355
+ except Exception:
356
+ st.session_state.scene_clusters = []
357
+ finally:
358
+ st.session_state.scene_detection_done = True
359
+
360
+ st.session_state.detect_done = True
361
+ st.success("✅ Processament completat!")
362
+ break
363
+ else:
364
+ progress_placeholder.empty()
365
+ st.warning(f"⏱️ El servidor no ha completat el job en {max_attempts * 5 // 60} minuts.")
366
+ except Exception as e:
367
+ st.error(f"Error inesperat: {e}")
368
+
369
+ # --- 3. Carruseles de cares ---
370
+ if st.session_state.get("characters_detected") is not None:
371
+ st.markdown("---")
372
+ n_face_clusters = len(st.session_state.get("characters_detected") or [])
373
+ st.subheader(f"🖼️ Cares — clústers: {n_face_clusters}")
374
+
375
+ if n_face_clusters == 0:
376
+ st.info("No s'han detectat clústers de cara en aquest clip.")
377
+
378
+ for idx, ch in enumerate(st.session_state.characters_detected or []):
379
+ try:
380
+ folder_name = Path(ch.get("folder") or "").name
381
+ except Exception:
382
+ folder_name = ""
383
+ char_id = ch.get("id") or folder_name or f"char{idx+1}"
384
+
385
+ def _safe_key(s: str) -> str:
386
+ k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
387
+ return k or f"cluster_{idx+1}"
388
+
389
+ key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
390
+ if f"{key_prefix}_idx" not in st.session_state:
391
+ st.session_state[f"{key_prefix}_idx"] = 0
392
+ if f"{key_prefix}_discard" not in st.session_state:
393
+ st.session_state[f"{key_prefix}_discard"] = set()
394
+
395
+ faces_all = ch.get("face_files") or ([ch.get("image_url")] if ch.get("image_url") else [])
396
+ faces_all = [f for f in faces_all if f]
397
+ discard_set = st.session_state[f"{key_prefix}_discard"]
398
+ faces = [f for f in faces_all if f not in discard_set]
399
+
400
+ if not faces:
401
+ st.write(f"- {idx+1}. {ch.get('name','(sense nom)')} — sense imatges seleccionades")
402
+ continue
403
+
404
+ cur = st.session_state[f"{key_prefix}_idx"]
405
+ if cur >= len(faces):
406
+ cur = 0
407
+ st.session_state[f"{key_prefix}_idx"] = cur
408
+ fname = faces[cur]
409
+
410
+ if fname.startswith("/files/"):
411
+ img_url = f"{backend_base_url}{fname}"
412
+ else:
413
+ base = ch.get("image_url") or ""
414
+ base_dir = "/".join((base or "/").split("/")[:-1])
415
+ img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
416
+
417
+ st.markdown(f"**{idx+1}. {ch.get('name','(sense nom)')} — {ch.get('num_faces', 0)} cares**")
418
+ c1, c2 = st.columns([1, 3])
419
+ with c1:
420
+ st.image(img_url, width=150)
421
+ st.caption(f"Imatge {cur+1}/{len(faces)}")
422
+ bcol1, bcol2, bcol3 = st.columns(3)
423
+ with bcol1:
424
+ if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
425
+ st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(faces)
426
+ st.rerun()
427
+ with bcol2:
428
+ if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquesta imatge del clúster"):
429
+ st.session_state[f"{key_prefix}_discard"].add(fname)
430
+ new_list = [f for f in faces if f != fname]
431
+ new_idx = cur if cur < len(new_list) else 0
432
+ st.session_state[f"{key_prefix}_idx"] = new_idx
433
+ st.rerun()
434
+ with bcol3:
435
+ if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
436
+ st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(faces)
437
+ st.rerun()
438
+ with c2:
439
+ name_key = f"{key_prefix}_name"
440
+ desc_key = f"{key_prefix}_desc"
441
+ default_name = ch.get("name", "")
442
+ default_desc = ch.get("description", "")
443
+
444
+ if default_name and (name_key not in st.session_state or not st.session_state.get(name_key)):
445
+ st.session_state[name_key] = default_name
446
+ elif name_key not in st.session_state:
447
+ st.session_state[name_key] = default_name or ""
448
+
449
+ if default_desc and (desc_key not in st.session_state or not st.session_state.get(desc_key)):
450
+ st.session_state[desc_key] = default_desc
451
+ elif desc_key not in st.session_state:
452
+ st.session_state[desc_key] = default_desc or ""
453
+
454
+ pending_desc_key = f"{key_prefix}_pending_desc"
455
+ pending_name_key = f"{key_prefix}_pending_name"
456
+ if pending_desc_key in st.session_state:
457
+ if desc_key not in st.session_state:
458
+ st.session_state[desc_key] = ""
459
+ st.session_state[desc_key] = st.session_state[pending_desc_key]
460
+ del st.session_state[pending_desc_key]
461
+
462
+ if pending_name_key in st.session_state:
463
+ if name_key not in st.session_state:
464
+ st.session_state[name_key] = ""
465
+ if not st.session_state.get(name_key):
466
+ st.session_state[name_key] = st.session_state[pending_name_key]
467
+ del st.session_state[pending_name_key]
468
+
469
+ st.text_input("Nom del clúster", key=name_key)
470
+ st.text_area("Descripció", key=desc_key, height=80)
471
+
472
+ if st.button("🎨 Generar descripció amb Salamandra Vision", key=f"svision_{key_prefix}"):
473
+ with st.spinner("Generant descripció..."):
474
+ from api_client import describe_image_with_svision
475
+ import requests as _req
476
+
477
+ try:
478
+ resp = _req.get(img_url, timeout=10)
479
+ if resp.status_code == 200:
480
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
481
+ tmp.write(resp.content)
482
+ tmp_path = tmp.name
483
+
484
+ desc, name = describe_image_with_svision(tmp_path, is_face=True)
485
+
486
+ if desc:
487
+ st.session_state[pending_desc_key] = desc
488
+ st.success("✅ Descripció generada!")
489
+ else:
490
+ st.warning("⚠️ No s'ha pogut generar una descripció.")
491
+
492
+ if name and not st.session_state.get(name_key):
493
+ st.session_state[pending_name_key] = name
494
+
495
+ os.unlink(tmp_path)
496
+ st.rerun()
497
+ else:
498
+ st.error(f"No s'ha pogut descarregar la imatge (status: {resp.status_code})")
499
+ except Exception as e:
500
+ st.error(f"Error generant descripció: {e}")
501
+
502
+ # --- 4. Carruseles de veus ---
503
+ if st.session_state.get("audio_segments") is not None:
504
+ st.markdown("---")
505
+
506
+ used_names_home = []
507
+ used_names_dona = []
508
+ noms_home_all, noms_dona_all = get_all_catalan_names()
509
+
510
+ for ch in (st.session_state.characters_detected or []):
511
+ ch_name = ch.get("name", "")
512
+ if ch_name in noms_home_all:
513
+ used_names_home.append(ch_name)
514
+ elif ch_name in noms_dona_all:
515
+ used_names_dona.append(ch_name)
516
+
517
+ segs = st.session_state.audio_segments or []
518
+ vlabels = st.session_state.voice_labels or []
519
+ valid_indices = [i for i, l in enumerate(vlabels) if isinstance(l, int) and l >= 0]
520
+ clusters = {}
521
+ for i in valid_indices:
522
+ lbl = int(vlabels[i])
523
+ clusters.setdefault(lbl, []).append(i)
524
+ n_vclusters = len(clusters)
525
+ st.subheader(f"🎙️ Empremtes de veu — clústers: {n_vclusters}")
526
+ di = st.session_state.get("diarization_info") or {}
527
+ if isinstance(di, dict) and not di.get("diarization_ok", True):
528
+ st.warning("No s'ha pogut fer la diarització amb pyannote (s'ha aplicat un sol segment de reserva).")
529
+ if not segs:
530
+ st.info("No s'han detectat mostres de veu.")
531
+ elif n_vclusters == 0:
532
+ st.info("No s'han format clústers de veu.")
533
+ else:
534
+ vname = st.session_state.video_name_from_engine
535
+ for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
536
+ key_prefix = f"voice_{lbl:02d}"
537
+ if f"{key_prefix}_idx" not in st.session_state:
538
+ st.session_state[f"{key_prefix}_idx"] = 0
539
+ if f"{key_prefix}_discard" not in st.session_state:
540
+ st.session_state[f"{key_prefix}_discard"] = set()
541
+ discard_set = st.session_state[f"{key_prefix}_discard"]
542
+ files = []
543
+ for i in idxs:
544
+ clip_local = (segs[i] or {}).get("clip_path")
545
+ fname = os.path.basename(clip_local) if clip_local else None
546
+ if fname:
547
+ files.append(fname)
548
+ files = [f for f in files if f and f not in discard_set]
549
+ if not files:
550
+ st.write(f"- SPEAKER_{lbl:02d} — sense clips seleccionats")
551
+ continue
552
+ cur = st.session_state[f"{key_prefix}_idx"]
553
+ if cur >= len(files):
554
+ cur = 0
555
+ st.session_state[f"{key_prefix}_idx"] = cur
556
+ fname = files[cur]
557
+ audio_url = f"{backend_base_url}/audio/{vname}/{fname}" if (vname and fname) else None
558
+ st.markdown(f"**SPEAKER_{lbl:02d} — {len(files)} clips**")
559
+ c1, c2 = st.columns([1, 2])
560
+ with c1:
561
+ if audio_url:
562
+ st.audio(audio_url, format="audio/wav")
563
+ st.caption(f"Clip {cur+1}/{len(files)}")
564
+ bcol1, bcol2, bcol3 = st.columns(3)
565
+ with bcol1:
566
+ if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
567
+ st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(files)
568
+ st.rerun()
569
+ with bcol2:
570
+ if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquest clip del clúster"):
571
+ st.session_state[f"{key_prefix}_discard"].add(fname)
572
+ new_list = [f for f in files if f != fname]
573
+ new_idx = cur if cur < len(new_list) else 0
574
+ st.session_state[f"{key_prefix}_idx"] = new_idx
575
+ st.rerun()
576
+ with bcol3:
577
+ if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
578
+ st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(files)
579
+ st.rerun()
580
+ with c2:
581
+ name_key = f"{key_prefix}_name"
582
+ desc_key = f"{key_prefix}_desc"
583
+ default_name = get_catalan_name_for_speaker(lbl, used_names_home, used_names_dona)
584
+ st.text_input("Nom del clúster", value=st.session_state.get(name_key, default_name), key=name_key)
585
+ st.text_area("Descripció", value=st.session_state.get(desc_key, ""), key=desc_key, height=80)
586
+
587
+
588
+ # --- 5. Carruseles de escenas ---
589
+ if st.session_state.get("scene_detection_done"):
590
+ st.markdown("---")
591
+ scene_clusters = st.session_state.get("scene_clusters")
592
+ n_scenes = len(scene_clusters or [])
593
+ st.subheader(f"📍 Escenes — clústers: {n_scenes}")
594
+ if not scene_clusters:
595
+ st.info("No s'han detectat clústers d'escenes en aquest clip.")
596
+ else:
597
+ for sidx, sc in enumerate(scene_clusters):
598
+ try:
599
+ folder_name = Path(sc.get("folder") or "").name
600
+ except Exception:
601
+ folder_name = ""
602
+ scene_id = sc.get("id") or folder_name or f"scene{sidx+1}"
603
+ key_prefix = re.sub(r"[^0-9a-zA-Z_]+", "_", f"scene_{sidx+1}_{scene_id}") or f"scene_{sidx+1}"
604
+ if f"{key_prefix}_idx" not in st.session_state:
605
+ st.session_state[f"{key_prefix}_idx"] = 0
606
+ if f"{key_prefix}_discard" not in st.session_state:
607
+ st.session_state[f"{key_prefix}_discard"] = set()
608
+ frames_all = sc.get("frame_files") or ([sc.get("image_url")] if sc.get("image_url") else [])
609
+ frames_all = [f for f in frames_all if f]
610
+ discard_set = st.session_state[f"{key_prefix}_discard"]
611
+ frames = [f for f in frames_all if f not in discard_set]
612
+ if not frames:
613
+ st.write(f"- {sidx+1}. (sense imatges de l'escena)")
614
+ continue
615
+ cur = st.session_state[f"{key_prefix}_idx"]
616
+ if cur >= len(frames):
617
+ cur = 0
618
+ st.session_state[f"{key_prefix}_idx"] = cur
619
+ fname = frames[cur]
620
+ if str(fname).startswith("/files/"):
621
+ img_url = f"{backend_base_url}{fname}"
622
+ else:
623
+ base = sc.get("image_url") or ""
624
+ base_dir = "/".join((base or "/").split("/")[:-1])
625
+ img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
626
+ st.markdown(f"**{sidx+1}. Escena — {sc.get('num_frames', 0)} frames**")
627
+ c1, c2 = st.columns([1, 2])
628
+ with c1:
629
+ st.image(img_url, use_container_width=True)
630
+ st.caption(f"Imatge {cur+1}/{len(frames)}")
631
+ bcol1, bcol2, bcol3 = st.columns(3)
632
+ with bcol1:
633
+ if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
634
+ st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(frames)
635
+ st.rerun()
636
+ with bcol2:
637
+ if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquesta imatge del clúster"):
638
+ st.session_state[f"{key_prefix}_discard"].add(fname)
639
+ new_list = [f for f in frames if f != fname]
640
+ new_idx = cur if cur < len(new_list) else 0
641
+ st.session_state[f"{key_prefix}_idx"] = new_idx
642
+ st.rerun()
643
+ with bcol3:
644
+ if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
645
+ st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(frames)
646
+ st.rerun()
647
+ with c2:
648
+ name_key = f"{key_prefix}_name"
649
+ desc_key = f"{key_prefix}_desc"
650
+ default_scene_name = sc.get("name", "")
651
+ default_scene_desc = sc.get("description", "")
652
+
653
+ if default_scene_name and (name_key not in st.session_state or not st.session_state.get(name_key)):
654
+ st.session_state[name_key] = default_scene_name
655
+ elif name_key not in st.session_state:
656
+ st.session_state[name_key] = default_scene_name or ""
657
+
658
+ if default_scene_desc and (desc_key not in st.session_state or not st.session_state.get(desc_key)):
659
+ st.session_state[desc_key] = default_scene_desc
660
+ elif desc_key not in st.session_state:
661
+ st.session_state[desc_key] = default_scene_desc or ""
662
+
663
+ pending_desc_key = f"{key_prefix}_pending_desc"
664
+ pending_name_key = f"{key_prefix}_pending_name"
665
+ if pending_desc_key in st.session_state:
666
+ if desc_key not in st.session_state:
667
+ st.session_state[desc_key] = ""
668
+ st.session_state[desc_key] = st.session_state[pending_desc_key]
669
+ del st.session_state[pending_desc_key]
670
+
671
+ if pending_name_key in st.session_state:
672
+ if name_key not in st.session_state:
673
+ st.session_state[name_key] = ""
674
+ if not st.session_state.get(name_key):
675
+ st.session_state[name_key] = st.session_state[pending_name_key]
676
+ del st.session_state[pending_name_key]
677
+
678
+ st.text_input("Nom del clúster", key=name_key)
679
+ st.text_area("Descripció", key=desc_key, height=80)
680
+
681
+ if st.button("🎨 Generar descripció amb Salamandra Vision", key=f"svision_{key_prefix}"):
682
+ with st.spinner("Generant descripció..."):
683
+ from api_client import describe_image_with_svision, generate_short_scene_name
684
+ import requests as _req
685
+
686
+ try:
687
+ resp = _req.get(img_url, timeout=10)
688
+ if resp.status_code == 200:
689
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
690
+ tmp.write(resp.content)
691
+ tmp_path = tmp.name
692
+
693
+ desc, name = describe_image_with_svision(tmp_path, is_face=False)
694
+
695
+ if desc:
696
+ st.session_state[pending_desc_key] = desc
697
+
698
+ try:
699
+ short_name = generate_short_scene_name(desc)
700
+ if short_name:
701
+ st.session_state[pending_name_key] = short_name
702
+ elif name:
703
+ st.session_state[pending_name_key] = name
704
+ except Exception:
705
+ if name:
706
+ st.session_state[pending_name_key] = name
707
+
708
+ st.success("✅ Descripció i nom generats!")
709
+ else:
710
+ st.warning("⚠️ No s'ha pogut generar una descripció.")
711
+
712
+ os.unlink(tmp_path)
713
+ st.rerun()
714
+ else:
715
+ st.error(f"No s'ha pogut descarregar la imatge (status: {resp.status_code})")
716
+ except Exception as e:
717
+ st.error(f"Error generant descripció: {e}")
718
+
719
+ # --- 6. Confirmación de casting y personajes combinados ---
720
+ if st.session_state.get("detect_done"):
721
+ st.markdown("---")
722
+ colc1, colc2 = st.columns([1,1])
723
+ with colc1:
724
+ if st.button("Confirmar càsting definitiu", type="primary"):
725
+ chars_payload = []
726
+ for idx, ch in enumerate(st.session_state.characters_detected or []):
727
+ try:
728
+ folder_name = Path(ch.get("folder") or "").name
729
+ except Exception:
730
+ folder_name = ""
731
+ char_id = ch.get("id") or folder_name or f"char{idx+1}"
732
+ def _safe_key(s: str) -> str:
733
+ k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
734
+ return k or f"cluster_{idx+1}"
735
+ key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
736
+ name = st.session_state.get(f"{key_prefix}_name") or ch.get("name") or f"Personatge {idx+1}"
737
+ desc = st.session_state.get(f"{key_prefix}_desc", "")
738
+ faces_all = ch.get("face_files") or []
739
+ discard = st.session_state.get(f"{key_prefix}_discard", set())
740
+ kept = [f for f in faces_all if f and f not in discard]
741
+ chars_payload.append({
742
+ "id": char_id,
743
+ "name": name,
744
+ "description": desc,
745
+ "folder": ch.get("folder"),
746
+ "kept_files": kept,
747
+ })
748
+
749
+ used_names_home_fin = []
750
+ used_names_dona_fin = []
751
+ noms_home_all, noms_dona_all = get_all_catalan_names()
752
+ for cp in chars_payload:
753
+ face_name = cp.get("name", "")
754
+ if face_name in noms_home_all:
755
+ used_names_home_fin.append(face_name)
756
+ elif face_name in noms_dona_all:
757
+ used_names_dona_fin.append(face_name)
758
+
759
+ segs = st.session_state.audio_segments or []
760
+ vlabels = st.session_state.voice_labels or []
761
+ vname = st.session_state.video_name_from_engine
762
+ voice_clusters = {}
763
+ for i, seg in enumerate(segs):
764
+ lbl = vlabels[i] if i < len(vlabels) else -1
765
+ clip_local = seg.get("clip_path")
766
+ fname = os.path.basename(clip_local) if clip_local else None
767
+ if fname:
768
+ default_voice_name = get_catalan_name_for_speaker(int(lbl), used_names_home_fin, used_names_dona_fin) if isinstance(lbl, int) and lbl >= 0 else "UNKNOWN"
769
+ voice_clusters.setdefault(lbl, {"label": lbl, "name": default_voice_name, "description": "", "clips": []})
770
+ if isinstance(lbl, int) and lbl >= 0:
771
+ vpref = f"voice_{int(lbl):02d}"
772
+ vname_custom = st.session_state.get(f"{vpref}_name")
773
+ vdesc_custom = st.session_state.get(f"{vpref}_desc")
774
+ if vname_custom:
775
+ voice_clusters[lbl]["name"] = vname_custom
776
+ if vdesc_custom is not None:
777
+ voice_clusters[lbl]["description"] = vdesc_custom
778
+ voice_clusters[lbl]["clips"].append(fname)
779
+
780
+ payload = {
781
+ "video_name": vname,
782
+ "base_dir": st.session_state.get("engine_base_dir"),
783
+ "characters": chars_payload,
784
+ "voice_clusters": list(voice_clusters.values()),
785
+ }
786
+
787
+ if not payload["video_name"] or not payload["base_dir"]:
788
+ st.error("Falten dades del vídeo per confirmar el càsting (video_name/base_dir). Torna a processar el vídeo.")
789
+ else:
790
+ with st.spinner("Consolidant càsting al servidor…"):
791
+ res_fc = api.finalize_casting(payload)
792
+ if isinstance(res_fc, dict) and res_fc.get("ok"):
793
+ st.success(f"Càsting consolidat. Identities: {len(res_fc.get('face_identities', []))} cares, {len(res_fc.get('voice_identities', []))} veus.")
794
+ st.session_state.casting_finalized = True
795
+
796
+ f_id = res_fc.get('face_identities', []) or []
797
+ v_id = res_fc.get('voice_identities', []) or []
798
+ c3, c4 = st.columns(2)
799
+ with c3:
800
+ st.markdown("**Identitats de cara**")
801
+ for n in f_id:
802
+ st.write(f"- {n}")
803
+ with c4:
804
+ st.markdown("**Identitats de veu**")
805
+ for n in v_id:
806
+ st.write(f"- {n}")
807
+
808
+ faces_dir = res_fc.get('faces_dir')
809
+ voices_dir = res_fc.get('voices_dir')
810
+ db_dir = res_fc.get('db_dir')
811
+ with st.spinner("Carregant índexs al cercador (Chroma)…"):
812
+ load_res = api.load_casting(faces_dir=faces_dir, voices_dir=voices_dir, db_dir=db_dir, drop_collections=True)
813
+ if isinstance(load_res, dict) and load_res.get('ok'):
814
+ st.success(f"Índexs carregats: {load_res.get('faces', 0)} cares, {load_res.get('voices', 0)} veus.")
815
+ else:
816
+ st.error(f"Error carregant índexs: {load_res}")
817
+ else:
818
+ st.error(f"No s'ha pogut consolidar el càsting: {res_fc}")
819
+
820
+ # --- Personatges combinats (cares + veus) ---
821
+ if st.session_state.get("casting_finalized"):
822
+ st.markdown("---")
823
+ st.subheader("👥 Personatges")
824
+
825
+ def normalize_name(name: str) -> str:
826
+ import unicodedata
827
+ name_upper = name.upper()
828
+ name_normalized = ''.join(
829
+ c for c in unicodedata.normalize('NFD', name_upper)
830
+ if unicodedata.category(c) != 'Mn'
831
+ )
832
+ return name_normalized
833
+
834
+ chars_payload = []
835
+ for idx, ch in enumerate(st.session_state.characters_detected or []):
836
+ try:
837
+ folder_name = Path(ch.get("folder") or "").name
838
+ except Exception:
839
+ folder_name = ""
840
+ char_id = ch.get("id") or folder_name or f"char{idx+1}"
841
+ def _safe_key(s: str) -> str:
842
+ k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
843
+ return k or f"cluster_{idx+1}"
844
+ key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
845
+ name = st.session_state.get(f"{key_prefix}_name") or ch.get("name") or f"Personatge {idx+1}"
846
+ name_normalized = normalize_name(name)
847
+ desc = st.session_state.get(f"{key_prefix}_desc", "").strip()
848
+ chars_payload.append({
849
+ "name": name,
850
+ "name_normalized": name_normalized,
851
+ "face_key_prefix": key_prefix,
852
+ "face_files": ch.get("face_files") or [],
853
+ "char_data": ch,
854
+ "description": desc,
855
+ })
856
+
857
+ used_names_home_pers = []
858
+ used_names_dona_pers = []
859
+ noms_home_all, noms_dona_all = get_all_catalan_names()
860
+ for cp in chars_payload:
861
+ face_name = cp.get("name", "")
862
+ if face_name in noms_home_all:
863
+ used_names_home_pers.append(face_name)
864
+ elif face_name in noms_dona_all:
865
+ used_names_dona_pers.append(face_name)
866
+
867
+ segs = st.session_state.audio_segments or []
868
+ vlabels = st.session_state.voice_labels or []
869
+ vname = st.session_state.video_name_from_engine
870
+ voice_clusters_by_name = {}
871
+ for i, seg in enumerate(segs):
872
+ lbl = vlabels[i] if i < len(vlabels) else -1
873
+ if not (isinstance(lbl, int) and lbl >= 0):
874
+ continue
875
+ vpref = f"voice_{int(lbl):02d}"
876
+ default_voice_name = get_catalan_name_for_speaker(int(lbl), used_names_home_pers, used_names_dona_pers) if isinstance(lbl, int) and lbl >= 0 else f"SPEAKER_{int(lbl):02d}"
877
+ vname_custom = st.session_state.get(f"{vpref}_name") or default_voice_name
878
+ vname_normalized = normalize_name(vname_custom)
879
+ vdesc = st.session_state.get(f"{vpref}_desc", "").strip()
880
+ clip_local = seg.get("clip_path")
881
+ fname = os.path.basename(clip_local) if clip_local else None
882
+ if fname:
883
+ voice_clusters_by_name.setdefault(vname_normalized, {
884
+ "voice_key_prefix": vpref,
885
+ "clips": [],
886
+ "label": lbl,
887
+ "original_name": vname_custom,
888
+ "description": vdesc,
889
+ })
890
+ voice_clusters_by_name[vname_normalized]["clips"].append(fname)
891
+
892
+ all_normalized_names = set([c["name_normalized"] for c in chars_payload] + list(voice_clusters_by_name.keys()))
893
+
894
+ for pidx, norm_name in enumerate(sorted(all_normalized_names)):
895
+ face_items = [c for c in chars_payload if c["name_normalized"] == norm_name]
896
+ voice_data = voice_clusters_by_name.get(norm_name)
897
+
898
+ display_name = face_items[0]["name"] if face_items else (voice_data["original_name"] if voice_data else norm_name)
899
+
900
+ descriptions = []
901
+ for face_item in face_items:
902
+ if face_item["description"]:
903
+ descriptions.append(face_item["description"])
904
+ if voice_data and voice_data.get("description"):
905
+ descriptions.append(voice_data["description"])
906
+
907
+ combined_description = "\n".join(descriptions) if descriptions else ""
908
+
909
+ st.markdown(f"**{pidx+1}. {display_name}**")
910
+
911
+ all_faces = []
912
+ for face_item in face_items:
913
+ all_faces.extend(face_item["face_files"])
914
+
915
+ face_data = face_items[0] if face_items else None
916
+
917
+ col_faces, col_voices, col_text = st.columns([1, 1, 1.5])
918
+
919
+ with col_faces:
920
+ if all_faces:
921
+ carousel_key = f"combined_face_{pidx}"
922
+ if f"{carousel_key}_idx" not in st.session_state:
923
+ st.session_state[f"{carousel_key}_idx"] = 0
924
+ cur = st.session_state[f"{carousel_key}_idx"]
925
+ if cur >= len(all_faces):
926
+ cur = 0
927
+ st.session_state[f"{carousel_key}_idx"] = cur
928
+ fname = all_faces[cur]
929
+ ch = face_data["char_data"] if face_data else {}
930
+ if fname.startswith("/files/"):
931
+ img_url = f"{backend_base_url}{fname}"
932
+ else:
933
+ base = ch.get("image_url") or ""
934
+ base_dir = "/".join((base or "/").split("/")[:-1])
935
+ img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
936
+ st.image(img_url, width=150)
937
+ st.caption(f"Cara {cur+1}/{len(all_faces)}")
938
+ bcol1, bcol2 = st.columns(2)
939
+ with bcol1:
940
+ if st.button("⬅️", key=f"combined_face_prev_{pidx}"):
941
+ st.session_state[f"{carousel_key}_idx"] = (cur - 1) % len(all_faces)
942
+ st.rerun()
943
+ with bcol2:
944
+ if st.button("➡️", key=f"combined_face_next_{pidx}"):
945
+ st.session_state[f"{carousel_key}_idx"] = (cur + 1) % len(all_faces)
946
+ st.rerun()
947
+ else:
948
+ st.info("Sense imatges")
949
+
950
+ with col_voices:
951
+ if voice_data:
952
+ clips = voice_data["clips"]
953
+ if clips:
954
+ carousel_key = f"combined_voice_{pidx}"
955
+ if f"{carousel_key}_idx" not in st.session_state:
956
+ st.session_state[f"{carousel_key}_idx"] = 0
957
+ cur = st.session_state[f"{carousel_key}_idx"]
958
+ if cur >= len(clips):
959
+ cur = 0
960
+ st.session_state[f"{carousel_key}_idx"] = cur
961
+ fname = clips[cur]
962
+ audio_url = f"{backend_base_url}/audio/{vname}/{fname}" if (vname and fname) else None
963
+ if audio_url:
964
+ st.audio(audio_url, format="audio/wav")
965
+ st.caption(f"Veu {cur+1}/{len(clips)}")
966
+ bcol1, bcol2 = st.columns(2)
967
+ with bcol1:
968
+ if st.button("⬅️", key=f"combined_voice_prev_{pidx}"):
969
+ st.session_state[f"{carousel_key}_idx"] = (cur - 1) % len(clips)
970
+ st.rerun()
971
+ with bcol2:
972
+ if st.button("➡️", key=f"combined_voice_next_{pidx}"):
973
+ st.session_state[f"{carousel_key}_idx"] = (cur + 1) % len(clips)
974
+ st.rerun()
975
+ else:
976
+ st.info("Sense clips de veu")
977
+ else:
978
+ st.info("Sense dades de veu")
979
+
980
+ with col_text:
981
+ combined_name_key = f"combined_char_{pidx}_name"
982
+ combined_desc_key = f"combined_char_{pidx}_desc"
983
+
984
+ if combined_name_key not in st.session_state:
985
+ st.session_state[combined_name_key] = norm_name
986
+ if combined_desc_key not in st.session_state:
987
+ st.session_state[combined_desc_key] = combined_description
988
+
989
+ st.text_input("Nom del personatge", key=combined_name_key, label_visibility="collapsed", placeholder="Nom del personatge")
990
+ st.text_area("Descripció", key=combined_desc_key, height=120, label_visibility="collapsed", placeholder="Descripció del personatge")
991
+
992
+ # --- 7. Generar audiodescripció ---
993
+ st.markdown("---")
994
+ if st.button("🎬 Generar audiodescripció", type="primary", use_container_width=True):
995
+ v = st.session_state.get("video_uploaded")
996
+ if not v:
997
+ st.error("No hi ha cap vídeo carregat.")
998
+ else:
999
+ progress_placeholder = st.empty()
1000
+ result_placeholder = st.empty()
1001
+
1002
+ with st.spinner("Generant audiodescripció... Aquest procés pot trigar diversos minuts."):
1003
+ progress_placeholder.info("⏳ Processant vídeo i generant audiodescripció UNE-153010...")
1004
+
1005
+ try:
1006
+ out = api.generate_audiodescription(v["bytes"], v["name"])
1007
+
1008
+ if isinstance(out, dict) and out.get("status") == "done":
1009
+ progress_placeholder.success("✅ Audiodescripció generada correctament!")
1010
+ res = out.get("results", {})
1011
+
1012
+ with result_placeholder.container():
1013
+ st.success("🎉 Audiodescripció completada!")
1014
+ c1, c2 = st.columns([1,1])
1015
+ with c1:
1016
+ st.markdown("**📄 UNE-153010 SRT**")
1017
+ une_srt_content = res.get("une_srt", "")
1018
+ st.code(une_srt_content, language="text")
1019
+ if une_srt_content:
1020
+ st.download_button(
1021
+ "⬇️ Descarregar UNE SRT",
1022
+ data=une_srt_content,
1023
+ file_name=f"{v['name']}_une.srt",
1024
+ mime="text/plain"
1025
+ )
1026
+ with c2:
1027
+ st.markdown("**📝 Narració lliure**")
1028
+ free_text_content = res.get("free_text", "")
1029
+ st.text_area("", value=free_text_content, height=240, key="free_text_result")
1030
+ if free_text_content:
1031
+ st.download_button(
1032
+ "⬇️ Descarregar text lliure",
1033
+ data=free_text_content,
1034
+ file_name=f"{v['name']}_free.txt",
1035
+ mime="text/plain"
1036
+ )
1037
+ else:
1038
+ progress_placeholder.empty()
1039
+ error_msg = str(out.get("error", out)) if isinstance(out, dict) else str(out)
1040
+ result_placeholder.error(f"❌ Error generant l'audiodescripció: {error_msg}")
1041
+
1042
+ except Exception as e:
1043
+ progress_placeholder.empty()
1044
+ result_placeholder.error(f"❌ Excepció durant la generació: {e}")