VeuReu commited on
Commit
09e41fd
·
1 Parent(s): f026f25

Upload 7 files

Browse files
Files changed (2) hide show
  1. api_client.py +27 -14
  2. page_modules/process_video.py +512 -447
api_client.py CHANGED
@@ -466,23 +466,36 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> Tuple[
466
  print(f"[svision] Enviant petició (pot trigar si ZeroGPU està en cold start)...")
467
  print(f"[svision] Image path: {image_path}")
468
 
469
- # Llamar al endpoint /describe con timeout aumentado para ZeroGPU cold start
470
- # El primer request puede tardar 30-60 segundos en ZeroGPU
471
  import time
472
  start_time = time.time()
473
-
474
- # IMPORTANTE: usar file() de gradio_client para enviar el archivo correctamente
475
- # Ajustar max_new_tokens según el tipo (escenas más breves)
476
  max_tokens = 256 if is_face else 128
477
-
478
- result = client.predict(
479
- handle_file(image_path), # Enviar el archivo usando el helper de gradio
480
- prompt, # texto
481
- max_tokens, # max_new_tokens (128 para escenas, 256 para caras)
482
- 0.7, # temperature
483
- api_name="/describe"
484
- )
485
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  elapsed = time.time() - start_time
487
  print(f"[svision] Resposta rebuda en {elapsed:.1f}s")
488
 
 
466
  print(f"[svision] Enviant petició (pot trigar si ZeroGPU està en cold start)...")
467
  print(f"[svision] Image path: {image_path}")
468
 
 
 
469
  import time
470
  start_time = time.time()
 
 
 
471
  max_tokens = 256 if is_face else 128
472
+ max_attempts = int(os.getenv("SVISION_MAX_ATTEMPTS", "5"))
473
+ wait_seconds = int(os.getenv("SVISION_RETRY_WAIT", "5"))
474
+ result = None
475
+ last_error: Exception | None = None
476
+
477
+ for attempt in range(1, max_attempts + 1):
478
+ try:
479
+ print(f"[svision] Attempt {attempt}/{max_attempts} (wait={wait_seconds}s)")
480
+ result = client.predict(
481
+ handle_file(image_path),
482
+ prompt,
483
+ max_tokens,
484
+ 0.7,
485
+ api_name="/describe",
486
+ timeout=int(os.getenv("SVISION_TIMEOUT", "180")),
487
+ )
488
+ if result and isinstance(result, str) and result.strip():
489
+ break
490
+ raise RuntimeError("Resposta buida de svision")
491
+ except Exception as exc:
492
+ last_error = exc
493
+ print(f"[svision] Error attempt {attempt}/{max_attempts}: {exc}")
494
+ if attempt == max_attempts:
495
+ raise
496
+ time.sleep(wait_seconds)
497
+ wait_seconds = min(wait_seconds * 2, 40)
498
+
499
  elapsed = time.time() - start_time
500
  print(f"[svision] Resposta rebuda en {elapsed:.1f}s")
501
 
page_modules/process_video.py CHANGED
@@ -118,453 +118,518 @@ def _transcode_video(input_path: str, output_path: str, max_duration: int | None
118
  "+faststart",
119
  output_path,
120
  ]
121
- result = subprocess.run(cmd, capture_output=True, text=True)
122
- if result.returncode != 0:
123
- raise RuntimeError(result.stderr.strip() or "ffmpeg failed")
124
-
125
-
126
- def render_process_video_page(api, backend_base_url: str) -> None:
127
- st.header("Processar un nou clip de vídeo")
128
-
129
- # Inicializar el estado de la página si no existe
130
- if "video_uploaded" not in st.session_state:
131
- st.session_state.video_uploaded = None
132
- if "characters_detected" not in st.session_state:
133
- st.session_state.characters_detected = None
134
- if "audio_segments" not in st.session_state:
135
- st.session_state.audio_segments = None
136
- if "voice_labels" not in st.session_state:
137
- st.session_state.voice_labels = None
138
- if "face_labels" not in st.session_state:
139
- st.session_state.face_labels = None
140
- if "scene_clusters" not in st.session_state:
141
- st.session_state.scene_clusters = None
142
- if "scene_detection_done" not in st.session_state:
143
- st.session_state.scene_detection_done = False
144
- if "detect_done" not in st.session_state:
145
- st.session_state.detect_done = False
146
- if "casting_finalized" not in st.session_state:
147
- st.session_state.casting_finalized = False
148
- if "video_name_from_engine" not in st.session_state:
149
- st.session_state.video_name_from_engine = None
150
- if "diarization_info" not in st.session_state:
151
- st.session_state.diarization_info = {}
152
- if "characters_saved" not in st.session_state:
153
- st.session_state.characters_saved = False
154
-
155
- # --- 1. Subida del vídeo ---
156
- MAX_SIZE_MB = 20
157
- MAX_DURATION_S = 240 # 4 minutos
158
-
159
- uploaded_file = st.file_uploader(
160
- "Puja un clip de vídeo (MP4, < 20MB, < 4 minuts)",
161
- type=["mp4"],
162
- key="video_uploader",
163
- )
164
-
165
- if uploaded_file is not None:
166
- # Resetear el estado si se sube un nuevo archivo
167
- if st.session_state.video_uploaded is None or uploaded_file.name != st.session_state.video_uploaded.get(
168
- "original_name"
169
- ):
170
- st.session_state.video_uploaded = {"original_name": uploaded_file.name, "status": "validating"}
171
- st.session_state.characters_detected = None
172
- st.session_state.characters_saved = False
173
-
174
- if st.session_state.video_uploaded["status"] == "validating":
175
- is_valid = True
176
- if uploaded_file.size > MAX_SIZE_MB * 1024 * 1024:
177
- st.error(f"El vídeo supera el límit de {MAX_SIZE_MB}MB.")
178
- is_valid = False
179
-
180
- if is_valid:
181
- with st.spinner("Processant el vídeo..."):
182
- temp_path = Path("temp_video.mp4")
183
- with temp_path.open("wb") as f:
184
- f.write(uploaded_file.getbuffer())
185
-
186
- was_truncated = False
187
- final_video_path = None
188
- try:
189
- duration = _get_video_duration(str(temp_path))
190
- duration_unknown = False
191
- if not duration:
192
- st.warning(
193
- "No s'ha pogut obtenir la durada del vídeo. Es continuarà assumint un màxim de 4 minuts."
194
- )
195
- duration = float(MAX_DURATION_S)
196
- duration_unknown = True
197
-
198
- if is_valid:
199
- if duration > MAX_DURATION_S:
200
- was_truncated = True
201
-
202
- video_name = Path(uploaded_file.name).stem
203
- video_dir = Path("/tmp/data/videos") / video_name
204
- video_dir.mkdir(parents=True, exist_ok=True)
205
- final_video_path = video_dir / f"{video_name}.mp4"
206
-
207
- try:
208
- _transcode_video(
209
- str(temp_path),
210
- str(final_video_path),
211
- MAX_DURATION_S if (was_truncated or duration_unknown) else None,
212
- )
213
- except RuntimeError as exc:
214
- st.error(f"No s'ha pogut processar el vídeo: {exc}")
215
- is_valid = False
216
-
217
- if is_valid and final_video_path is not None:
218
- st.session_state.video_uploaded.update(
219
- {
220
- "status": "processed",
221
- "path": str(final_video_path),
222
- "was_truncated": was_truncated or duration_unknown,
223
- "duration_unknown": duration_unknown,
224
- "bytes": uploaded_file.getvalue(),
225
- "name": uploaded_file.name,
226
- }
227
- )
228
- st.rerun()
229
- finally:
230
- if temp_path.exists():
231
- temp_path.unlink()
232
-
233
- if st.session_state.video_uploaded and st.session_state.video_uploaded["status"] == "processed":
234
- st.success(f"Vídeo '{st.session_state.video_uploaded['original_name']}' pujat i processat correctament.")
235
- if st.session_state.video_uploaded["was_truncated"]:
236
- st.warning("El vídeo s'ha truncat a 4 minuts.")
237
-
238
- # --- 2. Form de detecció amb sliders ---
239
- st.markdown("---")
240
-
241
- with st.form("detect_form"):
242
- col_btn, col_face, col_voice, col_scene = st.columns([1, 1, 1, 1])
243
- with col_face:
244
- st.markdown("**Cares**")
245
- face_max_groups = st.slider("Límit de grups (cares)", 1, 10, 5, 1, key="face_max_groups")
246
- face_min_cluster = st.slider("Mida mínima (cares)", 1, 5, 3, 1, key="face_min_cluster")
247
- face_sensitivity = st.slider("Sensibilitat (cares)", 0.0, 1.0, 0.5, 0.05, key="face_sensitivity",
248
- help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
249
- with col_voice:
250
- st.markdown("**Veus**")
251
- voice_max_groups = st.slider("Límit de grups (veus)", 1, 10, 5, 1, key="voice_max_groups")
252
- voice_min_cluster = st.slider("Mida mínima (veus)", 1, 5, 3, 1, key="voice_min_cluster")
253
- voice_sensitivity = st.slider("Sensibilitat (veus)", 0.0, 1.0, 0.5, 0.05, key="voice_sensitivity",
254
- help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
255
- with col_scene:
256
- st.markdown("**Escenes**")
257
- scene_max_groups = st.slider("Límit de grups (escenes)", 1, 10, 3, 1, key="scene_max_groups")
258
- scene_min_cluster = st.slider("Mida mínima (escenes)", 5, 20, 12, 1, key="scene_min_cluster")
259
- scene_sensitivity = st.slider("Sensibilitat (escenes)", 0.0, 1.0, 0.5, 0.05, key="scene_sensitivity",
260
- help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
261
- with col_btn:
262
- max_frames = st.number_input("Nombre de frames a processar", min_value=10, max_value=500, value=100, step=10,
263
- help="Nombre de fotogrames equiespaciats a extreure del vídeo per detectar cares")
264
- can_detect = st.session_state.video_uploaded is not None
265
- submit_detect = st.form_submit_button("Detectar Personatges", disabled=not can_detect)
266
-
267
- if not can_detect:
268
- st.caption("📹 Necessites pujar un vídeo primer")
269
-
270
- if submit_detect:
271
- try:
272
- v = st.session_state.video_uploaded
273
- # Reset estat abans de començar
274
- st.session_state.scene_clusters = None
275
- st.session_state.scene_detection_done = False
276
- st.session_state.detect_done = False
277
- st.session_state.casting_finalized = False
278
-
279
- resp = api.create_initial_casting(
280
- video_bytes=v["bytes"],
281
- video_name=v["name"],
282
- face_max_groups=face_max_groups,
283
- face_min_cluster_size=face_min_cluster,
284
- face_sensitivity=face_sensitivity,
285
- voice_max_groups=voice_max_groups,
286
- voice_min_cluster_size=voice_min_cluster,
287
- voice_sensitivity=voice_sensitivity,
288
- max_frames=max_frames,
289
- )
290
-
291
- if not isinstance(resp, dict) or not resp.get("job_id"):
292
- st.error("No s'ha pogut crear el job al servidor.")
293
- else:
294
- job_id = resp["job_id"]
295
- with st.spinner("Processant al servidor…"):
296
- time.sleep(3)
297
- attempt, max_attempts = 0, 120
298
- progress_placeholder = st.empty()
299
- while attempt < max_attempts:
300
- stt = api.get_job(job_id)
301
- status = stt.get("status")
302
- if status in ("queued", "processing"):
303
- if attempt % 10 == 0:
304
- elapsed_min = (attempt * 5) // 60
305
- progress_placeholder.info(f"⏳ Processant al servidor... (~{elapsed_min} min)")
306
- time.sleep(5)
307
- attempt += 1
308
- continue
309
- if status == "failed":
310
- progress_placeholder.empty()
311
- st.error("El processament ha fallat al servidor.")
312
- break
313
-
314
- # Success
315
- res = stt.get("results", {})
316
- chars = res.get("characters", [])
317
- fl = res.get("face_labels", [])
318
- segs = res.get("audio_segments", [])
319
- vl = res.get("voice_labels", [])
320
- base_dir = res.get("base_dir")
321
- vname = os.path.basename(base_dir) if base_dir else None
322
- diar_info = res.get("diarization_info", {})
323
-
324
- st.session_state.characters_detected = chars or []
325
- st.session_state.face_labels = fl or []
326
- st.session_state.audio_segments = segs or []
327
- st.session_state.voice_labels = vl or []
328
- st.session_state.video_name_from_engine = vname
329
- st.session_state.engine_base_dir = base_dir
330
- st.session_state.diarization_info = diar_info or {}
331
-
332
- progress_placeholder.empty()
333
-
334
- if chars:
335
- st.success(f"✓ Detecció completada! Trobades {len(chars)} cares.")
336
- st.info("💡 Usa els botons '🎨 Generar descripció' a sota de cada personatge per obtenir descripcions automàtiques amb Salamandra Vision.")
337
- else:
338
- st.info("No s'han detectat cares en aquest vídeo.")
339
-
340
- # Detect scenes
341
- try:
342
- scene_out = api.detect_scenes(
343
- video_bytes=v["bytes"],
344
- video_name=v["name"],
345
- max_groups=scene_max_groups,
346
- min_cluster_size=scene_min_cluster,
347
- scene_sensitivity=scene_sensitivity,
348
- frame_interval_sec=0.5,
349
- )
350
- scs = scene_out.get("scene_clusters") if isinstance(scene_out, dict) else None
351
- if isinstance(scs, list):
352
- st.session_state.scene_clusters = scs
353
- else:
354
- st.session_state.scene_clusters = []
355
- except Exception:
356
- st.session_state.scene_clusters = []
357
- finally:
358
- st.session_state.scene_detection_done = True
359
-
360
- st.session_state.detect_done = True
361
- st.success("✅ Processament completat!")
362
- break
363
- else:
364
- progress_placeholder.empty()
365
- st.warning(f"⏱️ El servidor no ha completat el job en {max_attempts * 5 // 60} minuts.")
366
- except Exception as e:
367
- st.error(f"Error inesperat: {e}")
368
-
369
- # --- 3. Carruseles de cares ---
370
- if st.session_state.get("characters_detected") is not None:
371
- st.markdown("---")
372
- n_face_clusters = len(st.session_state.get("characters_detected") or [])
373
- st.subheader(f"🖼️ Cares — clústers: {n_face_clusters}")
374
-
375
- if n_face_clusters == 0:
376
- st.info("No s'han detectat clústers de cara en aquest clip.")
377
-
378
- for idx, ch in enumerate(st.session_state.characters_detected or []):
379
- try:
380
- folder_name = Path(ch.get("folder") or "").name
381
- except Exception:
382
- folder_name = ""
383
- char_id = ch.get("id") or folder_name or f"char{idx+1}"
384
-
385
- def _safe_key(s: str) -> str:
386
- k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
387
- return k or f"cluster_{idx+1}"
388
-
389
- key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
390
- if f"{key_prefix}_idx" not in st.session_state:
391
- st.session_state[f"{key_prefix}_idx"] = 0
392
- if f"{key_prefix}_discard" not in st.session_state:
393
- st.session_state[f"{key_prefix}_discard"] = set()
394
-
395
- faces_all = ch.get("face_files") or ([ch.get("image_url")] if ch.get("image_url") else [])
396
- faces_all = [f for f in faces_all if f]
397
- discard_set = st.session_state[f"{key_prefix}_discard"]
398
- faces = [f for f in faces_all if f not in discard_set]
399
-
400
- if not faces:
401
- st.write(f"- {idx+1}. {ch.get('name','(sense nom)')} — sense imatges seleccionades")
402
- continue
403
-
404
- cur = st.session_state[f"{key_prefix}_idx"]
405
- if cur >= len(faces):
406
- cur = 0
407
- st.session_state[f"{key_prefix}_idx"] = cur
408
- fname = faces[cur]
409
-
410
- if fname.startswith("/files/"):
411
- img_url = f"{backend_base_url}{fname}"
412
- else:
413
- base = ch.get("image_url") or ""
414
- base_dir = "/".join((base or "/").split("/")[:-1])
415
- img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
416
-
417
- st.markdown(f"**{idx+1}. {ch.get('name','(sense nom)')} — {ch.get('num_faces', 0)} cares**")
418
- c1, c2 = st.columns([1, 3])
419
- with c1:
420
- st.image(img_url, width=150)
421
- st.caption(f"Imatge {cur+1}/{len(faces)}")
422
- bcol1, bcol2, bcol3 = st.columns(3)
423
- with bcol1:
424
- if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
425
- st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(faces)
426
- st.rerun()
427
- with bcol2:
428
- if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquesta imatge del clúster"):
429
- st.session_state[f"{key_prefix}_discard"].add(fname)
430
- new_list = [f for f in faces if f != fname]
431
- new_idx = cur if cur < len(new_list) else 0
432
- st.session_state[f"{key_prefix}_idx"] = new_idx
433
- st.rerun()
434
- with bcol3:
435
- if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
436
- st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(faces)
437
- st.rerun()
438
- with c2:
439
- name_key = f"{key_prefix}_name"
440
- desc_key = f"{key_prefix}_desc"
441
- default_name = ch.get("name", "")
442
- default_desc = ch.get("description", "")
443
-
444
- if default_name and (name_key not in st.session_state or not st.session_state.get(name_key)):
445
- st.session_state[name_key] = default_name
446
- elif name_key not in st.session_state:
447
- st.session_state[name_key] = default_name or ""
448
-
449
- if default_desc and (desc_key not in st.session_state or not st.session_state.get(desc_key)):
450
- st.session_state[desc_key] = default_desc
451
- elif desc_key not in st.session_state:
452
- st.session_state[desc_key] = default_desc or ""
453
-
454
- pending_desc_key = f"{key_prefix}_pending_desc"
455
- pending_name_key = f"{key_prefix}_pending_name"
456
- if pending_desc_key in st.session_state:
457
- if desc_key not in st.session_state:
458
- st.session_state[desc_key] = ""
459
- st.session_state[desc_key] = st.session_state[pending_desc_key]
460
- del st.session_state[pending_desc_key]
461
-
462
- if pending_name_key in st.session_state:
463
- if name_key not in st.session_state:
464
- st.session_state[name_key] = ""
465
- if not st.session_state.get(name_key):
466
- st.session_state[name_key] = st.session_state[pending_name_key]
467
- del st.session_state[pending_name_key]
468
-
469
- st.text_input("Nom del clúster", key=name_key)
470
- st.text_area("Descripció", key=desc_key, height=80)
471
-
472
- if st.button("🎨 Generar descripció amb Salamandra Vision", key=f"svision_{key_prefix}"):
473
- with st.spinner("Generant descripció..."):
474
- from api_client import describe_image_with_svision
475
- import requests as _req
476
-
477
- try:
478
- resp = _req.get(img_url, timeout=10)
479
- if resp.status_code == 200:
480
- with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
481
- tmp.write(resp.content)
482
- tmp_path = tmp.name
483
-
484
- desc, name = describe_image_with_svision(tmp_path, is_face=True)
485
-
486
- if desc:
487
- st.session_state[pending_desc_key] = desc
488
- st.success("✅ Descripció generada!")
489
- else:
490
- st.warning("⚠️ No s'ha pogut generar una descripció.")
491
-
492
- if name and not st.session_state.get(name_key):
493
- st.session_state[pending_name_key] = name
494
-
495
- os.unlink(tmp_path)
496
- st.rerun()
497
- else:
498
- st.error(f"No s'ha pogut descarregar la imatge (status: {resp.status_code})")
499
- except Exception as e:
500
- st.error(f"Error generant descripció: {e}")
501
-
502
- # --- 4. Carruseles de veus ---
503
- if st.session_state.get("audio_segments") is not None:
504
- st.markdown("---")
505
-
506
- used_names_home = []
507
- used_names_dona = []
508
- noms_home_all, noms_dona_all = get_all_catalan_names()
509
-
510
- for ch in (st.session_state.characters_detected or []):
511
- ch_name = ch.get("name", "")
512
- if ch_name in noms_home_all:
513
- used_names_home.append(ch_name)
514
- elif ch_name in noms_dona_all:
515
- used_names_dona.append(ch_name)
516
-
517
- segs = st.session_state.audio_segments or []
518
- vlabels = st.session_state.voice_labels or []
519
- valid_indices = [i for i, l in enumerate(vlabels) if isinstance(l, int) and l >= 0]
520
- clusters = {}
521
- for i in valid_indices:
522
- lbl = int(vlabels[i])
523
- clusters.setdefault(lbl, []).append(i)
524
- n_vclusters = len(clusters)
525
- st.subheader(f"🎙️ Empremtes de veu — clústers: {n_vclusters}")
526
- di = st.session_state.get("diarization_info") or {}
527
- if isinstance(di, dict) and not di.get("diarization_ok", True):
528
- st.warning("No s'ha pogut fer la diarització amb pyannote (s'ha aplicat un sol segment de reserva).")
529
- if not segs:
530
- st.info("No s'han detectat mostres de veu.")
531
- elif n_vclusters == 0:
532
- st.info("No s'han format clústers de veu.")
533
- else:
534
- vname = st.session_state.video_name_from_engine
535
- for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
536
- key_prefix = f"voice_{lbl:02d}"
537
- if f"{key_prefix}_idx" not in st.session_state:
538
- st.session_state[f"{key_prefix}_idx"] = 0
539
- if f"{key_prefix}_discard" not in st.session_state:
540
- st.session_state[f"{key_prefix}_discard"] = set()
541
- discard_set = st.session_state[f"{key_prefix}_discard"]
542
- files = []
543
- for i in idxs:
544
- clip_local = (segs[i] or {}).get("clip_path")
545
- fname = os.path.basename(clip_local) if clip_local else None
546
- if fname:
547
- files.append(fname)
548
- files = [f for f in files if f and f not in discard_set]
549
- if not files:
550
- st.write(f"- SPEAKER_{lbl:02d} — sense clips seleccionats")
551
- continue
552
- cur = st.session_state[f"{key_prefix}_idx"]
553
- if cur >= len(files):
554
- cur = 0
555
- st.session_state[f"{key_prefix}_idx"] = cur
556
- fname = files[cur]
557
- audio_url = f"{backend_base_url}/audio/{vname}/{fname}" if (vname and fname) else None
558
- st.markdown(f"**SPEAKER_{lbl:02d} {len(files)} clips**")
559
- c1, c2 = st.columns([1, 2])
560
- with c1:
561
- if audio_url:
562
- st.audio(audio_url, format="audio/wav")
563
- st.caption(f"Clip {cur+1}/{len(files)}")
564
- bcol1, bcol2, bcol3 = st.columns(3)
565
- with bcol1:
566
- if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
567
- st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  st.rerun()
569
  with bcol2:
570
  if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquest clip del clúster"):
 
118
  "+faststart",
119
  output_path,
120
  ]
121
+ result = subprocess.run(cmd, capture_output=True, text=True)
122
+ if result.returncode != 0:
123
+ raise RuntimeError(result.stderr.strip() or "ffmpeg failed")
124
+
125
+
126
+ def render_process_video_page(api, backend_base_url: str) -> None:
127
+ st.header("Processar un nou clip de vídeo")
128
+
129
+ if not st.session_state.get("process_video_css_applied"):
130
+ st.markdown(
131
+ """
132
+ <style>
133
+ /* Estabilitzar carrusels per evitar vibracions de layout */
134
+ .stImage {
135
+ min-height: 200px;
136
+ max-height: 250px;
137
+ display: flex;
138
+ align-items: center;
139
+ justify-content: center;
140
+ overflow: hidden;
141
+ }
142
+
143
+ .stImage > img {
144
+ max-width: 100%;
145
+ height: auto;
146
+ object-fit: contain;
147
+ }
148
+
149
+ .stAudio {
150
+ min-height: 54px;
151
+ max-height: 80px;
152
+ }
153
+
154
+ .stCaption {
155
+ min-height: 20px;
156
+ }
157
+
158
+ .stTextInput > div,
159
+ .stTextArea > div {
160
+ transition: none !important;
161
+ }
162
+
163
+ .stButton button {
164
+ transition: background-color 0.2s, color 0.2s;
165
+ min-height: 38px;
166
+ white-space: nowrap;
167
+ }
168
+
169
+ div[data-testid="column"] > div {
170
+ contain: layout style;
171
+ }
172
+
173
+ [data-testid="stVerticalBlock"] > div {
174
+ will-change: auto;
175
+ }
176
+ </style>
177
+ """,
178
+ unsafe_allow_html=True,
179
+ )
180
+ st.session_state.process_video_css_applied = True
181
+
182
+ msg_detect = st.empty()
183
+ msg_finalize = st.empty()
184
+ msg_ad = st.empty()
185
+
186
+ # Inicializar el estado de la página si no existe
187
+ if "video_uploaded" not in st.session_state:
188
+ st.session_state.video_uploaded = None
189
+ if "characters_detected" not in st.session_state:
190
+ st.session_state.characters_detected = None
191
+ if "audio_segments" not in st.session_state:
192
+ st.session_state.audio_segments = None
193
+ if "voice_labels" not in st.session_state:
194
+ st.session_state.voice_labels = None
195
+ if "face_labels" not in st.session_state:
196
+ st.session_state.face_labels = None
197
+ if "scene_clusters" not in st.session_state:
198
+ st.session_state.scene_clusters = None
199
+ if "scene_detection_done" not in st.session_state:
200
+ st.session_state.scene_detection_done = False
201
+ if "detect_done" not in st.session_state:
202
+ st.session_state.detect_done = False
203
+ if "casting_finalized" not in st.session_state:
204
+ st.session_state.casting_finalized = False
205
+ if "video_name_from_engine" not in st.session_state:
206
+ st.session_state.video_name_from_engine = None
207
+ if "diarization_info" not in st.session_state:
208
+ st.session_state.diarization_info = {}
209
+ if "characters_saved" not in st.session_state:
210
+ st.session_state.characters_saved = False
211
+
212
+ # --- 1. Subida del vídeo ---
213
+ MAX_SIZE_MB = 20
214
+ MAX_DURATION_S = 240 # 4 minutos
215
+
216
+ uploaded_file = st.file_uploader(
217
+ "Puja un clip de vídeo (MP4, < 20MB, < 4 minuts)",
218
+ type=["mp4"],
219
+ key="video_uploader",
220
+ )
221
+
222
+ if uploaded_file is not None:
223
+ # Resetear el estado si se sube un nuevo archivo
224
+ if st.session_state.video_uploaded is None or uploaded_file.name != st.session_state.video_uploaded.get(
225
+ "original_name"
226
+ ):
227
+ st.session_state.video_uploaded = {"original_name": uploaded_file.name, "status": "validating"}
228
+ st.session_state.characters_detected = None
229
+ st.session_state.characters_saved = False
230
+
231
+ if st.session_state.video_uploaded["status"] == "validating":
232
+ is_valid = True
233
+ if uploaded_file.size > MAX_SIZE_MB * 1024 * 1024:
234
+ st.error(f"El vídeo supera el límit de {MAX_SIZE_MB}MB.")
235
+ is_valid = False
236
+
237
+ if is_valid:
238
+ with st.spinner("Processant el vídeo..."):
239
+ temp_path = Path("temp_video.mp4")
240
+ with temp_path.open("wb") as f:
241
+ f.write(uploaded_file.getbuffer())
242
+
243
+ was_truncated = False
244
+ final_video_path = None
245
+ try:
246
+ duration = _get_video_duration(str(temp_path))
247
+ duration_unknown = False
248
+ if not duration:
249
+ st.warning(
250
+ "No s'ha pogut obtenir la durada del vídeo. Es continuarà assumint un màxim de 4 minuts."
251
+ )
252
+ duration = float(MAX_DURATION_S)
253
+ duration_unknown = True
254
+
255
+ if is_valid:
256
+ if duration > MAX_DURATION_S:
257
+ was_truncated = True
258
+
259
+ video_name = Path(uploaded_file.name).stem
260
+ video_dir = Path("/tmp/data/videos") / video_name
261
+ video_dir.mkdir(parents=True, exist_ok=True)
262
+ final_video_path = video_dir / f"{video_name}.mp4"
263
+
264
+ try:
265
+ _transcode_video(
266
+ str(temp_path),
267
+ str(final_video_path),
268
+ MAX_DURATION_S if (was_truncated or duration_unknown) else None,
269
+ )
270
+ except RuntimeError as exc:
271
+ st.error(f"No s'ha pogut processar el vídeo: {exc}")
272
+ is_valid = False
273
+
274
+ if is_valid and final_video_path is not None:
275
+ st.session_state.video_uploaded.update(
276
+ {
277
+ "status": "processed",
278
+ "path": str(final_video_path),
279
+ "was_truncated": was_truncated or duration_unknown,
280
+ "duration_unknown": duration_unknown,
281
+ "bytes": uploaded_file.getvalue(),
282
+ "name": uploaded_file.name,
283
+ }
284
+ )
285
+ st.rerun()
286
+ finally:
287
+ if temp_path.exists():
288
+ temp_path.unlink()
289
+
290
+ if st.session_state.video_uploaded and st.session_state.video_uploaded["status"] == "processed":
291
+ st.success(f"Vídeo '{st.session_state.video_uploaded['original_name']}' pujat i processat correctament.")
292
+ if st.session_state.video_uploaded["was_truncated"]:
293
+ st.warning("El vídeo s'ha truncat a 4 minuts.")
294
+
295
+ # --- 2. Form de detecció amb sliders ---
296
+ st.markdown("---")
297
+
298
+ with st.form("detect_form"):
299
+ col_btn, col_face, col_voice, col_scene = st.columns([1, 1, 1, 1])
300
+ with col_face:
301
+ st.markdown("**Cares**")
302
+ face_max_groups = st.slider("Límit de grups (cares)", 1, 10, 5, 1, key="face_max_groups")
303
+ face_min_cluster = st.slider("Mida mínima (cares)", 1, 5, 3, 1, key="face_min_cluster")
304
+ face_sensitivity = st.slider("Sensibilitat (cares)", 0.0, 1.0, 0.5, 0.05, key="face_sensitivity",
305
+ help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
306
+ with col_voice:
307
+ st.markdown("**Veus**")
308
+ voice_max_groups = st.slider("Límit de grups (veus)", 1, 10, 5, 1, key="voice_max_groups")
309
+ voice_min_cluster = st.slider("Mida mínima (veus)", 1, 5, 3, 1, key="voice_min_cluster")
310
+ voice_sensitivity = st.slider("Sensibilitat (veus)", 0.0, 1.0, 0.5, 0.05, key="voice_sensitivity",
311
+ help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
312
+ with col_scene:
313
+ st.markdown("**Escenes**")
314
+ scene_max_groups = st.slider("Límit de grups (escenes)", 1, 10, 3, 1, key="scene_max_groups")
315
+ scene_min_cluster = st.slider("Mida mínima (escenes)", 5, 20, 12, 1, key="scene_min_cluster")
316
+ scene_sensitivity = st.slider("Sensibilitat (escenes)", 0.0, 1.0, 0.5, 0.05, key="scene_sensitivity",
317
+ help="0.0 = menys clusters (més agressiu), 0.5 = balancejat, 1.0 = més clusters (més permissiu)")
318
+ with col_btn:
319
+ max_frames = st.number_input("Nombre de frames a processar", min_value=10, max_value=500, value=100, step=10,
320
+ help="Nombre de fotogrames equiespaciats a extreure del vídeo per detectar cares")
321
+ can_detect = st.session_state.video_uploaded is not None
322
+ submit_detect = st.form_submit_button("Detectar Personatges", disabled=not can_detect)
323
+
324
+ if not can_detect:
325
+ st.caption("📹 Necessites pujar un vídeo primer")
326
+
327
+ if submit_detect:
328
+ import time as _t
329
+ import os as _os
330
+ msg_detect.empty()
331
+ msg_finalize.empty()
332
+ msg_ad.empty()
333
+ try:
334
+ v = st.session_state.video_uploaded
335
+ # Reset estat abans de començar
336
+ st.session_state.scene_clusters = None
337
+ st.session_state.scene_detection_done = False
338
+ st.session_state.detect_done = False
339
+ st.session_state.casting_finalized = False
340
+
341
+ resp = api.create_initial_casting(
342
+ video_bytes=v["bytes"],
343
+ video_name=v["name"],
344
+ face_max_groups=face_max_groups,
345
+ face_min_cluster_size=face_min_cluster,
346
+ face_sensitivity=face_sensitivity,
347
+ voice_max_groups=voice_max_groups,
348
+ voice_min_cluster_size=voice_min_cluster,
349
+ voice_sensitivity=voice_sensitivity,
350
+ max_frames=max_frames,
351
+ )
352
+
353
+ if not isinstance(resp, dict) or not resp.get("job_id"):
354
+ msg_detect.error("No s'ha pogut crear el job al servidor. Torna-ho a intentar.")
355
+ else:
356
+ job_id = resp["job_id"]
357
+ msg_detect.info(f"Job creat: {job_id}. Iniciant polling en 3s…")
358
+ with st.spinner("Processant al servidor…"):
359
+ _t.sleep(3)
360
+ attempt, max_attempts = 0, 120
361
+ progress_placeholder = st.empty()
362
+ while attempt < max_attempts:
363
+ stt = api.get_job(job_id)
364
+ status = stt.get("status")
365
+ if status in ("queued", "processing"):
366
+ if attempt % 10 == 0:
367
+ elapsed_min = (attempt * 5) // 60
368
+ progress_placeholder.info(f"⏳ Processant al servidor... (~{elapsed_min} min)")
369
+ _t.sleep(5)
370
+ attempt += 1
371
+ continue
372
+ if status == "failed":
373
+ progress_placeholder.empty()
374
+ msg_detect.error("El processament ha fallat al servidor.")
375
+ break
376
+
377
+ # Success
378
+ res = stt.get("results", {})
379
+ chars = res.get("characters", [])
380
+ fl = res.get("face_labels", [])
381
+ segs = res.get("audio_segments", [])
382
+ vl = res.get("voice_labels", [])
383
+ base_dir = res.get("base_dir")
384
+ vname = _os.path.basename(base_dir) if base_dir else None
385
+ diar_info = res.get("diarization_info", {})
386
+
387
+ st.session_state.characters_detected = chars or []
388
+ st.session_state.face_labels = fl or []
389
+ st.session_state.audio_segments = segs or []
390
+ st.session_state.voice_labels = vl or []
391
+ st.session_state.video_name_from_engine = vname
392
+ st.session_state.engine_base_dir = base_dir
393
+ st.session_state.diarization_info = diar_info or {}
394
+
395
+ progress_placeholder.empty()
396
+
397
+ if chars:
398
+ msg_detect.success(
399
+ f"✓ Detecció completada! Trobades {len(chars)} cares.\n\n"
400
+ "💡 Usa els botons '🎨 Generar descripció' a sota de cada personatge per obtenir descripcions automàtiques amb Salamandra Vision."
401
+ )
402
+ else:
403
+ msg_detect.info("No s'han detectat cares en aquest vídeo.")
404
+
405
+ # Detect scenes
406
+ try:
407
+ scene_out = api.detect_scenes(
408
+ video_bytes=v["bytes"],
409
+ video_name=v["name"],
410
+ max_groups=scene_max_groups,
411
+ min_cluster_size=scene_min_cluster,
412
+ scene_sensitivity=scene_sensitivity,
413
+ frame_interval_sec=0.5,
414
+ )
415
+ scs = scene_out.get("scene_clusters") if isinstance(scene_out, dict) else None
416
+ if isinstance(scs, list):
417
+ st.session_state.scene_clusters = scs
418
+ else:
419
+ st.session_state.scene_clusters = []
420
+ except Exception:
421
+ st.session_state.scene_clusters = []
422
+ finally:
423
+ st.session_state.scene_detection_done = True
424
+
425
+ st.session_state.detect_done = True
426
+ msg_detect.success("✅ Processament completat!")
427
+ break
428
+ else:
429
+ progress_placeholder.empty()
430
+ msg_detect.warning(f"⏱️ El servidor no ha completat el job en {max_attempts * 5 // 60} minuts.")
431
+ except Exception as e:
432
+ msg_detect.error(f"Error inesperat: {e}")
433
+
434
+ # --- 3. Carruseles de cares ---
435
+ if st.session_state.get("characters_detected") is not None:
436
+ st.markdown("---")
437
+ n_face_clusters = len(st.session_state.get("characters_detected") or [])
438
+ st.subheader(f"🖼️ Cares — clústers: {n_face_clusters}")
439
+
440
+ if n_face_clusters == 0:
441
+ st.info("No s'han detectat clústers de cara en aquest clip.")
442
+
443
+ for idx, ch in enumerate(st.session_state.characters_detected or []):
444
+ try:
445
+ folder_name = Path(ch.get("folder") or "").name
446
+ except Exception:
447
+ folder_name = ""
448
+ char_id = ch.get("id") or folder_name or f"char{idx+1}"
449
+
450
+ def _safe_key(s: str) -> str:
451
+ k = re.sub(r"[^0-9a-zA-Z_]+", "_", s or "")
452
+ return k or f"cluster_{idx+1}"
453
+
454
+ key_prefix = _safe_key(f"char_{idx+1}_{char_id}")
455
+ if f"{key_prefix}_idx" not in st.session_state:
456
+ st.session_state[f"{key_prefix}_idx"] = 0
457
+ if f"{key_prefix}_discard" not in st.session_state:
458
+ st.session_state[f"{key_prefix}_discard"] = set()
459
+
460
+ faces_all = ch.get("face_files") or ([ch.get("image_url")] if ch.get("image_url") else [])
461
+ faces_all = [f for f in faces_all if f]
462
+ discard_set = st.session_state[f"{key_prefix}_discard"]
463
+ faces = [f for f in faces_all if f not in discard_set]
464
+
465
+ if not faces:
466
+ st.write(f"- {idx+1}. {ch.get('name','(sense nom)')} — sense imatges seleccionades")
467
+ continue
468
+
469
+ cur = st.session_state[f"{key_prefix}_idx"]
470
+ if cur >= len(faces):
471
+ cur = 0
472
+ st.session_state[f"{key_prefix}_idx"] = cur
473
+ fname = faces[cur]
474
+
475
+ if fname.startswith("/files/"):
476
+ img_url = f"{backend_base_url}{fname}"
477
+ else:
478
+ base = ch.get("image_url") or ""
479
+ base_dir = "/".join((base or "/").split("/")[:-1])
480
+ img_url = f"{backend_base_url}{base_dir}/{fname}" if base_dir else f"{backend_base_url}{fname}"
481
+
482
+ st.markdown(f"**{idx+1}. {ch.get('name','(sense nom)')} — {ch.get('num_faces', 0)} cares**")
483
+ c1, c2 = st.columns([1, 3])
484
+ with c1:
485
+ st.image(img_url, width=150)
486
+ st.caption(f"Imatge {cur+1}/{len(faces)}")
487
+ bcol1, bcol2, bcol3 = st.columns(3)
488
+ with bcol1:
489
+ if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
490
+ st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(faces)
491
+ st.rerun()
492
+ with bcol2:
493
+ if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquesta imatge del clúster"):
494
+ st.session_state[f"{key_prefix}_discard"].add(fname)
495
+ new_list = [f for f in faces if f != fname]
496
+ new_idx = cur if cur < len(new_list) else 0
497
+ st.session_state[f"{key_prefix}_idx"] = new_idx
498
+ st.rerun()
499
+ with bcol3:
500
+ if st.button("➡️", key=f"next_{key_prefix}", help="Següent"):
501
+ st.session_state[f"{key_prefix}_idx"] = (cur + 1) % len(faces)
502
+ st.rerun()
503
+ with c2:
504
+ name_key = f"{key_prefix}_name"
505
+ desc_key = f"{key_prefix}_desc"
506
+ default_name = ch.get("name", "")
507
+ default_desc = ch.get("description", "")
508
+
509
+ if default_name and (name_key not in st.session_state or not st.session_state.get(name_key)):
510
+ st.session_state[name_key] = default_name
511
+ elif name_key not in st.session_state:
512
+ st.session_state[name_key] = default_name or ""
513
+
514
+ if default_desc and (desc_key not in st.session_state or not st.session_state.get(desc_key)):
515
+ st.session_state[desc_key] = default_desc
516
+ elif desc_key not in st.session_state:
517
+ st.session_state[desc_key] = default_desc or ""
518
+
519
+ pending_desc_key = f"{key_prefix}_pending_desc"
520
+ pending_name_key = f"{key_prefix}_pending_name"
521
+ if pending_desc_key in st.session_state:
522
+ if desc_key not in st.session_state:
523
+ st.session_state[desc_key] = ""
524
+ st.session_state[desc_key] = st.session_state[pending_desc_key]
525
+ del st.session_state[pending_desc_key]
526
+
527
+ if pending_name_key in st.session_state:
528
+ if name_key not in st.session_state:
529
+ st.session_state[name_key] = ""
530
+ if not st.session_state.get(name_key):
531
+ st.session_state[name_key] = st.session_state[pending_name_key]
532
+ del st.session_state[pending_name_key]
533
+
534
+ st.text_input("Nom del clúster", key=name_key)
535
+ st.text_area("Descripció", key=desc_key, height=80)
536
+
537
+ if st.button("🎨 Generar descripció amb Salamandra Vision", key=f"svision_{key_prefix}"):
538
+ with st.spinner("Generant descripció..."):
539
+ from api_client import describe_image_with_svision
540
+ import requests as _req
541
+
542
+ try:
543
+ resp = _req.get(img_url, timeout=10)
544
+ if resp.status_code == 200:
545
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
546
+ tmp.write(resp.content)
547
+ tmp_path = tmp.name
548
+
549
+ desc, name = describe_image_with_svision(tmp_path, is_face=True)
550
+
551
+ if desc:
552
+ st.session_state[pending_desc_key] = desc
553
+ st.success("✅ Descripció generada!")
554
+ else:
555
+ st.warning("⚠️ No s'ha pogut generar una descripció.")
556
+
557
+ if name and not st.session_state.get(name_key):
558
+ st.session_state[pending_name_key] = name
559
+
560
+ os.unlink(tmp_path)
561
+ st.rerun()
562
+ else:
563
+ st.error(f"No s'ha pogut descarregar la imatge (status: {resp.status_code})")
564
+ except Exception as e:
565
+ st.error(f"Error generant descripció: {e}")
566
+
567
+ # --- 4. Carruseles de veus ---
568
+ if st.session_state.get("audio_segments") is not None:
569
+ st.markdown("---")
570
+
571
+ used_names_home = []
572
+ used_names_dona = []
573
+ noms_home_all, noms_dona_all = get_all_catalan_names()
574
+
575
+ for ch in (st.session_state.characters_detected or []):
576
+ ch_name = ch.get("name", "")
577
+ if ch_name in noms_home_all:
578
+ used_names_home.append(ch_name)
579
+ elif ch_name in noms_dona_all:
580
+ used_names_dona.append(ch_name)
581
+
582
+ segs = st.session_state.audio_segments or []
583
+ vlabels = st.session_state.voice_labels or []
584
+ valid_indices = [i for i, l in enumerate(vlabels) if isinstance(l, int) and l >= 0]
585
+ clusters = {}
586
+ for i in valid_indices:
587
+ lbl = int(vlabels[i])
588
+ clusters.setdefault(lbl, []).append(i)
589
+ n_vclusters = len(clusters)
590
+ st.subheader(f"🎙️ Empremtes de veu — clústers: {n_vclusters}")
591
+ di = st.session_state.get("diarization_info") or {}
592
+ if isinstance(di, dict) and not di.get("diarization_ok", True):
593
+ st.warning("No s'ha pogut fer la diarització amb pyannote (s'ha aplicat un sol segment de reserva).")
594
+ if not segs:
595
+ st.info("No s'han detectat mostres de veu.")
596
+ elif n_vclusters == 0:
597
+ st.info("No s'han format clústers de veu.")
598
+ else:
599
+ vname = st.session_state.video_name_from_engine
600
+ for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
601
+ key_prefix = f"voice_{lbl:02d}"
602
+ if f"{key_prefix}_idx" not in st.session_state:
603
+ st.session_state[f"{key_prefix}_idx"] = 0
604
+ if f"{key_prefix}_discard" not in st.session_state:
605
+ st.session_state[f"{key_prefix}_discard"] = set()
606
+ discard_set = st.session_state[f"{key_prefix}_discard"]
607
+ files = []
608
+ for i in idxs:
609
+ clip_local = (segs[i] or {}).get("clip_path")
610
+ fname = os.path.basename(clip_local) if clip_local else None
611
+ if fname:
612
+ files.append(fname)
613
+ files = [f for f in files if f and f not in discard_set]
614
+ if not files:
615
+ st.write(f"- SPEAKER_{lbl:02d} — sense clips seleccionats")
616
+ continue
617
+ cur = st.session_state[f"{key_prefix}_idx"]
618
+ if cur >= len(files):
619
+ cur = 0
620
+ st.session_state[f"{key_prefix}_idx"] = cur
621
+ fname = files[cur]
622
+ audio_url = f"{backend_base_url}/audio/{vname}/{fname}" if (vname and fname) else None
623
+ st.markdown(f"**SPEAKER_{lbl:02d} — {len(files)} clips**")
624
+ c1, c2 = st.columns([1, 2])
625
+ with c1:
626
+ if audio_url:
627
+ st.audio(audio_url, format="audio/wav")
628
+ st.caption(f"Clip {cur+1}/{len(files)}")
629
+ bcol1, bcol2, bcol3 = st.columns(3)
630
+ with bcol1:
631
+ if st.button("⬅️", key=f"prev_{key_prefix}", help="Anterior"):
632
+ st.session_state[f"{key_prefix}_idx"] = (cur - 1) % len(files)
633
  st.rerun()
634
  with bcol2:
635
  if st.button("🗑️", key=f"del_{key_prefix}", help="Eliminar aquest clip del clúster"):