VeuReu commited on
Commit
7b78506
·
verified ·
1 Parent(s): fa2fa9c

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +1282 -14
api.py CHANGED
@@ -1,18 +1,1286 @@
1
- import tempfile
2
- from svision_client import extract_scenes
3
- from PIL import Image
4
- from io import BytesIO
5
- import base64
 
 
 
 
 
 
 
 
 
 
6
 
7
- def pipeline_video_analysis(video_file, threshold=30.0, offset_frames=10, crop_ratio=0.1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  """
9
- Pipeline completo para procesar un vídeo:
10
- 1. Extrae escenas usando el Space svision.
11
- 2. Devuelve las imágenes de las escenas y la info asociada.
12
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Llamada a svision
15
- print("Llamando a svision para extraer escenas...")
16
- images, scenes_info = extract_scenes(video_file, threshold, offset_frames, crop_ratio)
17
- print("Escenas extraídas:", len(images))
18
- return images, scenes_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks, HTTPException
3
+ from fastapi import Body
4
+ from fastapi.responses import JSONResponse, FileResponse
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from pathlib import Path
7
+ import shutil
8
+ import uvicorn
9
+ import json
10
+ import uuid
11
+ from datetime import datetime
12
+ from typing import Dict
13
+ from enum import Enum
14
+ import os
15
+ import yaml
16
 
17
+ from video_processing import process_video_pipeline
18
+ from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments
19
+ from casting_loader import ensure_chroma, build_faces_index, build_voices_index
20
+ from narration_system import NarrationSystem
21
+ from llm_router import load_yaml, LLMRouter
22
+ from character_detection import detect_characters_from_video
23
+
24
+ from pipelines.audiodescription import generate as ad_generate
25
+
26
+ app = FastAPI(title="Veureu Engine API", version="0.2.0")
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ ROOT = Path("/tmp/veureu")
36
+ ROOT.mkdir(parents=True, exist_ok=True)
37
+ TEMP_ROOT = Path("/tmp/temp")
38
+ TEMP_ROOT.mkdir(parents=True, exist_ok=True)
39
+ VIDEOS_ROOT = Path("/tmp/data/videos")
40
+ VIDEOS_ROOT.mkdir(parents=True, exist_ok=True)
41
+ IDENTITIES_ROOT = Path("/tmp/characters")
42
+ IDENTITIES_ROOT.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Sistema de jobs asíncronos
45
+ class JobStatus(str, Enum):
46
+ QUEUED = "queued"
47
+ PROCESSING = "processing"
48
+ DONE = "done"
49
+ FAILED = "failed"
50
+
51
+ jobs: Dict[str, dict] = {}
52
+
53
+ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[str, str]:
54
+ """
55
+ Llama al space svision para describir una imagen (usado en generación de AD).
56
+
57
+ Args:
58
+ image_path: Ruta absoluta a la imagen
59
+ is_face: True si es una cara, False si es una escena
60
+
61
+ Returns:
62
+ tuple (descripción_completa, nombre_abreviado)
63
+ """
64
+ try:
65
+ from pathlib import Path as _P
66
+ import yaml
67
+ from llm_router import LLMRouter
68
+
69
+ # Cargar configuración
70
+ config_path = _P(__file__).parent / "config.yaml"
71
+ if not config_path.exists():
72
+ print(f"[svision] Config no encontrado: {config_path}")
73
+ return ("", "")
74
+
75
+ with open(config_path, 'r', encoding='utf-8') as f:
76
+ cfg = yaml.safe_load(f) or {}
77
+
78
+ router = LLMRouter(cfg)
79
+
80
+ # Contexto diferente para caras vs escenas
81
+ if is_face:
82
+ context = {
83
+ "task": "describe_person",
84
+ "instructions": "Descriu la persona en la imatge. Inclou: edat aproximada (jove/adult), gènere, característiques físiques notables (ulleres, barba, bigoti, etc.), expressió i vestimenta.",
85
+ "max_tokens": 256
86
+ }
87
+ else:
88
+ context = {
89
+ "task": "describe_scene",
90
+ "instructions": "Descriu aquesta escena breument en 2-3 frases: tipus de localització i elements principals.",
91
+ "max_tokens": 128
92
+ }
93
+
94
+ # Llamar a svision
95
+ descriptions = router.vision_describe([str(image_path)], context=context, model="salamandra-vision")
96
+ full_description = descriptions[0] if descriptions else ""
97
+
98
+ if not full_description:
99
+ return ("", "")
100
+
101
+ print(f"[svision] Descripció generada: {full_description[:100]}...")
102
+
103
+ return (full_description, "")
104
+
105
+ except Exception as e:
106
+ print(f"[svision] Error al descriure imatge: {e}")
107
+ import traceback
108
+ traceback.print_exc()
109
+ return ("", "")
110
+
111
+ def normalize_face_lighting(image):
112
+ """
113
+ Normaliza el brillo de una imagen de cara usando técnicas combinadas:
114
+ 1. CLAHE para ecualización adaptativa
115
+ 2. Normalización de rango para homogeneizar brillo general
116
+
117
+ Esto reduce el impacto de diferentes condiciones de iluminación en los embeddings
118
+ y en la visualización de las imágenes.
119
+
120
+ Args:
121
+ image: Imagen BGR (OpenCV format)
122
+
123
+ Returns:
124
+ Imagen normalizada en el mismo formato
125
+ """
126
+ import cv2
127
+ import numpy as np
128
+
129
+ # Paso 1: Convertir a LAB color space (más robusto para iluminación)
130
+ lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
131
+ l, a, b = cv2.split(lab)
132
+
133
+ # Paso 2: Aplicar CLAHE (Contrast Limited Adaptive Histogram Equalization) al canal L
134
+ # Usar clipLimit más alto para normalización más agresiva
135
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
136
+ l_clahe = clahe.apply(l)
137
+
138
+ # Paso 3: Normalizar el rango del canal L para asegurar distribución uniforme
139
+ # Esto garantiza que todas las imágenes tengan un rango de brillo similar
140
+ l_min, l_max = l_clahe.min(), l_clahe.max()
141
+ if l_max > l_min:
142
+ # Estirar el histograma al rango completo [0, 255]
143
+ l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
144
+ else:
145
+ l_normalized = l_clahe
146
+
147
+ # Paso 4: Aplicar suavizado suave para reducir ruido introducido por la normalización
148
+ l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
149
+
150
+ # Recombinar canales
151
+ lab_normalized = cv2.merge([l_normalized, a, b])
152
+
153
+ # Convertir de vuelta a BGR
154
+ normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
155
+ return normalized
156
+
157
+ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
158
+ """
159
+ Clustering jerárquico con silhouette score para encontrar automáticamente el mejor número de clusters.
160
+ Selecciona automáticamente el mejor número de clusters (hasta max_groups) usando silhouette score.
161
+ Filtra clusters con menos de min_cluster_size muestras (marcados como -1/ruido).
162
+
163
+ Args:
164
+ X: Array de embeddings (N, D)
165
+ max_groups: Número máximo de clusters a formar
166
+ min_cluster_size: Tamaño mínimo de cluster válido
167
+ sensitivity: Sensibilidad del clustering (0.0-1.0)
168
+ - 0.0 = muy agresivo (menos clusters)
169
+ - 0.5 = balanceado (recomendado)
170
+ - 1.0 = muy permisivo (más clusters)
171
+
172
+ Returns:
173
+ Array de labels (N,) donde -1 indica ruido
174
+ """
175
+ import numpy as np
176
+ from scipy.cluster.hierarchy import linkage, fcluster
177
+ from sklearn.metrics import silhouette_score
178
+ from collections import Counter
179
+
180
+ if len(X) == 0:
181
+ return np.array([])
182
+
183
+ if len(X) < min_cluster_size:
184
+ # Si hay menos muestras que el mínimo, todo es ruido
185
+ return np.full(len(X), -1, dtype=int)
186
+
187
+ # Linkage usando average linkage (más flexible que ward, menos sensible a outliers)
188
+ # Esto ayuda a agrupar mejor la misma persona con diferentes ángulos/expresiones
189
+ Z = linkage(X, method='average', metric='cosine') # Cosine similarity para embeddings
190
+
191
+ # Encontrar el número óptimo de clusters usando silhouette score
192
+ best_n_clusters = 2
193
+ best_score = -1
194
+
195
+ # Probar diferentes números de clusters (de 2 a max_groups)
196
+ max_to_try = min(max_groups, len(X) - 1) # No puede haber más clusters que muestras
197
+
198
+ if max_to_try >= 2:
199
+ for n_clusters in range(2, max_to_try + 1):
200
+ trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
201
+
202
+ # Calcular cuántos clusters válidos tendríamos después del filtrado
203
+ trial_counts = Counter(trial_labels)
204
+ valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
205
+
206
+ # Solo evaluar si hay al menos 2 clusters válidos
207
+ if valid_clusters >= 2:
208
+ try:
209
+ score = silhouette_score(X, trial_labels, metric='cosine')
210
+ # Penalización dinámica basada en sensibilidad:
211
+ # - sensitivity=0.0 → penalty=0.14 (muy agresivo, menos clusters)
212
+ # - sensitivity=0.5 → penalty=0.07 (balanceado, recomendado)
213
+ # - sensitivity=1.0 → penalty=0.01 (permisivo, más clusters)
214
+ penalty = 0.14 - (sensitivity * 0.13)
215
+ adjusted_score = score - (n_clusters * penalty)
216
+
217
+ if adjusted_score > best_score:
218
+ best_score = adjusted_score
219
+ best_n_clusters = n_clusters
220
+ except:
221
+ pass # Si falla el cálculo, ignorar esta configuración
222
+
223
+ # Usar el número óptimo de clusters encontrado
224
+ penalty = 0.14 - (sensitivity * 0.13)
225
+ print(f"Clustering óptimo: {best_n_clusters} clusters (de máximo {max_groups}), sensitivity={sensitivity:.2f}, penalty={penalty:.3f}, silhouette={best_score:.3f}")
226
+ labels = fcluster(Z, t=best_n_clusters, criterion='maxclust')
227
+
228
+ # fcluster devuelve labels 1-indexed, convertir a 0-indexed
229
+ labels = labels - 1
230
+
231
+ # Filtrar clusters pequeños
232
+ label_counts = Counter(labels)
233
+ filtered_labels = []
234
+ for lbl in labels:
235
+ if label_counts[lbl] >= min_cluster_size:
236
+ filtered_labels.append(lbl)
237
+ else:
238
+ filtered_labels.append(-1) # Ruido
239
+
240
+ return np.array(filtered_labels, dtype=int)
241
+
242
+ @app.get("/")
243
+ def root():
244
+ return {"ok": True, "service": "veureu-engine"}
245
+
246
+ @app.post("/process_video")
247
+ async def process_video(
248
+ video_file: UploadFile = File(...),
249
+ config_path: str = Form("config.yaml"),
250
+ out_root: str = Form("results"),
251
+ db_dir: str = Form("chroma_db"),
252
+ ):
253
+ tmp_video = ROOT / video_file.filename
254
+ with tmp_video.open("wb") as f:
255
+ shutil.copyfileobj(video_file.file, f)
256
+ result = process_video_pipeline(str(tmp_video), config_path=config_path, out_root=out_root, db_dir=db_dir)
257
+ return JSONResponse(result)
258
+
259
+ @app.post("/create_initial_casting")
260
+ async def create_initial_casting(
261
+ background_tasks: BackgroundTasks,
262
+ video: UploadFile = File(...),
263
+ max_groups: int = Form(default=3),
264
+ min_cluster_size: int = Form(default=3),
265
+ face_sensitivity: float = Form(default=0.5),
266
+ voice_max_groups: int = Form(default=3),
267
+ voice_min_cluster_size: int = Form(default=3),
268
+ voice_sensitivity: float = Form(default=0.5),
269
+ max_frames: int = Form(default=100),
270
+ ):
271
+ """
272
+ Crea un job para procesar el vídeo de forma asíncrona usando clustering jerárquico.
273
+ Devuelve un job_id inmediatamente.
274
+ """
275
+ # Guardar vídeo en carpeta de datos
276
+ video_name = Path(video.filename).stem
277
+ dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
278
+ with dst_video.open("wb") as f:
279
+ shutil.copyfileobj(video.file, f)
280
+
281
+ # Crear job_id único
282
+ job_id = str(uuid.uuid4())
283
+
284
+ # Inicializar el job
285
+ jobs[job_id] = {
286
+ "id": job_id,
287
+ "status": JobStatus.QUEUED,
288
+ "video_path": str(dst_video),
289
+ "video_name": video_name,
290
+ "max_groups": int(max_groups),
291
+ "min_cluster_size": int(min_cluster_size),
292
+ "face_sensitivity": float(face_sensitivity),
293
+ "voice_max_groups": int(voice_max_groups),
294
+ "voice_min_cluster_size": int(voice_min_cluster_size),
295
+ "voice_sensitivity": float(voice_sensitivity),
296
+ "max_frames": int(max_frames),
297
+ "created_at": datetime.now().isoformat(),
298
+ "results": None,
299
+ "error": None
300
+ }
301
+
302
+ print(f"[{job_id}] Job creado para vídeo: {video_name}")
303
+
304
+ # Iniciar procesamiento en background
305
+ background_tasks.add_task(process_video_job, job_id)
306
+
307
+ # Devolver job_id inmediatamente
308
+ return {"job_id": job_id}
309
+
310
+ @app.get("/jobs/{job_id}/status")
311
+ def get_job_status(job_id: str):
312
+ """
313
+ Devuelve el estado actual de un job.
314
+ El UI hace polling de este endpoint cada 5 segundos.
315
+ """
316
+ if job_id not in jobs:
317
+ raise HTTPException(status_code=404, detail="Job not found")
318
+
319
+ job = jobs[job_id]
320
+
321
+ # Normalizar el estado a string
322
+ status_value = job["status"].value if isinstance(job["status"], JobStatus) else str(job["status"])
323
+ response = {"status": status_value}
324
+
325
+ # Incluir resultados si existen (evita condiciones de carrera)
326
+ if job.get("results") is not None:
327
+ response["results"] = job["results"]
328
+
329
+ # Incluir error si existe
330
+ if job.get("error"):
331
+ response["error"] = job["error"]
332
+
333
+ return response
334
+
335
+ @app.get("/files/{video_name}/{char_id}/{filename}")
336
+ def serve_character_file(video_name: str, char_id: str, filename: str):
337
+ """
338
+ Sirve archivos estáticos de personajes (imágenes).
339
+ Ejemplo: /files/dif_catala_1/char1/representative.jpg
340
+ """
341
+ # Las caras se guardan en /tmp/temp/<video>/characters/<char_id>/<filename>
342
+ file_path = TEMP_ROOT / video_name / "characters" / char_id / filename
343
+
344
+ if not file_path.exists():
345
+ raise HTTPException(status_code=404, detail="File not found")
346
+
347
+ return FileResponse(file_path)
348
+
349
+ @app.get("/audio/{video_name}/{filename}")
350
+ def serve_audio_file(video_name: str, filename: str):
351
+ file_path = TEMP_ROOT / video_name / "clips" / filename
352
+ if not file_path.exists():
353
+ raise HTTPException(status_code=404, detail="File not found")
354
+ return FileResponse(file_path)
355
+
356
+ def process_video_job(job_id: str):
357
+ """
358
+ Procesa el vídeo de forma asíncrona.
359
+ Esta función se ejecuta en background.
360
+ """
361
+ try:
362
+ job = jobs[job_id]
363
+ print(f"[{job_id}] Iniciando procesamiento...")
364
+
365
+ # Cambiar estado a processing
366
+ job["status"] = JobStatus.PROCESSING
367
+
368
+ video_path = job["video_path"]
369
+ video_name = job["video_name"]
370
+ max_groups = int(job.get("max_groups", 5))
371
+ min_cluster_size = int(job.get("min_cluster_size", 3))
372
+ face_sensitivity = float(job.get("face_sensitivity", 0.5))
373
+ v_max_groups = int(job.get("voice_max_groups", 5))
374
+ v_min_cluster = int(job.get("voice_min_cluster_size", 3))
375
+ voice_sensitivity = float(job.get("voice_sensitivity", 0.5))
376
+
377
+ # Crear estructura de carpetas
378
+ base = TEMP_ROOT / video_name
379
+ base.mkdir(parents=True, exist_ok=True)
380
+
381
+ print(f"[{job_id}] Directorio base: {base}")
382
+
383
+ # Detección de caras y embeddings (CPU), alineado con 'originales'
384
+ try:
385
+ print(f"[{job_id}] Iniciando detección de personajes (CPU, originales)...")
386
+ print(f"[{job_id}] *** Normalización de brillo ACTIVADA ***")
387
+ print(f"[{job_id}] - CLAHE adaptativo (clipLimit=3.0)")
388
+ print(f"[{job_id}] - Estiramiento de histograma")
389
+ print(f"[{job_id}] - Suavizado Gaussiano")
390
+ print(f"[{job_id}] Esto homogeneizará el brillo de todas las caras detectadas")
391
+ import cv2
392
+ import numpy as np
393
+ try:
394
+ import face_recognition # CPU
395
+ _use_fr = True
396
+ print(f"[{job_id}] face_recognition disponible: CPU")
397
+ except Exception:
398
+ face_recognition = None # type: ignore
399
+ _use_fr = False
400
+ print(f"[{job_id}] face_recognition no disponible. Intentando DeepFace fallback.")
401
+ try:
402
+ from deepface import DeepFace # type: ignore
403
+ except Exception:
404
+ DeepFace = None # type: ignore
405
+
406
+ cap = cv2.VideoCapture(video_path)
407
+ if not cap.isOpened():
408
+ raise RuntimeError("No se pudo abrir el vídeo para extracción de caras")
409
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
410
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
411
+ max_samples = job.get("max_frames", 100)
412
+ # Índices de frames equiespaciados
413
+ if total_frames > 0:
414
+ frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
415
+ else:
416
+ frame_indices = []
417
+ print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames equiespaciados (máx {max_samples})")
418
+
419
+ # Salidas
420
+ faces_root = base / "faces_raw"
421
+ faces_root.mkdir(parents=True, exist_ok=True)
422
+ embeddings: list[list[float]] = []
423
+ crops_meta: list[dict] = []
424
+
425
+ saved_count = 0
426
+ frames_processed = 0
427
+ frames_with_faces = 0
428
+ for frame_idx in frame_indices:
429
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
430
+ ret2, frame = cap.read()
431
+ if not ret2:
432
+ continue
433
+ frames_processed += 1
434
+ # Normalizar iluminación antes de procesar
435
+ frame_normalized = normalize_face_lighting(frame)
436
+ rgb = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2RGB)
437
+
438
+ if _use_fr and face_recognition is not None:
439
+ boxes = face_recognition.face_locations(rgb, model="hog") # CPU HOG
440
+ encs = face_recognition.face_encodings(rgb, boxes)
441
+ if boxes:
442
+ frames_with_faces += 1
443
+ print(f"[{job_id}] Frame {frame_idx}: {len(boxes)} cara(s) detectada(s) con face_recognition")
444
+ for (top, right, bottom, left), e in zip(boxes, encs):
445
+ crop = frame_normalized[top:bottom, left:right]
446
+ if crop.size == 0:
447
+ continue
448
+ fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
449
+ cv2.imwrite(str(faces_root / fn), crop)
450
+ # Normalizar embedding
451
+ e = np.array(e, dtype=float)
452
+ e = e / (np.linalg.norm(e) + 1e-9)
453
+ embeddings.append(e.astype(float).tolist())
454
+ crops_meta.append({
455
+ "file": fn,
456
+ "frame": frame_idx,
457
+ "box": [int(top), int(right), int(bottom), int(left)],
458
+ })
459
+ saved_count += 1
460
+ else:
461
+ # DeepFace fallback con detección de bounding boxes vía Haar Cascade (OpenCV)
462
+ if DeepFace is None:
463
+ pass
464
+ else:
465
+ try:
466
+ gray = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2GRAY)
467
+ try:
468
+ haar_path = getattr(cv2.data, 'haarcascades', None) or ''
469
+ face_cascade = cv2.CascadeClassifier(os.path.join(haar_path, 'haarcascade_frontalface_default.xml'))
470
+ except Exception:
471
+ face_cascade = None
472
+ boxes_haar = []
473
+ if face_cascade is not None and not face_cascade.empty():
474
+ # Parámetros más estrictos para evitar falsos positivos
475
+ faces_haar = face_cascade.detectMultiScale(gray, scaleFactor=1.08, minNeighbors=5, minSize=(50, 50))
476
+ for (x, y, w, h) in faces_haar:
477
+ top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
478
+ boxes_haar.append((top, right, bottom, left))
479
+
480
+ # Si Haar no detecta nada, intentar con DeepFace directamente
481
+ if not boxes_haar:
482
+ try:
483
+ tmp_detect = faces_root / f"detect_{frame_idx:06d}.jpg"
484
+ cv2.imwrite(str(tmp_detect), frame_normalized)
485
+ detect_result = DeepFace.extract_faces(img_path=str(tmp_detect), detector_backend='opencv', enforce_detection=False)
486
+ for det in detect_result:
487
+ facial_area = det.get('facial_area', {})
488
+ if facial_area:
489
+ x, y, w, h = facial_area.get('x', 0), facial_area.get('y', 0), facial_area.get('w', 0), facial_area.get('h', 0)
490
+ # Validar que es un bbox real, no el frame completo
491
+ # Si el bbox es prácticamente el frame completo, descartarlo
492
+ is_full_frame = (x <= 5 and y <= 5 and w >= frame.shape[1] - 10 and h >= frame.shape[0] - 10)
493
+ # Bbox mínimo de 50x50 para filtrar falsos positivos pequeños
494
+ if w > 50 and h > 50 and not is_full_frame:
495
+ top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
496
+ boxes_haar.append((top, right, bottom, left))
497
+ tmp_detect.unlink(missing_ok=True)
498
+ except Exception as _e_detect:
499
+ print(f"[{job_id}] Frame {frame_idx}: DeepFace extract_faces error: {_e_detect}")
500
+
501
+ if boxes_haar:
502
+ frames_with_faces += 1
503
+ print(f"[{job_id}] Frame {frame_idx}: {len(boxes_haar)} cara(s) detectada(s) con Haar/DeepFace")
504
+
505
+ for (top, right, bottom, left) in boxes_haar:
506
+ crop = frame_normalized[top:bottom, left:right]
507
+ if crop.size == 0:
508
+ continue
509
+ fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
510
+ crop_path = faces_root / fn
511
+ cv2.imwrite(str(crop_path), crop)
512
+ reps = DeepFace.represent(img_path=str(crop_path), model_name="Facenet512", enforce_detection=False)
513
+ for r in (reps or []):
514
+ emb = r.get("embedding") if isinstance(r, dict) else r
515
+ if emb is None:
516
+ continue
517
+ emb = np.array(emb, dtype=float)
518
+ emb = emb / (np.linalg.norm(emb) + 1e-9)
519
+ embeddings.append(emb.astype(float).tolist())
520
+ crops_meta.append({
521
+ "file": fn,
522
+ "frame": frame_idx,
523
+ "box": [int(top), int(right), int(bottom), int(left)],
524
+ })
525
+ saved_count += 1
526
+ except Exception as _e_df:
527
+ print(f"[{job_id}] DeepFace fallback error: {_e_df}")
528
+ cap.release()
529
+
530
+ print(f"[{job_id}] ✓ Frames procesados: {frames_processed}/{len(frame_indices)}")
531
+ print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}")
532
+ print(f"[{job_id}] ✓ Caras detectadas (embeddings): {len(embeddings)}")
533
+
534
+ # Clustering jerárquico de caras
535
+ if embeddings:
536
+ Xf = np.array(embeddings)
537
+ labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
538
+ print(f"[{job_id}] Clustering jerárquico de caras: {len(set([l for l in labels if l >= 0]))} clusters")
539
+ else:
540
+ labels = []
541
+
542
+ # Construir carpetas por clúster con validación DeepFace
543
+ from face_classifier import validate_and_classify_face, get_random_catalan_name_by_gender, FACE_CONFIDENCE_THRESHOLD
544
+
545
+ characters_validated = []
546
+ cluster_map: dict[int, list[int]] = {}
547
+ for i, lbl in enumerate(labels):
548
+ if isinstance(lbl, int) and lbl >= 0:
549
+ cluster_map.setdefault(lbl, []).append(i)
550
+
551
+ chars_dir = base / "characters"
552
+ chars_dir.mkdir(parents=True, exist_ok=True)
553
+ import shutil as _sh
554
+
555
+ original_cluster_count = len(cluster_map)
556
+ print(f"[{job_id}] Procesando {original_cluster_count} clusters detectados...")
557
+
558
+ for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
559
+ char_id = f"char_{ci:02d}"
560
+
561
+ # PASO 1: Ordenar caras por área del bounding box (mejor calidad)
562
+ face_detections = []
563
+ for j in idxs:
564
+ meta = crops_meta[j]
565
+ box = meta.get("box", [0, 0, 0, 0])
566
+ if len(box) >= 4:
567
+ top, right, bottom, left = box
568
+ w = abs(right - left)
569
+ h = abs(bottom - top)
570
+ area_score = w * h
571
+ else:
572
+ area_score = 0
573
+
574
+ face_detections.append({
575
+ 'index': j,
576
+ 'score': area_score,
577
+ 'file': meta['file'],
578
+ 'box': box
579
+ })
580
+
581
+ # Ordenar por score descendente
582
+ face_detections_sorted = sorted(
583
+ face_detections,
584
+ key=lambda x: x['score'],
585
+ reverse=True
586
+ )
587
+
588
+ if not face_detections_sorted:
589
+ print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: sense deteccions, eliminant")
590
+ continue
591
+
592
+ # PASO 2: Validar SOLO la mejor cara del cluster
593
+ best_face = face_detections_sorted[0]
594
+ best_face_path = faces_root / best_face['file']
595
+
596
+ print(f"[{job_id}] [VALIDATION] Cluster {char_id}: validant millor cara (bbox_area={best_face['score']:.0f}px²)")
597
+ print(f"[{job_id}] [VALIDATION] Cluster {char_id}: millor cara path={best_face_path}")
598
+ print(f"[{job_id}] [VALIDATION] ▶▶▶ CRIDANT validate_and_classify_face() ◀◀◀")
599
+
600
+ validation = validate_and_classify_face(str(best_face_path))
601
+
602
+ print(f"[{job_id}] [VALIDATION] ▶▶▶ validate_and_classify_face() RETORNAT ◀◀◀")
603
+
604
+ if not validation:
605
+ print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: error en validació DeepFace, eliminant cluster")
606
+ continue
607
+
608
+ # Mostrar resultados detallados de DeepFace
609
+ print(f"[{job_id}] [DEEPFACE RESULT] Cluster {char_id}:")
610
+ print(f"[{job_id}] - is_valid_face: {validation['is_valid_face']}")
611
+ print(f"[{job_id}] - face_confidence: {validation['face_confidence']:.3f}")
612
+ print(f"[{job_id}] - man_prob: {validation['man_prob']:.3f}")
613
+ print(f"[{job_id}] - woman_prob: {validation['woman_prob']:.3f}")
614
+ print(f"[{job_id}] - gender_diff: {abs(validation['man_prob'] - validation['woman_prob']):.3f}")
615
+ print(f"[{job_id}] - gender_assigned: {validation['gender']}")
616
+ print(f"[{job_id}] - gender_confidence: {validation['gender_confidence']:.3f}")
617
+
618
+ # PASO 3: Verificar si és una cara vàlida
619
+ if not validation['is_valid_face'] or validation['face_confidence'] < FACE_CONFIDENCE_THRESHOLD:
620
+ print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: NO ES UNA CARA VÁLIDA (face_confidence={validation['face_confidence']:.3f} < threshold={FACE_CONFIDENCE_THRESHOLD}), eliminant tot el clúster")
621
+ continue
622
+
623
+ # PASO 4: És una cara vàlida! Crear carpeta
624
+ out_dir = chars_dir / char_id
625
+ out_dir.mkdir(parents=True, exist_ok=True)
626
+
627
+ # PASO 5: Limitar caras a mostrar (primera meitat + 1)
628
+ total_faces = len(face_detections_sorted)
629
+ max_faces_to_show = (total_faces // 2) + 1
630
+ face_detections_limited = face_detections_sorted[:max_faces_to_show]
631
+
632
+ # Copiar solo las caras limitadas
633
+ files = []
634
+ face_files_urls = []
635
+ for k, face_det in enumerate(face_detections_limited):
636
+ fname = face_det['file']
637
+ src = faces_root / fname
638
+ dst = out_dir / fname
639
+ try:
640
+ _sh.copy2(src, dst)
641
+ files.append(fname)
642
+ face_files_urls.append(f"/files/{video_name}/{char_id}/{fname}")
643
+ except Exception:
644
+ pass
645
+
646
+ # Imagen representativa (la mejor)
647
+ rep = files[0] if files else None
648
+ if rep:
649
+ rep_src = out_dir / rep
650
+ rep_dst = out_dir / "representative.jpg"
651
+ try:
652
+ _sh.copy2(rep_src, rep_dst)
653
+ except Exception:
654
+ pass
655
+
656
+ # PASO 6: Generar nombre según género
657
+ gender = validation['gender']
658
+ character_name = get_random_catalan_name_by_gender(gender, char_id)
659
+
660
+ print(f"[{job_id}] [NAME GENERATION] Cluster {char_id}:")
661
+ print(f"[{job_id}] - Gender detectado: {gender}")
662
+ print(f"[{job_id}] - Nombre asignado: {character_name}")
663
+ print(f"[{job_id}] - Seed usado: {char_id}")
664
+
665
+ character_data = {
666
+ "id": char_id,
667
+ "name": character_name,
668
+ "gender": gender,
669
+ "gender_confidence": validation['gender_confidence'],
670
+ "face_confidence": validation['face_confidence'],
671
+ "man_prob": validation['man_prob'],
672
+ "woman_prob": validation['woman_prob'],
673
+ "folder": str(out_dir),
674
+ "num_faces": len(files),
675
+ "total_faces_detected": total_faces,
676
+ "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
677
+ "face_files": face_files_urls,
678
+ }
679
+
680
+ characters_validated.append(character_data)
681
+
682
+ print(f"[{job_id}] [VALIDATION] ✓ Cluster {char_id}: CARA VÁLIDA!")
683
+ print(f"[{job_id}] Nombre: {character_name}")
684
+ print(f"[{job_id}] Género: {gender} (man={validation['man_prob']:.3f}, woman={validation['woman_prob']:.3f})")
685
+ print(f"[{job_id}] Confianza género: {validation['gender_confidence']:.3f}")
686
+ print(f"[{job_id}] Confianza cara: {validation['face_confidence']:.3f}")
687
+ print(f"[{job_id}] Caras mostradas: {len(files)}/{total_faces}")
688
+ print(f"[{job_id}] Imagen representativa: {best_face_path.name}")
689
+
690
+ # Estadístiques finals
691
+ eliminated_count = original_cluster_count - len(characters_validated)
692
+ print(f"[{job_id}] [VALIDATION] Total: {len(characters_validated)} clústers vàlids "
693
+ f"(eliminats {eliminated_count} falsos positius)")
694
+
695
+ characters = characters_validated
696
+
697
+ # Escribir analysis.json compatible con 'originales'
698
+ analysis = {
699
+ "caras": [{"embeddings": e} for e in embeddings],
700
+ "voices": [],
701
+ "escenas": [],
702
+ }
703
+ analysis_path = str(base / "analysis.json")
704
+ with open(analysis_path, "w", encoding="utf-8") as f:
705
+ json.dump(analysis, f, ensure_ascii=False)
706
+
707
+ face_labels = labels
708
+ num_face_embeddings = len(embeddings)
709
+
710
+ print(f"[{job_id}] Personajes detectados: {len(characters)}")
711
+ for char in characters:
712
+ print(f"[{job_id}] - {char['name']}: {char['num_faces']} caras")
713
+
714
+ # Enriquecer info de personajes con listado real de imágenes disponibles
715
+ try:
716
+ import glob, os
717
+ for ch in characters:
718
+ folder = ch.get("folder")
719
+ face_files = []
720
+ if folder and os.path.isdir(folder):
721
+ # soportar patrones face_* y extensiones jpg/png
722
+ patterns = ["face_*.jpg", "face_*.png"]
723
+ files = []
724
+ for pat in patterns:
725
+ files.extend(glob.glob(os.path.join(folder, pat)))
726
+ # si no hay face_*, tomar cualquier jpg/png para no dejar vacío
727
+ if not files:
728
+ files.extend(glob.glob(os.path.join(folder, "*.jpg")))
729
+ files.extend(glob.glob(os.path.join(folder, "*.png")))
730
+ # normalizar nombres de fichero relativos
731
+ face_files = sorted({os.path.basename(p) for p in files})
732
+ # Garantizar que representative.(jpg|png) esté el primero si existe
733
+ for rep_name in ("representative.jpg", "representative.png"):
734
+ rep_path = os.path.join(folder, rep_name)
735
+ if os.path.exists(rep_path):
736
+ if rep_name in face_files:
737
+ face_files.remove(rep_name)
738
+ face_files.insert(0, rep_name)
739
+ ch["face_files"] = face_files
740
+ # Ajustar num_faces si hay discrepancia
741
+ if face_files:
742
+ ch["num_faces"] = len(face_files)
743
+ except Exception as _e:
744
+ print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
745
+
746
+ # Procesamiento de audio: diarización, ASR y embeddings de voz
747
+ try:
748
+ cfg = load_yaml("config.yaml")
749
+ audio_segments, srt_unmod, full_txt, diar_info, connection_logs = process_audio_for_video(video_path, base, cfg, voice_collection=None)
750
+ # Loggear en consola del engine los eventos de conexión
751
+ try:
752
+ for ev in (connection_logs or []):
753
+ msg = ev.get("message") if isinstance(ev, dict) else None
754
+ if msg:
755
+ print(f"[{job_id}] {msg}")
756
+ except Exception:
757
+ pass
758
+ except Exception as e_audio:
759
+ import traceback
760
+ print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
761
+ audio_segments, srt_unmod, full_txt = [], None, ""
762
+ diar_info = {"diarization_ok": False, "error": str(e_audio)}
763
+ connection_logs = []
764
+
765
+ # Fallback: si no hay segmentos de audio, crear uno mínimo del audio completo
766
+ if not audio_segments:
767
+ try:
768
+ from pathlib import Path as _P
769
+ from pydub import AudioSegment as _AS
770
+ wav_out = extract_audio_ffmpeg(video_path, base / f"{_P(video_path).stem}.wav", sr=16000)
771
+ audio = _AS.from_wav(wav_out)
772
+ clips_dir = base / "clips"
773
+ clips_dir.mkdir(parents=True, exist_ok=True)
774
+ cp = clips_dir / "segment_000.wav"
775
+ audio.export(cp, format="wav")
776
+ emb_list = embed_voice_segments([str(cp)])
777
+ audio_segments = [{
778
+ "segment": 0,
779
+ "start": 0.0,
780
+ "end": float(len(audio) / 1000.0),
781
+ "speaker": "SPEAKER_00",
782
+ "text": "",
783
+ "voice_embedding": emb_list[0] if emb_list else [],
784
+ "clip_path": str(cp),
785
+ "lang": "ca",
786
+ "lang_prob": 1.0,
787
+ }]
788
+ except Exception as _efb:
789
+ print(f"[{job_id}] WARN - Audio minimal fallback failed: {_efb}")
790
+
791
+ # Clustering jerárquico de voces sobre embeddings válidos
792
+ import numpy as np
793
+ voice_embeddings = [seg.get("voice_embedding") for seg in audio_segments if seg.get("voice_embedding")]
794
+ if voice_embeddings:
795
+ try:
796
+ Xv = np.array(voice_embeddings)
797
+ v_labels = hierarchical_cluster_with_min_size(Xv, v_max_groups, v_min_cluster, voice_sensitivity).tolist()
798
+ print(f"[{job_id}] Clustering jerárquico de voz: {len(set([l for l in v_labels if l >= 0]))} clusters")
799
+ except Exception as _e:
800
+ print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
801
+ v_labels = []
802
+ else:
803
+ v_labels = []
804
+
805
+ # Guardar resultados primero y luego marcar como completado (evita carreras)
806
+ job["results"] = {
807
+ "characters": characters,
808
+ "num_characters": len(characters),
809
+ "analysis_path": analysis_path,
810
+ "base_dir": str(base),
811
+ "face_labels": face_labels,
812
+ "num_face_embeddings": num_face_embeddings,
813
+ "audio_segments": audio_segments,
814
+ "srt_unmodified": srt_unmod,
815
+ "full_transcription": full_txt,
816
+ "voice_labels": v_labels,
817
+ "num_voice_embeddings": len(voice_embeddings),
818
+ "diarization_info": diar_info,
819
+ }
820
+ job["status"] = JobStatus.DONE
821
+
822
+ # Log resumido sin embeddings
823
+ print(f"[{job_id}] ✓ Resultados guardados:")
824
+ print(f"[{job_id}] - Personatges: {len(characters)}")
825
+ print(f"[{job_id}] - Segments d'àudio: {len(audio_segments)}")
826
+ print(f"[{job_id}] - Face embeddings: {num_face_embeddings}")
827
+ print(f"[{job_id}] - Voice embeddings: {len(voice_embeddings)}")
828
+
829
+ except Exception as e_detect:
830
+ # Si falla la detección, intentar modo fallback
831
+ import traceback
832
+ print(f"[{job_id}] ✗ Error en detección: {e_detect}")
833
+ print(f"[{job_id}] Traceback: {traceback.format_exc()}")
834
+ print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
835
+
836
+ # Crear carpetas básicas como fallback
837
+ for sub in ("sources", "faces", "voices", "backgrounds"):
838
+ (base / sub).mkdir(parents=True, exist_ok=True)
839
+
840
+ # Guardar resultados de fallback y luego marcar como completado
841
+ job["results"] = {
842
+ "characters": [],
843
+ "num_characters": 0,
844
+ "temp_dirs": {
845
+ "sources": str(base / "sources"),
846
+ "faces": str(base / "faces"),
847
+ "voices": str(base / "voices"),
848
+ "backgrounds": str(base / "backgrounds"),
849
+ },
850
+ "warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
851
+ }
852
+ job["status"] = JobStatus.DONE
853
+
854
+ print(f"[{job_id}] ✓ Job completado exitosamente")
855
+
856
+ except Exception as e:
857
+ import traceback
858
+ print(f"[{job_id}] ✗ Error inesperado: {e}")
859
+ try:
860
+ job = jobs.get(job_id)
861
+ if job is not None:
862
+ job["status"] = JobStatus.FAILED
863
+ job["error"] = str(e)
864
+ except Exception:
865
+ pass
866
+ print(f"[{job_id}] Traceback: {traceback.format_exc()}")
867
+
868
+ @app.post("/generate_audiodescription")
869
+ async def generate_audiodescription(video: UploadFile = File(...)):
870
+ try:
871
+ import uuid
872
+ job_id = str(uuid.uuid4())
873
+ vid_name = video.filename or f"video_{job_id}.mp4"
874
+ base = TEMP_ROOT / Path(vid_name).stem
875
+
876
+ base.mkdir(parents=True, exist_ok=True)
877
+ # Save temp mp4
878
+ video_path = base / vid_name
879
+ with open(video_path, "wb") as f:
880
+ f.write(await video.read())
881
+
882
+ # Run MVP pipeline
883
+ result = ad_generate(str(video_path), base)
884
+
885
+ return {
886
+ "status": "done",
887
+ "results": {
888
+ "une_srt": result.get("une_srt", ""),
889
+ "free_text": result.get("free_text", ""),
890
+ "artifacts": result.get("artifacts", {}),
891
+ },
892
+ }
893
+ except Exception as e:
894
+ import traceback
895
+ print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}")
896
+ raise HTTPException(status_code=500, detail=str(e))
897
+
898
+ @app.post("/load_casting")
899
+ async def load_casting(
900
+ faces_dir: str = Form("identities/faces"),
901
+ voices_dir: str = Form("identities/voices"),
902
+ db_dir: str = Form("chroma_db"),
903
+ drop_collections: bool = Form(False),
904
+ ):
905
+ client = ensure_chroma(Path(db_dir))
906
+ n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
907
+ n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
908
+ return {"ok": True, "faces": n_faces, "voices": n_voices}
909
+
910
+ @app.post("/finalize_casting")
911
+ async def finalize_casting(
912
+ payload: dict = Body(...),
913
+ ):
914
+ """
915
+ Consolidate selected face and voice clusters into identities directories and build indices.
916
+ Expected payload:
917
+ {
918
+ "video_name": str,
919
+ "base_dir": str, # engine temp base for this video
920
+ "characters": [
921
+ {"id": "char1", "name": "Nom", "folder": "/tmp/temp/<video>/char1", "kept_files": ["representative.jpg", ...], "description": "..."}, ...
922
+ ],
923
+ "voice_clusters": [
924
+ {"label": 0, "name": "SPEAKER_00", "clips": ["segment_000.wav", ...]}, ...
925
+ ]
926
+ }
927
+ """
928
+ import os
929
+ import shutil
930
+ from pathlib import Path as _P
931
+
932
+ video_name = payload.get("video_name")
933
+ base_dir = payload.get("base_dir")
934
+ characters = payload.get("characters", []) or []
935
+ voice_clusters = payload.get("voice_clusters", []) or []
936
+
937
+ if not video_name or not base_dir:
938
+ raise HTTPException(status_code=400, detail="Missing video_name or base_dir")
939
+
940
+ faces_out = IDENTITIES_ROOT / video_name / "faces"
941
+ voices_out = IDENTITIES_ROOT / video_name / "voices"
942
+ faces_out.mkdir(parents=True, exist_ok=True)
943
+ voices_out.mkdir(parents=True, exist_ok=True)
944
+
945
+ # Consolidate faces per character name (merge same names)
946
+ for ch in characters:
947
+ ch_name = (ch.get("name") or "Unknown").strip() or "Unknown"
948
+ ch_folder = ch.get("folder")
949
+ kept = ch.get("kept_files") or []
950
+ if not ch_folder or not os.path.isdir(ch_folder):
951
+ continue
952
+ dst_dir = faces_out / ch_name
953
+ dst_dir.mkdir(parents=True, exist_ok=True)
954
+ for fname in kept:
955
+ src = _P(ch_folder) / fname
956
+ if src.exists() and src.is_file():
957
+ try:
958
+ shutil.copy2(src, dst_dir / fname)
959
+ except Exception:
960
+ pass
961
+
962
+ # Consolidate voices per cluster name
963
+ clips_dir = _P(base_dir) / "clips"
964
+ for vc in voice_clusters:
965
+ v_name = (vc.get("name") or f"SPEAKER_{int(vc.get('label',0)):02d}").strip()
966
+ dst_dir = voices_out / v_name
967
+ dst_dir.mkdir(parents=True, exist_ok=True)
968
+ for wav in (vc.get("clips") or []):
969
+ src = clips_dir / wav
970
+ if src.exists() and src.is_file():
971
+ try:
972
+ shutil.copy2(src, dst_dir / wav)
973
+ except Exception:
974
+ pass
975
+
976
+ # Build indices using casting_loader helpers
977
+ db_dir = IDENTITIES_ROOT / video_name / "chroma_db"
978
+ client = ensure_chroma(db_dir)
979
+ n_faces = build_faces_index(faces_out, client, collection_name="index_faces", deepface_model='Facenet512', drop=True)
980
+ n_voices = build_voices_index(voices_out, client, collection_name="index_voices", drop=True)
981
+
982
+ # Summary of identities
983
+ face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
984
+ voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
985
+
986
+ return {
987
+ "ok": True,
988
+ "video_name": video_name,
989
+ "faces_dir": str(faces_out),
990
+ "voices_dir": str(voices_out),
991
+ "db_dir": str(db_dir),
992
+ "n_faces_embeddings": n_faces,
993
+ "n_voices_embeddings": n_voices,
994
+ "face_identities": face_identities,
995
+ "voice_identities": voice_identities,
996
+ }
997
+
998
+ @app.get("/files_scene/{video_name}/{scene_id}/{filename}")
999
+ def serve_scene_file(video_name: str, scene_id: str, filename: str):
1000
+ file_path = TEMP_ROOT / video_name / "scenes" / scene_id / filename
1001
+ if not file_path.exists():
1002
+ raise HTTPException(status_code=404, detail="File not found")
1003
+ return FileResponse(file_path)
1004
+
1005
+ @app.post("/detect_scenes")
1006
+ async def detect_scenes(
1007
+ video: UploadFile = File(...),
1008
+ max_groups: int = Form(default=3),
1009
+ min_cluster_size: int = Form(default=3),
1010
+ scene_sensitivity: float = Form(default=0.5),
1011
+ frame_interval_sec: float = Form(default=0.5),
1012
+ ):
1013
  """
1014
+ Detecta clústers d'escenes mitjançant clustering jeràrquic d'histogrames de color.
1015
+ Retorna una llista de scene_clusters estructurada de forma similar a characters.
 
1016
  """
1017
+ import cv2
1018
+ import numpy as np
1019
+
1020
+ # Guardar el vídeo temporalment
1021
+ video_name = Path(video.filename).stem
1022
+ dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
1023
+ with dst_video.open("wb") as f:
1024
+ shutil.copyfileobj(video.file, f)
1025
+
1026
+ cap = cv2.VideoCapture(str(dst_video))
1027
+ if not cap.isOpened():
1028
+ raise HTTPException(status_code=400, detail="Cannot open video")
1029
+
1030
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
1031
+ step = max(1, int(frame_interval_sec * fps))
1032
+
1033
+ frames = []
1034
+ metas = []
1035
+ idx = 0
1036
+ while True:
1037
+ ret = cap.grab()
1038
+ if not ret:
1039
+ break
1040
+ if idx % step == 0:
1041
+ ret2, frame = cap.retrieve()
1042
+ if not ret2:
1043
+ break
1044
+ # Reduir mida per estabilitat i càlcul ràpid
1045
+ small = cv2.resize(frame, (160, 90))
1046
+ hsv = cv2.cvtColor(small, cv2.COLOR_BGR2HSV)
1047
+ # Histograma per canal
1048
+ h_hist = cv2.calcHist([hsv],[0],None,[32],[0,180]).flatten()
1049
+ s_hist = cv2.calcHist([hsv],[1],None,[32],[0,256]).flatten()
1050
+ v_hist = cv2.calcHist([hsv],[2],None,[32],[0,256]).flatten()
1051
+ hist = np.concatenate([h_hist, s_hist, v_hist])
1052
+ hist = hist / (np.linalg.norm(hist) + 1e-8)
1053
+ frames.append(hist)
1054
+ metas.append({"index": idx, "time_sec": idx/float(fps)})
1055
+ idx += 1
1056
+ cap.release()
1057
+
1058
+ if not frames:
1059
+ return {"scene_clusters": []}
1060
+
1061
+ X = np.array(frames)
1062
+ labels = hierarchical_cluster_with_min_size(X, max_groups, min_cluster_size, scene_sensitivity).tolist()
1063
+ initial_clusters = len(set([l for l in labels if l >= 0]))
1064
+ print(f"Scene clustering jeràrquic inicial: {initial_clusters} clusters")
1065
+
1066
+ # Agrupar per etiqueta (>=0)
1067
+ clusters = {}
1068
+ for i, lbl in enumerate(labels):
1069
+ if lbl is None or lbl < 0:
1070
+ continue
1071
+ clusters.setdefault(int(lbl), []).append(i)
1072
+
1073
+ # VALIDACIÓ MILLORADA: Fusionar clusters molt similars de forma més agressiva
1074
+ # Calcular centroides (histograma promig de cada cluster)
1075
+ centroids = {}
1076
+ for lbl, idxs in clusters.items():
1077
+ cluster_histograms = X[idxs]
1078
+ centroids[lbl] = np.mean(cluster_histograms, axis=0)
1079
+
1080
+ print(f"[SCENE VALIDATION] Validant similaritat entre {len(centroids)} clusters...")
1081
+
1082
+ # Thresholds més agressius per fusionar escenes similars
1083
+ SIMILARITY_THRESHOLD = 0.25 # Aumentado de 0.15 a 0.25 (fusiona más)
1084
+ CORRELATION_THRESHOLD = 0.85 # Correlación mínima para considerar similares
1085
 
1086
+ # Calcular matriu de distàncies i correlacions entre centroides
1087
+ cluster_labels = sorted(centroids.keys())
1088
+ similarities = {}
1089
+
1090
+ for i, lbl1 in enumerate(cluster_labels):
1091
+ for lbl2 in cluster_labels[i+1:]:
1092
+ # Distancia euclidiana (normalizada)
1093
+ dist = np.linalg.norm(centroids[lbl1] - centroids[lbl2])
1094
+
1095
+ # Correlación de Pearson entre histogramas
1096
+ corr = np.corrcoef(centroids[lbl1], centroids[lbl2])[0, 1]
1097
+
1098
+ # Son similares si:
1099
+ # - Distancia baja (< threshold) O
1100
+ # - Correlación alta (> threshold)
1101
+ are_similar = (dist < SIMILARITY_THRESHOLD) or (corr > CORRELATION_THRESHOLD)
1102
+
1103
+ similarities[(lbl1, lbl2)] = {
1104
+ 'distance': dist,
1105
+ 'correlation': corr,
1106
+ 'similar': are_similar
1107
+ }
1108
+
1109
+ if are_similar:
1110
+ print(f"[SCENE VALIDATION] Clusters {lbl1} i {lbl2} són similars: "
1111
+ f"dist={dist:.3f} (threshold={SIMILARITY_THRESHOLD}), "
1112
+ f"corr={corr:.3f} (threshold={CORRELATION_THRESHOLD})")
1113
+
1114
+ # Union-Find para fusionar clusters transitivamente
1115
+ # Si A~B y B~C, entonces A~B~C (todos en el mismo grupo)
1116
+ parent = {lbl: lbl for lbl in cluster_labels}
1117
+
1118
+ def find(x):
1119
+ if parent[x] != x:
1120
+ parent[x] = find(parent[x]) # Path compression
1121
+ return parent[x]
1122
+
1123
+ def union(x, y):
1124
+ root_x = find(x)
1125
+ root_y = find(y)
1126
+ if root_x != root_y:
1127
+ parent[root_y] = root_x
1128
+
1129
+ # Fusionar todos los clusters similares
1130
+ fusion_count = 0
1131
+ for (lbl1, lbl2), sim in similarities.items():
1132
+ if sim['similar']:
1133
+ union(lbl1, lbl2)
1134
+ fusion_count += 1
1135
+
1136
+ # Aplicar fusió als clusters
1137
+ new_clusters = {}
1138
+ for lbl, idxs in clusters.items():
1139
+ root = find(lbl)
1140
+ if root not in new_clusters:
1141
+ new_clusters[root] = []
1142
+ new_clusters[root].extend(idxs)
1143
+
1144
+ # Reordenar labels para que sean consecutivos
1145
+ final_clusters_dict = {}
1146
+ for i, (root, idxs) in enumerate(sorted(new_clusters.items())):
1147
+ final_clusters_dict[i] = idxs
1148
+
1149
+ clusters = final_clusters_dict
1150
+ final_clusters = len(clusters)
1151
+ eliminated = initial_clusters - final_clusters
1152
+
1153
+ print(f"[SCENE VALIDATION] ===== RESULTADO =====")
1154
+ print(f"[SCENE VALIDATION] Clusters inicials: {initial_clusters}")
1155
+ print(f"[SCENE VALIDATION] Fusions realitzades: {fusion_count}")
1156
+ print(f"[SCENE VALIDATION] Clusters finals: {final_clusters}")
1157
+ print(f"[SCENE VALIDATION] Clusters eliminats (fusionats): {eliminated}")
1158
+ print(f"[SCENE VALIDATION] Reducció: {(eliminated/initial_clusters*100):.1f}%")
1159
+ print(f"[SCENE VALIDATION] =======================")
1160
+
1161
+ # Escriure imatges representatives per a cada clúster
1162
+ base = TEMP_ROOT / video_name / "scenes"
1163
+ base.mkdir(parents=True, exist_ok=True)
1164
+ scene_list = []
1165
+ cap = cv2.VideoCapture(str(dst_video))
1166
+ for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
1167
+ scene_id = f"scene_{int(lbl):02d}"
1168
+ out_dir = base / scene_id
1169
+ out_dir.mkdir(parents=True, exist_ok=True)
1170
+ frame_files = []
1171
+ # Guardar fins a 12 frames per clúster
1172
+ for k, fi in enumerate(idxs[:12]):
1173
+ frame_num = metas[fi]["index"]
1174
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
1175
+ ret2, frame = cap.read()
1176
+ if not ret2:
1177
+ continue
1178
+ fn = f"frame_{k:03d}.jpg"
1179
+ cv2.imwrite(str(out_dir / fn), frame)
1180
+ frame_files.append(fn)
1181
+ # Representative
1182
+ rep = frame_files[0] if frame_files else None
1183
+ image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
1184
+
1185
+ # Llamar a svision para describir la escena representativa
1186
+ scene_description = ""
1187
+ scene_name = f"Escena {lbl+1}"
1188
+ if rep:
1189
+ rep_full_path = out_dir / rep
1190
+ if rep_full_path.exists():
1191
+ print(f"Llamando a svision para describir {scene_id}...")
1192
+ try:
1193
+ scene_description, scene_name = describe_image_with_svision(str(rep_full_path), is_face=False)
1194
+ if not scene_name:
1195
+ scene_name = f"Escena {lbl+1}"
1196
+
1197
+ # Si tenemos descripción, generar nombre corto con schat
1198
+ if scene_description:
1199
+ print(f"Llamando a schat para generar nombre corto de {scene_id}...")
1200
+ try:
1201
+ # Usar LLMRouter para llamar a schat
1202
+ config_path = os.getenv("CONFIG_YAML", "config.yaml")
1203
+ if os.path.exists(config_path):
1204
+ with open(config_path, 'r', encoding='utf-8') as f:
1205
+ cfg = yaml.safe_load(f) or {}
1206
+ router = LLMRouter(cfg)
1207
+
1208
+ prompt = f"Basant-te en aquesta descripció d'una escena, genera un nom curt de menys de 3 paraules que la resumeixi:\n\n{scene_description}\n\nNom de l'escena:"
1209
+
1210
+ short_name = router.instruct(
1211
+ prompt=prompt,
1212
+ system="Ets un assistent que genera noms curts i descriptius per a escenes. Respon NOMÉS amb el nom, sense explicacions.",
1213
+ model="salamandra-instruct"
1214
+ ).strip()
1215
+
1216
+ # Limpiar posibles comillas o puntuación extra
1217
+ short_name = short_name.strip('"\'.,!?').strip()
1218
+
1219
+ if short_name and len(short_name) > 0:
1220
+ scene_name = short_name
1221
+ print(f"[schat] Nom generat: {scene_name}")
1222
+ else:
1223
+ print(f"[schat] No s'ha generat nom, usant fallback")
1224
+ except Exception as e_schat:
1225
+ print(f"Error generando nombre con schat: {e_schat}")
1226
+ # Mantener el nombre de svision si schat falla
1227
+
1228
+ except Exception as e:
1229
+ print(f"Error describiendo {scene_id}: {e}")
1230
+
1231
+ scene_list.append({
1232
+ "id": scene_id,
1233
+ "name": scene_name,
1234
+ "description": scene_description,
1235
+ "folder": str(out_dir),
1236
+ "num_frames": len(frame_files),
1237
+ "image_url": image_url,
1238
+ "frame_files": frame_files,
1239
+ })
1240
+ cap.release()
1241
+
1242
+ return {"scene_clusters": scene_list, "base_dir": str(base)}
1243
+
1244
+ @app.post("/refine_narration")
1245
+ async def refine_narration(
1246
+ dialogues_srt: str = Form(...),
1247
+ frame_descriptions_json: str = Form("[]"),
1248
+ config_path: str = Form("config.yaml"),
1249
+ ):
1250
+ cfg = load_yaml(config_path)
1251
+ frames = json.loads(frame_descriptions_json)
1252
+ model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
1253
+ use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
1254
+
1255
+ if use_remote:
1256
+ router = LLMRouter(cfg)
1257
+ system_msg = (
1258
+ "Eres un sistema de audiodescripción que cumple UNE-153010. "
1259
+ "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
1260
+ "Devuelve JSON con {narrative_text, srt_text}."
1261
+ )
1262
+ prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
1263
+ try:
1264
+ txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
1265
+ out = {}
1266
+ try:
1267
+ out = json.loads(txt)
1268
+ except Exception:
1269
+ out = {"narrative_text": txt, "srt_text": ""}
1270
+ return {
1271
+ "narrative_text": out.get("narrative_text", ""),
1272
+ "srt_text": out.get("srt_text", ""),
1273
+ "approved": True,
1274
+ "critic_feedback": "",
1275
+ }
1276
+ except Exception:
1277
+ ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("narration_une_guidelines_path", "UNE_153010.txt"))
1278
+ res = ns.run(dialogues_srt, frames)
1279
+ return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
1280
+
1281
+ ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
1282
+ out = ns.run(dialogues_srt, frames)
1283
+ return {"narrative_text": out.narrative_text, "srt_text": out.srt_text, "approved": out.approved, "critic_feedback": out.critic_feedback}
1284
+
1285
+ if __name__ == "__main__":
1286
+ uvicorn.run(app, host="0.0.0.0", port=7860)