VeuReu commited on
Commit
2df0bca
verified
1 Parent(s): 104fa1a

Upload 5 files

Browse files
Files changed (2) hide show
  1. api.py +39 -2
  2. character_detection.py +30 -10
api.py CHANGED
@@ -13,6 +13,7 @@ from enum import Enum
13
  import os
14
 
15
  from video_processing import process_video_pipeline
 
16
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
17
  from narration_system import NarrationSystem
18
  from llm_router import load_yaml, LLMRouter
@@ -172,13 +173,17 @@ def process_video_job(job_id: str):
172
  output_base=str(base),
173
  epsilon=epsilon,
174
  min_cluster_size=min_cluster_size,
175
- video_name=video_name
 
 
176
  )
177
 
178
  print(f"[{job_id}] DEBUG - result completo: {result}")
179
 
180
  characters = result.get("characters", [])
181
  analysis_path = result.get("analysis_path", "")
 
 
182
 
183
  print(f"[{job_id}] Personajes detectados: {len(characters)}")
184
  for char in characters:
@@ -216,12 +221,44 @@ def process_video_job(job_id: str):
216
  except Exception as _e:
217
  print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  # Guardar resultados primero y luego marcar como completado (evita carreras)
220
  job["results"] = {
221
  "characters": characters,
222
  "num_characters": len(characters),
223
  "analysis_path": analysis_path,
224
- "base_dir": str(base)
 
 
 
 
 
 
 
225
  }
226
  job["status"] = JobStatus.DONE
227
 
 
13
  import os
14
 
15
  from video_processing import process_video_pipeline
16
+ from audio_tools import process_audio_for_video
17
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
18
  from narration_system import NarrationSystem
19
  from llm_router import load_yaml, LLMRouter
 
173
  output_base=str(base),
174
  epsilon=epsilon,
175
  min_cluster_size=min_cluster_size,
176
+ video_name=video_name,
177
+ start_offset_sec=5.0,
178
+ extract_every_sec=0.5
179
  )
180
 
181
  print(f"[{job_id}] DEBUG - result completo: {result}")
182
 
183
  characters = result.get("characters", [])
184
  analysis_path = result.get("analysis_path", "")
185
+ face_labels = result.get("face_labels", [])
186
+ num_face_embeddings = int(result.get("num_face_embeddings", 0))
187
 
188
  print(f"[{job_id}] Personajes detectados: {len(characters)}")
189
  for char in characters:
 
221
  except Exception as _e:
222
  print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
223
 
224
+ # Procesamiento de audio: diarizaci贸n, ASR y embeddings de voz
225
+ try:
226
+ cfg = load_yaml("config.yaml")
227
+ audio_segments, srt_unmod, full_txt = process_audio_for_video(video_path, base, cfg, voice_collection=None)
228
+ except Exception as e_audio:
229
+ import traceback
230
+ print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
231
+ audio_segments, srt_unmod, full_txt = [], None, ""
232
+
233
+ # Clustering de voces (DBSCAN sobre embeddings v谩lidos)
234
+ from sklearn.cluster import DBSCAN
235
+ import numpy as np
236
+ voice_embeddings = [seg.get("voice_embedding") for seg in audio_segments if seg.get("voice_embedding")]
237
+ if voice_embeddings:
238
+ try:
239
+ Xv = np.array(voice_embeddings)
240
+ v_eps = 1.3
241
+ v_min = 1
242
+ v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
243
+ except Exception as _e:
244
+ print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
245
+ v_labels = []
246
+ else:
247
+ v_labels = []
248
+
249
  # Guardar resultados primero y luego marcar como completado (evita carreras)
250
  job["results"] = {
251
  "characters": characters,
252
  "num_characters": len(characters),
253
  "analysis_path": analysis_path,
254
+ "base_dir": str(base),
255
+ "face_labels": face_labels,
256
+ "num_face_embeddings": num_face_embeddings,
257
+ "audio_segments": audio_segments,
258
+ "srt_unmodified": srt_unmod,
259
+ "full_transcription": full_txt,
260
+ "voice_labels": v_labels,
261
+ "num_voice_embeddings": len(voice_embeddings),
262
  }
263
  job["status"] = JobStatus.DONE
264
 
character_detection.py CHANGED
@@ -54,7 +54,9 @@ class CharacterDetector:
54
  for d in [self.faces_dir, self.voices_dir, self.scenes_dir]:
55
  d.mkdir(parents=True, exist_ok=True)
56
 
57
- def extract_faces_embeddings(self) -> List[Dict[str, Any]]:
 
 
58
  """
59
  Extrae caras del v铆deo y calcula sus embeddings usando DeepFace directamente.
60
 
@@ -67,13 +69,14 @@ class CharacterDetector:
67
 
68
  logger.info("Extrayendo caras del v铆deo con DeepFace...")
69
 
70
- extract_every = 1.0 # segundos
71
  video = cv2.VideoCapture(self.video_path)
72
  fps = int(video.get(cv2.CAP_PROP_FPS))
73
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
74
  frame_interval = int(fps * extract_every)
75
  frame_count = 0
76
  saved_count = 0
 
77
 
78
  embeddings_caras = []
79
 
@@ -84,6 +87,10 @@ class CharacterDetector:
84
  if not ret:
85
  break
86
 
 
 
 
 
87
  if frame_count % frame_interval == 0:
88
  temp_path = self.faces_dir / "temp_frame.jpg"
89
  cv2.imwrite(str(temp_path), frame)
@@ -94,14 +101,21 @@ class CharacterDetector:
94
  face_objs = DeepFace.represent(
95
  img_path=str(temp_path),
96
  model_name='Facenet512',
97
- detector_backend='opencv',
98
- enforce_detection=False
99
  )
100
 
101
  if face_objs:
102
  for i, face_obj in enumerate(face_objs):
103
  embedding = face_obj['embedding']
104
  facial_area = face_obj.get('facial_area', {})
 
 
 
 
 
 
 
105
 
106
  # Guardar el frame completo
107
  save_path = self.faces_dir / f"frame_{saved_count:04d}.jpg"
@@ -270,7 +284,8 @@ class CharacterDetector:
270
 
271
  return analysis_path
272
 
273
- def detect_characters(self, epsilon: float = 0.5, min_cluster_size: int = 2) -> Tuple[List[Dict], Path]:
 
274
  """
275
  Pipeline completo de detecci贸n de personajes.
276
 
@@ -282,7 +297,7 @@ class CharacterDetector:
282
  Tuple de (lista de personajes, path al analysis.json)
283
  """
284
  # 1. Extraer caras y embeddings
285
- embeddings_caras = self.extract_faces_embeddings()
286
 
287
  # 2. Extraer voces y embeddings (opcional, por ahora)
288
  embeddings_voices = self.extract_voices_embeddings()
@@ -299,13 +314,14 @@ class CharacterDetector:
299
  # 6. Crear carpetas de personajes
300
  characters = self.create_character_folders(embeddings_caras, labels)
301
 
302
- return characters, analysis_path
303
 
304
 
305
  # Funci贸n de conveniencia para usar en el API
306
  def detect_characters_from_video(video_path: str, output_base: str,
307
  epsilon: float = 0.5, min_cluster_size: int = 2,
308
- video_name: str = None) -> Dict[str, Any]:
 
309
  """
310
  Funci贸n de alto nivel para detectar personajes en un v铆deo.
311
 
@@ -320,10 +336,14 @@ def detect_characters_from_video(video_path: str, output_base: str,
320
  Dict con resultados: {"characters": [...], "analysis_path": "..."}
321
  """
322
  detector = CharacterDetector(video_path, Path(output_base), video_name=video_name)
323
- characters, analysis_path = detector.detect_characters(epsilon, min_cluster_size)
 
 
324
 
325
  return {
326
  "characters": characters,
327
  "analysis_path": str(analysis_path),
328
- "num_characters": len(characters)
 
 
329
  }
 
54
  for d in [self.faces_dir, self.voices_dir, self.scenes_dir]:
55
  d.mkdir(parents=True, exist_ok=True)
56
 
57
+ def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
58
+ detector_backend: str = 'retinaface', min_face_area: int = 900,
59
+ enforce_detection: bool = True) -> List[Dict[str, Any]]:
60
  """
61
  Extrae caras del v铆deo y calcula sus embeddings usando DeepFace directamente.
62
 
 
69
 
70
  logger.info("Extrayendo caras del v铆deo con DeepFace...")
71
 
72
+ extract_every = float(extract_every_sec)
73
  video = cv2.VideoCapture(self.video_path)
74
  fps = int(video.get(cv2.CAP_PROP_FPS))
75
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
76
  frame_interval = int(fps * extract_every)
77
  frame_count = 0
78
  saved_count = 0
79
+ start_frame = int(max(0.0, start_offset_sec) * (fps if fps > 0 else 25))
80
 
81
  embeddings_caras = []
82
 
 
87
  if not ret:
88
  break
89
 
90
+ if frame_count < start_frame:
91
+ frame_count += 1
92
+ continue
93
+
94
  if frame_count % frame_interval == 0:
95
  temp_path = self.faces_dir / "temp_frame.jpg"
96
  cv2.imwrite(str(temp_path), frame)
 
101
  face_objs = DeepFace.represent(
102
  img_path=str(temp_path),
103
  model_name='Facenet512',
104
+ detector_backend=detector_backend,
105
+ enforce_detection=enforce_detection
106
  )
107
 
108
  if face_objs:
109
  for i, face_obj in enumerate(face_objs):
110
  embedding = face_obj['embedding']
111
  facial_area = face_obj.get('facial_area', {})
112
+ try:
113
+ w = int(facial_area.get('w', 0))
114
+ h = int(facial_area.get('h', 0))
115
+ if w * h < int(min_face_area):
116
+ continue
117
+ except Exception:
118
+ pass
119
 
120
  # Guardar el frame completo
121
  save_path = self.faces_dir / f"frame_{saved_count:04d}.jpg"
 
284
 
285
  return analysis_path
286
 
287
+ def detect_characters(self, epsilon: float = 0.5, min_cluster_size: int = 2,
288
+ *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5) -> Tuple[List[Dict], Path, np.ndarray, List[Dict[str, Any]]]:
289
  """
290
  Pipeline completo de detecci贸n de personajes.
291
 
 
297
  Tuple de (lista de personajes, path al analysis.json)
298
  """
299
  # 1. Extraer caras y embeddings
300
+ embeddings_caras = self.extract_faces_embeddings(start_offset_sec=start_offset_sec, extract_every_sec=extract_every_sec)
301
 
302
  # 2. Extraer voces y embeddings (opcional, por ahora)
303
  embeddings_voices = self.extract_voices_embeddings()
 
314
  # 6. Crear carpetas de personajes
315
  characters = self.create_character_folders(embeddings_caras, labels)
316
 
317
+ return characters, analysis_path, labels, embeddings_caras
318
 
319
 
320
  # Funci贸n de conveniencia para usar en el API
321
  def detect_characters_from_video(video_path: str, output_base: str,
322
  epsilon: float = 0.5, min_cluster_size: int = 2,
323
+ video_name: str = None,
324
+ *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5) -> Dict[str, Any]:
325
  """
326
  Funci贸n de alto nivel para detectar personajes en un v铆deo.
327
 
 
336
  Dict con resultados: {"characters": [...], "analysis_path": "..."}
337
  """
338
  detector = CharacterDetector(video_path, Path(output_base), video_name=video_name)
339
+ characters, analysis_path, labels, embeddings_caras = detector.detect_characters(epsilon, min_cluster_size,
340
+ start_offset_sec=start_offset_sec,
341
+ extract_every_sec=extract_every_sec)
342
 
343
  return {
344
  "characters": characters,
345
  "analysis_path": str(analysis_path),
346
+ "num_characters": len(characters),
347
+ "face_labels": labels.tolist() if isinstance(labels, np.ndarray) else list(labels),
348
+ "num_face_embeddings": len(embeddings_caras)
349
  }