Upload 5 files
Browse files- api.py +39 -2
- character_detection.py +30 -10
api.py
CHANGED
|
@@ -13,6 +13,7 @@ from enum import Enum
|
|
| 13 |
import os
|
| 14 |
|
| 15 |
from video_processing import process_video_pipeline
|
|
|
|
| 16 |
from casting_loader import ensure_chroma, build_faces_index, build_voices_index
|
| 17 |
from narration_system import NarrationSystem
|
| 18 |
from llm_router import load_yaml, LLMRouter
|
|
@@ -172,13 +173,17 @@ def process_video_job(job_id: str):
|
|
| 172 |
output_base=str(base),
|
| 173 |
epsilon=epsilon,
|
| 174 |
min_cluster_size=min_cluster_size,
|
| 175 |
-
video_name=video_name
|
|
|
|
|
|
|
| 176 |
)
|
| 177 |
|
| 178 |
print(f"[{job_id}] DEBUG - result completo: {result}")
|
| 179 |
|
| 180 |
characters = result.get("characters", [])
|
| 181 |
analysis_path = result.get("analysis_path", "")
|
|
|
|
|
|
|
| 182 |
|
| 183 |
print(f"[{job_id}] Personajes detectados: {len(characters)}")
|
| 184 |
for char in characters:
|
|
@@ -216,12 +221,44 @@ def process_video_job(job_id: str):
|
|
| 216 |
except Exception as _e:
|
| 217 |
print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
# Guardar resultados primero y luego marcar como completado (evita carreras)
|
| 220 |
job["results"] = {
|
| 221 |
"characters": characters,
|
| 222 |
"num_characters": len(characters),
|
| 223 |
"analysis_path": analysis_path,
|
| 224 |
-
"base_dir": str(base)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
}
|
| 226 |
job["status"] = JobStatus.DONE
|
| 227 |
|
|
|
|
| 13 |
import os
|
| 14 |
|
| 15 |
from video_processing import process_video_pipeline
|
| 16 |
+
from audio_tools import process_audio_for_video
|
| 17 |
from casting_loader import ensure_chroma, build_faces_index, build_voices_index
|
| 18 |
from narration_system import NarrationSystem
|
| 19 |
from llm_router import load_yaml, LLMRouter
|
|
|
|
| 173 |
output_base=str(base),
|
| 174 |
epsilon=epsilon,
|
| 175 |
min_cluster_size=min_cluster_size,
|
| 176 |
+
video_name=video_name,
|
| 177 |
+
start_offset_sec=5.0,
|
| 178 |
+
extract_every_sec=0.5
|
| 179 |
)
|
| 180 |
|
| 181 |
print(f"[{job_id}] DEBUG - result completo: {result}")
|
| 182 |
|
| 183 |
characters = result.get("characters", [])
|
| 184 |
analysis_path = result.get("analysis_path", "")
|
| 185 |
+
face_labels = result.get("face_labels", [])
|
| 186 |
+
num_face_embeddings = int(result.get("num_face_embeddings", 0))
|
| 187 |
|
| 188 |
print(f"[{job_id}] Personajes detectados: {len(characters)}")
|
| 189 |
for char in characters:
|
|
|
|
| 221 |
except Exception as _e:
|
| 222 |
print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
|
| 223 |
|
| 224 |
+
# Procesamiento de audio: diarizaci贸n, ASR y embeddings de voz
|
| 225 |
+
try:
|
| 226 |
+
cfg = load_yaml("config.yaml")
|
| 227 |
+
audio_segments, srt_unmod, full_txt = process_audio_for_video(video_path, base, cfg, voice_collection=None)
|
| 228 |
+
except Exception as e_audio:
|
| 229 |
+
import traceback
|
| 230 |
+
print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
|
| 231 |
+
audio_segments, srt_unmod, full_txt = [], None, ""
|
| 232 |
+
|
| 233 |
+
# Clustering de voces (DBSCAN sobre embeddings v谩lidos)
|
| 234 |
+
from sklearn.cluster import DBSCAN
|
| 235 |
+
import numpy as np
|
| 236 |
+
voice_embeddings = [seg.get("voice_embedding") for seg in audio_segments if seg.get("voice_embedding")]
|
| 237 |
+
if voice_embeddings:
|
| 238 |
+
try:
|
| 239 |
+
Xv = np.array(voice_embeddings)
|
| 240 |
+
v_eps = 1.3
|
| 241 |
+
v_min = 1
|
| 242 |
+
v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
|
| 243 |
+
except Exception as _e:
|
| 244 |
+
print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
|
| 245 |
+
v_labels = []
|
| 246 |
+
else:
|
| 247 |
+
v_labels = []
|
| 248 |
+
|
| 249 |
# Guardar resultados primero y luego marcar como completado (evita carreras)
|
| 250 |
job["results"] = {
|
| 251 |
"characters": characters,
|
| 252 |
"num_characters": len(characters),
|
| 253 |
"analysis_path": analysis_path,
|
| 254 |
+
"base_dir": str(base),
|
| 255 |
+
"face_labels": face_labels,
|
| 256 |
+
"num_face_embeddings": num_face_embeddings,
|
| 257 |
+
"audio_segments": audio_segments,
|
| 258 |
+
"srt_unmodified": srt_unmod,
|
| 259 |
+
"full_transcription": full_txt,
|
| 260 |
+
"voice_labels": v_labels,
|
| 261 |
+
"num_voice_embeddings": len(voice_embeddings),
|
| 262 |
}
|
| 263 |
job["status"] = JobStatus.DONE
|
| 264 |
|
character_detection.py
CHANGED
|
@@ -54,7 +54,9 @@ class CharacterDetector:
|
|
| 54 |
for d in [self.faces_dir, self.voices_dir, self.scenes_dir]:
|
| 55 |
d.mkdir(parents=True, exist_ok=True)
|
| 56 |
|
| 57 |
-
def extract_faces_embeddings(self
|
|
|
|
|
|
|
| 58 |
"""
|
| 59 |
Extrae caras del v铆deo y calcula sus embeddings usando DeepFace directamente.
|
| 60 |
|
|
@@ -67,13 +69,14 @@ class CharacterDetector:
|
|
| 67 |
|
| 68 |
logger.info("Extrayendo caras del v铆deo con DeepFace...")
|
| 69 |
|
| 70 |
-
extract_every =
|
| 71 |
video = cv2.VideoCapture(self.video_path)
|
| 72 |
fps = int(video.get(cv2.CAP_PROP_FPS))
|
| 73 |
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 74 |
frame_interval = int(fps * extract_every)
|
| 75 |
frame_count = 0
|
| 76 |
saved_count = 0
|
|
|
|
| 77 |
|
| 78 |
embeddings_caras = []
|
| 79 |
|
|
@@ -84,6 +87,10 @@ class CharacterDetector:
|
|
| 84 |
if not ret:
|
| 85 |
break
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
if frame_count % frame_interval == 0:
|
| 88 |
temp_path = self.faces_dir / "temp_frame.jpg"
|
| 89 |
cv2.imwrite(str(temp_path), frame)
|
|
@@ -94,14 +101,21 @@ class CharacterDetector:
|
|
| 94 |
face_objs = DeepFace.represent(
|
| 95 |
img_path=str(temp_path),
|
| 96 |
model_name='Facenet512',
|
| 97 |
-
detector_backend=
|
| 98 |
-
enforce_detection=
|
| 99 |
)
|
| 100 |
|
| 101 |
if face_objs:
|
| 102 |
for i, face_obj in enumerate(face_objs):
|
| 103 |
embedding = face_obj['embedding']
|
| 104 |
facial_area = face_obj.get('facial_area', {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
# Guardar el frame completo
|
| 107 |
save_path = self.faces_dir / f"frame_{saved_count:04d}.jpg"
|
|
@@ -270,7 +284,8 @@ class CharacterDetector:
|
|
| 270 |
|
| 271 |
return analysis_path
|
| 272 |
|
| 273 |
-
def detect_characters(self, epsilon: float = 0.5, min_cluster_size: int = 2
|
|
|
|
| 274 |
"""
|
| 275 |
Pipeline completo de detecci贸n de personajes.
|
| 276 |
|
|
@@ -282,7 +297,7 @@ class CharacterDetector:
|
|
| 282 |
Tuple de (lista de personajes, path al analysis.json)
|
| 283 |
"""
|
| 284 |
# 1. Extraer caras y embeddings
|
| 285 |
-
embeddings_caras = self.extract_faces_embeddings()
|
| 286 |
|
| 287 |
# 2. Extraer voces y embeddings (opcional, por ahora)
|
| 288 |
embeddings_voices = self.extract_voices_embeddings()
|
|
@@ -299,13 +314,14 @@ class CharacterDetector:
|
|
| 299 |
# 6. Crear carpetas de personajes
|
| 300 |
characters = self.create_character_folders(embeddings_caras, labels)
|
| 301 |
|
| 302 |
-
return characters, analysis_path
|
| 303 |
|
| 304 |
|
| 305 |
# Funci贸n de conveniencia para usar en el API
|
| 306 |
def detect_characters_from_video(video_path: str, output_base: str,
|
| 307 |
epsilon: float = 0.5, min_cluster_size: int = 2,
|
| 308 |
-
video_name: str = None
|
|
|
|
| 309 |
"""
|
| 310 |
Funci贸n de alto nivel para detectar personajes en un v铆deo.
|
| 311 |
|
|
@@ -320,10 +336,14 @@ def detect_characters_from_video(video_path: str, output_base: str,
|
|
| 320 |
Dict con resultados: {"characters": [...], "analysis_path": "..."}
|
| 321 |
"""
|
| 322 |
detector = CharacterDetector(video_path, Path(output_base), video_name=video_name)
|
| 323 |
-
characters, analysis_path = detector.detect_characters(epsilon, min_cluster_size
|
|
|
|
|
|
|
| 324 |
|
| 325 |
return {
|
| 326 |
"characters": characters,
|
| 327 |
"analysis_path": str(analysis_path),
|
| 328 |
-
"num_characters": len(characters)
|
|
|
|
|
|
|
| 329 |
}
|
|
|
|
| 54 |
for d in [self.faces_dir, self.voices_dir, self.scenes_dir]:
|
| 55 |
d.mkdir(parents=True, exist_ok=True)
|
| 56 |
|
| 57 |
+
def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
|
| 58 |
+
detector_backend: str = 'retinaface', min_face_area: int = 900,
|
| 59 |
+
enforce_detection: bool = True) -> List[Dict[str, Any]]:
|
| 60 |
"""
|
| 61 |
Extrae caras del v铆deo y calcula sus embeddings usando DeepFace directamente.
|
| 62 |
|
|
|
|
| 69 |
|
| 70 |
logger.info("Extrayendo caras del v铆deo con DeepFace...")
|
| 71 |
|
| 72 |
+
extract_every = float(extract_every_sec)
|
| 73 |
video = cv2.VideoCapture(self.video_path)
|
| 74 |
fps = int(video.get(cv2.CAP_PROP_FPS))
|
| 75 |
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 76 |
frame_interval = int(fps * extract_every)
|
| 77 |
frame_count = 0
|
| 78 |
saved_count = 0
|
| 79 |
+
start_frame = int(max(0.0, start_offset_sec) * (fps if fps > 0 else 25))
|
| 80 |
|
| 81 |
embeddings_caras = []
|
| 82 |
|
|
|
|
| 87 |
if not ret:
|
| 88 |
break
|
| 89 |
|
| 90 |
+
if frame_count < start_frame:
|
| 91 |
+
frame_count += 1
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
if frame_count % frame_interval == 0:
|
| 95 |
temp_path = self.faces_dir / "temp_frame.jpg"
|
| 96 |
cv2.imwrite(str(temp_path), frame)
|
|
|
|
| 101 |
face_objs = DeepFace.represent(
|
| 102 |
img_path=str(temp_path),
|
| 103 |
model_name='Facenet512',
|
| 104 |
+
detector_backend=detector_backend,
|
| 105 |
+
enforce_detection=enforce_detection
|
| 106 |
)
|
| 107 |
|
| 108 |
if face_objs:
|
| 109 |
for i, face_obj in enumerate(face_objs):
|
| 110 |
embedding = face_obj['embedding']
|
| 111 |
facial_area = face_obj.get('facial_area', {})
|
| 112 |
+
try:
|
| 113 |
+
w = int(facial_area.get('w', 0))
|
| 114 |
+
h = int(facial_area.get('h', 0))
|
| 115 |
+
if w * h < int(min_face_area):
|
| 116 |
+
continue
|
| 117 |
+
except Exception:
|
| 118 |
+
pass
|
| 119 |
|
| 120 |
# Guardar el frame completo
|
| 121 |
save_path = self.faces_dir / f"frame_{saved_count:04d}.jpg"
|
|
|
|
| 284 |
|
| 285 |
return analysis_path
|
| 286 |
|
| 287 |
+
def detect_characters(self, epsilon: float = 0.5, min_cluster_size: int = 2,
|
| 288 |
+
*, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5) -> Tuple[List[Dict], Path, np.ndarray, List[Dict[str, Any]]]:
|
| 289 |
"""
|
| 290 |
Pipeline completo de detecci贸n de personajes.
|
| 291 |
|
|
|
|
| 297 |
Tuple de (lista de personajes, path al analysis.json)
|
| 298 |
"""
|
| 299 |
# 1. Extraer caras y embeddings
|
| 300 |
+
embeddings_caras = self.extract_faces_embeddings(start_offset_sec=start_offset_sec, extract_every_sec=extract_every_sec)
|
| 301 |
|
| 302 |
# 2. Extraer voces y embeddings (opcional, por ahora)
|
| 303 |
embeddings_voices = self.extract_voices_embeddings()
|
|
|
|
| 314 |
# 6. Crear carpetas de personajes
|
| 315 |
characters = self.create_character_folders(embeddings_caras, labels)
|
| 316 |
|
| 317 |
+
return characters, analysis_path, labels, embeddings_caras
|
| 318 |
|
| 319 |
|
| 320 |
# Funci贸n de conveniencia para usar en el API
|
| 321 |
def detect_characters_from_video(video_path: str, output_base: str,
|
| 322 |
epsilon: float = 0.5, min_cluster_size: int = 2,
|
| 323 |
+
video_name: str = None,
|
| 324 |
+
*, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5) -> Dict[str, Any]:
|
| 325 |
"""
|
| 326 |
Funci贸n de alto nivel para detectar personajes en un v铆deo.
|
| 327 |
|
|
|
|
| 336 |
Dict con resultados: {"characters": [...], "analysis_path": "..."}
|
| 337 |
"""
|
| 338 |
detector = CharacterDetector(video_path, Path(output_base), video_name=video_name)
|
| 339 |
+
characters, analysis_path, labels, embeddings_caras = detector.detect_characters(epsilon, min_cluster_size,
|
| 340 |
+
start_offset_sec=start_offset_sec,
|
| 341 |
+
extract_every_sec=extract_every_sec)
|
| 342 |
|
| 343 |
return {
|
| 344 |
"characters": characters,
|
| 345 |
"analysis_path": str(analysis_path),
|
| 346 |
+
"num_characters": len(characters),
|
| 347 |
+
"face_labels": labels.tolist() if isinstance(labels, np.ndarray) else list(labels),
|
| 348 |
+
"num_face_embeddings": len(embeddings_caras)
|
| 349 |
}
|