VeuReu commited on
Commit
924dc7a
·
verified ·
1 Parent(s): c8c329a

Upload 3 files

Browse files
Files changed (3) hide show
  1. asr_client.py +62 -0
  2. preprocessing_router.py +131 -162
  3. svision_client.py +67 -0
asr_client.py CHANGED
@@ -138,3 +138,65 @@ def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
138
  api_name="/identificar_veu"
139
  )
140
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  api_name="/identificar_veu"
139
  )
140
  return result
141
+
142
+
143
+ def get_voice_embedding(audio_path: str) -> List[float]:
144
+ """
145
+ Call the /voice_embedding endpoint to get a voice embedding vector.
146
+
147
+ This replaces local SpeakerRecognition processing by delegating to asr Space.
148
+
149
+ Parameters
150
+ ----------
151
+ audio_path : str
152
+ Path to the audio file (WAV format preferred).
153
+
154
+ Returns
155
+ -------
156
+ List[float]
157
+ Normalized embedding vector for the voice, or empty list on error.
158
+ """
159
+ try:
160
+ result = _get_asr_client().predict(
161
+ wav_archivo=handle_file(audio_path),
162
+ api_name="/voice_embedding"
163
+ )
164
+ return result if result else []
165
+ except Exception as e:
166
+ print(f"[asr_client] get_voice_embedding error: {e}")
167
+ return []
168
+
169
+
170
+ def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
171
+ """
172
+ Extract audio from video and perform diarization in one call.
173
+
174
+ Parameters
175
+ ----------
176
+ video_path : str
177
+ Path to the input video file.
178
+
179
+ Returns
180
+ -------
181
+ Dict[str, Any]
182
+ Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
183
+ """
184
+ try:
185
+ # First extract audio
186
+ audio_path = extract_audio_from_video(video_path)
187
+ if not audio_path:
188
+ return {"clips": [], "segments": [], "error": "Audio extraction failed"}
189
+
190
+ # Then diarize
191
+ result = diarize_audio(audio_path)
192
+ # result is tuple: (clips_paths, segments)
193
+ if result and len(result) >= 2:
194
+ return {
195
+ "clips": result[0] if result[0] else [],
196
+ "segments": result[1] if result[1] else [],
197
+ "audio_path": audio_path,
198
+ }
199
+ return {"clips": [], "segments": [], "audio_path": audio_path}
200
+ except Exception as e:
201
+ print(f"[asr_client] extract_audio_and_diarize error: {e}")
202
+ return {"clips": [], "segments": [], "error": str(e)}
preprocessing_router.py CHANGED
@@ -5,21 +5,20 @@ from fastapi.responses import FileResponse
5
  from pathlib import Path
6
  from datetime import datetime
7
  from enum import Enum
8
- from typing import Dict, Any
9
  import shutil
10
  import os
11
  import uuid
12
  import numpy as np
13
  import cv2
 
14
 
15
- from video_processing import process_video_pipeline
16
- from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
17
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
18
- from narration_system import NarrationSystem
19
  from llm_router import load_yaml, LLMRouter
20
- from character_detection import detect_characters_from_video
21
- from vision_tools import FaceOfImageEmbedding
22
- from pipelines.audiodescription import generate as ad_generate
 
23
 
24
 
25
  ROOT = Path("/tmp/veureu")
@@ -43,26 +42,9 @@ jobs: Dict[str, dict] = {}
43
 
44
 
45
  # ---------------------------------------------------------------------------
46
- # Helper functions for face detection and clustering
47
  # ---------------------------------------------------------------------------
48
 
49
- def normalize_face_lighting(image):
50
- """Normalize face brightness using CLAHE and range normalization."""
51
- lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
52
- l, a, b = cv2.split(lab)
53
- clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
54
- l_clahe = clahe.apply(l)
55
- l_min, l_max = l_clahe.min(), l_clahe.max()
56
- if l_max > l_min:
57
- l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
58
- else:
59
- l_normalized = l_clahe
60
- l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
61
- lab_normalized = cv2.merge([l_normalized, a, b])
62
- normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
63
- return normalized
64
-
65
-
66
  def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
67
  """Hierarchical clustering with silhouette score and minimum cluster size."""
68
  from scipy.cluster.hierarchy import linkage, fcluster
@@ -412,10 +394,18 @@ async def detect_scenes(
412
 
413
 
414
  def process_video_job(job_id: str):
415
- """Process video job in background: detect faces, cluster, validate."""
 
 
 
 
 
 
 
 
416
  try:
417
  job = jobs[job_id]
418
- print(f"[{job_id}] Iniciando procesamiento...")
419
 
420
  job["status"] = JobStatus.PROCESSING
421
 
@@ -430,23 +420,15 @@ def process_video_job(job_id: str):
430
  print(f"[{job_id}] Directorio base: {base}")
431
 
432
  try:
433
- print(f"[{job_id}] Iniciando detección de personajes...")
434
- try:
435
- import face_recognition
436
- _use_fr = True
437
- print(f"[{job_id}] face_recognition disponible: CPU")
438
- except Exception:
439
- face_recognition = None
440
- _use_fr = False
441
- print(f"[{job_id}] face_recognition no disponible. Intentando DeepFace fallback.")
442
- try:
443
- from deepface import DeepFace
444
- except Exception:
445
- DeepFace = None
446
-
447
  cap = cv2.VideoCapture(video_path)
448
  if not cap.isOpened():
449
- raise RuntimeError("No se pudo abrir el vídeo para extracción de caras")
 
450
  fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
451
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
452
  max_samples = job.get("max_frames", 100)
@@ -455,100 +437,98 @@ def process_video_job(job_id: str):
455
  frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
456
  else:
457
  frame_indices = []
 
458
  print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
459
 
 
 
 
460
  faces_root = base / "faces_raw"
461
  faces_root.mkdir(parents=True, exist_ok=True)
462
- embeddings: list[list[float]] = []
463
- crops_meta: list[dict] = []
464
-
465
- saved_count = 0
466
- frames_processed = 0
467
- frames_with_faces = 0
468
 
 
469
  for frame_idx in frame_indices:
470
  cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
471
- ret2, frame = cap.read()
472
- if not ret2:
473
  continue
474
- frames_processed += 1
475
- frame_normalized = normalize_face_lighting(frame)
476
- rgb = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2RGB)
477
-
478
- if _use_fr and face_recognition is not None:
479
- boxes = face_recognition.face_locations(rgb, model="hog")
480
- encs = face_recognition.face_encodings(rgb, boxes)
481
- if boxes:
482
- frames_with_faces += 1
483
- for (top, right, bottom, left), e in zip(boxes, encs):
484
- crop = frame_normalized[top:bottom, left:right]
485
- if crop.size == 0:
486
- continue
487
- fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
488
- cv2.imwrite(str(faces_root / fn), crop)
489
- e = np.array(e, dtype=float)
490
- e = e / (np.linalg.norm(e) + 1e-9)
491
- embeddings.append(e.astype(float).tolist())
492
- crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
493
- saved_count += 1
494
- else:
495
- if DeepFace is not None:
496
- try:
497
- gray = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2GRAY)
498
- haar_path = getattr(cv2.data, 'haarcascades', None) or ''
499
- face_cascade = cv2.CascadeClassifier(os.path.join(haar_path, 'haarcascade_frontalface_default.xml'))
500
- boxes_haar = []
501
- if face_cascade is not None and not face_cascade.empty():
502
- faces_haar = face_cascade.detectMultiScale(gray, scaleFactor=1.08, minNeighbors=5, minSize=(50, 50))
503
- for (x, y, w, h) in faces_haar:
504
- top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
505
- boxes_haar.append((top, right, bottom, left))
506
-
507
- if boxes_haar:
508
- frames_with_faces += 1
509
-
510
- for (top, right, bottom, left) in boxes_haar:
511
- crop = frame_normalized[top:bottom, left:right]
512
- if crop.size == 0:
513
- continue
514
- fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
515
- crop_path = faces_root / fn
516
- cv2.imwrite(str(crop_path), crop)
517
- reps = DeepFace.represent(img_path=str(crop_path), model_name="Facenet512", enforce_detection=False)
518
- for r in (reps or []):
519
- emb = r.get("embedding") if isinstance(r, dict) else r
520
- if emb is None:
521
- continue
522
- emb = np.array(emb, dtype=float)
523
- emb = emb / (np.linalg.norm(emb) + 1e-9)
524
- embeddings.append(emb.astype(float).tolist())
525
- crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
526
- saved_count += 1
527
- except Exception as _e_df:
528
- print(f"[{job_id}] DeepFace fallback error: {_e_df}")
529
  cap.release()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
- print(f"[{job_id}] ✓ Frames procesados: {frames_processed}/{len(frame_indices)}")
532
- print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}")
533
  print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
534
 
535
- # Clustering
 
 
536
  if embeddings:
 
537
  Xf = np.array(embeddings)
538
  labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
539
- print(f"[{job_id}] Clustering: {len(set([l for l in labels if l >= 0]))} clusters")
 
540
  else:
541
  labels = []
542
 
543
- # Build character folders with validation
544
- try:
545
- from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
546
- except ImportError:
547
- validate_and_classify_face = None
548
- FACE_CONFIDENCE_THRESHOLD = 0.5
549
-
550
- characters_validated: list[dict[str, Any]] = []
551
- cluster_map: dict[int, list[int]] = {}
552
  for idx, lbl in enumerate(labels):
553
  if isinstance(lbl, int) and lbl >= 0:
554
  cluster_map.setdefault(lbl, []).append(idx)
@@ -558,55 +538,40 @@ def process_video_job(job_id: str):
558
 
559
  for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
560
  char_id = f"char_{ci:02d}"
561
- detections: list[dict[str, Any]] = []
562
- for j in idxs:
563
- meta = crops_meta[j]
564
- file_name = meta.get("file")
565
- if not file_name:
566
- continue
567
- box = meta.get("box", [0, 0, 0, 0])
568
- area = abs(box[1] - box[3]) * abs(box[2] - box[0]) if len(box) >= 4 else 0
569
- detections.append({"index": j, "file": file_name, "score": area, "box": box})
570
-
571
- if not detections:
572
  continue
573
 
574
- detections.sort(key=lambda d: d["score"], reverse=True)
575
- best_face = detections[0]
576
- best_face_path = faces_root / best_face["file"]
577
-
578
- # Validation (optional)
579
- validation = None
580
- if validate_and_classify_face is not None:
581
- try:
582
- validation = validate_and_classify_face(str(best_face_path))
583
- except Exception:
584
- validation = None
585
-
586
- if validation and not validation.get("is_valid_face", True):
587
- if validation.get("face_confidence", 1.0) < FACE_CONFIDENCE_THRESHOLD:
588
- continue
589
-
590
  out_dir = chars_dir / char_id
591
  out_dir.mkdir(parents=True, exist_ok=True)
592
 
593
- total_faces = len(detections)
 
594
  max_faces_to_show = (total_faces // 2) + 1
595
- selected = detections[:max_faces_to_show]
596
 
597
- files: list[str] = []
598
- file_urls: list[str] = []
599
- for det in selected:
600
- fname = det["file"]
 
 
 
 
 
 
 
601
  src = faces_root / fname
602
  dst = out_dir / fname
603
  try:
604
- shutil.copy2(src, dst)
605
- files.append(fname)
606
- file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
 
607
  except Exception:
608
  pass
609
 
 
610
  rep = files[0] if files else None
611
  if rep:
612
  try:
@@ -614,14 +579,12 @@ def process_video_job(job_id: str):
614
  except Exception:
615
  pass
616
 
617
- cluster_number = int(char_id.split("_")[1]) + 1
618
  character_name = f"Cluster {cluster_number}"
619
- gender = validation.get("gender", "Neutral") if validation else "Neutral"
620
 
621
- characters_validated.append({
622
  "id": char_id,
623
  "name": character_name,
624
- "gender": gender,
625
  "folder": str(out_dir),
626
  "num_faces": len(files),
627
  "total_faces_detected": total_faces,
@@ -630,10 +593,16 @@ def process_video_job(job_id: str):
630
  })
631
  print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
632
 
633
- print(f"[{job_id}] Total: {len(characters_validated)} personajes válidos")
 
 
 
 
 
 
634
 
635
  job["results"] = {
636
- "characters": characters_validated,
637
  "face_labels": labels,
638
  "video_name": video_name,
639
  "base_dir": str(base),
@@ -641,8 +610,8 @@ def process_video_job(job_id: str):
641
  job["status"] = JobStatus.DONE
642
  print(f"[{job_id}] ✓ Procesamiento completado")
643
 
644
- except Exception as face_error:
645
- print(f"[{job_id}] Error en detección de caras: {face_error}")
646
  import traceback
647
  traceback.print_exc()
648
  job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
 
5
  from pathlib import Path
6
  from datetime import datetime
7
  from enum import Enum
8
+ from typing import Dict, Any, List
9
  import shutil
10
  import os
11
  import uuid
12
  import numpy as np
13
  import cv2
14
+ import tempfile
15
 
 
 
16
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
 
17
  from llm_router import load_yaml, LLMRouter
18
+
19
+ # External space clients (no local GPU needed)
20
+ import svision_client
21
+ import asr_client
22
 
23
 
24
  ROOT = Path("/tmp/veureu")
 
42
 
43
 
44
  # ---------------------------------------------------------------------------
45
+ # Helper function for clustering (only math, no GPU)
46
  # ---------------------------------------------------------------------------
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
49
  """Hierarchical clustering with silhouette score and minimum cluster size."""
50
  from scipy.cluster.hierarchy import linkage, fcluster
 
394
 
395
 
396
  def process_video_job(job_id: str):
397
+ """
398
+ Process video job in background using EXTERNAL spaces (svision, asr).
399
+
400
+ NO local GPU needed - all vision/audio processing is delegated to:
401
+ - svision: face detection + embeddings (MTCNN + FaceNet)
402
+ - asr: audio diarization + voice embeddings (pyannote + ECAPA)
403
+
404
+ Engine only does: frame extraction, clustering (math), file organization.
405
+ """
406
  try:
407
  job = jobs[job_id]
408
+ print(f"[{job_id}] Iniciando procesamiento (delegando a svision/asr)...")
409
 
410
  job["status"] = JobStatus.PROCESSING
411
 
 
420
  print(f"[{job_id}] Directorio base: {base}")
421
 
422
  try:
423
+ # ============================================================
424
+ # STEP 1: Extract frames from video (local, simple cv2)
425
+ # ============================================================
426
+ print(f"[{job_id}] Extrayendo frames del vídeo...")
427
+
 
 
 
 
 
 
 
 
 
428
  cap = cv2.VideoCapture(video_path)
429
  if not cap.isOpened():
430
+ raise RuntimeError("No se pudo abrir el vídeo")
431
+
432
  fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
433
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
434
  max_samples = job.get("max_frames", 100)
 
437
  frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
438
  else:
439
  frame_indices = []
440
+
441
  print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
442
 
443
+ # Save frames temporarily for svision processing
444
+ frames_dir = base / "frames_temp"
445
+ frames_dir.mkdir(parents=True, exist_ok=True)
446
  faces_root = base / "faces_raw"
447
  faces_root.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
448
 
449
+ frame_paths: List[str] = []
450
  for frame_idx in frame_indices:
451
  cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
452
+ ret, frame = cap.read()
453
+ if not ret:
454
  continue
455
+ frame_path = frames_dir / f"frame_{frame_idx:06d}.jpg"
456
+ cv2.imwrite(str(frame_path), frame)
457
+ frame_paths.append(str(frame_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  cap.release()
459
+
460
+ print(f"[{job_id}] ✓ {len(frame_paths)} frames extraídos")
461
+
462
+ # ============================================================
463
+ # STEP 2: Send frames to SVISION for face detection + embeddings
464
+ # ============================================================
465
+ print(f"[{job_id}] Enviando frames a svision para detección de caras...")
466
+
467
+ embeddings: List[List[float]] = []
468
+ crops_meta: List[dict] = []
469
+ saved_count = 0
470
+ frames_with_faces = 0
471
+
472
+ for i, frame_path in enumerate(frame_paths):
473
+ frame_idx = frame_indices[i] if i < len(frame_indices) else i
474
+ try:
475
+ # Call svision to get faces + embeddings
476
+ faces = svision_client.get_face_embeddings_from_image(frame_path)
477
+
478
+ if faces:
479
+ frames_with_faces += 1
480
+ for face_data in faces:
481
+ emb = face_data.get("embedding", [])
482
+ if not emb:
483
+ continue
484
+
485
+ # Normalize embedding
486
+ emb = np.array(emb, dtype=float)
487
+ emb = emb / (np.linalg.norm(emb) + 1e-9)
488
+ embeddings.append(emb.tolist())
489
+
490
+ # Save face crop if provided by svision
491
+ crop_path = face_data.get("face_crop_path")
492
+ fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
493
+ local_crop_path = faces_root / fn
494
+
495
+ if crop_path and os.path.exists(crop_path):
496
+ shutil.copy2(crop_path, local_crop_path)
497
+ else:
498
+ # If no crop from svision, use original frame
499
+ shutil.copy2(frame_path, local_crop_path)
500
+
501
+ crops_meta.append({
502
+ "file": fn,
503
+ "frame": frame_idx,
504
+ "index": face_data.get("index", saved_count),
505
+ })
506
+ saved_count += 1
507
+
508
+ except Exception as e:
509
+ print(f"[{job_id}] Error procesando frame {frame_idx}: {e}")
510
+ continue
511
 
512
+ print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}/{len(frame_paths)}")
 
513
  print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
514
 
515
+ # ============================================================
516
+ # STEP 3: Clustering (local, only math - no GPU)
517
+ # ============================================================
518
  if embeddings:
519
+ print(f"[{job_id}] Clustering jerárquico...")
520
  Xf = np.array(embeddings)
521
  labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
522
+ n_clusters = len(set([l for l in labels if l >= 0]))
523
+ print(f"[{job_id}] ✓ Clustering: {n_clusters} clusters")
524
  else:
525
  labels = []
526
 
527
+ # ============================================================
528
+ # STEP 4: Organize faces into character folders
529
+ # ============================================================
530
+ characters: List[Dict[str, Any]] = []
531
+ cluster_map: Dict[int, List[int]] = {}
 
 
 
 
532
  for idx, lbl in enumerate(labels):
533
  if isinstance(lbl, int) and lbl >= 0:
534
  cluster_map.setdefault(lbl, []).append(idx)
 
538
 
539
  for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
540
  char_id = f"char_{ci:02d}"
541
+
542
+ if not idxs:
 
 
 
 
 
 
 
 
 
543
  continue
544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  out_dir = chars_dir / char_id
546
  out_dir.mkdir(parents=True, exist_ok=True)
547
 
548
+ # Select faces to show (half + 1)
549
+ total_faces = len(idxs)
550
  max_faces_to_show = (total_faces // 2) + 1
551
+ selected_idxs = idxs[:max_faces_to_show]
552
 
553
+ files: List[str] = []
554
+ file_urls: List[str] = []
555
+
556
+ for j in selected_idxs:
557
+ if j >= len(crops_meta):
558
+ continue
559
+ meta = crops_meta[j]
560
+ fname = meta.get("file")
561
+ if not fname:
562
+ continue
563
+
564
  src = faces_root / fname
565
  dst = out_dir / fname
566
  try:
567
+ if src.exists():
568
+ shutil.copy2(src, dst)
569
+ files.append(fname)
570
+ file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
571
  except Exception:
572
  pass
573
 
574
+ # Create representative image
575
  rep = files[0] if files else None
576
  if rep:
577
  try:
 
579
  except Exception:
580
  pass
581
 
582
+ cluster_number = ci + 1
583
  character_name = f"Cluster {cluster_number}"
 
584
 
585
+ characters.append({
586
  "id": char_id,
587
  "name": character_name,
 
588
  "folder": str(out_dir),
589
  "num_faces": len(files),
590
  "total_faces_detected": total_faces,
 
593
  })
594
  print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
595
 
596
+ # Cleanup temp frames
597
+ try:
598
+ shutil.rmtree(frames_dir)
599
+ except Exception:
600
+ pass
601
+
602
+ print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
603
 
604
  job["results"] = {
605
+ "characters": characters,
606
  "face_labels": labels,
607
  "video_name": video_name,
608
  "base_dir": str(base),
 
610
  job["status"] = JobStatus.DONE
611
  print(f"[{job_id}] ✓ Procesamiento completado")
612
 
613
+ except Exception as proc_error:
614
+ print(f"[{job_id}] Error en procesamiento: {proc_error}")
615
  import traceback
616
  traceback.print_exc()
617
  job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
svision_client.py CHANGED
@@ -121,3 +121,70 @@ def extract_descripcion_escena(imagen_path: str) -> str:
121
  api_name="/describe_images"
122
  )
123
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  api_name="/describe_images"
122
  )
123
  return result
124
+
125
+
126
+ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
127
+ """
128
+ Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.
129
+
130
+ This replaces local DeepFace/face_recognition processing by delegating to svision Space.
131
+
132
+ Parameters
133
+ ----------
134
+ image_path : str
135
+ Path to the input image file (a video frame).
136
+
137
+ Returns
138
+ -------
139
+ List[Dict[str, Any]]
140
+ List of dicts with 'embedding' (list of floats) and 'face_crop' (image path).
141
+ Returns empty list if no faces detected or on error.
142
+ """
143
+ try:
144
+ # Returns: (face_crops: list of images, face_embeddings: list of dicts)
145
+ result = _get_svision_client().predict(
146
+ image=handle_file(image_path),
147
+ api_name="/face_image_embedding_casting"
148
+ )
149
+ # result is a tuple: (list of image paths, list of embedding dicts)
150
+ if result and len(result) >= 2:
151
+ face_crops = result[0] if result[0] else []
152
+ face_embeddings = result[1] if result[1] else []
153
+ # Combine into unified structure
154
+ faces = []
155
+ for i, emb_dict in enumerate(face_embeddings):
156
+ faces.append({
157
+ "embedding": emb_dict.get("embedding", []),
158
+ "face_crop_path": face_crops[i] if i < len(face_crops) else None,
159
+ "index": emb_dict.get("index", i),
160
+ })
161
+ return faces
162
+ return []
163
+ except Exception as e:
164
+ print(f"[svision_client] get_face_embeddings_from_image error: {e}")
165
+ return []
166
+
167
+
168
+ def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
169
+ """
170
+ Call the /face_image_embedding endpoint to get face embeddings only.
171
+
172
+ Parameters
173
+ ----------
174
+ image_path : str
175
+ Path to the input image file.
176
+
177
+ Returns
178
+ -------
179
+ List[List[float]]
180
+ List of embedding vectors (one per detected face).
181
+ """
182
+ try:
183
+ result = _get_svision_client().predict(
184
+ image=handle_file(image_path),
185
+ api_name="/face_image_embedding"
186
+ )
187
+ return result if result else []
188
+ except Exception as e:
189
+ print(f"[svision_client] get_face_embeddings_simple error: {e}")
190
+ return []