Upload 3 files
Browse files- asr_client.py +62 -0
- preprocessing_router.py +131 -162
- svision_client.py +67 -0
asr_client.py
CHANGED
|
@@ -138,3 +138,65 @@ def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
|
|
| 138 |
api_name="/identificar_veu"
|
| 139 |
)
|
| 140 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
api_name="/identificar_veu"
|
| 139 |
)
|
| 140 |
return result
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def get_voice_embedding(audio_path: str) -> List[float]:
|
| 144 |
+
"""
|
| 145 |
+
Call the /voice_embedding endpoint to get a voice embedding vector.
|
| 146 |
+
|
| 147 |
+
This replaces local SpeakerRecognition processing by delegating to asr Space.
|
| 148 |
+
|
| 149 |
+
Parameters
|
| 150 |
+
----------
|
| 151 |
+
audio_path : str
|
| 152 |
+
Path to the audio file (WAV format preferred).
|
| 153 |
+
|
| 154 |
+
Returns
|
| 155 |
+
-------
|
| 156 |
+
List[float]
|
| 157 |
+
Normalized embedding vector for the voice, or empty list on error.
|
| 158 |
+
"""
|
| 159 |
+
try:
|
| 160 |
+
result = _get_asr_client().predict(
|
| 161 |
+
wav_archivo=handle_file(audio_path),
|
| 162 |
+
api_name="/voice_embedding"
|
| 163 |
+
)
|
| 164 |
+
return result if result else []
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"[asr_client] get_voice_embedding error: {e}")
|
| 167 |
+
return []
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
|
| 171 |
+
"""
|
| 172 |
+
Extract audio from video and perform diarization in one call.
|
| 173 |
+
|
| 174 |
+
Parameters
|
| 175 |
+
----------
|
| 176 |
+
video_path : str
|
| 177 |
+
Path to the input video file.
|
| 178 |
+
|
| 179 |
+
Returns
|
| 180 |
+
-------
|
| 181 |
+
Dict[str, Any]
|
| 182 |
+
Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
|
| 183 |
+
"""
|
| 184 |
+
try:
|
| 185 |
+
# First extract audio
|
| 186 |
+
audio_path = extract_audio_from_video(video_path)
|
| 187 |
+
if not audio_path:
|
| 188 |
+
return {"clips": [], "segments": [], "error": "Audio extraction failed"}
|
| 189 |
+
|
| 190 |
+
# Then diarize
|
| 191 |
+
result = diarize_audio(audio_path)
|
| 192 |
+
# result is tuple: (clips_paths, segments)
|
| 193 |
+
if result and len(result) >= 2:
|
| 194 |
+
return {
|
| 195 |
+
"clips": result[0] if result[0] else [],
|
| 196 |
+
"segments": result[1] if result[1] else [],
|
| 197 |
+
"audio_path": audio_path,
|
| 198 |
+
}
|
| 199 |
+
return {"clips": [], "segments": [], "audio_path": audio_path}
|
| 200 |
+
except Exception as e:
|
| 201 |
+
print(f"[asr_client] extract_audio_and_diarize error: {e}")
|
| 202 |
+
return {"clips": [], "segments": [], "error": str(e)}
|
preprocessing_router.py
CHANGED
|
@@ -5,21 +5,20 @@ from fastapi.responses import FileResponse
|
|
| 5 |
from pathlib import Path
|
| 6 |
from datetime import datetime
|
| 7 |
from enum import Enum
|
| 8 |
-
from typing import Dict, Any
|
| 9 |
import shutil
|
| 10 |
import os
|
| 11 |
import uuid
|
| 12 |
import numpy as np
|
| 13 |
import cv2
|
|
|
|
| 14 |
|
| 15 |
-
from video_processing import process_video_pipeline
|
| 16 |
-
from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
|
| 17 |
from casting_loader import ensure_chroma, build_faces_index, build_voices_index
|
| 18 |
-
from narration_system import NarrationSystem
|
| 19 |
from llm_router import load_yaml, LLMRouter
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
ROOT = Path("/tmp/veureu")
|
|
@@ -43,26 +42,9 @@ jobs: Dict[str, dict] = {}
|
|
| 43 |
|
| 44 |
|
| 45 |
# ---------------------------------------------------------------------------
|
| 46 |
-
# Helper
|
| 47 |
# ---------------------------------------------------------------------------
|
| 48 |
|
| 49 |
-
def normalize_face_lighting(image):
|
| 50 |
-
"""Normalize face brightness using CLAHE and range normalization."""
|
| 51 |
-
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
|
| 52 |
-
l, a, b = cv2.split(lab)
|
| 53 |
-
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
| 54 |
-
l_clahe = clahe.apply(l)
|
| 55 |
-
l_min, l_max = l_clahe.min(), l_clahe.max()
|
| 56 |
-
if l_max > l_min:
|
| 57 |
-
l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
|
| 58 |
-
else:
|
| 59 |
-
l_normalized = l_clahe
|
| 60 |
-
l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
|
| 61 |
-
lab_normalized = cv2.merge([l_normalized, a, b])
|
| 62 |
-
normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
|
| 63 |
-
return normalized
|
| 64 |
-
|
| 65 |
-
|
| 66 |
def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
|
| 67 |
"""Hierarchical clustering with silhouette score and minimum cluster size."""
|
| 68 |
from scipy.cluster.hierarchy import linkage, fcluster
|
|
@@ -412,10 +394,18 @@ async def detect_scenes(
|
|
| 412 |
|
| 413 |
|
| 414 |
def process_video_job(job_id: str):
|
| 415 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
try:
|
| 417 |
job = jobs[job_id]
|
| 418 |
-
print(f"[{job_id}] Iniciando procesamiento...")
|
| 419 |
|
| 420 |
job["status"] = JobStatus.PROCESSING
|
| 421 |
|
|
@@ -430,23 +420,15 @@ def process_video_job(job_id: str):
|
|
| 430 |
print(f"[{job_id}] Directorio base: {base}")
|
| 431 |
|
| 432 |
try:
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
except Exception:
|
| 439 |
-
face_recognition = None
|
| 440 |
-
_use_fr = False
|
| 441 |
-
print(f"[{job_id}] face_recognition no disponible. Intentando DeepFace fallback.")
|
| 442 |
-
try:
|
| 443 |
-
from deepface import DeepFace
|
| 444 |
-
except Exception:
|
| 445 |
-
DeepFace = None
|
| 446 |
-
|
| 447 |
cap = cv2.VideoCapture(video_path)
|
| 448 |
if not cap.isOpened():
|
| 449 |
-
raise RuntimeError("No se pudo abrir el vídeo
|
|
|
|
| 450 |
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
| 451 |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
|
| 452 |
max_samples = job.get("max_frames", 100)
|
|
@@ -455,100 +437,98 @@ def process_video_job(job_id: str):
|
|
| 455 |
frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
|
| 456 |
else:
|
| 457 |
frame_indices = []
|
|
|
|
| 458 |
print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
|
| 459 |
|
|
|
|
|
|
|
|
|
|
| 460 |
faces_root = base / "faces_raw"
|
| 461 |
faces_root.mkdir(parents=True, exist_ok=True)
|
| 462 |
-
embeddings: list[list[float]] = []
|
| 463 |
-
crops_meta: list[dict] = []
|
| 464 |
-
|
| 465 |
-
saved_count = 0
|
| 466 |
-
frames_processed = 0
|
| 467 |
-
frames_with_faces = 0
|
| 468 |
|
|
|
|
| 469 |
for frame_idx in frame_indices:
|
| 470 |
cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
|
| 471 |
-
|
| 472 |
-
if not
|
| 473 |
continue
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
if _use_fr and face_recognition is not None:
|
| 479 |
-
boxes = face_recognition.face_locations(rgb, model="hog")
|
| 480 |
-
encs = face_recognition.face_encodings(rgb, boxes)
|
| 481 |
-
if boxes:
|
| 482 |
-
frames_with_faces += 1
|
| 483 |
-
for (top, right, bottom, left), e in zip(boxes, encs):
|
| 484 |
-
crop = frame_normalized[top:bottom, left:right]
|
| 485 |
-
if crop.size == 0:
|
| 486 |
-
continue
|
| 487 |
-
fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
|
| 488 |
-
cv2.imwrite(str(faces_root / fn), crop)
|
| 489 |
-
e = np.array(e, dtype=float)
|
| 490 |
-
e = e / (np.linalg.norm(e) + 1e-9)
|
| 491 |
-
embeddings.append(e.astype(float).tolist())
|
| 492 |
-
crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
|
| 493 |
-
saved_count += 1
|
| 494 |
-
else:
|
| 495 |
-
if DeepFace is not None:
|
| 496 |
-
try:
|
| 497 |
-
gray = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2GRAY)
|
| 498 |
-
haar_path = getattr(cv2.data, 'haarcascades', None) or ''
|
| 499 |
-
face_cascade = cv2.CascadeClassifier(os.path.join(haar_path, 'haarcascade_frontalface_default.xml'))
|
| 500 |
-
boxes_haar = []
|
| 501 |
-
if face_cascade is not None and not face_cascade.empty():
|
| 502 |
-
faces_haar = face_cascade.detectMultiScale(gray, scaleFactor=1.08, minNeighbors=5, minSize=(50, 50))
|
| 503 |
-
for (x, y, w, h) in faces_haar:
|
| 504 |
-
top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
|
| 505 |
-
boxes_haar.append((top, right, bottom, left))
|
| 506 |
-
|
| 507 |
-
if boxes_haar:
|
| 508 |
-
frames_with_faces += 1
|
| 509 |
-
|
| 510 |
-
for (top, right, bottom, left) in boxes_haar:
|
| 511 |
-
crop = frame_normalized[top:bottom, left:right]
|
| 512 |
-
if crop.size == 0:
|
| 513 |
-
continue
|
| 514 |
-
fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
|
| 515 |
-
crop_path = faces_root / fn
|
| 516 |
-
cv2.imwrite(str(crop_path), crop)
|
| 517 |
-
reps = DeepFace.represent(img_path=str(crop_path), model_name="Facenet512", enforce_detection=False)
|
| 518 |
-
for r in (reps or []):
|
| 519 |
-
emb = r.get("embedding") if isinstance(r, dict) else r
|
| 520 |
-
if emb is None:
|
| 521 |
-
continue
|
| 522 |
-
emb = np.array(emb, dtype=float)
|
| 523 |
-
emb = emb / (np.linalg.norm(emb) + 1e-9)
|
| 524 |
-
embeddings.append(emb.astype(float).tolist())
|
| 525 |
-
crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
|
| 526 |
-
saved_count += 1
|
| 527 |
-
except Exception as _e_df:
|
| 528 |
-
print(f"[{job_id}] DeepFace fallback error: {_e_df}")
|
| 529 |
cap.release()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
-
print(f"[{job_id}] ✓ Frames
|
| 532 |
-
print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}")
|
| 533 |
print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
|
| 534 |
|
| 535 |
-
#
|
|
|
|
|
|
|
| 536 |
if embeddings:
|
|
|
|
| 537 |
Xf = np.array(embeddings)
|
| 538 |
labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
|
| 539 |
-
|
|
|
|
| 540 |
else:
|
| 541 |
labels = []
|
| 542 |
|
| 543 |
-
#
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
FACE_CONFIDENCE_THRESHOLD = 0.5
|
| 549 |
-
|
| 550 |
-
characters_validated: list[dict[str, Any]] = []
|
| 551 |
-
cluster_map: dict[int, list[int]] = {}
|
| 552 |
for idx, lbl in enumerate(labels):
|
| 553 |
if isinstance(lbl, int) and lbl >= 0:
|
| 554 |
cluster_map.setdefault(lbl, []).append(idx)
|
|
@@ -558,55 +538,40 @@ def process_video_job(job_id: str):
|
|
| 558 |
|
| 559 |
for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
|
| 560 |
char_id = f"char_{ci:02d}"
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
meta = crops_meta[j]
|
| 564 |
-
file_name = meta.get("file")
|
| 565 |
-
if not file_name:
|
| 566 |
-
continue
|
| 567 |
-
box = meta.get("box", [0, 0, 0, 0])
|
| 568 |
-
area = abs(box[1] - box[3]) * abs(box[2] - box[0]) if len(box) >= 4 else 0
|
| 569 |
-
detections.append({"index": j, "file": file_name, "score": area, "box": box})
|
| 570 |
-
|
| 571 |
-
if not detections:
|
| 572 |
continue
|
| 573 |
|
| 574 |
-
detections.sort(key=lambda d: d["score"], reverse=True)
|
| 575 |
-
best_face = detections[0]
|
| 576 |
-
best_face_path = faces_root / best_face["file"]
|
| 577 |
-
|
| 578 |
-
# Validation (optional)
|
| 579 |
-
validation = None
|
| 580 |
-
if validate_and_classify_face is not None:
|
| 581 |
-
try:
|
| 582 |
-
validation = validate_and_classify_face(str(best_face_path))
|
| 583 |
-
except Exception:
|
| 584 |
-
validation = None
|
| 585 |
-
|
| 586 |
-
if validation and not validation.get("is_valid_face", True):
|
| 587 |
-
if validation.get("face_confidence", 1.0) < FACE_CONFIDENCE_THRESHOLD:
|
| 588 |
-
continue
|
| 589 |
-
|
| 590 |
out_dir = chars_dir / char_id
|
| 591 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 592 |
|
| 593 |
-
|
|
|
|
| 594 |
max_faces_to_show = (total_faces // 2) + 1
|
| 595 |
-
|
| 596 |
|
| 597 |
-
files:
|
| 598 |
-
file_urls:
|
| 599 |
-
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
src = faces_root / fname
|
| 602 |
dst = out_dir / fname
|
| 603 |
try:
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
|
|
|
| 607 |
except Exception:
|
| 608 |
pass
|
| 609 |
|
|
|
|
| 610 |
rep = files[0] if files else None
|
| 611 |
if rep:
|
| 612 |
try:
|
|
@@ -614,14 +579,12 @@ def process_video_job(job_id: str):
|
|
| 614 |
except Exception:
|
| 615 |
pass
|
| 616 |
|
| 617 |
-
cluster_number =
|
| 618 |
character_name = f"Cluster {cluster_number}"
|
| 619 |
-
gender = validation.get("gender", "Neutral") if validation else "Neutral"
|
| 620 |
|
| 621 |
-
|
| 622 |
"id": char_id,
|
| 623 |
"name": character_name,
|
| 624 |
-
"gender": gender,
|
| 625 |
"folder": str(out_dir),
|
| 626 |
"num_faces": len(files),
|
| 627 |
"total_faces_detected": total_faces,
|
|
@@ -630,10 +593,16 @@ def process_video_job(job_id: str):
|
|
| 630 |
})
|
| 631 |
print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
|
| 632 |
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
|
| 635 |
job["results"] = {
|
| 636 |
-
"characters":
|
| 637 |
"face_labels": labels,
|
| 638 |
"video_name": video_name,
|
| 639 |
"base_dir": str(base),
|
|
@@ -641,8 +610,8 @@ def process_video_job(job_id: str):
|
|
| 641 |
job["status"] = JobStatus.DONE
|
| 642 |
print(f"[{job_id}] ✓ Procesamiento completado")
|
| 643 |
|
| 644 |
-
except Exception as
|
| 645 |
-
print(f"[{job_id}] Error en
|
| 646 |
import traceback
|
| 647 |
traceback.print_exc()
|
| 648 |
job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
from datetime import datetime
|
| 7 |
from enum import Enum
|
| 8 |
+
from typing import Dict, Any, List
|
| 9 |
import shutil
|
| 10 |
import os
|
| 11 |
import uuid
|
| 12 |
import numpy as np
|
| 13 |
import cv2
|
| 14 |
+
import tempfile
|
| 15 |
|
|
|
|
|
|
|
| 16 |
from casting_loader import ensure_chroma, build_faces_index, build_voices_index
|
|
|
|
| 17 |
from llm_router import load_yaml, LLMRouter
|
| 18 |
+
|
| 19 |
+
# External space clients (no local GPU needed)
|
| 20 |
+
import svision_client
|
| 21 |
+
import asr_client
|
| 22 |
|
| 23 |
|
| 24 |
ROOT = Path("/tmp/veureu")
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
# ---------------------------------------------------------------------------
|
| 45 |
+
# Helper function for clustering (only math, no GPU)
|
| 46 |
# ---------------------------------------------------------------------------
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
|
| 49 |
"""Hierarchical clustering with silhouette score and minimum cluster size."""
|
| 50 |
from scipy.cluster.hierarchy import linkage, fcluster
|
|
|
|
| 394 |
|
| 395 |
|
| 396 |
def process_video_job(job_id: str):
|
| 397 |
+
"""
|
| 398 |
+
Process video job in background using EXTERNAL spaces (svision, asr).
|
| 399 |
+
|
| 400 |
+
NO local GPU needed - all vision/audio processing is delegated to:
|
| 401 |
+
- svision: face detection + embeddings (MTCNN + FaceNet)
|
| 402 |
+
- asr: audio diarization + voice embeddings (pyannote + ECAPA)
|
| 403 |
+
|
| 404 |
+
Engine only does: frame extraction, clustering (math), file organization.
|
| 405 |
+
"""
|
| 406 |
try:
|
| 407 |
job = jobs[job_id]
|
| 408 |
+
print(f"[{job_id}] Iniciando procesamiento (delegando a svision/asr)...")
|
| 409 |
|
| 410 |
job["status"] = JobStatus.PROCESSING
|
| 411 |
|
|
|
|
| 420 |
print(f"[{job_id}] Directorio base: {base}")
|
| 421 |
|
| 422 |
try:
|
| 423 |
+
# ============================================================
|
| 424 |
+
# STEP 1: Extract frames from video (local, simple cv2)
|
| 425 |
+
# ============================================================
|
| 426 |
+
print(f"[{job_id}] Extrayendo frames del vídeo...")
|
| 427 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
cap = cv2.VideoCapture(video_path)
|
| 429 |
if not cap.isOpened():
|
| 430 |
+
raise RuntimeError("No se pudo abrir el vídeo")
|
| 431 |
+
|
| 432 |
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
| 433 |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
|
| 434 |
max_samples = job.get("max_frames", 100)
|
|
|
|
| 437 |
frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
|
| 438 |
else:
|
| 439 |
frame_indices = []
|
| 440 |
+
|
| 441 |
print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
|
| 442 |
|
| 443 |
+
# Save frames temporarily for svision processing
|
| 444 |
+
frames_dir = base / "frames_temp"
|
| 445 |
+
frames_dir.mkdir(parents=True, exist_ok=True)
|
| 446 |
faces_root = base / "faces_raw"
|
| 447 |
faces_root.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
+
frame_paths: List[str] = []
|
| 450 |
for frame_idx in frame_indices:
|
| 451 |
cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
|
| 452 |
+
ret, frame = cap.read()
|
| 453 |
+
if not ret:
|
| 454 |
continue
|
| 455 |
+
frame_path = frames_dir / f"frame_{frame_idx:06d}.jpg"
|
| 456 |
+
cv2.imwrite(str(frame_path), frame)
|
| 457 |
+
frame_paths.append(str(frame_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
cap.release()
|
| 459 |
+
|
| 460 |
+
print(f"[{job_id}] ✓ {len(frame_paths)} frames extraídos")
|
| 461 |
+
|
| 462 |
+
# ============================================================
|
| 463 |
+
# STEP 2: Send frames to SVISION for face detection + embeddings
|
| 464 |
+
# ============================================================
|
| 465 |
+
print(f"[{job_id}] Enviando frames a svision para detección de caras...")
|
| 466 |
+
|
| 467 |
+
embeddings: List[List[float]] = []
|
| 468 |
+
crops_meta: List[dict] = []
|
| 469 |
+
saved_count = 0
|
| 470 |
+
frames_with_faces = 0
|
| 471 |
+
|
| 472 |
+
for i, frame_path in enumerate(frame_paths):
|
| 473 |
+
frame_idx = frame_indices[i] if i < len(frame_indices) else i
|
| 474 |
+
try:
|
| 475 |
+
# Call svision to get faces + embeddings
|
| 476 |
+
faces = svision_client.get_face_embeddings_from_image(frame_path)
|
| 477 |
+
|
| 478 |
+
if faces:
|
| 479 |
+
frames_with_faces += 1
|
| 480 |
+
for face_data in faces:
|
| 481 |
+
emb = face_data.get("embedding", [])
|
| 482 |
+
if not emb:
|
| 483 |
+
continue
|
| 484 |
+
|
| 485 |
+
# Normalize embedding
|
| 486 |
+
emb = np.array(emb, dtype=float)
|
| 487 |
+
emb = emb / (np.linalg.norm(emb) + 1e-9)
|
| 488 |
+
embeddings.append(emb.tolist())
|
| 489 |
+
|
| 490 |
+
# Save face crop if provided by svision
|
| 491 |
+
crop_path = face_data.get("face_crop_path")
|
| 492 |
+
fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
|
| 493 |
+
local_crop_path = faces_root / fn
|
| 494 |
+
|
| 495 |
+
if crop_path and os.path.exists(crop_path):
|
| 496 |
+
shutil.copy2(crop_path, local_crop_path)
|
| 497 |
+
else:
|
| 498 |
+
# If no crop from svision, use original frame
|
| 499 |
+
shutil.copy2(frame_path, local_crop_path)
|
| 500 |
+
|
| 501 |
+
crops_meta.append({
|
| 502 |
+
"file": fn,
|
| 503 |
+
"frame": frame_idx,
|
| 504 |
+
"index": face_data.get("index", saved_count),
|
| 505 |
+
})
|
| 506 |
+
saved_count += 1
|
| 507 |
+
|
| 508 |
+
except Exception as e:
|
| 509 |
+
print(f"[{job_id}] Error procesando frame {frame_idx}: {e}")
|
| 510 |
+
continue
|
| 511 |
|
| 512 |
+
print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}/{len(frame_paths)}")
|
|
|
|
| 513 |
print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
|
| 514 |
|
| 515 |
+
# ============================================================
|
| 516 |
+
# STEP 3: Clustering (local, only math - no GPU)
|
| 517 |
+
# ============================================================
|
| 518 |
if embeddings:
|
| 519 |
+
print(f"[{job_id}] Clustering jerárquico...")
|
| 520 |
Xf = np.array(embeddings)
|
| 521 |
labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
|
| 522 |
+
n_clusters = len(set([l for l in labels if l >= 0]))
|
| 523 |
+
print(f"[{job_id}] ✓ Clustering: {n_clusters} clusters")
|
| 524 |
else:
|
| 525 |
labels = []
|
| 526 |
|
| 527 |
+
# ============================================================
|
| 528 |
+
# STEP 4: Organize faces into character folders
|
| 529 |
+
# ============================================================
|
| 530 |
+
characters: List[Dict[str, Any]] = []
|
| 531 |
+
cluster_map: Dict[int, List[int]] = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
for idx, lbl in enumerate(labels):
|
| 533 |
if isinstance(lbl, int) and lbl >= 0:
|
| 534 |
cluster_map.setdefault(lbl, []).append(idx)
|
|
|
|
| 538 |
|
| 539 |
for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
|
| 540 |
char_id = f"char_{ci:02d}"
|
| 541 |
+
|
| 542 |
+
if not idxs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
continue
|
| 544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
out_dir = chars_dir / char_id
|
| 546 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 547 |
|
| 548 |
+
# Select faces to show (half + 1)
|
| 549 |
+
total_faces = len(idxs)
|
| 550 |
max_faces_to_show = (total_faces // 2) + 1
|
| 551 |
+
selected_idxs = idxs[:max_faces_to_show]
|
| 552 |
|
| 553 |
+
files: List[str] = []
|
| 554 |
+
file_urls: List[str] = []
|
| 555 |
+
|
| 556 |
+
for j in selected_idxs:
|
| 557 |
+
if j >= len(crops_meta):
|
| 558 |
+
continue
|
| 559 |
+
meta = crops_meta[j]
|
| 560 |
+
fname = meta.get("file")
|
| 561 |
+
if not fname:
|
| 562 |
+
continue
|
| 563 |
+
|
| 564 |
src = faces_root / fname
|
| 565 |
dst = out_dir / fname
|
| 566 |
try:
|
| 567 |
+
if src.exists():
|
| 568 |
+
shutil.copy2(src, dst)
|
| 569 |
+
files.append(fname)
|
| 570 |
+
file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
|
| 571 |
except Exception:
|
| 572 |
pass
|
| 573 |
|
| 574 |
+
# Create representative image
|
| 575 |
rep = files[0] if files else None
|
| 576 |
if rep:
|
| 577 |
try:
|
|
|
|
| 579 |
except Exception:
|
| 580 |
pass
|
| 581 |
|
| 582 |
+
cluster_number = ci + 1
|
| 583 |
character_name = f"Cluster {cluster_number}"
|
|
|
|
| 584 |
|
| 585 |
+
characters.append({
|
| 586 |
"id": char_id,
|
| 587 |
"name": character_name,
|
|
|
|
| 588 |
"folder": str(out_dir),
|
| 589 |
"num_faces": len(files),
|
| 590 |
"total_faces_detected": total_faces,
|
|
|
|
| 593 |
})
|
| 594 |
print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
|
| 595 |
|
| 596 |
+
# Cleanup temp frames
|
| 597 |
+
try:
|
| 598 |
+
shutil.rmtree(frames_dir)
|
| 599 |
+
except Exception:
|
| 600 |
+
pass
|
| 601 |
+
|
| 602 |
+
print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
|
| 603 |
|
| 604 |
job["results"] = {
|
| 605 |
+
"characters": characters,
|
| 606 |
"face_labels": labels,
|
| 607 |
"video_name": video_name,
|
| 608 |
"base_dir": str(base),
|
|
|
|
| 610 |
job["status"] = JobStatus.DONE
|
| 611 |
print(f"[{job_id}] ✓ Procesamiento completado")
|
| 612 |
|
| 613 |
+
except Exception as proc_error:
|
| 614 |
+
print(f"[{job_id}] Error en procesamiento: {proc_error}")
|
| 615 |
import traceback
|
| 616 |
traceback.print_exc()
|
| 617 |
job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
|
svision_client.py
CHANGED
|
@@ -121,3 +121,70 @@ def extract_descripcion_escena(imagen_path: str) -> str:
|
|
| 121 |
api_name="/describe_images"
|
| 122 |
)
|
| 123 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
api_name="/describe_images"
|
| 122 |
)
|
| 123 |
return result
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
|
| 127 |
+
"""
|
| 128 |
+
Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.
|
| 129 |
+
|
| 130 |
+
This replaces local DeepFace/face_recognition processing by delegating to svision Space.
|
| 131 |
+
|
| 132 |
+
Parameters
|
| 133 |
+
----------
|
| 134 |
+
image_path : str
|
| 135 |
+
Path to the input image file (a video frame).
|
| 136 |
+
|
| 137 |
+
Returns
|
| 138 |
+
-------
|
| 139 |
+
List[Dict[str, Any]]
|
| 140 |
+
List of dicts with 'embedding' (list of floats) and 'face_crop' (image path).
|
| 141 |
+
Returns empty list if no faces detected or on error.
|
| 142 |
+
"""
|
| 143 |
+
try:
|
| 144 |
+
# Returns: (face_crops: list of images, face_embeddings: list of dicts)
|
| 145 |
+
result = _get_svision_client().predict(
|
| 146 |
+
image=handle_file(image_path),
|
| 147 |
+
api_name="/face_image_embedding_casting"
|
| 148 |
+
)
|
| 149 |
+
# result is a tuple: (list of image paths, list of embedding dicts)
|
| 150 |
+
if result and len(result) >= 2:
|
| 151 |
+
face_crops = result[0] if result[0] else []
|
| 152 |
+
face_embeddings = result[1] if result[1] else []
|
| 153 |
+
# Combine into unified structure
|
| 154 |
+
faces = []
|
| 155 |
+
for i, emb_dict in enumerate(face_embeddings):
|
| 156 |
+
faces.append({
|
| 157 |
+
"embedding": emb_dict.get("embedding", []),
|
| 158 |
+
"face_crop_path": face_crops[i] if i < len(face_crops) else None,
|
| 159 |
+
"index": emb_dict.get("index", i),
|
| 160 |
+
})
|
| 161 |
+
return faces
|
| 162 |
+
return []
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f"[svision_client] get_face_embeddings_from_image error: {e}")
|
| 165 |
+
return []
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
|
| 169 |
+
"""
|
| 170 |
+
Call the /face_image_embedding endpoint to get face embeddings only.
|
| 171 |
+
|
| 172 |
+
Parameters
|
| 173 |
+
----------
|
| 174 |
+
image_path : str
|
| 175 |
+
Path to the input image file.
|
| 176 |
+
|
| 177 |
+
Returns
|
| 178 |
+
-------
|
| 179 |
+
List[List[float]]
|
| 180 |
+
List of embedding vectors (one per detected face).
|
| 181 |
+
"""
|
| 182 |
+
try:
|
| 183 |
+
result = _get_svision_client().predict(
|
| 184 |
+
image=handle_file(image_path),
|
| 185 |
+
api_name="/face_image_embedding"
|
| 186 |
+
)
|
| 187 |
+
return result if result else []
|
| 188 |
+
except Exception as e:
|
| 189 |
+
print(f"[svision_client] get_face_embeddings_simple error: {e}")
|
| 190 |
+
return []
|