Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 24 days ago

Commit

924dc7a

verified ·

1 Parent(s): c8c329a

Upload 3 files

Browse files

Files changed (3) hide show

asr_client.py +62 -0
preprocessing_router.py +131 -162
svision_client.py +67 -0

asr_client.py CHANGED Viewed

@@ -138,3 +138,65 @@ def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
         api_name="/identificar_veu"
     )
     return result

         api_name="/identificar_veu"
     )
     return result
+def get_voice_embedding(audio_path: str) -> List[float]:
+    """
+    Call the /voice_embedding endpoint to get a voice embedding vector.
+    This replaces local SpeakerRecognition processing by delegating to asr Space.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the audio file (WAV format preferred).
+    Returns
+    -------
+    List[float]
+        Normalized embedding vector for the voice, or empty list on error.
+    """
+    try:
+        result = _get_asr_client().predict(
+            wav_archivo=handle_file(audio_path),
+            api_name="/voice_embedding"
+        )
+        return result if result else []
+    except Exception as e:
+        print(f"[asr_client] get_voice_embedding error: {e}")
+        return []
+def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
+    """
+    Extract audio from video and perform diarization in one call.
+    Parameters
+    ----------
+    video_path : str
+        Path to the input video file.
+    Returns
+    -------
+    Dict[str, Any]
+        Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
+    """
+    try:
+        # First extract audio
+        audio_path = extract_audio_from_video(video_path)
+        if not audio_path:
+            return {"clips": [], "segments": [], "error": "Audio extraction failed"}
+        # Then diarize
+        result = diarize_audio(audio_path)
+        # result is tuple: (clips_paths, segments)
+        if result and len(result) >= 2:
+            return {
+                "clips": result[0] if result[0] else [],
+                "segments": result[1] if result[1] else [],
+                "audio_path": audio_path,
+            }
+        return {"clips": [], "segments": [], "audio_path": audio_path}
+    except Exception as e:
+        print(f"[asr_client] extract_audio_and_diarize error: {e}")
+        return {"clips": [], "segments": [], "error": str(e)}

preprocessing_router.py CHANGED Viewed

@@ -5,21 +5,20 @@ from fastapi.responses import FileResponse
 from pathlib import Path
 from datetime import datetime
 from enum import Enum
-from typing import Dict, Any
 import shutil
 import os
 import uuid
 import numpy as np
 import cv2
-from video_processing import process_video_pipeline
-from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
 from casting_loader import ensure_chroma, build_faces_index, build_voices_index
-from narration_system import NarrationSystem
 from llm_router import load_yaml, LLMRouter
-from character_detection import detect_characters_from_video
-from vision_tools import FaceOfImageEmbedding
-from pipelines.audiodescription import generate as ad_generate
 ROOT = Path("/tmp/veureu")
@@ -43,26 +42,9 @@ jobs: Dict[str, dict] = {}
 # ---------------------------------------------------------------------------
-# Helper functions for face detection and clustering
 # ---------------------------------------------------------------------------
-def normalize_face_lighting(image):
-    """Normalize face brightness using CLAHE and range normalization."""
-    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
-    l, a, b = cv2.split(lab)
-    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
-    l_clahe = clahe.apply(l)
-    l_min, l_max = l_clahe.min(), l_clahe.max()
-    if l_max > l_min:
-        l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
-    else:
-        l_normalized = l_clahe
-    l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
-    lab_normalized = cv2.merge([l_normalized, a, b])
-    normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
-    return normalized
 def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
     """Hierarchical clustering with silhouette score and minimum cluster size."""
     from scipy.cluster.hierarchy import linkage, fcluster
@@ -412,10 +394,18 @@ async def detect_scenes(
 def process_video_job(job_id: str):
-    """Process video job in background: detect faces, cluster, validate."""
     try:
         job = jobs[job_id]
-        print(f"[{job_id}] Iniciando procesamiento...")
         job["status"] = JobStatus.PROCESSING
@@ -430,23 +420,15 @@ def process_video_job(job_id: str):
         print(f"[{job_id}] Directorio base: {base}")
         try:
-            print(f"[{job_id}] Iniciando detección de personajes...")
-            try:
-                import face_recognition
-                _use_fr = True
-                print(f"[{job_id}] face_recognition disponible: CPU")
-            except Exception:
-                face_recognition = None
-                _use_fr = False
-                print(f"[{job_id}] face_recognition no disponible. Intentando DeepFace fallback.")
-                try:
-                    from deepface import DeepFace
-                except Exception:
-                    DeepFace = None
             cap = cv2.VideoCapture(video_path)
             if not cap.isOpened():
-                raise RuntimeError("No se pudo abrir el vídeo para extracción de caras")
             fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
             max_samples = job.get("max_frames", 100)
@@ -455,100 +437,98 @@ def process_video_job(job_id: str):
                 frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
             else:
                 frame_indices = []
             print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
             faces_root = base / "faces_raw"
             faces_root.mkdir(parents=True, exist_ok=True)
-            embeddings: list[list[float]] = []
-            crops_meta: list[dict] = []
-            saved_count = 0
-            frames_processed = 0
-            frames_with_faces = 0
             for frame_idx in frame_indices:
                 cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
-                ret2, frame = cap.read()
-                if not ret2:
                     continue
-                frames_processed += 1
-                frame_normalized = normalize_face_lighting(frame)
-                rgb = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2RGB)
-                if _use_fr and face_recognition is not None:
-                    boxes = face_recognition.face_locations(rgb, model="hog")
-                    encs = face_recognition.face_encodings(rgb, boxes)
-                    if boxes:
-                        frames_with_faces += 1
-                    for (top, right, bottom, left), e in zip(boxes, encs):
-                        crop = frame_normalized[top:bottom, left:right]
-                        if crop.size == 0:
-                            continue
-                        fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
-                        cv2.imwrite(str(faces_root / fn), crop)
-                        e = np.array(e, dtype=float)
-                        e = e / (np.linalg.norm(e) + 1e-9)
-                        embeddings.append(e.astype(float).tolist())
-                        crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
-                        saved_count += 1
-                else:
-                    if DeepFace is not None:
-                        try:
-                            gray = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2GRAY)
-                            haar_path = getattr(cv2.data, 'haarcascades', None) or ''
-                            face_cascade = cv2.CascadeClassifier(os.path.join(haar_path, 'haarcascade_frontalface_default.xml'))
-                            boxes_haar = []
-                            if face_cascade is not None and not face_cascade.empty():
-                                faces_haar = face_cascade.detectMultiScale(gray, scaleFactor=1.08, minNeighbors=5, minSize=(50, 50))
-                                for (x, y, w, h) in faces_haar:
-                                    top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
-                                    boxes_haar.append((top, right, bottom, left))
-                            if boxes_haar:
-                                frames_with_faces += 1
-                            for (top, right, bottom, left) in boxes_haar:
-                                crop = frame_normalized[top:bottom, left:right]
-                                if crop.size == 0:
-                                    continue
-                                fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
-                                crop_path = faces_root / fn
-                                cv2.imwrite(str(crop_path), crop)
-                                reps = DeepFace.represent(img_path=str(crop_path), model_name="Facenet512", enforce_detection=False)
-                                for r in (reps or []):
-                                    emb = r.get("embedding") if isinstance(r, dict) else r
-                                    if emb is None:
-                                        continue
-                                    emb = np.array(emb, dtype=float)
-                                    emb = emb / (np.linalg.norm(emb) + 1e-9)
-                                    embeddings.append(emb.astype(float).tolist())
-                                    crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
-                                    saved_count += 1
-                        except Exception as _e_df:
-                            print(f"[{job_id}] DeepFace fallback error: {_e_df}")
             cap.release()
-            print(f"[{job_id}] ✓ Frames procesados: {frames_processed}/{len(frame_indices)}")
-            print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}")
             print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
-            # Clustering
             if embeddings:
                 Xf = np.array(embeddings)
                 labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
-                print(f"[{job_id}] Clustering: {len(set([l for l in labels if l >= 0]))} clusters")
             else:
                 labels = []
-            # Build character folders with validation
-            try:
-                from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
-            except ImportError:
-                validate_and_classify_face = None
-                FACE_CONFIDENCE_THRESHOLD = 0.5
-            characters_validated: list[dict[str, Any]] = []
-            cluster_map: dict[int, list[int]] = {}
             for idx, lbl in enumerate(labels):
                 if isinstance(lbl, int) and lbl >= 0:
                     cluster_map.setdefault(lbl, []).append(idx)
@@ -558,55 +538,40 @@ def process_video_job(job_id: str):
             for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
                 char_id = f"char_{ci:02d}"
-                detections: list[dict[str, Any]] = []
-                for j in idxs:
-                    meta = crops_meta[j]
-                    file_name = meta.get("file")
-                    if not file_name:
-                        continue
-                    box = meta.get("box", [0, 0, 0, 0])
-                    area = abs(box[1] - box[3]) * abs(box[2] - box[0]) if len(box) >= 4 else 0
-                    detections.append({"index": j, "file": file_name, "score": area, "box": box})
-                if not detections:
                     continue
-                detections.sort(key=lambda d: d["score"], reverse=True)
-                best_face = detections[0]
-                best_face_path = faces_root / best_face["file"]
-                # Validation (optional)
-                validation = None
-                if validate_and_classify_face is not None:
-                    try:
-                        validation = validate_and_classify_face(str(best_face_path))
-                    except Exception:
-                        validation = None
-                if validation and not validation.get("is_valid_face", True):
-                    if validation.get("face_confidence", 1.0) < FACE_CONFIDENCE_THRESHOLD:
-                        continue
                 out_dir = chars_dir / char_id
                 out_dir.mkdir(parents=True, exist_ok=True)
-                total_faces = len(detections)
                 max_faces_to_show = (total_faces // 2) + 1
-                selected = detections[:max_faces_to_show]
-                files: list[str] = []
-                file_urls: list[str] = []
-                for det in selected:
-                    fname = det["file"]
                     src = faces_root / fname
                     dst = out_dir / fname
                     try:
-                        shutil.copy2(src, dst)
-                        files.append(fname)
-                        file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
                     except Exception:
                         pass
                 rep = files[0] if files else None
                 if rep:
                     try:
@@ -614,14 +579,12 @@ def process_video_job(job_id: str):
                     except Exception:
                         pass
-                cluster_number = int(char_id.split("_")[1]) + 1
                 character_name = f"Cluster {cluster_number}"
-                gender = validation.get("gender", "Neutral") if validation else "Neutral"
-                characters_validated.append({
                     "id": char_id,
                     "name": character_name,
-                    "gender": gender,
                     "folder": str(out_dir),
                     "num_faces": len(files),
                     "total_faces_detected": total_faces,
@@ -630,10 +593,16 @@ def process_video_job(job_id: str):
                 })
                 print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
-            print(f"[{job_id}] ✓ Total: {len(characters_validated)} personajes válidos")
             job["results"] = {
-                "characters": characters_validated,
                 "face_labels": labels,
                 "video_name": video_name,
                 "base_dir": str(base),
@@ -641,8 +610,8 @@ def process_video_job(job_id: str):
             job["status"] = JobStatus.DONE
             print(f"[{job_id}] ✓ Procesamiento completado")
-        except Exception as face_error:
-            print(f"[{job_id}] Error en detección de caras: {face_error}")
             import traceback
             traceback.print_exc()
             job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}

 from pathlib import Path
 from datetime import datetime
 from enum import Enum
+from typing import Dict, Any, List
 import shutil
 import os
 import uuid
 import numpy as np
 import cv2
+import tempfile
 from casting_loader import ensure_chroma, build_faces_index, build_voices_index
 from llm_router import load_yaml, LLMRouter
+# External space clients (no local GPU needed)
+import svision_client
+import asr_client
 ROOT = Path("/tmp/veureu")
 # ---------------------------------------------------------------------------
+# Helper function for clustering (only math, no GPU)
 # ---------------------------------------------------------------------------
 def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
     """Hierarchical clustering with silhouette score and minimum cluster size."""
     from scipy.cluster.hierarchy import linkage, fcluster
 def process_video_job(job_id: str):
+    """
+    Process video job in background using EXTERNAL spaces (svision, asr).
+    NO local GPU needed - all vision/audio processing is delegated to:
+    - svision: face detection + embeddings (MTCNN + FaceNet)
+    - asr: audio diarization + voice embeddings (pyannote + ECAPA)
+    Engine only does: frame extraction, clustering (math), file organization.
+    """
     try:
         job = jobs[job_id]
+        print(f"[{job_id}] Iniciando procesamiento (delegando a svision/asr)...")
         job["status"] = JobStatus.PROCESSING
         print(f"[{job_id}] Directorio base: {base}")
         try:
+            # ============================================================
+            # STEP 1: Extract frames from video (local, simple cv2)
+            # ============================================================
+            print(f"[{job_id}] Extrayendo frames del vídeo...")
             cap = cv2.VideoCapture(video_path)
             if not cap.isOpened():
+                raise RuntimeError("No se pudo abrir el vídeo")
             fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
             max_samples = job.get("max_frames", 100)
                 frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
             else:
                 frame_indices = []
             print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
+            # Save frames temporarily for svision processing
+            frames_dir = base / "frames_temp"
+            frames_dir.mkdir(parents=True, exist_ok=True)
             faces_root = base / "faces_raw"
             faces_root.mkdir(parents=True, exist_ok=True)
+            frame_paths: List[str] = []
             for frame_idx in frame_indices:
                 cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
+                ret, frame = cap.read()
+                if not ret:
                     continue
+                frame_path = frames_dir / f"frame_{frame_idx:06d}.jpg"
+                cv2.imwrite(str(frame_path), frame)
+                frame_paths.append(str(frame_path))
             cap.release()
+            print(f"[{job_id}] ✓ {len(frame_paths)} frames extraídos")
+            # ============================================================
+            # STEP 2: Send frames to SVISION for face detection + embeddings
+            # ============================================================
+            print(f"[{job_id}] Enviando frames a svision para detección de caras...")
+            embeddings: List[List[float]] = []
+            crops_meta: List[dict] = []
+            saved_count = 0
+            frames_with_faces = 0
+            for i, frame_path in enumerate(frame_paths):
+                frame_idx = frame_indices[i] if i < len(frame_indices) else i
+                try:
+                    # Call svision to get faces + embeddings
+                    faces = svision_client.get_face_embeddings_from_image(frame_path)
+                    if faces:
+                        frames_with_faces += 1
+                        for face_data in faces:
+                            emb = face_data.get("embedding", [])
+                            if not emb:
+                                continue
+                            # Normalize embedding
+                            emb = np.array(emb, dtype=float)
+                            emb = emb / (np.linalg.norm(emb) + 1e-9)
+                            embeddings.append(emb.tolist())
+                            # Save face crop if provided by svision
+                            crop_path = face_data.get("face_crop_path")
+                            fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
+                            local_crop_path = faces_root / fn
+                            if crop_path and os.path.exists(crop_path):
+                                shutil.copy2(crop_path, local_crop_path)
+                            else:
+                                # If no crop from svision, use original frame
+                                shutil.copy2(frame_path, local_crop_path)
+                            crops_meta.append({
+                                "file": fn,
+                                "frame": frame_idx,
+                                "index": face_data.get("index", saved_count),
+                            })
+                            saved_count += 1
+                except Exception as e:
+                    print(f"[{job_id}] Error procesando frame {frame_idx}: {e}")
+                    continue
+            print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}/{len(frame_paths)}")
             print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
+            # ============================================================
+            # STEP 3: Clustering (local, only math - no GPU)
+            # ============================================================
             if embeddings:
+                print(f"[{job_id}] Clustering jerárquico...")
                 Xf = np.array(embeddings)
                 labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
+                n_clusters = len(set([l for l in labels if l >= 0]))
+                print(f"[{job_id}] ✓ Clustering: {n_clusters} clusters")
             else:
                 labels = []
+            # ============================================================
+            # STEP 4: Organize faces into character folders
+            # ============================================================
+            characters: List[Dict[str, Any]] = []
+            cluster_map: Dict[int, List[int]] = {}
             for idx, lbl in enumerate(labels):
                 if isinstance(lbl, int) and lbl >= 0:
                     cluster_map.setdefault(lbl, []).append(idx)
             for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
                 char_id = f"char_{ci:02d}"
+                if not idxs:
                     continue
                 out_dir = chars_dir / char_id
                 out_dir.mkdir(parents=True, exist_ok=True)
+                # Select faces to show (half + 1)
+                total_faces = len(idxs)
                 max_faces_to_show = (total_faces // 2) + 1
+                selected_idxs = idxs[:max_faces_to_show]
+                files: List[str] = []
+                file_urls: List[str] = []
+                for j in selected_idxs:
+                    if j >= len(crops_meta):
+                        continue
+                    meta = crops_meta[j]
+                    fname = meta.get("file")
+                    if not fname:
+                        continue
                     src = faces_root / fname
                     dst = out_dir / fname
                     try:
+                        if src.exists():
+                            shutil.copy2(src, dst)
+                            files.append(fname)
+                            file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
                     except Exception:
                         pass
+                # Create representative image
                 rep = files[0] if files else None
                 if rep:
                     try:
                     except Exception:
                         pass
+                cluster_number = ci + 1
                 character_name = f"Cluster {cluster_number}"
+                characters.append({
                     "id": char_id,
                     "name": character_name,
                     "folder": str(out_dir),
                     "num_faces": len(files),
                     "total_faces_detected": total_faces,
                 })
                 print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
+            # Cleanup temp frames
+            try:
+                shutil.rmtree(frames_dir)
+            except Exception:
+                pass
+            print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
             job["results"] = {
+                "characters": characters,
                 "face_labels": labels,
                 "video_name": video_name,
                 "base_dir": str(base),
             job["status"] = JobStatus.DONE
             print(f"[{job_id}] ✓ Procesamiento completado")
+        except Exception as proc_error:
+            print(f"[{job_id}] Error en procesamiento: {proc_error}")
             import traceback
             traceback.print_exc()
             job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}

svision_client.py CHANGED Viewed

@@ -121,3 +121,70 @@ def extract_descripcion_escena(imagen_path: str) -> str:
         api_name="/describe_images"
     )
     return result

         api_name="/describe_images"
     )
     return result
+def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
+    """
+    Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.
+    This replaces local DeepFace/face_recognition processing by delegating to svision Space.
+    Parameters
+    ----------
+    image_path : str
+        Path to the input image file (a video frame).
+    Returns
+    -------
+    List[Dict[str, Any]]
+        List of dicts with 'embedding' (list of floats) and 'face_crop' (image path).
+        Returns empty list if no faces detected or on error.
+    """
+    try:
+        # Returns: (face_crops: list of images, face_embeddings: list of dicts)
+        result = _get_svision_client().predict(
+            image=handle_file(image_path),
+            api_name="/face_image_embedding_casting"
+        )
+        # result is a tuple: (list of image paths, list of embedding dicts)
+        if result and len(result) >= 2:
+            face_crops = result[0] if result[0] else []
+            face_embeddings = result[1] if result[1] else []
+            # Combine into unified structure
+            faces = []
+            for i, emb_dict in enumerate(face_embeddings):
+                faces.append({
+                    "embedding": emb_dict.get("embedding", []),
+                    "face_crop_path": face_crops[i] if i < len(face_crops) else None,
+                    "index": emb_dict.get("index", i),
+                })
+            return faces
+        return []
+    except Exception as e:
+        print(f"[svision_client] get_face_embeddings_from_image error: {e}")
+        return []
+def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
+    """
+    Call the /face_image_embedding endpoint to get face embeddings only.
+    Parameters
+    ----------
+    image_path : str
+        Path to the input image file.
+    Returns
+    -------
+    List[List[float]]
+        List of embedding vectors (one per detected face).
+    """
+    try:
+        result = _get_svision_client().predict(
+            image=handle_file(image_path),
+            api_name="/face_image_embedding"
+        )
+        return result if result else []
+    except Exception as e:
+        print(f"[svision_client] get_face_embeddings_simple error: {e}")
+        return []