Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 23 days ago

Commit

19f6f25

verified ·

1 Parent(s): 061959a

Upload 2 files

Browse files

Files changed (2) hide show

preprocessing_router.py +159 -9
svision_client.py +3 -0

preprocessing_router.py CHANGED Viewed

@@ -378,19 +378,77 @@ async def detect_scenes(
     scene_sensitivity: float = Form(default=0.5),
     frame_interval_sec: float = Form(default=0.5),
 ):
-    import cv2
-    import numpy as np
     video_name = Path(video.filename).stem
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
     with dst_video.open("wb") as f:
         shutil.copyfileobj(video.file, f)
-    # Aquí reutilizarías tu lógica existente de detect_scenes desde api.py,
-    # pero la omitimos por brevedad dentro de este contexto de refactor.
-    # Mantén la implementación actual que ya tienes en engine/api.py.
-    return {"scene_clusters": []}
 def process_video_job(job_id: str):
@@ -631,9 +689,97 @@ def process_video_job(job_id: str):
             print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
             job["results"] = {
                 "characters": characters,
                 "face_labels": labels,
                 "video_name": video_name,
                 "base_dir": str(base),
             }
@@ -644,7 +790,11 @@ def process_video_job(job_id: str):
             print(f"[{job_id}] Error en procesamiento: {proc_error}")
             import traceback
             traceback.print_exc()
-            job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
             job["status"] = JobStatus.DONE
     except Exception as e:

     scene_sensitivity: float = Form(default=0.5),
     frame_interval_sec: float = Form(default=0.5),
 ):
+    """Extract scenes from video using svision Space."""
     video_name = Path(video.filename).stem
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
     with dst_video.open("wb") as f:
         shutil.copyfileobj(video.file, f)
+    try:
+        print(f"[detect_scenes] Extrayendo escenas de {video_name}...")
+        # Call svision to extract scenes
+        result = svision_client.extract_scenes(str(dst_video), threshold=scene_sensitivity)
+        # result contains scene keyframes
+        scenes_raw = result if isinstance(result, list) else []
+        print(f"[detect_scenes] svision devolvió {len(scenes_raw)} escenas")
+        # Create scene clusters directory
+        base = TEMP_ROOT / video_name
+        scenes_dir = base / "scenes"
+        scenes_dir.mkdir(parents=True, exist_ok=True)
+        scene_clusters = []
+        for i, scene_data in enumerate(scenes_raw):
+            scene_id = f"scene_{i:02d}"
+            scene_out_dir = scenes_dir / scene_id
+            scene_out_dir.mkdir(parents=True, exist_ok=True)
+            # Extract keyframe path from scene data
+            keyframe_path = None
+            if isinstance(scene_data, str):
+                keyframe_path = scene_data
+            elif isinstance(scene_data, dict):
+                keyframe_path = scene_data.get("path") or scene_data.get("keyframe") or scene_data.get("image")
+            # Download or copy keyframe
+            local_keyframe = scene_out_dir / "keyframe.jpg"
+            keyframe_saved = False
+            if keyframe_path:
+                try:
+                    if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
+                        import requests
+                        resp = requests.get(keyframe_path, timeout=30)
+                        if resp.status_code == 200:
+                            with open(local_keyframe, "wb") as f:
+                                f.write(resp.content)
+                            keyframe_saved = True
+                    elif isinstance(keyframe_path, str) and os.path.exists(keyframe_path):
+                        shutil.copy2(keyframe_path, local_keyframe)
+                        keyframe_saved = True
+                except Exception as dl_err:
+                    print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
+            if keyframe_saved:
+                scene_clusters.append({
+                    "id": scene_id,
+                    "name": f"Escena {i+1}",
+                    "folder": str(scene_out_dir),
+                    "image_url": f"/files_scene/{video_name}/{scene_id}/keyframe.jpg",
+                    "start_time": scene_data.get("start", 0) if isinstance(scene_data, dict) else 0,
+                    "end_time": scene_data.get("end", 0) if isinstance(scene_data, dict) else 0,
+                })
+        print(f"[detect_scenes] ✓ {len(scene_clusters)} escenas procesadas")
+        return {"scene_clusters": scene_clusters}
+    except Exception as e:
+        print(f"[detect_scenes] Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return {"scene_clusters": [], "error": str(e)}
 def process_video_job(job_id: str):
             print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
+            # ============================================================
+            # STEP 5: Audio diarization + voice embeddings using ASR space
+            # ============================================================
+            voice_max_groups = int(job.get("voice_max_groups", 3))
+            voice_min_cluster_size = int(job.get("voice_min_cluster_size", 3))
+            voice_sensitivity = float(job.get("voice_sensitivity", 0.5))
+            audio_segments: List[Dict[str, Any]] = []
+            voice_labels: List[int] = []
+            voice_embeddings: List[List[float]] = []
+            diarization_info: Dict[str, Any] = {}
+            print(f"[{job_id}] Procesando audio con ASR space...")
+            try:
+                # Extract audio and diarize
+                diar_result = asr_client.extract_audio_and_diarize(video_path)
+                clips = diar_result.get("clips", [])
+                segments = diar_result.get("segments", [])
+                print(f"[{job_id}] Diarización: {len(clips)} clips, {len(segments)} segmentos")
+                # Save clips locally
+                clips_dir = base / "clips"
+                clips_dir.mkdir(parents=True, exist_ok=True)
+                for i, clip_info in enumerate(clips if isinstance(clips, list) else []):
+                    clip_path = clip_info if isinstance(clip_info, str) else clip_info.get("path") if isinstance(clip_info, dict) else None
+                    if not clip_path:
+                        continue
+                    # Download or copy clip
+                    local_clip = clips_dir / f"segment_{i:03d}.wav"
+                    try:
+                        if isinstance(clip_path, str) and clip_path.startswith("http"):
+                            import requests
+                            resp = requests.get(clip_path, timeout=30)
+                            if resp.status_code == 200:
+                                with open(local_clip, "wb") as f:
+                                    f.write(resp.content)
+                        elif isinstance(clip_path, str) and os.path.exists(clip_path):
+                            shutil.copy2(clip_path, local_clip)
+                    except Exception as dl_err:
+                        print(f"[{job_id}] Error guardando clip {i}: {dl_err}")
+                        continue
+                    # Get segment info
+                    seg_info = segments[i] if i < len(segments) else {}
+                    speaker = seg_info.get("speaker", f"SPEAKER_{i:02d}")
+                    # Get voice embedding for this clip
+                    emb = asr_client.get_voice_embedding(str(local_clip))
+                    if emb:
+                        voice_embeddings.append(emb)
+                    audio_segments.append({
+                        "index": i,
+                        "clip_path": str(local_clip),
+                        "clip_url": f"/audio/{video_name}/segment_{i:03d}.wav",
+                        "speaker": speaker,
+                        "start": seg_info.get("start", 0),
+                        "end": seg_info.get("end", 0),
+                    })
+                print(f"[{job_id}] ✓ {len(audio_segments)} segmentos de audio procesados")
+                # Cluster voice embeddings
+                if voice_embeddings:
+                    print(f"[{job_id}] Clustering jerárquico de voz...")
+                    Xv = np.array(voice_embeddings)
+                    voice_labels = hierarchical_cluster_with_min_size(
+                        Xv, voice_max_groups, voice_min_cluster_size, voice_sensitivity
+                    ).tolist()
+                    n_voice_clusters = len(set([l for l in voice_labels if l >= 0]))
+                    print(f"[{job_id}] ✓ Clustering de voz: {n_voice_clusters} clusters")
+                diarization_info = {
+                    "num_segments": len(audio_segments),
+                    "num_voice_clusters": len(set([l for l in voice_labels if l >= 0])) if voice_labels else 0,
+                }
+            except Exception as audio_err:
+                print(f"[{job_id}] Error en procesamiento de audio: {audio_err}")
+                import traceback
+                traceback.print_exc()
             job["results"] = {
                 "characters": characters,
                 "face_labels": labels,
+                "audio_segments": audio_segments,
+                "voice_labels": voice_labels,
+                "diarization_info": diarization_info,
                 "video_name": video_name,
                 "base_dir": str(base),
             }
             print(f"[{job_id}] Error en procesamiento: {proc_error}")
             import traceback
             traceback.print_exc()
+            job["results"] = {
+                "characters": [], "face_labels": [],
+                "audio_segments": [], "voice_labels": [], "diarization_info": {},
+                "video_name": video_name, "base_dir": str(base)
+            }
             job["status"] = JobStatus.DONE
     except Exception as e:

svision_client.py CHANGED Viewed

@@ -184,6 +184,9 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
                 })
             print(f"[svision_client] Detected {len(faces)} faces from image")
             return faces
         return []
     except Exception as e:

                 })
             print(f"[svision_client] Detected {len(faces)} faces from image")
+            for i, f in enumerate(faces):
+                crop_path = f.get("face_crop_path")
+                print(f"[svision_client]   Face {i}: crop_path={crop_path[:80] if crop_path else 'None'}...")
             return faces
         return []
     except Exception as e: