VeuReu commited on
Commit
19f6f25
·
verified ·
1 Parent(s): 061959a

Upload 2 files

Browse files
Files changed (2) hide show
  1. preprocessing_router.py +159 -9
  2. svision_client.py +3 -0
preprocessing_router.py CHANGED
@@ -378,19 +378,77 @@ async def detect_scenes(
378
  scene_sensitivity: float = Form(default=0.5),
379
  frame_interval_sec: float = Form(default=0.5),
380
  ):
381
- import cv2
382
- import numpy as np
383
-
384
  video_name = Path(video.filename).stem
385
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
386
  with dst_video.open("wb") as f:
387
  shutil.copyfileobj(video.file, f)
388
 
389
- # Aquí reutilizarías tu lógica existente de detect_scenes desde api.py,
390
- # pero la omitimos por brevedad dentro de este contexto de refactor.
391
- # Mantén la implementación actual que ya tienes en engine/api.py.
392
-
393
- return {"scene_clusters": []}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
 
396
  def process_video_job(job_id: str):
@@ -631,9 +689,97 @@ def process_video_job(job_id: str):
631
 
632
  print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  job["results"] = {
635
  "characters": characters,
636
  "face_labels": labels,
 
 
 
637
  "video_name": video_name,
638
  "base_dir": str(base),
639
  }
@@ -644,7 +790,11 @@ def process_video_job(job_id: str):
644
  print(f"[{job_id}] Error en procesamiento: {proc_error}")
645
  import traceback
646
  traceback.print_exc()
647
- job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
 
 
 
 
648
  job["status"] = JobStatus.DONE
649
 
650
  except Exception as e:
 
378
  scene_sensitivity: float = Form(default=0.5),
379
  frame_interval_sec: float = Form(default=0.5),
380
  ):
381
+ """Extract scenes from video using svision Space."""
 
 
382
  video_name = Path(video.filename).stem
383
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
384
  with dst_video.open("wb") as f:
385
  shutil.copyfileobj(video.file, f)
386
 
387
+ try:
388
+ print(f"[detect_scenes] Extrayendo escenas de {video_name}...")
389
+
390
+ # Call svision to extract scenes
391
+ result = svision_client.extract_scenes(str(dst_video), threshold=scene_sensitivity)
392
+
393
+ # result contains scene keyframes
394
+ scenes_raw = result if isinstance(result, list) else []
395
+ print(f"[detect_scenes] svision devolvió {len(scenes_raw)} escenas")
396
+
397
+ # Create scene clusters directory
398
+ base = TEMP_ROOT / video_name
399
+ scenes_dir = base / "scenes"
400
+ scenes_dir.mkdir(parents=True, exist_ok=True)
401
+
402
+ scene_clusters = []
403
+ for i, scene_data in enumerate(scenes_raw):
404
+ scene_id = f"scene_{i:02d}"
405
+ scene_out_dir = scenes_dir / scene_id
406
+ scene_out_dir.mkdir(parents=True, exist_ok=True)
407
+
408
+ # Extract keyframe path from scene data
409
+ keyframe_path = None
410
+ if isinstance(scene_data, str):
411
+ keyframe_path = scene_data
412
+ elif isinstance(scene_data, dict):
413
+ keyframe_path = scene_data.get("path") or scene_data.get("keyframe") or scene_data.get("image")
414
+
415
+ # Download or copy keyframe
416
+ local_keyframe = scene_out_dir / "keyframe.jpg"
417
+ keyframe_saved = False
418
+
419
+ if keyframe_path:
420
+ try:
421
+ if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
422
+ import requests
423
+ resp = requests.get(keyframe_path, timeout=30)
424
+ if resp.status_code == 200:
425
+ with open(local_keyframe, "wb") as f:
426
+ f.write(resp.content)
427
+ keyframe_saved = True
428
+ elif isinstance(keyframe_path, str) and os.path.exists(keyframe_path):
429
+ shutil.copy2(keyframe_path, local_keyframe)
430
+ keyframe_saved = True
431
+ except Exception as dl_err:
432
+ print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
433
+
434
+ if keyframe_saved:
435
+ scene_clusters.append({
436
+ "id": scene_id,
437
+ "name": f"Escena {i+1}",
438
+ "folder": str(scene_out_dir),
439
+ "image_url": f"/files_scene/{video_name}/{scene_id}/keyframe.jpg",
440
+ "start_time": scene_data.get("start", 0) if isinstance(scene_data, dict) else 0,
441
+ "end_time": scene_data.get("end", 0) if isinstance(scene_data, dict) else 0,
442
+ })
443
+
444
+ print(f"[detect_scenes] ✓ {len(scene_clusters)} escenas procesadas")
445
+ return {"scene_clusters": scene_clusters}
446
+
447
+ except Exception as e:
448
+ print(f"[detect_scenes] Error: {e}")
449
+ import traceback
450
+ traceback.print_exc()
451
+ return {"scene_clusters": [], "error": str(e)}
452
 
453
 
454
  def process_video_job(job_id: str):
 
689
 
690
  print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
691
 
692
+ # ============================================================
693
+ # STEP 5: Audio diarization + voice embeddings using ASR space
694
+ # ============================================================
695
+ voice_max_groups = int(job.get("voice_max_groups", 3))
696
+ voice_min_cluster_size = int(job.get("voice_min_cluster_size", 3))
697
+ voice_sensitivity = float(job.get("voice_sensitivity", 0.5))
698
+
699
+ audio_segments: List[Dict[str, Any]] = []
700
+ voice_labels: List[int] = []
701
+ voice_embeddings: List[List[float]] = []
702
+ diarization_info: Dict[str, Any] = {}
703
+
704
+ print(f"[{job_id}] Procesando audio con ASR space...")
705
+ try:
706
+ # Extract audio and diarize
707
+ diar_result = asr_client.extract_audio_and_diarize(video_path)
708
+ clips = diar_result.get("clips", [])
709
+ segments = diar_result.get("segments", [])
710
+
711
+ print(f"[{job_id}] Diarización: {len(clips)} clips, {len(segments)} segmentos")
712
+
713
+ # Save clips locally
714
+ clips_dir = base / "clips"
715
+ clips_dir.mkdir(parents=True, exist_ok=True)
716
+
717
+ for i, clip_info in enumerate(clips if isinstance(clips, list) else []):
718
+ clip_path = clip_info if isinstance(clip_info, str) else clip_info.get("path") if isinstance(clip_info, dict) else None
719
+ if not clip_path:
720
+ continue
721
+
722
+ # Download or copy clip
723
+ local_clip = clips_dir / f"segment_{i:03d}.wav"
724
+ try:
725
+ if isinstance(clip_path, str) and clip_path.startswith("http"):
726
+ import requests
727
+ resp = requests.get(clip_path, timeout=30)
728
+ if resp.status_code == 200:
729
+ with open(local_clip, "wb") as f:
730
+ f.write(resp.content)
731
+ elif isinstance(clip_path, str) and os.path.exists(clip_path):
732
+ shutil.copy2(clip_path, local_clip)
733
+ except Exception as dl_err:
734
+ print(f"[{job_id}] Error guardando clip {i}: {dl_err}")
735
+ continue
736
+
737
+ # Get segment info
738
+ seg_info = segments[i] if i < len(segments) else {}
739
+ speaker = seg_info.get("speaker", f"SPEAKER_{i:02d}")
740
+
741
+ # Get voice embedding for this clip
742
+ emb = asr_client.get_voice_embedding(str(local_clip))
743
+ if emb:
744
+ voice_embeddings.append(emb)
745
+
746
+ audio_segments.append({
747
+ "index": i,
748
+ "clip_path": str(local_clip),
749
+ "clip_url": f"/audio/{video_name}/segment_{i:03d}.wav",
750
+ "speaker": speaker,
751
+ "start": seg_info.get("start", 0),
752
+ "end": seg_info.get("end", 0),
753
+ })
754
+
755
+ print(f"[{job_id}] ✓ {len(audio_segments)} segmentos de audio procesados")
756
+
757
+ # Cluster voice embeddings
758
+ if voice_embeddings:
759
+ print(f"[{job_id}] Clustering jerárquico de voz...")
760
+ Xv = np.array(voice_embeddings)
761
+ voice_labels = hierarchical_cluster_with_min_size(
762
+ Xv, voice_max_groups, voice_min_cluster_size, voice_sensitivity
763
+ ).tolist()
764
+ n_voice_clusters = len(set([l for l in voice_labels if l >= 0]))
765
+ print(f"[{job_id}] ✓ Clustering de voz: {n_voice_clusters} clusters")
766
+
767
+ diarization_info = {
768
+ "num_segments": len(audio_segments),
769
+ "num_voice_clusters": len(set([l for l in voice_labels if l >= 0])) if voice_labels else 0,
770
+ }
771
+
772
+ except Exception as audio_err:
773
+ print(f"[{job_id}] Error en procesamiento de audio: {audio_err}")
774
+ import traceback
775
+ traceback.print_exc()
776
+
777
  job["results"] = {
778
  "characters": characters,
779
  "face_labels": labels,
780
+ "audio_segments": audio_segments,
781
+ "voice_labels": voice_labels,
782
+ "diarization_info": diarization_info,
783
  "video_name": video_name,
784
  "base_dir": str(base),
785
  }
 
790
  print(f"[{job_id}] Error en procesamiento: {proc_error}")
791
  import traceback
792
  traceback.print_exc()
793
+ job["results"] = {
794
+ "characters": [], "face_labels": [],
795
+ "audio_segments": [], "voice_labels": [], "diarization_info": {},
796
+ "video_name": video_name, "base_dir": str(base)
797
+ }
798
  job["status"] = JobStatus.DONE
799
 
800
  except Exception as e:
svision_client.py CHANGED
@@ -184,6 +184,9 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
184
  })
185
 
186
  print(f"[svision_client] Detected {len(faces)} faces from image")
 
 
 
187
  return faces
188
  return []
189
  except Exception as e:
 
184
  })
185
 
186
  print(f"[svision_client] Detected {len(faces)} faces from image")
187
+ for i, f in enumerate(faces):
188
+ crop_path = f.get("face_crop_path")
189
+ print(f"[svision_client] Face {i}: crop_path={crop_path[:80] if crop_path else 'None'}...")
190
  return faces
191
  return []
192
  except Exception as e: