Upload 2 files
Browse files- preprocessing_router.py +159 -9
- svision_client.py +3 -0
preprocessing_router.py
CHANGED
|
@@ -378,19 +378,77 @@ async def detect_scenes(
|
|
| 378 |
scene_sensitivity: float = Form(default=0.5),
|
| 379 |
frame_interval_sec: float = Form(default=0.5),
|
| 380 |
):
|
| 381 |
-
|
| 382 |
-
import numpy as np
|
| 383 |
-
|
| 384 |
video_name = Path(video.filename).stem
|
| 385 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 386 |
with dst_video.open("wb") as f:
|
| 387 |
shutil.copyfileobj(video.file, f)
|
| 388 |
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
|
| 396 |
def process_video_job(job_id: str):
|
|
@@ -631,9 +689,97 @@ def process_video_job(job_id: str):
|
|
| 631 |
|
| 632 |
print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
|
| 633 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
job["results"] = {
|
| 635 |
"characters": characters,
|
| 636 |
"face_labels": labels,
|
|
|
|
|
|
|
|
|
|
| 637 |
"video_name": video_name,
|
| 638 |
"base_dir": str(base),
|
| 639 |
}
|
|
@@ -644,7 +790,11 @@ def process_video_job(job_id: str):
|
|
| 644 |
print(f"[{job_id}] Error en procesamiento: {proc_error}")
|
| 645 |
import traceback
|
| 646 |
traceback.print_exc()
|
| 647 |
-
job["results"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
job["status"] = JobStatus.DONE
|
| 649 |
|
| 650 |
except Exception as e:
|
|
|
|
| 378 |
scene_sensitivity: float = Form(default=0.5),
|
| 379 |
frame_interval_sec: float = Form(default=0.5),
|
| 380 |
):
|
| 381 |
+
"""Extract scenes from video using svision Space."""
|
|
|
|
|
|
|
| 382 |
video_name = Path(video.filename).stem
|
| 383 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 384 |
with dst_video.open("wb") as f:
|
| 385 |
shutil.copyfileobj(video.file, f)
|
| 386 |
|
| 387 |
+
try:
|
| 388 |
+
print(f"[detect_scenes] Extrayendo escenas de {video_name}...")
|
| 389 |
+
|
| 390 |
+
# Call svision to extract scenes
|
| 391 |
+
result = svision_client.extract_scenes(str(dst_video), threshold=scene_sensitivity)
|
| 392 |
+
|
| 393 |
+
# result contains scene keyframes
|
| 394 |
+
scenes_raw = result if isinstance(result, list) else []
|
| 395 |
+
print(f"[detect_scenes] svision devolvió {len(scenes_raw)} escenas")
|
| 396 |
+
|
| 397 |
+
# Create scene clusters directory
|
| 398 |
+
base = TEMP_ROOT / video_name
|
| 399 |
+
scenes_dir = base / "scenes"
|
| 400 |
+
scenes_dir.mkdir(parents=True, exist_ok=True)
|
| 401 |
+
|
| 402 |
+
scene_clusters = []
|
| 403 |
+
for i, scene_data in enumerate(scenes_raw):
|
| 404 |
+
scene_id = f"scene_{i:02d}"
|
| 405 |
+
scene_out_dir = scenes_dir / scene_id
|
| 406 |
+
scene_out_dir.mkdir(parents=True, exist_ok=True)
|
| 407 |
+
|
| 408 |
+
# Extract keyframe path from scene data
|
| 409 |
+
keyframe_path = None
|
| 410 |
+
if isinstance(scene_data, str):
|
| 411 |
+
keyframe_path = scene_data
|
| 412 |
+
elif isinstance(scene_data, dict):
|
| 413 |
+
keyframe_path = scene_data.get("path") or scene_data.get("keyframe") or scene_data.get("image")
|
| 414 |
+
|
| 415 |
+
# Download or copy keyframe
|
| 416 |
+
local_keyframe = scene_out_dir / "keyframe.jpg"
|
| 417 |
+
keyframe_saved = False
|
| 418 |
+
|
| 419 |
+
if keyframe_path:
|
| 420 |
+
try:
|
| 421 |
+
if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
|
| 422 |
+
import requests
|
| 423 |
+
resp = requests.get(keyframe_path, timeout=30)
|
| 424 |
+
if resp.status_code == 200:
|
| 425 |
+
with open(local_keyframe, "wb") as f:
|
| 426 |
+
f.write(resp.content)
|
| 427 |
+
keyframe_saved = True
|
| 428 |
+
elif isinstance(keyframe_path, str) and os.path.exists(keyframe_path):
|
| 429 |
+
shutil.copy2(keyframe_path, local_keyframe)
|
| 430 |
+
keyframe_saved = True
|
| 431 |
+
except Exception as dl_err:
|
| 432 |
+
print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
|
| 433 |
+
|
| 434 |
+
if keyframe_saved:
|
| 435 |
+
scene_clusters.append({
|
| 436 |
+
"id": scene_id,
|
| 437 |
+
"name": f"Escena {i+1}",
|
| 438 |
+
"folder": str(scene_out_dir),
|
| 439 |
+
"image_url": f"/files_scene/{video_name}/{scene_id}/keyframe.jpg",
|
| 440 |
+
"start_time": scene_data.get("start", 0) if isinstance(scene_data, dict) else 0,
|
| 441 |
+
"end_time": scene_data.get("end", 0) if isinstance(scene_data, dict) else 0,
|
| 442 |
+
})
|
| 443 |
+
|
| 444 |
+
print(f"[detect_scenes] ✓ {len(scene_clusters)} escenas procesadas")
|
| 445 |
+
return {"scene_clusters": scene_clusters}
|
| 446 |
+
|
| 447 |
+
except Exception as e:
|
| 448 |
+
print(f"[detect_scenes] Error: {e}")
|
| 449 |
+
import traceback
|
| 450 |
+
traceback.print_exc()
|
| 451 |
+
return {"scene_clusters": [], "error": str(e)}
|
| 452 |
|
| 453 |
|
| 454 |
def process_video_job(job_id: str):
|
|
|
|
| 689 |
|
| 690 |
print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
|
| 691 |
|
| 692 |
+
# ============================================================
|
| 693 |
+
# STEP 5: Audio diarization + voice embeddings using ASR space
|
| 694 |
+
# ============================================================
|
| 695 |
+
voice_max_groups = int(job.get("voice_max_groups", 3))
|
| 696 |
+
voice_min_cluster_size = int(job.get("voice_min_cluster_size", 3))
|
| 697 |
+
voice_sensitivity = float(job.get("voice_sensitivity", 0.5))
|
| 698 |
+
|
| 699 |
+
audio_segments: List[Dict[str, Any]] = []
|
| 700 |
+
voice_labels: List[int] = []
|
| 701 |
+
voice_embeddings: List[List[float]] = []
|
| 702 |
+
diarization_info: Dict[str, Any] = {}
|
| 703 |
+
|
| 704 |
+
print(f"[{job_id}] Procesando audio con ASR space...")
|
| 705 |
+
try:
|
| 706 |
+
# Extract audio and diarize
|
| 707 |
+
diar_result = asr_client.extract_audio_and_diarize(video_path)
|
| 708 |
+
clips = diar_result.get("clips", [])
|
| 709 |
+
segments = diar_result.get("segments", [])
|
| 710 |
+
|
| 711 |
+
print(f"[{job_id}] Diarización: {len(clips)} clips, {len(segments)} segmentos")
|
| 712 |
+
|
| 713 |
+
# Save clips locally
|
| 714 |
+
clips_dir = base / "clips"
|
| 715 |
+
clips_dir.mkdir(parents=True, exist_ok=True)
|
| 716 |
+
|
| 717 |
+
for i, clip_info in enumerate(clips if isinstance(clips, list) else []):
|
| 718 |
+
clip_path = clip_info if isinstance(clip_info, str) else clip_info.get("path") if isinstance(clip_info, dict) else None
|
| 719 |
+
if not clip_path:
|
| 720 |
+
continue
|
| 721 |
+
|
| 722 |
+
# Download or copy clip
|
| 723 |
+
local_clip = clips_dir / f"segment_{i:03d}.wav"
|
| 724 |
+
try:
|
| 725 |
+
if isinstance(clip_path, str) and clip_path.startswith("http"):
|
| 726 |
+
import requests
|
| 727 |
+
resp = requests.get(clip_path, timeout=30)
|
| 728 |
+
if resp.status_code == 200:
|
| 729 |
+
with open(local_clip, "wb") as f:
|
| 730 |
+
f.write(resp.content)
|
| 731 |
+
elif isinstance(clip_path, str) and os.path.exists(clip_path):
|
| 732 |
+
shutil.copy2(clip_path, local_clip)
|
| 733 |
+
except Exception as dl_err:
|
| 734 |
+
print(f"[{job_id}] Error guardando clip {i}: {dl_err}")
|
| 735 |
+
continue
|
| 736 |
+
|
| 737 |
+
# Get segment info
|
| 738 |
+
seg_info = segments[i] if i < len(segments) else {}
|
| 739 |
+
speaker = seg_info.get("speaker", f"SPEAKER_{i:02d}")
|
| 740 |
+
|
| 741 |
+
# Get voice embedding for this clip
|
| 742 |
+
emb = asr_client.get_voice_embedding(str(local_clip))
|
| 743 |
+
if emb:
|
| 744 |
+
voice_embeddings.append(emb)
|
| 745 |
+
|
| 746 |
+
audio_segments.append({
|
| 747 |
+
"index": i,
|
| 748 |
+
"clip_path": str(local_clip),
|
| 749 |
+
"clip_url": f"/audio/{video_name}/segment_{i:03d}.wav",
|
| 750 |
+
"speaker": speaker,
|
| 751 |
+
"start": seg_info.get("start", 0),
|
| 752 |
+
"end": seg_info.get("end", 0),
|
| 753 |
+
})
|
| 754 |
+
|
| 755 |
+
print(f"[{job_id}] ✓ {len(audio_segments)} segmentos de audio procesados")
|
| 756 |
+
|
| 757 |
+
# Cluster voice embeddings
|
| 758 |
+
if voice_embeddings:
|
| 759 |
+
print(f"[{job_id}] Clustering jerárquico de voz...")
|
| 760 |
+
Xv = np.array(voice_embeddings)
|
| 761 |
+
voice_labels = hierarchical_cluster_with_min_size(
|
| 762 |
+
Xv, voice_max_groups, voice_min_cluster_size, voice_sensitivity
|
| 763 |
+
).tolist()
|
| 764 |
+
n_voice_clusters = len(set([l for l in voice_labels if l >= 0]))
|
| 765 |
+
print(f"[{job_id}] ✓ Clustering de voz: {n_voice_clusters} clusters")
|
| 766 |
+
|
| 767 |
+
diarization_info = {
|
| 768 |
+
"num_segments": len(audio_segments),
|
| 769 |
+
"num_voice_clusters": len(set([l for l in voice_labels if l >= 0])) if voice_labels else 0,
|
| 770 |
+
}
|
| 771 |
+
|
| 772 |
+
except Exception as audio_err:
|
| 773 |
+
print(f"[{job_id}] Error en procesamiento de audio: {audio_err}")
|
| 774 |
+
import traceback
|
| 775 |
+
traceback.print_exc()
|
| 776 |
+
|
| 777 |
job["results"] = {
|
| 778 |
"characters": characters,
|
| 779 |
"face_labels": labels,
|
| 780 |
+
"audio_segments": audio_segments,
|
| 781 |
+
"voice_labels": voice_labels,
|
| 782 |
+
"diarization_info": diarization_info,
|
| 783 |
"video_name": video_name,
|
| 784 |
"base_dir": str(base),
|
| 785 |
}
|
|
|
|
| 790 |
print(f"[{job_id}] Error en procesamiento: {proc_error}")
|
| 791 |
import traceback
|
| 792 |
traceback.print_exc()
|
| 793 |
+
job["results"] = {
|
| 794 |
+
"characters": [], "face_labels": [],
|
| 795 |
+
"audio_segments": [], "voice_labels": [], "diarization_info": {},
|
| 796 |
+
"video_name": video_name, "base_dir": str(base)
|
| 797 |
+
}
|
| 798 |
job["status"] = JobStatus.DONE
|
| 799 |
|
| 800 |
except Exception as e:
|
svision_client.py
CHANGED
|
@@ -184,6 +184,9 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
|
|
| 184 |
})
|
| 185 |
|
| 186 |
print(f"[svision_client] Detected {len(faces)} faces from image")
|
|
|
|
|
|
|
|
|
|
| 187 |
return faces
|
| 188 |
return []
|
| 189 |
except Exception as e:
|
|
|
|
| 184 |
})
|
| 185 |
|
| 186 |
print(f"[svision_client] Detected {len(faces)} faces from image")
|
| 187 |
+
for i, f in enumerate(faces):
|
| 188 |
+
crop_path = f.get("face_crop_path")
|
| 189 |
+
print(f"[svision_client] Face {i}: crop_path={crop_path[:80] if crop_path else 'None'}...")
|
| 190 |
return faces
|
| 191 |
return []
|
| 192 |
except Exception as e:
|