Upload 2 files
Browse files- preprocessing_router.py +194 -73
- svision_client.py +37 -9
preprocessing_router.py
CHANGED
|
@@ -46,46 +46,73 @@ jobs: Dict[str, dict] = {}
|
|
| 46 |
# ---------------------------------------------------------------------------
|
| 47 |
|
| 48 |
def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
|
| 49 |
-
"""Hierarchical clustering
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
from scipy.cluster.hierarchy import linkage, fcluster
|
| 51 |
-
from sklearn.metrics import silhouette_score
|
| 52 |
from collections import Counter
|
| 53 |
|
| 54 |
-
|
|
|
|
| 55 |
return np.array([])
|
| 56 |
-
if len(X) < min_cluster_size:
|
| 57 |
-
return np.full(len(X), -1, dtype=int)
|
| 58 |
-
|
| 59 |
-
Z = linkage(X, method='average', metric='cosine')
|
| 60 |
-
best_n_clusters = 2
|
| 61 |
-
best_score = -1
|
| 62 |
-
max_to_try = min(max_groups, len(X) - 1)
|
| 63 |
-
|
| 64 |
-
if max_to_try >= 2:
|
| 65 |
-
for n_clusters in range(2, max_to_try + 1):
|
| 66 |
-
trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
|
| 67 |
-
trial_counts = Counter(trial_labels)
|
| 68 |
-
valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
|
| 69 |
-
if valid_clusters >= 2:
|
| 70 |
-
try:
|
| 71 |
-
score = silhouette_score(X, trial_labels, metric='cosine')
|
| 72 |
-
penalty = 0.14 - (sensitivity * 0.13)
|
| 73 |
-
adjusted_score = score - (n_clusters * penalty)
|
| 74 |
-
if adjusted_score > best_score:
|
| 75 |
-
best_score = adjusted_score
|
| 76 |
-
best_n_clusters = n_clusters
|
| 77 |
-
except Exception:
|
| 78 |
-
pass
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
router = APIRouter(tags=["Preprocessing Manager"])
|
|
@@ -378,48 +405,63 @@ async def detect_scenes(
|
|
| 378 |
scene_sensitivity: float = Form(default=0.5),
|
| 379 |
frame_interval_sec: float = Form(default=0.5),
|
| 380 |
):
|
| 381 |
-
"""Extract
|
|
|
|
|
|
|
| 382 |
video_name = Path(video.filename).stem
|
| 383 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 384 |
with dst_video.open("wb") as f:
|
| 385 |
shutil.copyfileobj(video.file, f)
|
| 386 |
|
| 387 |
try:
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
| 389 |
|
| 390 |
-
# Call svision to extract
|
| 391 |
-
result = svision_client.
|
| 392 |
|
| 393 |
-
|
| 394 |
-
scenes_raw = result if isinstance(result, list) else []
|
| 395 |
-
print(f"[detect_scenes] svision devolvió {len(scenes_raw)} escenas")
|
| 396 |
|
| 397 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
base = TEMP_ROOT / video_name
|
| 399 |
scenes_dir = base / "scenes"
|
| 400 |
scenes_dir.mkdir(parents=True, exist_ok=True)
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
keyframe_path = scene_data
|
| 412 |
-
elif isinstance(scene_data, dict):
|
| 413 |
-
keyframe_path = scene_data.get("path") or scene_data.get("keyframe") or scene_data.get("image")
|
| 414 |
-
|
| 415 |
-
# Download or copy keyframe
|
| 416 |
-
local_keyframe = scene_out_dir / "keyframe.jpg"
|
| 417 |
keyframe_saved = False
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
if keyframe_path:
|
| 420 |
try:
|
| 421 |
if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
|
| 422 |
-
import requests
|
| 423 |
resp = requests.get(keyframe_path, timeout=30)
|
| 424 |
if resp.status_code == 200:
|
| 425 |
with open(local_keyframe, "wb") as f:
|
|
@@ -430,18 +472,97 @@ async def detect_scenes(
|
|
| 430 |
keyframe_saved = True
|
| 431 |
except Exception as dl_err:
|
| 432 |
print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
|
| 433 |
-
|
| 434 |
-
if keyframe_saved:
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
return {"scene_clusters": scene_clusters}
|
| 446 |
|
| 447 |
except Exception as e:
|
|
|
|
| 46 |
# ---------------------------------------------------------------------------
|
| 47 |
|
| 48 |
def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
|
| 49 |
+
"""Hierarchical clustering using only min_cluster_size and k-target (max_groups).
|
| 50 |
+
|
| 51 |
+
- Primero intenta crear el máximo número posible de clusters con al menos
|
| 52 |
+
``min_cluster_size`` elementos.
|
| 53 |
+
- Después fusiona implícitamente (bajando el número de clusters) hasta
|
| 54 |
+
llegar a un número de clusters válidos (tamaño >= min_cluster_size)
|
| 55 |
+
menor o igual que ``max_groups``.
|
| 56 |
+
|
| 57 |
+
``sensitivity`` se mantiene en la firma por compatibilidad, pero no se usa.
|
| 58 |
+
"""
|
| 59 |
from scipy.cluster.hierarchy import linkage, fcluster
|
|
|
|
| 60 |
from collections import Counter
|
| 61 |
|
| 62 |
+
n_samples = len(X)
|
| 63 |
+
if n_samples == 0:
|
| 64 |
return np.array([])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
+
# Si no hay suficientes muestras para formar un solo cluster válido,
|
| 67 |
+
# marcamos todo como ruido (-1).
|
| 68 |
+
if n_samples < min_cluster_size:
|
| 69 |
+
return np.full(n_samples, -1, dtype=int)
|
| 70 |
+
|
| 71 |
+
# k_target = max_groups (interpretamos este parámetro como k-Target)
|
| 72 |
+
k_target = max(0, int(max_groups))
|
| 73 |
+
|
| 74 |
+
# Caso especial: k_target == 0 => no queremos clusters, todo ruido.
|
| 75 |
+
if k_target == 0:
|
| 76 |
+
return np.full(n_samples, -1, dtype=int)
|
| 77 |
+
|
| 78 |
+
# Enlace jerárquico una sola vez
|
| 79 |
+
Z = linkage(X, method="average", metric="cosine")
|
| 80 |
+
|
| 81 |
+
# Máximo número de clusters posibles respetando min_cluster_size
|
| 82 |
+
max_possible = n_samples // min_cluster_size
|
| 83 |
+
if max_possible <= 0:
|
| 84 |
+
return np.full(n_samples, -1, dtype=int)
|
| 85 |
+
|
| 86 |
+
max_to_try = min(max_possible, n_samples)
|
| 87 |
+
|
| 88 |
+
best_labels = np.full(n_samples, -1, dtype=int)
|
| 89 |
+
|
| 90 |
+
# Recorremos de más clusters a menos, buscando la primera solución
|
| 91 |
+
# que tenga entre 1 y k_target clusters válidos.
|
| 92 |
+
for n_clusters in range(max_to_try, 0, -1):
|
| 93 |
+
trial_labels = fcluster(Z, t=n_clusters, criterion="maxclust") - 1
|
| 94 |
+
counts = Counter(trial_labels)
|
| 95 |
+
|
| 96 |
+
# Clusters con tamaño suficiente
|
| 97 |
+
valid_clusters = {lbl for lbl, cnt in counts.items() if cnt >= min_cluster_size}
|
| 98 |
+
num_valid = len(valid_clusters)
|
| 99 |
+
|
| 100 |
+
if num_valid == 0:
|
| 101 |
+
# Demasiado fino, todos los clusters son demasiado pequeños
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
if num_valid <= k_target:
|
| 105 |
+
# Aceptamos esta solución
|
| 106 |
+
final_labels = []
|
| 107 |
+
for lbl in trial_labels:
|
| 108 |
+
if lbl in valid_clusters:
|
| 109 |
+
final_labels.append(lbl)
|
| 110 |
+
else:
|
| 111 |
+
final_labels.append(-1)
|
| 112 |
+
best_labels = np.array(final_labels, dtype=int)
|
| 113 |
+
break
|
| 114 |
+
|
| 115 |
+
return best_labels
|
| 116 |
|
| 117 |
|
| 118 |
router = APIRouter(tags=["Preprocessing Manager"])
|
|
|
|
| 405 |
scene_sensitivity: float = Form(default=0.5),
|
| 406 |
frame_interval_sec: float = Form(default=0.5),
|
| 407 |
):
|
| 408 |
+
"""Extract keyframes from video using svision Space (1 per second)."""
|
| 409 |
+
import requests
|
| 410 |
+
|
| 411 |
video_name = Path(video.filename).stem
|
| 412 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 413 |
with dst_video.open("wb") as f:
|
| 414 |
shutil.copyfileobj(video.file, f)
|
| 415 |
|
| 416 |
try:
|
| 417 |
+
import cv2
|
| 418 |
+
import numpy as np
|
| 419 |
+
|
| 420 |
+
print(f"[detect_scenes] Extrayendo keyframes de {video_name}...")
|
| 421 |
|
| 422 |
+
# Call svision to extract keyframes (1 per second)
|
| 423 |
+
result = svision_client.keyframes_every_second_extraction(str(dst_video))
|
| 424 |
|
| 425 |
+
print(f"[detect_scenes] Raw result type: {type(result)}, len: {len(result) if result else 0}")
|
|
|
|
|
|
|
| 426 |
|
| 427 |
+
# result is tuple: (images, frames_info)
|
| 428 |
+
images_raw = []
|
| 429 |
+
frames_info = []
|
| 430 |
+
if result and len(result) >= 2:
|
| 431 |
+
images_raw = result[0] if result[0] else []
|
| 432 |
+
frames_info = result[1] if result[1] else []
|
| 433 |
+
|
| 434 |
+
n_keyframes = len(images_raw)
|
| 435 |
+
print(f"[detect_scenes] svision devolvió {n_keyframes} keyframes")
|
| 436 |
+
|
| 437 |
+
# Create base directory for scenes
|
| 438 |
base = TEMP_ROOT / video_name
|
| 439 |
scenes_dir = base / "scenes"
|
| 440 |
scenes_dir.mkdir(parents=True, exist_ok=True)
|
| 441 |
+
|
| 442 |
+
# ------------------------------------------------------------------
|
| 443 |
+
# STEP 1: Guardar todos los keyframes y construir embeddings sencillos
|
| 444 |
+
# ------------------------------------------------------------------
|
| 445 |
+
keyframe_paths: List[Path] = []
|
| 446 |
+
keyframe_infos: List[dict] = []
|
| 447 |
+
features: List[np.ndarray] = []
|
| 448 |
+
|
| 449 |
+
for i, img_data in enumerate(images_raw):
|
| 450 |
+
local_keyframe = scenes_dir / f"keyframe_{i:03d}.jpg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
keyframe_saved = False
|
| 452 |
+
|
| 453 |
+
# Extract path from Gradio file object
|
| 454 |
+
keyframe_path = None
|
| 455 |
+
if isinstance(img_data, str):
|
| 456 |
+
keyframe_path = img_data
|
| 457 |
+
elif isinstance(img_data, dict):
|
| 458 |
+
keyframe_path = img_data.get("path") or img_data.get("url") or img_data.get("name")
|
| 459 |
+
elif hasattr(img_data, "name"):
|
| 460 |
+
keyframe_path = img_data.name
|
| 461 |
+
|
| 462 |
if keyframe_path:
|
| 463 |
try:
|
| 464 |
if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
|
|
|
|
| 465 |
resp = requests.get(keyframe_path, timeout=30)
|
| 466 |
if resp.status_code == 200:
|
| 467 |
with open(local_keyframe, "wb") as f:
|
|
|
|
| 472 |
keyframe_saved = True
|
| 473 |
except Exception as dl_err:
|
| 474 |
print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
|
| 475 |
+
|
| 476 |
+
if not keyframe_saved:
|
| 477 |
+
continue
|
| 478 |
+
|
| 479 |
+
# Cargar imagen y construir un histograma de color simple como embedding
|
| 480 |
+
try:
|
| 481 |
+
img = cv2.imread(str(local_keyframe))
|
| 482 |
+
if img is None:
|
| 483 |
+
continue
|
| 484 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 485 |
+
# Histograma 8x8x8 en RGB, normalizado
|
| 486 |
+
hist = cv2.calcHist([img_rgb], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
|
| 487 |
+
hist = cv2.normalize(hist, hist).flatten()
|
| 488 |
+
features.append(hist.astype("float32"))
|
| 489 |
+
except Exception as fe_err:
|
| 490 |
+
print(f"[detect_scenes] Error calculando embedding para keyframe {i}: {fe_err}")
|
| 491 |
+
continue
|
| 492 |
+
|
| 493 |
+
keyframe_paths.append(local_keyframe)
|
| 494 |
+
info = frames_info[i] if i < len(frames_info) else {}
|
| 495 |
+
keyframe_infos.append(info if isinstance(info, dict) else {})
|
| 496 |
+
|
| 497 |
+
if not features or len(features) < min_cluster_size:
|
| 498 |
+
print("[detect_scenes] No hay suficientes keyframes válidos para clusterizar escenas")
|
| 499 |
+
return {"scene_clusters": []}
|
| 500 |
+
|
| 501 |
+
Xs = np.vstack(features)
|
| 502 |
+
|
| 503 |
+
# ------------------------------------------------------------------
|
| 504 |
+
# STEP 2: Clustering jerárquico de escenas (k-Target + mida mínima)
|
| 505 |
+
# ------------------------------------------------------------------
|
| 506 |
+
print("[detect_scenes] Clustering jerárquico de escenas...")
|
| 507 |
+
scene_labels = hierarchical_cluster_with_min_size(Xs, max_groups, min_cluster_size, 0.5)
|
| 508 |
+
unique_labels = sorted({int(l) for l in scene_labels if int(l) >= 0})
|
| 509 |
+
print(f"[detect_scenes] Etiquetas de escena válidas: {unique_labels}")
|
| 510 |
+
|
| 511 |
+
# Mapear índices de keyframes a clusters
|
| 512 |
+
cluster_map: Dict[int, List[int]] = {}
|
| 513 |
+
for idx, lbl in enumerate(scene_labels):
|
| 514 |
+
lbl = int(lbl)
|
| 515 |
+
if lbl >= 0:
|
| 516 |
+
cluster_map.setdefault(lbl, []).append(idx)
|
| 517 |
+
|
| 518 |
+
# ------------------------------------------------------------------
|
| 519 |
+
# STEP 3: Construir scene_clusters con el formato esperado por el demo
|
| 520 |
+
# ------------------------------------------------------------------
|
| 521 |
+
scene_clusters: List[Dict[str, Any]] = []
|
| 522 |
+
for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
|
| 523 |
+
if not idxs:
|
| 524 |
+
continue
|
| 525 |
+
|
| 526 |
+
scene_id = f"scene_{ci:02d}"
|
| 527 |
+
scene_out_dir = scenes_dir / scene_id
|
| 528 |
+
scene_out_dir.mkdir(parents=True, exist_ok=True)
|
| 529 |
+
|
| 530 |
+
# Copiar todos los keyframes del cluster a la carpeta del cluster
|
| 531 |
+
cluster_start = None
|
| 532 |
+
cluster_end = None
|
| 533 |
+
representative_file = None
|
| 534 |
+
|
| 535 |
+
for j, k_idx in enumerate(idxs):
|
| 536 |
+
src = keyframe_paths[k_idx]
|
| 537 |
+
dst = scene_out_dir / src.name
|
| 538 |
+
try:
|
| 539 |
+
shutil.copy2(src, dst)
|
| 540 |
+
except Exception as cp_err:
|
| 541 |
+
print(f"[detect_scenes] Error copiando keyframe {src} a cluster {scene_id}: {cp_err}")
|
| 542 |
+
continue
|
| 543 |
+
|
| 544 |
+
if representative_file is None:
|
| 545 |
+
representative_file = dst
|
| 546 |
+
|
| 547 |
+
info = keyframe_infos[k_idx]
|
| 548 |
+
start = info.get("start", k_idx)
|
| 549 |
+
end = info.get("end", k_idx + 1)
|
| 550 |
+
cluster_start = start if cluster_start is None else min(cluster_start, start)
|
| 551 |
+
cluster_end = end if cluster_end is None else max(cluster_end, end)
|
| 552 |
+
|
| 553 |
+
if representative_file is None:
|
| 554 |
+
continue
|
| 555 |
+
|
| 556 |
+
scene_clusters.append({
|
| 557 |
+
"id": scene_id,
|
| 558 |
+
"name": f"Escena {len(scene_clusters)+1}",
|
| 559 |
+
"folder": str(scene_out_dir),
|
| 560 |
+
"image_url": f"/files_scene/{video_name}/{scene_id}/{representative_file.name}",
|
| 561 |
+
"start_time": float(cluster_start) if cluster_start is not None else 0.0,
|
| 562 |
+
"end_time": float(cluster_end) if cluster_end is not None else 0.0,
|
| 563 |
+
})
|
| 564 |
+
|
| 565 |
+
print(f"[detect_scenes] ✓ {len(scene_clusters)} escenes clusteritzades")
|
| 566 |
return {"scene_clusters": scene_clusters}
|
| 567 |
|
| 568 |
except Exception as e:
|
svision_client.py
CHANGED
|
@@ -125,17 +125,39 @@ def extract_descripcion_escena(imagen_path: str) -> str:
|
|
| 125 |
|
| 126 |
|
| 127 |
def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
|
| 128 |
-
"""Extract file path from Gradio file object (can be dict, str, or other).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
if file_obj is None:
|
| 130 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
if isinstance(file_obj, str):
|
| 132 |
return file_obj
|
|
|
|
|
|
|
| 133 |
if isinstance(file_obj, dict):
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
return file_obj.name
|
| 138 |
-
|
|
|
|
|
|
|
| 139 |
|
| 140 |
|
| 141 |
def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
|
|
@@ -162,18 +184,27 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
|
|
| 162 |
api_name="/face_image_embedding_casting"
|
| 163 |
)
|
| 164 |
|
|
|
|
|
|
|
| 165 |
# result is a tuple: (list of image paths/dicts, list of embedding dicts)
|
| 166 |
if result and len(result) >= 2:
|
| 167 |
face_crops_raw = result[0] if result[0] else []
|
| 168 |
face_embeddings = result[1] if result[1] else []
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
# Combine into unified structure, extracting paths correctly
|
| 171 |
faces = []
|
| 172 |
for i, emb_dict in enumerate(face_embeddings):
|
| 173 |
# Extract path from Gradio file object (might be dict or string)
|
| 174 |
crop_path = None
|
| 175 |
if i < len(face_crops_raw):
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
|
| 179 |
|
|
@@ -184,9 +215,6 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
|
|
| 184 |
})
|
| 185 |
|
| 186 |
print(f"[svision_client] Detected {len(faces)} faces from image")
|
| 187 |
-
for i, f in enumerate(faces):
|
| 188 |
-
crop_path = f.get("face_crop_path")
|
| 189 |
-
print(f"[svision_client] Face {i}: crop_path={crop_path[:80] if crop_path else 'None'}...")
|
| 190 |
return faces
|
| 191 |
return []
|
| 192 |
except Exception as e:
|
|
|
|
| 125 |
|
| 126 |
|
| 127 |
def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
|
| 128 |
+
"""Extract file path from Gradio file object (can be dict, str, tuple, or other).
|
| 129 |
+
|
| 130 |
+
Gradio Gallery returns different formats depending on version:
|
| 131 |
+
- List of tuples: [(path, caption), ...]
|
| 132 |
+
- List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
|
| 133 |
+
- List of FileData: [FileData(path=..., url=...), ...]
|
| 134 |
+
- List of paths: [path, ...]
|
| 135 |
+
"""
|
| 136 |
if file_obj is None:
|
| 137 |
return None
|
| 138 |
+
|
| 139 |
+
# Handle tuple format: (path, caption)
|
| 140 |
+
if isinstance(file_obj, tuple) and len(file_obj) >= 1:
|
| 141 |
+
return _extract_path_from_gradio_file(file_obj[0])
|
| 142 |
+
|
| 143 |
+
# Handle string path/URL
|
| 144 |
if isinstance(file_obj, str):
|
| 145 |
return file_obj
|
| 146 |
+
|
| 147 |
+
# Handle dict format: {"path": "...", "url": "...", "name": "..."}
|
| 148 |
if isinstance(file_obj, dict):
|
| 149 |
+
return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
|
| 150 |
+
|
| 151 |
+
# Handle FileData or similar object with attributes
|
| 152 |
+
if hasattr(file_obj, "path") and file_obj.path:
|
| 153 |
+
return file_obj.path
|
| 154 |
+
if hasattr(file_obj, "url") and file_obj.url:
|
| 155 |
+
return file_obj.url
|
| 156 |
+
if hasattr(file_obj, "name") and file_obj.name:
|
| 157 |
return file_obj.name
|
| 158 |
+
|
| 159 |
+
# Last resort: convert to string
|
| 160 |
+
return str(file_obj) if file_obj else None
|
| 161 |
|
| 162 |
|
| 163 |
def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
|
|
|
|
| 184 |
api_name="/face_image_embedding_casting"
|
| 185 |
)
|
| 186 |
|
| 187 |
+
print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
|
| 188 |
+
|
| 189 |
# result is a tuple: (list of image paths/dicts, list of embedding dicts)
|
| 190 |
if result and len(result) >= 2:
|
| 191 |
face_crops_raw = result[0] if result[0] else []
|
| 192 |
face_embeddings = result[1] if result[1] else []
|
| 193 |
|
| 194 |
+
print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
|
| 195 |
+
if face_crops_raw and len(face_crops_raw) > 0:
|
| 196 |
+
print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
|
| 197 |
+
|
| 198 |
# Combine into unified structure, extracting paths correctly
|
| 199 |
faces = []
|
| 200 |
for i, emb_dict in enumerate(face_embeddings):
|
| 201 |
# Extract path from Gradio file object (might be dict or string)
|
| 202 |
crop_path = None
|
| 203 |
if i < len(face_crops_raw):
|
| 204 |
+
raw_crop = face_crops_raw[i]
|
| 205 |
+
crop_path = _extract_path_from_gradio_file(raw_crop)
|
| 206 |
+
if not crop_path:
|
| 207 |
+
print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
|
| 208 |
|
| 209 |
embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
|
| 210 |
|
|
|
|
| 215 |
})
|
| 216 |
|
| 217 |
print(f"[svision_client] Detected {len(faces)} faces from image")
|
|
|
|
|
|
|
|
|
|
| 218 |
return faces
|
| 219 |
return []
|
| 220 |
except Exception as e:
|