Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 21 days ago

Commit

a40d539

verified ·

1 Parent(s): 19f6f25

Upload 2 files

Browse files

Files changed (2) hide show

preprocessing_router.py +194 -73
svision_client.py +37 -9

preprocessing_router.py CHANGED Viewed

@@ -46,46 +46,73 @@ jobs: Dict[str, dict] = {}
 # ---------------------------------------------------------------------------
 def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
-    """Hierarchical clustering with silhouette score and minimum cluster size."""
     from scipy.cluster.hierarchy import linkage, fcluster
-    from sklearn.metrics import silhouette_score
     from collections import Counter
-    if len(X) == 0:
         return np.array([])
-    if len(X) < min_cluster_size:
-        return np.full(len(X), -1, dtype=int)
-    Z = linkage(X, method='average', metric='cosine')
-    best_n_clusters = 2
-    best_score = -1
-    max_to_try = min(max_groups, len(X) - 1)
-    if max_to_try >= 2:
-        for n_clusters in range(2, max_to_try + 1):
-            trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
-            trial_counts = Counter(trial_labels)
-            valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
-            if valid_clusters >= 2:
-                try:
-                    score = silhouette_score(X, trial_labels, metric='cosine')
-                    penalty = 0.14 - (sensitivity * 0.13)
-                    adjusted_score = score - (n_clusters * penalty)
-                    if adjusted_score > best_score:
-                        best_score = adjusted_score
-                        best_n_clusters = n_clusters
-                except Exception:
-                    pass
-    labels = fcluster(Z, t=best_n_clusters, criterion='maxclust') - 1
-    label_counts = Counter(labels)
-    filtered_labels = []
-    for lbl in labels:
-        if label_counts[lbl] >= min_cluster_size:
-            filtered_labels.append(lbl)
-        else:
-            filtered_labels.append(-1)
-    return np.array(filtered_labels, dtype=int)
 router = APIRouter(tags=["Preprocessing Manager"])
@@ -378,48 +405,63 @@ async def detect_scenes(
     scene_sensitivity: float = Form(default=0.5),
     frame_interval_sec: float = Form(default=0.5),
 ):
-    """Extract scenes from video using svision Space."""
     video_name = Path(video.filename).stem
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
     with dst_video.open("wb") as f:
         shutil.copyfileobj(video.file, f)
     try:
-        print(f"[detect_scenes] Extrayendo escenas de {video_name}...")
-        # Call svision to extract scenes
-        result = svision_client.extract_scenes(str(dst_video), threshold=scene_sensitivity)
-        # result contains scene keyframes
-        scenes_raw = result if isinstance(result, list) else []
-        print(f"[detect_scenes] svision devolvió {len(scenes_raw)} escenas")
-        # Create scene clusters directory
         base = TEMP_ROOT / video_name
         scenes_dir = base / "scenes"
         scenes_dir.mkdir(parents=True, exist_ok=True)
-        scene_clusters = []
-        for i, scene_data in enumerate(scenes_raw):
-            scene_id = f"scene_{i:02d}"
-            scene_out_dir = scenes_dir / scene_id
-            scene_out_dir.mkdir(parents=True, exist_ok=True)
-            # Extract keyframe path from scene data
-            keyframe_path = None
-            if isinstance(scene_data, str):
-                keyframe_path = scene_data
-            elif isinstance(scene_data, dict):
-                keyframe_path = scene_data.get("path") or scene_data.get("keyframe") or scene_data.get("image")
-            # Download or copy keyframe
-            local_keyframe = scene_out_dir / "keyframe.jpg"
             keyframe_saved = False
             if keyframe_path:
                 try:
                     if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
-                        import requests
                         resp = requests.get(keyframe_path, timeout=30)
                         if resp.status_code == 200:
                             with open(local_keyframe, "wb") as f:
@@ -430,18 +472,97 @@ async def detect_scenes(
                         keyframe_saved = True
                 except Exception as dl_err:
                     print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
-            if keyframe_saved:
-                scene_clusters.append({
-                    "id": scene_id,
-                    "name": f"Escena {i+1}",
-                    "folder": str(scene_out_dir),
-                    "image_url": f"/files_scene/{video_name}/{scene_id}/keyframe.jpg",
-                    "start_time": scene_data.get("start", 0) if isinstance(scene_data, dict) else 0,
-                    "end_time": scene_data.get("end", 0) if isinstance(scene_data, dict) else 0,
-                })
-        print(f"[detect_scenes] ✓ {len(scene_clusters)} escenas procesadas")
         return {"scene_clusters": scene_clusters}
     except Exception as e:

 # ---------------------------------------------------------------------------
 def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
+    """Hierarchical clustering using only min_cluster_size and k-target (max_groups).
+    - Primero intenta crear el máximo número posible de clusters con al menos
+      ``min_cluster_size`` elementos.
+    - Después fusiona implícitamente (bajando el número de clusters) hasta
+      llegar a un número de clusters válidos (tamaño >= min_cluster_size)
+      menor o igual que ``max_groups``.
+    ``sensitivity`` se mantiene en la firma por compatibilidad, pero no se usa.
+    """
     from scipy.cluster.hierarchy import linkage, fcluster
     from collections import Counter
+    n_samples = len(X)
+    if n_samples == 0:
         return np.array([])
+    # Si no hay suficientes muestras para formar un solo cluster válido,
+    # marcamos todo como ruido (-1).
+    if n_samples < min_cluster_size:
+        return np.full(n_samples, -1, dtype=int)
+    # k_target = max_groups (interpretamos este parámetro como k-Target)
+    k_target = max(0, int(max_groups))
+    # Caso especial: k_target == 0 => no queremos clusters, todo ruido.
+    if k_target == 0:
+        return np.full(n_samples, -1, dtype=int)
+    # Enlace jerárquico una sola vez
+    Z = linkage(X, method="average", metric="cosine")
+    # Máximo número de clusters posibles respetando min_cluster_size
+    max_possible = n_samples // min_cluster_size
+    if max_possible <= 0:
+        return np.full(n_samples, -1, dtype=int)
+    max_to_try = min(max_possible, n_samples)
+    best_labels = np.full(n_samples, -1, dtype=int)
+    # Recorremos de más clusters a menos, buscando la primera solución
+    # que tenga entre 1 y k_target clusters válidos.
+    for n_clusters in range(max_to_try, 0, -1):
+        trial_labels = fcluster(Z, t=n_clusters, criterion="maxclust") - 1
+        counts = Counter(trial_labels)
+        # Clusters con tamaño suficiente
+        valid_clusters = {lbl for lbl, cnt in counts.items() if cnt >= min_cluster_size}
+        num_valid = len(valid_clusters)
+        if num_valid == 0:
+            # Demasiado fino, todos los clusters son demasiado pequeños
+            continue
+        if num_valid <= k_target:
+            # Aceptamos esta solución
+            final_labels = []
+            for lbl in trial_labels:
+                if lbl in valid_clusters:
+                    final_labels.append(lbl)
+                else:
+                    final_labels.append(-1)
+            best_labels = np.array(final_labels, dtype=int)
+            break
+    return best_labels
 router = APIRouter(tags=["Preprocessing Manager"])
     scene_sensitivity: float = Form(default=0.5),
     frame_interval_sec: float = Form(default=0.5),
 ):
+    """Extract keyframes from video using svision Space (1 per second)."""
+    import requests
     video_name = Path(video.filename).stem
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
     with dst_video.open("wb") as f:
         shutil.copyfileobj(video.file, f)
     try:
+        import cv2
+        import numpy as np
+        print(f"[detect_scenes] Extrayendo keyframes de {video_name}...")
+        # Call svision to extract keyframes (1 per second)
+        result = svision_client.keyframes_every_second_extraction(str(dst_video))
+        print(f"[detect_scenes] Raw result type: {type(result)}, len: {len(result) if result else 0}")
+        # result is tuple: (images, frames_info)
+        images_raw = []
+        frames_info = []
+        if result and len(result) >= 2:
+            images_raw = result[0] if result[0] else []
+            frames_info = result[1] if result[1] else []
+        n_keyframes = len(images_raw)
+        print(f"[detect_scenes] svision devolvió {n_keyframes} keyframes")
+        # Create base directory for scenes
         base = TEMP_ROOT / video_name
         scenes_dir = base / "scenes"
         scenes_dir.mkdir(parents=True, exist_ok=True)
+        # ------------------------------------------------------------------
+        # STEP 1: Guardar todos los keyframes y construir embeddings sencillos
+        # ------------------------------------------------------------------
+        keyframe_paths: List[Path] = []
+        keyframe_infos: List[dict] = []
+        features: List[np.ndarray] = []
+        for i, img_data in enumerate(images_raw):
+            local_keyframe = scenes_dir / f"keyframe_{i:03d}.jpg"
             keyframe_saved = False
+            # Extract path from Gradio file object
+            keyframe_path = None
+            if isinstance(img_data, str):
+                keyframe_path = img_data
+            elif isinstance(img_data, dict):
+                keyframe_path = img_data.get("path") or img_data.get("url") or img_data.get("name")
+            elif hasattr(img_data, "name"):
+                keyframe_path = img_data.name
             if keyframe_path:
                 try:
                     if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
                         resp = requests.get(keyframe_path, timeout=30)
                         if resp.status_code == 200:
                             with open(local_keyframe, "wb") as f:
                         keyframe_saved = True
                 except Exception as dl_err:
                     print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
+            if not keyframe_saved:
+                continue
+            # Cargar imagen y construir un histograma de color simple como embedding
+            try:
+                img = cv2.imread(str(local_keyframe))
+                if img is None:
+                    continue
+                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+                # Histograma 8x8x8 en RGB, normalizado
+                hist = cv2.calcHist([img_rgb], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
+                hist = cv2.normalize(hist, hist).flatten()
+                features.append(hist.astype("float32"))
+            except Exception as fe_err:
+                print(f"[detect_scenes] Error calculando embedding para keyframe {i}: {fe_err}")
+                continue
+            keyframe_paths.append(local_keyframe)
+            info = frames_info[i] if i < len(frames_info) else {}
+            keyframe_infos.append(info if isinstance(info, dict) else {})
+        if not features or len(features) < min_cluster_size:
+            print("[detect_scenes] No hay suficientes keyframes válidos para clusterizar escenas")
+            return {"scene_clusters": []}
+        Xs = np.vstack(features)
+        # ------------------------------------------------------------------
+        # STEP 2: Clustering jerárquico de escenas (k-Target + mida mínima)
+        # ------------------------------------------------------------------
+        print("[detect_scenes] Clustering jerárquico de escenas...")
+        scene_labels = hierarchical_cluster_with_min_size(Xs, max_groups, min_cluster_size, 0.5)
+        unique_labels = sorted({int(l) for l in scene_labels if int(l) >= 0})
+        print(f"[detect_scenes] Etiquetas de escena válidas: {unique_labels}")
+        # Mapear índices de keyframes a clusters
+        cluster_map: Dict[int, List[int]] = {}
+        for idx, lbl in enumerate(scene_labels):
+            lbl = int(lbl)
+            if lbl >= 0:
+                cluster_map.setdefault(lbl, []).append(idx)
+        # ------------------------------------------------------------------
+        # STEP 3: Construir scene_clusters con el formato esperado por el demo
+        # ------------------------------------------------------------------
+        scene_clusters: List[Dict[str, Any]] = []
+        for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
+            if not idxs:
+                continue
+            scene_id = f"scene_{ci:02d}"
+            scene_out_dir = scenes_dir / scene_id
+            scene_out_dir.mkdir(parents=True, exist_ok=True)
+            # Copiar todos los keyframes del cluster a la carpeta del cluster
+            cluster_start = None
+            cluster_end = None
+            representative_file = None
+            for j, k_idx in enumerate(idxs):
+                src = keyframe_paths[k_idx]
+                dst = scene_out_dir / src.name
+                try:
+                    shutil.copy2(src, dst)
+                except Exception as cp_err:
+                    print(f"[detect_scenes] Error copiando keyframe {src} a cluster {scene_id}: {cp_err}")
+                    continue
+                if representative_file is None:
+                    representative_file = dst
+                info = keyframe_infos[k_idx]
+                start = info.get("start", k_idx)
+                end = info.get("end", k_idx + 1)
+                cluster_start = start if cluster_start is None else min(cluster_start, start)
+                cluster_end = end if cluster_end is None else max(cluster_end, end)
+            if representative_file is None:
+                continue
+            scene_clusters.append({
+                "id": scene_id,
+                "name": f"Escena {len(scene_clusters)+1}",
+                "folder": str(scene_out_dir),
+                "image_url": f"/files_scene/{video_name}/{scene_id}/{representative_file.name}",
+                "start_time": float(cluster_start) if cluster_start is not None else 0.0,
+                "end_time": float(cluster_end) if cluster_end is not None else 0.0,
+            })
+        print(f"[detect_scenes] ✓ {len(scene_clusters)} escenes clusteritzades")
         return {"scene_clusters": scene_clusters}
     except Exception as e:

svision_client.py CHANGED Viewed

@@ -125,17 +125,39 @@ def extract_descripcion_escena(imagen_path: str) -> str:
 def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
-    """Extract file path from Gradio file object (can be dict, str, or other)."""
     if file_obj is None:
         return None
     if isinstance(file_obj, str):
         return file_obj
     if isinstance(file_obj, dict):
-        # Gradio returns dicts like {"path": "...", "url": "...", "orig_name": "..."}
-        return file_obj.get("path") or file_obj.get("url") or file_obj.get("name")
-    if hasattr(file_obj, "name"):
         return file_obj.name
-    return str(file_obj)
 def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
@@ -162,18 +184,27 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
             api_name="/face_image_embedding_casting"
         )
         # result is a tuple: (list of image paths/dicts, list of embedding dicts)
         if result and len(result) >= 2:
             face_crops_raw = result[0] if result[0] else []
             face_embeddings = result[1] if result[1] else []
             # Combine into unified structure, extracting paths correctly
             faces = []
             for i, emb_dict in enumerate(face_embeddings):
                 # Extract path from Gradio file object (might be dict or string)
                 crop_path = None
                 if i < len(face_crops_raw):
-                    crop_path = _extract_path_from_gradio_file(face_crops_raw[i])
                 embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
@@ -184,9 +215,6 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
                 })
             print(f"[svision_client] Detected {len(faces)} faces from image")
-            for i, f in enumerate(faces):
-                crop_path = f.get("face_crop_path")
-                print(f"[svision_client]   Face {i}: crop_path={crop_path[:80] if crop_path else 'None'}...")
             return faces
         return []
     except Exception as e:

 def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
+    """Extract file path from Gradio file object (can be dict, str, tuple, or other).
+    Gradio Gallery returns different formats depending on version:
+    - List of tuples: [(path, caption), ...]
+    - List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
+    - List of FileData: [FileData(path=..., url=...), ...]
+    - List of paths: [path, ...]
+    """
     if file_obj is None:
         return None
+    # Handle tuple format: (path, caption)
+    if isinstance(file_obj, tuple) and len(file_obj) >= 1:
+        return _extract_path_from_gradio_file(file_obj[0])
+    # Handle string path/URL
     if isinstance(file_obj, str):
         return file_obj
+    # Handle dict format: {"path": "...", "url": "...", "name": "..."}
     if isinstance(file_obj, dict):
+        return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
+    # Handle FileData or similar object with attributes
+    if hasattr(file_obj, "path") and file_obj.path:
+        return file_obj.path
+    if hasattr(file_obj, "url") and file_obj.url:
+        return file_obj.url
+    if hasattr(file_obj, "name") and file_obj.name:
         return file_obj.name
+    # Last resort: convert to string
+    return str(file_obj) if file_obj else None
 def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
             api_name="/face_image_embedding_casting"
         )
+        print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
         # result is a tuple: (list of image paths/dicts, list of embedding dicts)
         if result and len(result) >= 2:
             face_crops_raw = result[0] if result[0] else []
             face_embeddings = result[1] if result[1] else []
+            print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
+            if face_crops_raw and len(face_crops_raw) > 0:
+                print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
             # Combine into unified structure, extracting paths correctly
             faces = []
             for i, emb_dict in enumerate(face_embeddings):
                 # Extract path from Gradio file object (might be dict or string)
                 crop_path = None
                 if i < len(face_crops_raw):
+                    raw_crop = face_crops_raw[i]
+                    crop_path = _extract_path_from_gradio_file(raw_crop)
+                    if not crop_path:
+                        print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
                 embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
                 })
             print(f"[svision_client] Detected {len(faces)} faces from image")
             return faces
         return []
     except Exception as e: