VeuReu commited on
Commit
a40d539
·
verified ·
1 Parent(s): 19f6f25

Upload 2 files

Browse files
Files changed (2) hide show
  1. preprocessing_router.py +194 -73
  2. svision_client.py +37 -9
preprocessing_router.py CHANGED
@@ -46,46 +46,73 @@ jobs: Dict[str, dict] = {}
46
  # ---------------------------------------------------------------------------
47
 
48
  def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
49
- """Hierarchical clustering with silhouette score and minimum cluster size."""
 
 
 
 
 
 
 
 
 
50
  from scipy.cluster.hierarchy import linkage, fcluster
51
- from sklearn.metrics import silhouette_score
52
  from collections import Counter
53
 
54
- if len(X) == 0:
 
55
  return np.array([])
56
- if len(X) < min_cluster_size:
57
- return np.full(len(X), -1, dtype=int)
58
-
59
- Z = linkage(X, method='average', metric='cosine')
60
- best_n_clusters = 2
61
- best_score = -1
62
- max_to_try = min(max_groups, len(X) - 1)
63
-
64
- if max_to_try >= 2:
65
- for n_clusters in range(2, max_to_try + 1):
66
- trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
67
- trial_counts = Counter(trial_labels)
68
- valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
69
- if valid_clusters >= 2:
70
- try:
71
- score = silhouette_score(X, trial_labels, metric='cosine')
72
- penalty = 0.14 - (sensitivity * 0.13)
73
- adjusted_score = score - (n_clusters * penalty)
74
- if adjusted_score > best_score:
75
- best_score = adjusted_score
76
- best_n_clusters = n_clusters
77
- except Exception:
78
- pass
79
 
80
- labels = fcluster(Z, t=best_n_clusters, criterion='maxclust') - 1
81
- label_counts = Counter(labels)
82
- filtered_labels = []
83
- for lbl in labels:
84
- if label_counts[lbl] >= min_cluster_size:
85
- filtered_labels.append(lbl)
86
- else:
87
- filtered_labels.append(-1)
88
- return np.array(filtered_labels, dtype=int)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
 
91
  router = APIRouter(tags=["Preprocessing Manager"])
@@ -378,48 +405,63 @@ async def detect_scenes(
378
  scene_sensitivity: float = Form(default=0.5),
379
  frame_interval_sec: float = Form(default=0.5),
380
  ):
381
- """Extract scenes from video using svision Space."""
 
 
382
  video_name = Path(video.filename).stem
383
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
384
  with dst_video.open("wb") as f:
385
  shutil.copyfileobj(video.file, f)
386
 
387
  try:
388
- print(f"[detect_scenes] Extrayendo escenas de {video_name}...")
 
 
 
389
 
390
- # Call svision to extract scenes
391
- result = svision_client.extract_scenes(str(dst_video), threshold=scene_sensitivity)
392
 
393
- # result contains scene keyframes
394
- scenes_raw = result if isinstance(result, list) else []
395
- print(f"[detect_scenes] svision devolvió {len(scenes_raw)} escenas")
396
 
397
- # Create scene clusters directory
 
 
 
 
 
 
 
 
 
 
398
  base = TEMP_ROOT / video_name
399
  scenes_dir = base / "scenes"
400
  scenes_dir.mkdir(parents=True, exist_ok=True)
401
-
402
- scene_clusters = []
403
- for i, scene_data in enumerate(scenes_raw):
404
- scene_id = f"scene_{i:02d}"
405
- scene_out_dir = scenes_dir / scene_id
406
- scene_out_dir.mkdir(parents=True, exist_ok=True)
407
-
408
- # Extract keyframe path from scene data
409
- keyframe_path = None
410
- if isinstance(scene_data, str):
411
- keyframe_path = scene_data
412
- elif isinstance(scene_data, dict):
413
- keyframe_path = scene_data.get("path") or scene_data.get("keyframe") or scene_data.get("image")
414
-
415
- # Download or copy keyframe
416
- local_keyframe = scene_out_dir / "keyframe.jpg"
417
  keyframe_saved = False
418
-
 
 
 
 
 
 
 
 
 
419
  if keyframe_path:
420
  try:
421
  if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
422
- import requests
423
  resp = requests.get(keyframe_path, timeout=30)
424
  if resp.status_code == 200:
425
  with open(local_keyframe, "wb") as f:
@@ -430,18 +472,97 @@ async def detect_scenes(
430
  keyframe_saved = True
431
  except Exception as dl_err:
432
  print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
433
-
434
- if keyframe_saved:
435
- scene_clusters.append({
436
- "id": scene_id,
437
- "name": f"Escena {i+1}",
438
- "folder": str(scene_out_dir),
439
- "image_url": f"/files_scene/{video_name}/{scene_id}/keyframe.jpg",
440
- "start_time": scene_data.get("start", 0) if isinstance(scene_data, dict) else 0,
441
- "end_time": scene_data.get("end", 0) if isinstance(scene_data, dict) else 0,
442
- })
443
-
444
- print(f"[detect_scenes] {len(scene_clusters)} escenas procesadas")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  return {"scene_clusters": scene_clusters}
446
 
447
  except Exception as e:
 
46
  # ---------------------------------------------------------------------------
47
 
48
  def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
49
+ """Hierarchical clustering using only min_cluster_size and k-target (max_groups).
50
+
51
+ - Primero intenta crear el máximo número posible de clusters con al menos
52
+ ``min_cluster_size`` elementos.
53
+ - Después fusiona implícitamente (bajando el número de clusters) hasta
54
+ llegar a un número de clusters válidos (tamaño >= min_cluster_size)
55
+ menor o igual que ``max_groups``.
56
+
57
+ ``sensitivity`` se mantiene en la firma por compatibilidad, pero no se usa.
58
+ """
59
  from scipy.cluster.hierarchy import linkage, fcluster
 
60
  from collections import Counter
61
 
62
+ n_samples = len(X)
63
+ if n_samples == 0:
64
  return np.array([])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ # Si no hay suficientes muestras para formar un solo cluster válido,
67
+ # marcamos todo como ruido (-1).
68
+ if n_samples < min_cluster_size:
69
+ return np.full(n_samples, -1, dtype=int)
70
+
71
+ # k_target = max_groups (interpretamos este parámetro como k-Target)
72
+ k_target = max(0, int(max_groups))
73
+
74
+ # Caso especial: k_target == 0 => no queremos clusters, todo ruido.
75
+ if k_target == 0:
76
+ return np.full(n_samples, -1, dtype=int)
77
+
78
+ # Enlace jerárquico una sola vez
79
+ Z = linkage(X, method="average", metric="cosine")
80
+
81
+ # Máximo número de clusters posibles respetando min_cluster_size
82
+ max_possible = n_samples // min_cluster_size
83
+ if max_possible <= 0:
84
+ return np.full(n_samples, -1, dtype=int)
85
+
86
+ max_to_try = min(max_possible, n_samples)
87
+
88
+ best_labels = np.full(n_samples, -1, dtype=int)
89
+
90
+ # Recorremos de más clusters a menos, buscando la primera solución
91
+ # que tenga entre 1 y k_target clusters válidos.
92
+ for n_clusters in range(max_to_try, 0, -1):
93
+ trial_labels = fcluster(Z, t=n_clusters, criterion="maxclust") - 1
94
+ counts = Counter(trial_labels)
95
+
96
+ # Clusters con tamaño suficiente
97
+ valid_clusters = {lbl for lbl, cnt in counts.items() if cnt >= min_cluster_size}
98
+ num_valid = len(valid_clusters)
99
+
100
+ if num_valid == 0:
101
+ # Demasiado fino, todos los clusters son demasiado pequeños
102
+ continue
103
+
104
+ if num_valid <= k_target:
105
+ # Aceptamos esta solución
106
+ final_labels = []
107
+ for lbl in trial_labels:
108
+ if lbl in valid_clusters:
109
+ final_labels.append(lbl)
110
+ else:
111
+ final_labels.append(-1)
112
+ best_labels = np.array(final_labels, dtype=int)
113
+ break
114
+
115
+ return best_labels
116
 
117
 
118
  router = APIRouter(tags=["Preprocessing Manager"])
 
405
  scene_sensitivity: float = Form(default=0.5),
406
  frame_interval_sec: float = Form(default=0.5),
407
  ):
408
+ """Extract keyframes from video using svision Space (1 per second)."""
409
+ import requests
410
+
411
  video_name = Path(video.filename).stem
412
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
413
  with dst_video.open("wb") as f:
414
  shutil.copyfileobj(video.file, f)
415
 
416
  try:
417
+ import cv2
418
+ import numpy as np
419
+
420
+ print(f"[detect_scenes] Extrayendo keyframes de {video_name}...")
421
 
422
+ # Call svision to extract keyframes (1 per second)
423
+ result = svision_client.keyframes_every_second_extraction(str(dst_video))
424
 
425
+ print(f"[detect_scenes] Raw result type: {type(result)}, len: {len(result) if result else 0}")
 
 
426
 
427
+ # result is tuple: (images, frames_info)
428
+ images_raw = []
429
+ frames_info = []
430
+ if result and len(result) >= 2:
431
+ images_raw = result[0] if result[0] else []
432
+ frames_info = result[1] if result[1] else []
433
+
434
+ n_keyframes = len(images_raw)
435
+ print(f"[detect_scenes] svision devolvió {n_keyframes} keyframes")
436
+
437
+ # Create base directory for scenes
438
  base = TEMP_ROOT / video_name
439
  scenes_dir = base / "scenes"
440
  scenes_dir.mkdir(parents=True, exist_ok=True)
441
+
442
+ # ------------------------------------------------------------------
443
+ # STEP 1: Guardar todos los keyframes y construir embeddings sencillos
444
+ # ------------------------------------------------------------------
445
+ keyframe_paths: List[Path] = []
446
+ keyframe_infos: List[dict] = []
447
+ features: List[np.ndarray] = []
448
+
449
+ for i, img_data in enumerate(images_raw):
450
+ local_keyframe = scenes_dir / f"keyframe_{i:03d}.jpg"
 
 
 
 
 
 
451
  keyframe_saved = False
452
+
453
+ # Extract path from Gradio file object
454
+ keyframe_path = None
455
+ if isinstance(img_data, str):
456
+ keyframe_path = img_data
457
+ elif isinstance(img_data, dict):
458
+ keyframe_path = img_data.get("path") or img_data.get("url") or img_data.get("name")
459
+ elif hasattr(img_data, "name"):
460
+ keyframe_path = img_data.name
461
+
462
  if keyframe_path:
463
  try:
464
  if isinstance(keyframe_path, str) and keyframe_path.startswith("http"):
 
465
  resp = requests.get(keyframe_path, timeout=30)
466
  if resp.status_code == 200:
467
  with open(local_keyframe, "wb") as f:
 
472
  keyframe_saved = True
473
  except Exception as dl_err:
474
  print(f"[detect_scenes] Error guardando keyframe {i}: {dl_err}")
475
+
476
+ if not keyframe_saved:
477
+ continue
478
+
479
+ # Cargar imagen y construir un histograma de color simple como embedding
480
+ try:
481
+ img = cv2.imread(str(local_keyframe))
482
+ if img is None:
483
+ continue
484
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
485
+ # Histograma 8x8x8 en RGB, normalizado
486
+ hist = cv2.calcHist([img_rgb], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
487
+ hist = cv2.normalize(hist, hist).flatten()
488
+ features.append(hist.astype("float32"))
489
+ except Exception as fe_err:
490
+ print(f"[detect_scenes] Error calculando embedding para keyframe {i}: {fe_err}")
491
+ continue
492
+
493
+ keyframe_paths.append(local_keyframe)
494
+ info = frames_info[i] if i < len(frames_info) else {}
495
+ keyframe_infos.append(info if isinstance(info, dict) else {})
496
+
497
+ if not features or len(features) < min_cluster_size:
498
+ print("[detect_scenes] No hay suficientes keyframes válidos para clusterizar escenas")
499
+ return {"scene_clusters": []}
500
+
501
+ Xs = np.vstack(features)
502
+
503
+ # ------------------------------------------------------------------
504
+ # STEP 2: Clustering jerárquico de escenas (k-Target + mida mínima)
505
+ # ------------------------------------------------------------------
506
+ print("[detect_scenes] Clustering jerárquico de escenas...")
507
+ scene_labels = hierarchical_cluster_with_min_size(Xs, max_groups, min_cluster_size, 0.5)
508
+ unique_labels = sorted({int(l) for l in scene_labels if int(l) >= 0})
509
+ print(f"[detect_scenes] Etiquetas de escena válidas: {unique_labels}")
510
+
511
+ # Mapear índices de keyframes a clusters
512
+ cluster_map: Dict[int, List[int]] = {}
513
+ for idx, lbl in enumerate(scene_labels):
514
+ lbl = int(lbl)
515
+ if lbl >= 0:
516
+ cluster_map.setdefault(lbl, []).append(idx)
517
+
518
+ # ------------------------------------------------------------------
519
+ # STEP 3: Construir scene_clusters con el formato esperado por el demo
520
+ # ------------------------------------------------------------------
521
+ scene_clusters: List[Dict[str, Any]] = []
522
+ for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
523
+ if not idxs:
524
+ continue
525
+
526
+ scene_id = f"scene_{ci:02d}"
527
+ scene_out_dir = scenes_dir / scene_id
528
+ scene_out_dir.mkdir(parents=True, exist_ok=True)
529
+
530
+ # Copiar todos los keyframes del cluster a la carpeta del cluster
531
+ cluster_start = None
532
+ cluster_end = None
533
+ representative_file = None
534
+
535
+ for j, k_idx in enumerate(idxs):
536
+ src = keyframe_paths[k_idx]
537
+ dst = scene_out_dir / src.name
538
+ try:
539
+ shutil.copy2(src, dst)
540
+ except Exception as cp_err:
541
+ print(f"[detect_scenes] Error copiando keyframe {src} a cluster {scene_id}: {cp_err}")
542
+ continue
543
+
544
+ if representative_file is None:
545
+ representative_file = dst
546
+
547
+ info = keyframe_infos[k_idx]
548
+ start = info.get("start", k_idx)
549
+ end = info.get("end", k_idx + 1)
550
+ cluster_start = start if cluster_start is None else min(cluster_start, start)
551
+ cluster_end = end if cluster_end is None else max(cluster_end, end)
552
+
553
+ if representative_file is None:
554
+ continue
555
+
556
+ scene_clusters.append({
557
+ "id": scene_id,
558
+ "name": f"Escena {len(scene_clusters)+1}",
559
+ "folder": str(scene_out_dir),
560
+ "image_url": f"/files_scene/{video_name}/{scene_id}/{representative_file.name}",
561
+ "start_time": float(cluster_start) if cluster_start is not None else 0.0,
562
+ "end_time": float(cluster_end) if cluster_end is not None else 0.0,
563
+ })
564
+
565
+ print(f"[detect_scenes] ✓ {len(scene_clusters)} escenes clusteritzades")
566
  return {"scene_clusters": scene_clusters}
567
 
568
  except Exception as e:
svision_client.py CHANGED
@@ -125,17 +125,39 @@ def extract_descripcion_escena(imagen_path: str) -> str:
125
 
126
 
127
  def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
128
- """Extract file path from Gradio file object (can be dict, str, or other)."""
 
 
 
 
 
 
 
129
  if file_obj is None:
130
  return None
 
 
 
 
 
 
131
  if isinstance(file_obj, str):
132
  return file_obj
 
 
133
  if isinstance(file_obj, dict):
134
- # Gradio returns dicts like {"path": "...", "url": "...", "orig_name": "..."}
135
- return file_obj.get("path") or file_obj.get("url") or file_obj.get("name")
136
- if hasattr(file_obj, "name"):
 
 
 
 
 
137
  return file_obj.name
138
- return str(file_obj)
 
 
139
 
140
 
141
  def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
@@ -162,18 +184,27 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
162
  api_name="/face_image_embedding_casting"
163
  )
164
 
 
 
165
  # result is a tuple: (list of image paths/dicts, list of embedding dicts)
166
  if result and len(result) >= 2:
167
  face_crops_raw = result[0] if result[0] else []
168
  face_embeddings = result[1] if result[1] else []
169
 
 
 
 
 
170
  # Combine into unified structure, extracting paths correctly
171
  faces = []
172
  for i, emb_dict in enumerate(face_embeddings):
173
  # Extract path from Gradio file object (might be dict or string)
174
  crop_path = None
175
  if i < len(face_crops_raw):
176
- crop_path = _extract_path_from_gradio_file(face_crops_raw[i])
 
 
 
177
 
178
  embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
179
 
@@ -184,9 +215,6 @@ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
184
  })
185
 
186
  print(f"[svision_client] Detected {len(faces)} faces from image")
187
- for i, f in enumerate(faces):
188
- crop_path = f.get("face_crop_path")
189
- print(f"[svision_client] Face {i}: crop_path={crop_path[:80] if crop_path else 'None'}...")
190
  return faces
191
  return []
192
  except Exception as e:
 
125
 
126
 
127
  def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
128
+ """Extract file path from Gradio file object (can be dict, str, tuple, or other).
129
+
130
+ Gradio Gallery returns different formats depending on version:
131
+ - List of tuples: [(path, caption), ...]
132
+ - List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
133
+ - List of FileData: [FileData(path=..., url=...), ...]
134
+ - List of paths: [path, ...]
135
+ """
136
  if file_obj is None:
137
  return None
138
+
139
+ # Handle tuple format: (path, caption)
140
+ if isinstance(file_obj, tuple) and len(file_obj) >= 1:
141
+ return _extract_path_from_gradio_file(file_obj[0])
142
+
143
+ # Handle string path/URL
144
  if isinstance(file_obj, str):
145
  return file_obj
146
+
147
+ # Handle dict format: {"path": "...", "url": "...", "name": "..."}
148
  if isinstance(file_obj, dict):
149
+ return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
150
+
151
+ # Handle FileData or similar object with attributes
152
+ if hasattr(file_obj, "path") and file_obj.path:
153
+ return file_obj.path
154
+ if hasattr(file_obj, "url") and file_obj.url:
155
+ return file_obj.url
156
+ if hasattr(file_obj, "name") and file_obj.name:
157
  return file_obj.name
158
+
159
+ # Last resort: convert to string
160
+ return str(file_obj) if file_obj else None
161
 
162
 
163
  def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
 
184
  api_name="/face_image_embedding_casting"
185
  )
186
 
187
+ print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
188
+
189
  # result is a tuple: (list of image paths/dicts, list of embedding dicts)
190
  if result and len(result) >= 2:
191
  face_crops_raw = result[0] if result[0] else []
192
  face_embeddings = result[1] if result[1] else []
193
 
194
+ print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
195
+ if face_crops_raw and len(face_crops_raw) > 0:
196
+ print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
197
+
198
  # Combine into unified structure, extracting paths correctly
199
  faces = []
200
  for i, emb_dict in enumerate(face_embeddings):
201
  # Extract path from Gradio file object (might be dict or string)
202
  crop_path = None
203
  if i < len(face_crops_raw):
204
+ raw_crop = face_crops_raw[i]
205
+ crop_path = _extract_path_from_gradio_file(raw_crop)
206
+ if not crop_path:
207
+ print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
208
 
209
  embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
210
 
 
215
  })
216
 
217
  print(f"[svision_client] Detected {len(faces)} faces from image")
 
 
 
218
  return faces
219
  return []
220
  except Exception as e: