|
|
import os
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
|
|
|
|
|
from gradio_client import Client, handle_file
|
|
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
import requests
|
|
|
import json
|
|
|
|
|
|
|
|
|
_svision_client = None
|
|
|
|
|
|
|
|
|
def _get_svision_client():
|
|
|
"""Get or create the svision client (lazy initialization)."""
|
|
|
global _svision_client
|
|
|
if _svision_client is None:
|
|
|
_svision_client = Client("VeuReu/svision")
|
|
|
return _svision_client
|
|
|
|
|
|
|
|
|
def extract_scenes(video_path: str, threshold: float = 30.0, offset_frames: int = 5, crop_ratio: float = 0.1):
|
|
|
"""
|
|
|
Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
video_path : str
|
|
|
Path to the input video file.
|
|
|
threshold : float, optional
|
|
|
Scene change detection threshold; higher values make detection less sensitive.
|
|
|
offset_frames : int, optional
|
|
|
Number of frames to include before and after a detected scene boundary.
|
|
|
crop_ratio : float, optional
|
|
|
Ratio for cropping borders before performing scene detection.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
Any
|
|
|
Response returned by the remote /scenes_extraction endpoint.
|
|
|
"""
|
|
|
result = _get_svision_client().predict(
|
|
|
video_file={"video": handle_file(video_path)},
|
|
|
threshold=threshold,
|
|
|
offset_frames=offset_frames,
|
|
|
crop_ratio=crop_ratio,
|
|
|
api_name="/scenes_extraction"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def keyframes_every_second_extraction(video_path: str):
|
|
|
"""
|
|
|
Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
video_path : str
|
|
|
Path to the input video file.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
Any
|
|
|
Response returned by the remote /keyframes_every_second_extraction endpoint.
|
|
|
"""
|
|
|
result = _get_svision_client().predict(
|
|
|
video_path={"video": handle_file(video_path)},
|
|
|
api_name="/keyframes_every_second_extraction"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
|
"""
|
|
|
Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
|
|
|
|
|
|
This function sends an image together with metadata and face collection data
|
|
|
to perform OCR, face detection, and annotation enhancement.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
imagen_path : str
|
|
|
Path to the input image file.
|
|
|
informacion_image : Dict[str, Any]
|
|
|
Dictionary containing image-related metadata.
|
|
|
face_col : List[Dict[str, Any]]
|
|
|
List of dictionaries representing detected faces or face metadata.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
Dict[str, Any]
|
|
|
Processed output containing OCR results, face detection data, and annotations.
|
|
|
"""
|
|
|
informacion_image_str = json.dumps(informacion_image)
|
|
|
face_col_str = json.dumps(face_col)
|
|
|
result = _get_svision_client().predict(
|
|
|
image=handle_file(imagen_path),
|
|
|
informacion_image=informacion_image_str,
|
|
|
face_col=face_col_str,
|
|
|
api_name="/add_ocr_and_faces"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def extract_descripcion_escena(imagen_path: str) -> str:
|
|
|
"""
|
|
|
Call the /describe_images endpoint of the remote Space VeuReu/svision.
|
|
|
|
|
|
This function sends an image to receive a textual description of its visual content.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
imagen_path : str
|
|
|
Path to the input image file.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
str
|
|
|
Description generated for the given image.
|
|
|
"""
|
|
|
result = _get_svision_client().predict(
|
|
|
images=[{"image": handle_file(imagen_path)}],
|
|
|
api_name="/describe_images"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
|
|
|
"""Extract file path from Gradio file object (can be dict, str, tuple, or other).
|
|
|
|
|
|
Gradio Gallery returns different formats depending on version:
|
|
|
- List of tuples: [(path, caption), ...]
|
|
|
- List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
|
|
|
- List of FileData: [FileData(path=..., url=...), ...]
|
|
|
- List of paths: [path, ...]
|
|
|
"""
|
|
|
if file_obj is None:
|
|
|
return None
|
|
|
|
|
|
|
|
|
if isinstance(file_obj, tuple) and len(file_obj) >= 1:
|
|
|
return _extract_path_from_gradio_file(file_obj[0])
|
|
|
|
|
|
|
|
|
if isinstance(file_obj, str):
|
|
|
return file_obj
|
|
|
|
|
|
|
|
|
if isinstance(file_obj, dict):
|
|
|
return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
|
|
|
|
|
|
|
|
|
if hasattr(file_obj, "path") and file_obj.path:
|
|
|
return file_obj.path
|
|
|
if hasattr(file_obj, "url") and file_obj.url:
|
|
|
return file_obj.url
|
|
|
if hasattr(file_obj, "name") and file_obj.name:
|
|
|
return file_obj.name
|
|
|
|
|
|
|
|
|
return str(file_obj) if file_obj else None
|
|
|
|
|
|
|
|
|
def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
|
Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.
|
|
|
|
|
|
This replaces local DeepFace/face_recognition processing by delegating to svision Space.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
image_path : str
|
|
|
Path to the input image file (a video frame).
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
List[Dict[str, Any]]
|
|
|
List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string).
|
|
|
Returns empty list if no faces detected or on error.
|
|
|
"""
|
|
|
try:
|
|
|
|
|
|
result = _get_svision_client().predict(
|
|
|
image=handle_file(image_path),
|
|
|
api_name="/face_image_embedding_casting"
|
|
|
)
|
|
|
|
|
|
print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
|
|
|
|
|
|
|
|
|
if result and len(result) >= 2:
|
|
|
face_crops_raw = result[0] if result[0] else []
|
|
|
face_embeddings = result[1] if result[1] else []
|
|
|
|
|
|
print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
|
|
|
if face_crops_raw and len(face_crops_raw) > 0:
|
|
|
print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
|
|
|
|
|
|
|
|
|
faces = []
|
|
|
for i, emb_dict in enumerate(face_embeddings):
|
|
|
|
|
|
crop_path = None
|
|
|
if i < len(face_crops_raw):
|
|
|
raw_crop = face_crops_raw[i]
|
|
|
crop_path = _extract_path_from_gradio_file(raw_crop)
|
|
|
if not crop_path:
|
|
|
print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
|
|
|
|
|
|
embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
|
|
|
|
|
|
faces.append({
|
|
|
"embedding": embedding,
|
|
|
"face_crop_path": crop_path,
|
|
|
"index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i,
|
|
|
})
|
|
|
|
|
|
print(f"[svision_client] Detected {len(faces)} faces from image")
|
|
|
return faces
|
|
|
return []
|
|
|
except Exception as e:
|
|
|
print(f"[svision_client] get_face_embeddings_from_image error: {e}")
|
|
|
import traceback
|
|
|
traceback.print_exc()
|
|
|
return []
|
|
|
|
|
|
|
|
|
def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
|
|
|
"""
|
|
|
Call the /face_image_embedding endpoint to get face embeddings only.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
image_path : str
|
|
|
Path to the input image file.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
List[List[float]]
|
|
|
List of embedding vectors (one per detected face).
|
|
|
"""
|
|
|
try:
|
|
|
result = _get_svision_client().predict(
|
|
|
image=handle_file(image_path),
|
|
|
api_name="/face_image_embedding"
|
|
|
)
|
|
|
return result if result else []
|
|
|
except Exception as e:
|
|
|
print(f"[svision_client] get_face_embeddings_simple error: {e}")
|
|
|
return []
|
|
|
|