Spaces:

VeuReu
/

engine

Running

App Files Files Community

engine / svision_client.py

VeuReu

Update svision_client.py

391b4d9 verified 17 days ago

raw

history blame

8.64 kB

	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "1"

	from gradio_client import Client, handle_file
	from typing import Any, Dict, List, Optional, Tuple, Union
	import requests
	import json

	# Lazy initialization to avoid crash if Space is down at import time
	_svision_client = None


	def _get_svision_client():
	"""Get or create the svision client (lazy initialization)."""
	global _svision_client
	if _svision_client is None:
	_svision_client = Client("VeuReu/svision")
	return _svision_client


	def extract_scenes(video_path: str, threshold: float = 240, offset_frames: int = 5, crop_ratio: float = 0.1):
	"""
	Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.

	Parameters
	----------
	video_path : str
	Path to the input video file.
	threshold : float, optional
	Scene change detection threshold; higher values make detection less sensitive.
	offset_frames : int, optional
	Number of frames to include before and after a detected scene boundary.
	crop_ratio : float, optional
	Ratio for cropping borders before performing scene detection.

	Returns
	-------
	Any
	Response returned by the remote /scenes_extraction endpoint.
	"""
	result = _get_svision_client().predict(
	video_file={"video": handle_file(video_path)},
	threshold=threshold,
	offset_frames=offset_frames,
	crop_ratio=crop_ratio,
	api_name="/scenes_extraction"
	)
	return result


	def keyframes_every_second_extraction(video_path: str):
	"""
	Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.

	Parameters
	----------
	video_path : str
	Path to the input video file.

	Returns
	-------
	Any
	Response returned by the remote /keyframes_every_second_extraction endpoint.
	"""
	result = _get_svision_client().predict(
	video_path={"video": handle_file(video_path)},
	api_name="/keyframes_every_second_extraction"
	)
	return result


	def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
	"""
	Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.

	This function sends an image together with metadata and face collection data
	to perform OCR, face detection, and annotation enhancement.

	Parameters
	----------
	imagen_path : str
	Path to the input image file.
	informacion_image : Dict[str, Any]
	Dictionary containing image-related metadata.
	face_col : List[Dict[str, Any]]
	List of dictionaries representing detected faces or face metadata.

	Returns
	-------
	Dict[str, Any]
	Processed output containing OCR results, face detection data, and annotations.
	"""
	informacion_image_str = json.dumps(informacion_image)
	face_col_str = json.dumps(face_col)
	result = _get_svision_client().predict(
	image=handle_file(imagen_path),
	informacion_image=informacion_image_str,
	face_col=face_col_str,
	api_name="/add_ocr_and_faces"
	)
	return result


	def extract_descripcion_escena(imagen_path: str) -> str:
	"""
	Call the /describe_images endpoint of the remote Space VeuReu/svision.

	This function sends an image to receive a textual description of its visual content.

	Parameters
	----------
	imagen_path : str
	Path to the input image file.

	Returns
	-------
	str
	Description generated for the given image.
	"""
	result = _get_svision_client().predict(
	images=[{"image": handle_file(imagen_path)}],
	api_name="/describe_images"
	)
	return result


	def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
	"""Extract file path from Gradio file object (can be dict, str, tuple, or other).

	Gradio Gallery returns different formats depending on version:
	- List of tuples: [(path, caption), ...]
	- List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
	- List of FileData: [FileData(path=..., url=...), ...]
	- List of paths: [path, ...]
	"""
	if file_obj is None:
	return None

	# Handle tuple format: (path, caption)
	if isinstance(file_obj, tuple) and len(file_obj) >= 1:
	return _extract_path_from_gradio_file(file_obj[0])

	# Handle string path/URL
	if isinstance(file_obj, str):
	return file_obj

	# Handle dict format: {"path": "...", "url": "...", "name": "..."}
	if isinstance(file_obj, dict):
	return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")

	# Handle FileData or similar object with attributes
	if hasattr(file_obj, "path") and file_obj.path:
	return file_obj.path
	if hasattr(file_obj, "url") and file_obj.url:
	return file_obj.url
	if hasattr(file_obj, "name") and file_obj.name:
	return file_obj.name

	# Last resort: convert to string
	return str(file_obj) if file_obj else None


	def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
	"""
	Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.

	This replaces local DeepFace/face_recognition processing by delegating to svision Space.

	Parameters
	----------
	image_path : str
	Path to the input image file (a video frame).

	Returns
	-------
	List[Dict[str, Any]]
	List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string).
	Returns empty list if no faces detected or on error.
	"""
	try:
	# Returns: (face_crops: list of images/dicts, face_embeddings: list of dicts)
	result = _get_svision_client().predict(
	image=handle_file(image_path),
	api_name="/face_image_embedding_casting"
	)

	print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")

	# result is a tuple: (list of image paths/dicts, list of embedding dicts)
	if result and len(result) >= 2:
	face_crops_raw = result[0] if result[0] else []
	face_embeddings = result[1] if result[1] else []

	print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
	if face_crops_raw and len(face_crops_raw) > 0:
	print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")

	# Combine into unified structure, extracting paths correctly
	faces = []
	for i, emb_dict in enumerate(face_embeddings):
	# Extract path from Gradio file object (might be dict or string)
	crop_path = None
	if i < len(face_crops_raw):
	raw_crop = face_crops_raw[i]
	crop_path = _extract_path_from_gradio_file(raw_crop)
	if not crop_path:
	print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")

	embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []

	faces.append({
	"embedding": embedding,
	"face_crop_path": crop_path,
	"index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i,
	})

	print(f"[svision_client] Detected {len(faces)} faces from image")
	return faces
	return []
	except Exception as e:
	print(f"[svision_client] get_face_embeddings_from_image error: {e}")
	import traceback
	traceback.print_exc()
	return []


	def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
	"""
	Call the /face_image_embedding endpoint to get face embeddings only.

	Parameters
	----------
	image_path : str
	Path to the input image file.

	Returns
	-------
	List[List[float]]
	List of embedding vectors (one per detected face).
	"""
	try:
	result = _get_svision_client().predict(
	image=handle_file(image_path),
	api_name="/face_image_embedding"
	)
	return result if result else []
	except Exception as e:
	print(f"[svision_client] get_face_embeddings_simple error: {e}")
	return []