engine / asr_client.py
VeuReu's picture
Upload 3 files
924dc7a verified
raw
history blame
5.92 kB
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from gradio_client import Client, handle_file
from typing import Any, Dict, List
from PIL import Image
import json
# Lazy initialization to avoid crash if Space is down at import time
_asr_client = None
def _get_asr_client():
"""Get or create the ASR client (lazy initialization)."""
global _asr_client
if _asr_client is None:
_asr_client = Client("VeuReu/asr")
return _asr_client
def extract_audio_from_video(video_path: str) -> str:
"""
Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
This function uploads a video file to the remote ASR service and extracts its audio track.
Parameters
----------
video_path : str
Path to the input video file from which audio will be extracted.
Returns
-------
str
Path or identifier of the extracted audio file returned by the remote service.
"""
result = _get_asr_client().predict(
video_file={"video": handle_file(video_path)},
api_name="/extract_audio_ffmpeg"
)
return result
def diarize_audio(audio_path: str) -> str:
"""
Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
This function performs speaker diarization, identifying segments of speech
belonging to different speakers in the audio file.
Parameters
----------
audio_path : str
Path to the audio file to be diarized.
Returns
-------
str
JSON-like diarization output containing speaker segments and timings.
"""
result = _get_asr_client().predict(
wav_archivo=handle_file(audio_path),
api_name="/diaritzar_audio"
)
return result
def transcribe_long_audio(audio_path: str) -> str:
"""
Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
Designed for long audio recordings, this function sends the audio to the ASR model
optimized for processing extended durations.
Parameters
----------
audio_path : str
Path to the long audio file to be transcribed.
Returns
-------
str
Transcribed text returned by the remote ASR service.
"""
result = _get_asr_client().predict(
wav_path=handle_file(audio_path),
api_name="/transcribe_long_audio"
)
return result
def transcribe_short_audio(audio_path: str) -> str:
"""
Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
This function is optimized for short-duration audio samples and produces fast transcriptions.
Parameters
----------
audio_path : str
Path to the short audio file to be transcribed.
Returns
-------
str
Transcribed text returned by the remote service.
"""
result = _get_asr_client().predict(
wav_path=handle_file(audio_path),
api_name="/transcribe_wav"
)
return result
def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
"""
Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
This function attempts to identify which known speaker (from a provided
collection of voice profiles) appears in the given audio clip.
Parameters
----------
clip_path : str
Path to the audio clip whose speaker is to be identified.
voice_col : List[Dict[str, Any]]
List of dictionaries containing metadata or embeddings for known voices.
Returns
-------
Any
Output returned by the remote speaker identification model.
"""
voice_col_str = json.dumps(voice_col)
result = _get_asr_client().predict(
wav_archivo=handle_file(clip_path),
voice_col=voice_col_str,
api_name="/identificar_veu"
)
return result
def get_voice_embedding(audio_path: str) -> List[float]:
"""
Call the /voice_embedding endpoint to get a voice embedding vector.
This replaces local SpeakerRecognition processing by delegating to asr Space.
Parameters
----------
audio_path : str
Path to the audio file (WAV format preferred).
Returns
-------
List[float]
Normalized embedding vector for the voice, or empty list on error.
"""
try:
result = _get_asr_client().predict(
wav_archivo=handle_file(audio_path),
api_name="/voice_embedding"
)
return result if result else []
except Exception as e:
print(f"[asr_client] get_voice_embedding error: {e}")
return []
def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
"""
Extract audio from video and perform diarization in one call.
Parameters
----------
video_path : str
Path to the input video file.
Returns
-------
Dict[str, Any]
Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
"""
try:
# First extract audio
audio_path = extract_audio_from_video(video_path)
if not audio_path:
return {"clips": [], "segments": [], "error": "Audio extraction failed"}
# Then diarize
result = diarize_audio(audio_path)
# result is tuple: (clips_paths, segments)
if result and len(result) >= 2:
return {
"clips": result[0] if result[0] else [],
"segments": result[1] if result[1] else [],
"audio_path": audio_path,
}
return {"clips": [], "segments": [], "audio_path": audio_path}
except Exception as e:
print(f"[asr_client] extract_audio_and_diarize error: {e}")
return {"clips": [], "segments": [], "error": str(e)}