|
|
import os
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
|
|
|
|
|
from gradio_client import Client, handle_file
|
|
|
from typing import Any, Dict, List
|
|
|
from PIL import Image
|
|
|
import json
|
|
|
|
|
|
|
|
|
_asr_client = None
|
|
|
|
|
|
|
|
|
def _get_asr_client():
|
|
|
"""Get or create the ASR client (lazy initialization)."""
|
|
|
global _asr_client
|
|
|
if _asr_client is None:
|
|
|
_asr_client = Client("VeuReu/asr")
|
|
|
return _asr_client
|
|
|
|
|
|
|
|
|
def extract_audio_from_video(video_path: str) -> str:
|
|
|
"""
|
|
|
Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
|
|
|
|
|
|
This function uploads a video file to the remote ASR service and extracts its audio track.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
video_path : str
|
|
|
Path to the input video file from which audio will be extracted.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
str
|
|
|
Path or identifier of the extracted audio file returned by the remote service.
|
|
|
"""
|
|
|
result = _get_asr_client().predict(
|
|
|
video_file={"video": handle_file(video_path)},
|
|
|
api_name="/extract_audio_ffmpeg"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def diarize_audio(audio_path: str) -> str:
|
|
|
"""
|
|
|
Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
|
|
|
|
|
|
This function performs speaker diarization, identifying segments of speech
|
|
|
belonging to different speakers in the audio file.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
audio_path : str
|
|
|
Path to the audio file to be diarized.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
str
|
|
|
JSON-like diarization output containing speaker segments and timings.
|
|
|
"""
|
|
|
result = _get_asr_client().predict(
|
|
|
wav_archivo=handle_file(audio_path),
|
|
|
api_name="/diaritzar_audio"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def transcribe_long_audio(audio_path: str) -> str:
|
|
|
"""
|
|
|
Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
|
|
|
|
|
|
Designed for long audio recordings, this function sends the audio to the ASR model
|
|
|
optimized for processing extended durations.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
audio_path : str
|
|
|
Path to the long audio file to be transcribed.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
str
|
|
|
Transcribed text returned by the remote ASR service.
|
|
|
"""
|
|
|
result = _get_asr_client().predict(
|
|
|
wav_path=handle_file(audio_path),
|
|
|
api_name="/transcribe_long_audio"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def transcribe_short_audio(audio_path: str) -> str:
|
|
|
"""
|
|
|
Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
|
|
|
|
|
|
This function is optimized for short-duration audio samples and produces fast transcriptions.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
audio_path : str
|
|
|
Path to the short audio file to be transcribed.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
str
|
|
|
Transcribed text returned by the remote service.
|
|
|
"""
|
|
|
result = _get_asr_client().predict(
|
|
|
wav_path=handle_file(audio_path),
|
|
|
api_name="/transcribe_wav"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
|
|
|
"""
|
|
|
Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
|
|
|
|
|
|
This function attempts to identify which known speaker (from a provided
|
|
|
collection of voice profiles) appears in the given audio clip.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
clip_path : str
|
|
|
Path to the audio clip whose speaker is to be identified.
|
|
|
voice_col : List[Dict[str, Any]]
|
|
|
List of dictionaries containing metadata or embeddings for known voices.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
Any
|
|
|
Output returned by the remote speaker identification model.
|
|
|
"""
|
|
|
voice_col_str = json.dumps(voice_col)
|
|
|
result = _get_asr_client().predict(
|
|
|
wav_archivo=handle_file(clip_path),
|
|
|
voice_col=voice_col_str,
|
|
|
api_name="/identificar_veu"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def get_voice_embedding(audio_path: str) -> List[float]:
|
|
|
"""
|
|
|
Call the /voice_embedding endpoint to get a voice embedding vector.
|
|
|
|
|
|
This replaces local SpeakerRecognition processing by delegating to asr Space.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
audio_path : str
|
|
|
Path to the audio file (WAV format preferred).
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
List[float]
|
|
|
Normalized embedding vector for the voice, or empty list on error.
|
|
|
"""
|
|
|
try:
|
|
|
result = _get_asr_client().predict(
|
|
|
wav_archivo=handle_file(audio_path),
|
|
|
api_name="/voice_embedding"
|
|
|
)
|
|
|
return result if result else []
|
|
|
except Exception as e:
|
|
|
print(f"[asr_client] get_voice_embedding error: {e}")
|
|
|
return []
|
|
|
|
|
|
|
|
|
def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
|
|
|
"""
|
|
|
Extract audio from video and perform diarization in one call.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
video_path : str
|
|
|
Path to the input video file.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
Dict[str, Any]
|
|
|
Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
|
|
|
"""
|
|
|
try:
|
|
|
|
|
|
audio_path = extract_audio_from_video(video_path)
|
|
|
if not audio_path:
|
|
|
return {"clips": [], "segments": [], "error": "Audio extraction failed"}
|
|
|
|
|
|
|
|
|
result = diarize_audio(audio_path)
|
|
|
|
|
|
if result and len(result) >= 2:
|
|
|
return {
|
|
|
"clips": result[0] if result[0] else [],
|
|
|
"segments": result[1] if result[1] else [],
|
|
|
"audio_path": audio_path,
|
|
|
}
|
|
|
return {"clips": [], "segments": [], "audio_path": audio_path}
|
|
|
except Exception as e:
|
|
|
print(f"[asr_client] extract_audio_and_diarize error: {e}")
|
|
|
return {"clips": [], "segments": [], "error": str(e)}
|
|
|
|