File size: 5,920 Bytes
c8c329a 924dc7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from gradio_client import Client, handle_file
from typing import Any, Dict, List
from PIL import Image
import json
# Lazy initialization to avoid crash if Space is down at import time
_asr_client = None
def _get_asr_client():
"""Get or create the ASR client (lazy initialization)."""
global _asr_client
if _asr_client is None:
_asr_client = Client("VeuReu/asr")
return _asr_client
def extract_audio_from_video(video_path: str) -> str:
"""
Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
This function uploads a video file to the remote ASR service and extracts its audio track.
Parameters
----------
video_path : str
Path to the input video file from which audio will be extracted.
Returns
-------
str
Path or identifier of the extracted audio file returned by the remote service.
"""
result = _get_asr_client().predict(
video_file={"video": handle_file(video_path)},
api_name="/extract_audio_ffmpeg"
)
return result
def diarize_audio(audio_path: str) -> str:
"""
Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
This function performs speaker diarization, identifying segments of speech
belonging to different speakers in the audio file.
Parameters
----------
audio_path : str
Path to the audio file to be diarized.
Returns
-------
str
JSON-like diarization output containing speaker segments and timings.
"""
result = _get_asr_client().predict(
wav_archivo=handle_file(audio_path),
api_name="/diaritzar_audio"
)
return result
def transcribe_long_audio(audio_path: str) -> str:
"""
Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
Designed for long audio recordings, this function sends the audio to the ASR model
optimized for processing extended durations.
Parameters
----------
audio_path : str
Path to the long audio file to be transcribed.
Returns
-------
str
Transcribed text returned by the remote ASR service.
"""
result = _get_asr_client().predict(
wav_path=handle_file(audio_path),
api_name="/transcribe_long_audio"
)
return result
def transcribe_short_audio(audio_path: str) -> str:
"""
Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
This function is optimized for short-duration audio samples and produces fast transcriptions.
Parameters
----------
audio_path : str
Path to the short audio file to be transcribed.
Returns
-------
str
Transcribed text returned by the remote service.
"""
result = _get_asr_client().predict(
wav_path=handle_file(audio_path),
api_name="/transcribe_wav"
)
return result
def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
"""
Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
This function attempts to identify which known speaker (from a provided
collection of voice profiles) appears in the given audio clip.
Parameters
----------
clip_path : str
Path to the audio clip whose speaker is to be identified.
voice_col : List[Dict[str, Any]]
List of dictionaries containing metadata or embeddings for known voices.
Returns
-------
Any
Output returned by the remote speaker identification model.
"""
voice_col_str = json.dumps(voice_col)
result = _get_asr_client().predict(
wav_archivo=handle_file(clip_path),
voice_col=voice_col_str,
api_name="/identificar_veu"
)
return result
def get_voice_embedding(audio_path: str) -> List[float]:
"""
Call the /voice_embedding endpoint to get a voice embedding vector.
This replaces local SpeakerRecognition processing by delegating to asr Space.
Parameters
----------
audio_path : str
Path to the audio file (WAV format preferred).
Returns
-------
List[float]
Normalized embedding vector for the voice, or empty list on error.
"""
try:
result = _get_asr_client().predict(
wav_archivo=handle_file(audio_path),
api_name="/voice_embedding"
)
return result if result else []
except Exception as e:
print(f"[asr_client] get_voice_embedding error: {e}")
return []
def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
"""
Extract audio from video and perform diarization in one call.
Parameters
----------
video_path : str
Path to the input video file.
Returns
-------
Dict[str, Any]
Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
"""
try:
# First extract audio
audio_path = extract_audio_from_video(video_path)
if not audio_path:
return {"clips": [], "segments": [], "error": "Audio extraction failed"}
# Then diarize
result = diarize_audio(audio_path)
# result is tuple: (clips_paths, segments)
if result and len(result) >= 2:
return {
"clips": result[0] if result[0] else [],
"segments": result[1] if result[1] else [],
"audio_path": audio_path,
}
return {"clips": [], "segments": [], "audio_path": audio_path}
except Exception as e:
print(f"[asr_client] extract_audio_and_diarize error: {e}")
return {"clips": [], "segments": [], "error": str(e)}
|