Spaces:

VeuReu
/

engine

Running

File size: 5,920 Bytes

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from gradio_client import Client, handle_file
from typing import Any, Dict, List
from PIL import Image
import json

# Lazy initialization to avoid crash if Space is down at import time
_asr_client = None


def _get_asr_client():
    """Get or create the ASR client (lazy initialization)."""
    global _asr_client
    if _asr_client is None:
        _asr_client = Client("VeuReu/asr")
    return _asr_client


def extract_audio_from_video(video_path: str) -> str:
    """

    Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.



    This function uploads a video file to the remote ASR service and extracts its audio track.



    Parameters

    ----------

    video_path : str

        Path to the input video file from which audio will be extracted.



    Returns

    -------

    str

        Path or identifier of the extracted audio file returned by the remote service.

    """
    result = _get_asr_client().predict(
        video_file={"video": handle_file(video_path)},
        api_name="/extract_audio_ffmpeg"
    )
    return result


def diarize_audio(audio_path: str) -> str:
    """

    Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.



    This function performs speaker diarization, identifying segments of speech

    belonging to different speakers in the audio file.



    Parameters

    ----------

    audio_path : str

        Path to the audio file to be diarized.



    Returns

    -------

    str

        JSON-like diarization output containing speaker segments and timings.

    """
    result = _get_asr_client().predict(
        wav_archivo=handle_file(audio_path),
        api_name="/diaritzar_audio"
    )
    return result


def transcribe_long_audio(audio_path: str) -> str:
    """

    Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.



    Designed for long audio recordings, this function sends the audio to the ASR model

    optimized for processing extended durations.



    Parameters

    ----------

    audio_path : str

        Path to the long audio file to be transcribed.



    Returns

    -------

    str

        Transcribed text returned by the remote ASR service.

    """
    result = _get_asr_client().predict(
        wav_path=handle_file(audio_path),
        api_name="/transcribe_long_audio"
    )
    return result


def transcribe_short_audio(audio_path: str) -> str:
    """

    Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.



    This function is optimized for short-duration audio samples and produces fast transcriptions.



    Parameters

    ----------

    audio_path : str

        Path to the short audio file to be transcribed.



    Returns

    -------

    str

        Transcribed text returned by the remote service.

    """
    result = _get_asr_client().predict(
        wav_path=handle_file(audio_path),
        api_name="/transcribe_wav"
    )
    return result


def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
    """

    Call the /identificar_veu endpoint of the remote VeuReu/asr Space.



    This function attempts to identify which known speaker (from a provided

    collection of voice profiles) appears in the given audio clip.



    Parameters

    ----------

    clip_path : str

        Path to the audio clip whose speaker is to be identified.

    voice_col : List[Dict[str, Any]]

        List of dictionaries containing metadata or embeddings for known voices.



    Returns

    -------

    Any

        Output returned by the remote speaker identification model.

    """
    voice_col_str = json.dumps(voice_col)
    result = _get_asr_client().predict(
        wav_archivo=handle_file(clip_path),
        voice_col=voice_col_str,
        api_name="/identificar_veu"
    )
    return result


def get_voice_embedding(audio_path: str) -> List[float]:
    """

    Call the /voice_embedding endpoint to get a voice embedding vector.



    This replaces local SpeakerRecognition processing by delegating to asr Space.



    Parameters

    ----------

    audio_path : str

        Path to the audio file (WAV format preferred).



    Returns

    -------

    List[float]

        Normalized embedding vector for the voice, or empty list on error.

    """
    try:
        result = _get_asr_client().predict(
            wav_archivo=handle_file(audio_path),
            api_name="/voice_embedding"
        )
        return result if result else []
    except Exception as e:
        print(f"[asr_client] get_voice_embedding error: {e}")
        return []


def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
    """

    Extract audio from video and perform diarization in one call.



    Parameters

    ----------

    video_path : str

        Path to the input video file.



    Returns

    -------

    Dict[str, Any]

        Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).

    """
    try:
        # First extract audio
        audio_path = extract_audio_from_video(video_path)
        if not audio_path:
            return {"clips": [], "segments": [], "error": "Audio extraction failed"}

        # Then diarize
        result = diarize_audio(audio_path)
        # result is tuple: (clips_paths, segments)
        if result and len(result) >= 2:
            return {
                "clips": result[0] if result[0] else [],
                "segments": result[1] if result[1] else [],
                "audio_path": audio_path,
            }
        return {"clips": [], "segments": [], "audio_path": audio_path}
    except Exception as e:
        print(f"[asr_client] extract_audio_and_diarize error: {e}")
        return {"clips": [], "segments": [], "error": str(e)}