File size: 4,064 Bytes
eb567a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import gradio as gr
import numpy as np
import torch
import torch.nn.functional as F
try:
import librosa
LIBROSA_AVAILABLE = True
except ImportError:
LIBROSA_AVAILABLE = False
print("⚠️ Librosa not available, using scipy fallback")
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import io
import time
from typing import Dict, Tuple, Optional
import threading
import queue
from dataclasses import dataclass
from collections import deque
# Resto del código igual hasta la función create_interface...
# [Aquí iría todo el código de las clases como está, pero cambio solo la parte del streaming]
def create_interface():
"""Create Gradio interface with corrected streaming"""
with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
gr.Markdown("""
# 🎤 VAD Demo: Real-time Speech Detection Framework
**Multi-Model Voice Activity Detection with Interactive Visualization**
This demo showcases 5 different AI models for speech detection optimized for CPU.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🎛️ **Controls**")
model_a = gr.Dropdown(
choices=list(demo_app.models.keys()),
value="Silero-VAD",
label="Panel A Model"
)
model_b = gr.Dropdown(
choices=list(demo_app.models.keys()),
value="E-PANNs",
label="Panel B Model"
)
threshold_slider = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.5,
step=0.05,
label="Detection Threshold"
)
status_display = gr.Textbox(
label="Status",
value="🔇 Ready to detect speech",
interactive=False
)
with gr.Column(scale=2):
gr.Markdown("### 🎙️ **Audio Input**")
# Simplified audio input without streaming for compatibility
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Microphone Input"
)
process_btn = gr.Button("🎯 Process Audio", variant="primary")
gr.Markdown("### 📊 **Analysis Results**")
plot_output = gr.Plot(label="VAD Analysis")
model_details = gr.JSON(label="Model Details")
# Event handlers - usando click en lugar de streaming para compatibilidad
process_btn.click(
fn=demo_app.process_audio_stream,
inputs=[audio_input, model_a, model_b, threshold_slider],
outputs=[plot_output, status_display, model_details]
)
# Auto-process cuando se graba audio
audio_input.change(
fn=demo_app.process_audio_stream,
inputs=[audio_input, model_a, model_b, threshold_slider],
outputs=[plot_output, status_display, model_details]
)
gr.Markdown("""
### 🔬 **Research Context**
This demonstration supports research in privacy-preserving audio datasets and real-time speech analysis.
Original: https://github.com/gbibbo/vad_demo
""")
return interface
# Initialize demo
demo_app = VADDemo()
# Create and launch interface
if __name__ == "__main__":
interface = create_interface()
interface.queue(max_size=20)
# Simplified launch for HF Spaces compatibility
interface.launch(
share=False, # HF Spaces maneja esto automáticamente
debug=False,
show_error=True
)
|