File size: 4,064 Bytes
eb567a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import numpy as np
import torch
import torch.nn.functional as F
try:
    import librosa
    LIBROSA_AVAILABLE = True
except ImportError:
    LIBROSA_AVAILABLE = False
    print("⚠️ Librosa not available, using scipy fallback")

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import io
import time
from typing import Dict, Tuple, Optional
import threading
import queue
from dataclasses import dataclass
from collections import deque

# Resto del código igual hasta la función create_interface...
# [Aquí iría todo el código de las clases como está, pero cambio solo la parte del streaming]

def create_interface():
    """Create Gradio interface with corrected streaming"""
    
    with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
        gr.Markdown("""
        # 🎤 VAD Demo: Real-time Speech Detection Framework
        
        **Multi-Model Voice Activity Detection with Interactive Visualization**
        
        This demo showcases 5 different AI models for speech detection optimized for CPU.
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 🎛️ **Controls**")
                
                model_a = gr.Dropdown(
                    choices=list(demo_app.models.keys()),
                    value="Silero-VAD",
                    label="Panel A Model"
                )
                
                model_b = gr.Dropdown(
                    choices=list(demo_app.models.keys()),
                    value="E-PANNs", 
                    label="Panel B Model"
                )
                
                threshold_slider = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.5,
                    step=0.05,
                    label="Detection Threshold"
                )
                
                status_display = gr.Textbox(
                    label="Status",
                    value="🔇 Ready to detect speech",
                    interactive=False
                )
            
            with gr.Column(scale=2):
                gr.Markdown("### 🎙️ **Audio Input**")
                
                # Simplified audio input without streaming for compatibility
                audio_input = gr.Audio(
                    sources=["microphone"],
                    type="numpy",
                    label="Microphone Input"
                )
                
                process_btn = gr.Button("🎯 Process Audio", variant="primary")
                
                gr.Markdown("### 📊 **Analysis Results**")
                
                plot_output = gr.Plot(label="VAD Analysis")
                model_details = gr.JSON(label="Model Details")
        
        # Event handlers - usando click en lugar de streaming para compatibilidad
        process_btn.click(
            fn=demo_app.process_audio_stream,
            inputs=[audio_input, model_a, model_b, threshold_slider],
            outputs=[plot_output, status_display, model_details]
        )
        
        # Auto-process cuando se graba audio
        audio_input.change(
            fn=demo_app.process_audio_stream,
            inputs=[audio_input, model_a, model_b, threshold_slider],
            outputs=[plot_output, status_display, model_details]
        )
        
        gr.Markdown("""
        ### 🔬 **Research Context**
        This demonstration supports research in privacy-preserving audio datasets and real-time speech analysis.
        Original: https://github.com/gbibbo/vad_demo
        """)
    
    return interface

# Initialize demo
demo_app = VADDemo()

# Create and launch interface
if __name__ == "__main__":
    interface = create_interface()
    interface.queue(max_size=20)
    
    # Simplified launch for HF Spaces compatibility
    interface.launch(
        share=False,  # HF Spaces maneja esto automáticamente
        debug=False,
        show_error=True
    )