|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
try: |
|
|
import librosa |
|
|
LIBROSA_AVAILABLE = True |
|
|
except ImportError: |
|
|
LIBROSA_AVAILABLE = False |
|
|
print("⚠️ Librosa not available, using scipy fallback") |
|
|
|
|
|
import plotly.graph_objects as go |
|
|
from plotly.subplots import make_subplots |
|
|
import io |
|
|
import time |
|
|
from typing import Dict, Tuple, Optional |
|
|
import threading |
|
|
import queue |
|
|
from dataclasses import dataclass |
|
|
from collections import deque |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
"""Create Gradio interface with corrected streaming""" |
|
|
|
|
|
with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface: |
|
|
gr.Markdown(""" |
|
|
# 🎤 VAD Demo: Real-time Speech Detection Framework |
|
|
|
|
|
**Multi-Model Voice Activity Detection with Interactive Visualization** |
|
|
|
|
|
This demo showcases 5 different AI models for speech detection optimized for CPU. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 🎛️ **Controls**") |
|
|
|
|
|
model_a = gr.Dropdown( |
|
|
choices=list(demo_app.models.keys()), |
|
|
value="Silero-VAD", |
|
|
label="Panel A Model" |
|
|
) |
|
|
|
|
|
model_b = gr.Dropdown( |
|
|
choices=list(demo_app.models.keys()), |
|
|
value="E-PANNs", |
|
|
label="Panel B Model" |
|
|
) |
|
|
|
|
|
threshold_slider = gr.Slider( |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
value=0.5, |
|
|
step=0.05, |
|
|
label="Detection Threshold" |
|
|
) |
|
|
|
|
|
status_display = gr.Textbox( |
|
|
label="Status", |
|
|
value="🔇 Ready to detect speech", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("### 🎙️ **Audio Input**") |
|
|
|
|
|
|
|
|
audio_input = gr.Audio( |
|
|
sources=["microphone"], |
|
|
type="numpy", |
|
|
label="Microphone Input" |
|
|
) |
|
|
|
|
|
process_btn = gr.Button("🎯 Process Audio", variant="primary") |
|
|
|
|
|
gr.Markdown("### 📊 **Analysis Results**") |
|
|
|
|
|
plot_output = gr.Plot(label="VAD Analysis") |
|
|
model_details = gr.JSON(label="Model Details") |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=demo_app.process_audio_stream, |
|
|
inputs=[audio_input, model_a, model_b, threshold_slider], |
|
|
outputs=[plot_output, status_display, model_details] |
|
|
) |
|
|
|
|
|
|
|
|
audio_input.change( |
|
|
fn=demo_app.process_audio_stream, |
|
|
inputs=[audio_input, model_a, model_b, threshold_slider], |
|
|
outputs=[plot_output, status_display, model_details] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### 🔬 **Research Context** |
|
|
This demonstration supports research in privacy-preserving audio datasets and real-time speech analysis. |
|
|
Original: https://github.com/gbibbo/vad_demo |
|
|
""") |
|
|
|
|
|
return interface |
|
|
|
|
|
|
|
|
demo_app = VADDemo() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
interface = create_interface() |
|
|
interface.queue(max_size=20) |
|
|
|
|
|
|
|
|
interface.launch( |
|
|
share=False, |
|
|
debug=False, |
|
|
show_error=True |
|
|
) |
|
|
|