Gabriel Bibb贸
commited on
Commit
路
a3b933f
1
Parent(s):
60f0c90
GitHub-faithful implementation - 32kHz, 2048 FFT, per-model delays, 80ms gaps
Browse files
app.py
CHANGED
|
@@ -201,15 +201,30 @@ class OptimizedEPANNs:
|
|
| 201 |
if len(audio.shape) > 1:
|
| 202 |
audio = audio.mean(axis=1)
|
| 203 |
|
|
|
|
| 204 |
if LIBROSA_AVAILABLE:
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
|
| 207 |
-
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
else:
|
| 210 |
from scipy import signal
|
| 211 |
-
|
|
|
|
| 212 |
energy = np.mean(10 * np.log10(Sxx + 1e-10))
|
|
|
|
|
|
|
| 213 |
speech_score = (energy + 100) / 50
|
| 214 |
|
| 215 |
probability = np.clip(speech_score, 0, 1)
|
|
@@ -227,7 +242,6 @@ class OptimizedPANNs:
|
|
| 227 |
self.sample_rate = 32000
|
| 228 |
self.model = None
|
| 229 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 230 |
-
self.cached_clip_prob = None
|
| 231 |
self.load_model()
|
| 232 |
|
| 233 |
def load_model(self):
|
|
@@ -243,11 +257,6 @@ class OptimizedPANNs:
|
|
| 243 |
self.model = None
|
| 244 |
|
| 245 |
def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
|
| 246 |
-
if timestamp > 0 and self.cached_clip_prob is not None:
|
| 247 |
-
return VADResult(self.cached_clip_prob,
|
| 248 |
-
self.cached_clip_prob > 0.5,
|
| 249 |
-
self.model_name, 0.0, timestamp)
|
| 250 |
-
|
| 251 |
start_time = time.time()
|
| 252 |
|
| 253 |
if self.model is None or len(audio) == 0:
|
|
@@ -265,19 +274,45 @@ class OptimizedPANNs:
|
|
| 265 |
if len(audio.shape) > 1:
|
| 266 |
audio = audio.mean(axis=1)
|
| 267 |
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
input_sr=self.sample_rate)
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
-
speech_prob = clip_probs[0,
|
| 277 |
-
|
| 278 |
-
return VADResult(self.
|
| 279 |
-
self.cached_clip_prob > 0.5,
|
| 280 |
-
self.model_name, time.time()-start_time, timestamp)
|
| 281 |
|
| 282 |
except Exception as e:
|
| 283 |
print(f"Error in {self.model_name}: {e}")
|
|
@@ -298,7 +333,6 @@ class OptimizedAST:
|
|
| 298 |
self.model = None
|
| 299 |
self.feature_extractor = None
|
| 300 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 301 |
-
self.cached_clip_prob = None
|
| 302 |
self.load_model()
|
| 303 |
|
| 304 |
def load_model(self):
|
|
@@ -318,11 +352,6 @@ class OptimizedAST:
|
|
| 318 |
self.model = None
|
| 319 |
|
| 320 |
def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
|
| 321 |
-
if timestamp > 0 and self.cached_clip_prob is not None:
|
| 322 |
-
return VADResult(self.cached_clip_prob,
|
| 323 |
-
self.cached_clip_prob > 0.5,
|
| 324 |
-
self.model_name, 0.0, timestamp)
|
| 325 |
-
|
| 326 |
start_time = time.time()
|
| 327 |
|
| 328 |
if self.model is None or len(audio) == 0:
|
|
@@ -344,6 +373,11 @@ class OptimizedAST:
|
|
| 344 |
if len(audio.shape) > 1:
|
| 345 |
audio = audio.mean(axis=1)
|
| 346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt")
|
| 348 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 349 |
|
|
@@ -353,13 +387,18 @@ class OptimizedAST:
|
|
| 353 |
probs = torch.sigmoid(logits)
|
| 354 |
|
| 355 |
label2id = self.model.config.label2id
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
|
| 364 |
except Exception as e:
|
| 365 |
print(f"Error in {self.model_name}: {e}")
|
|
@@ -833,50 +872,21 @@ class VADDemo:
|
|
| 833 |
if len(processed_audio) == 0:
|
| 834 |
return None, "馃幍 Processing audio...", "No audio data processed"
|
| 835 |
|
| 836 |
-
panns_prob = None
|
| 837 |
-
ast_prob = None
|
| 838 |
-
selected_models = list(set([model_a, model_b]))
|
| 839 |
-
|
| 840 |
-
if 'PANNs' in selected_models:
|
| 841 |
-
panns_model = self.models['PANNs']
|
| 842 |
-
# Reset cache for new audio clip
|
| 843 |
-
panns_model.cached_clip_prob = None
|
| 844 |
-
if LIBROSA_AVAILABLE:
|
| 845 |
-
audio_32k = librosa.resample(processed_audio,
|
| 846 |
-
orig_sr=self.processor.sample_rate,
|
| 847 |
-
target_sr=panns_model.sample_rate)
|
| 848 |
-
panns_prob = panns_model.predict(audio_32k, 0.0).probability
|
| 849 |
-
else:
|
| 850 |
-
panns_prob = 0.0
|
| 851 |
-
|
| 852 |
-
if 'AST' in selected_models:
|
| 853 |
-
ast_model = self.models['AST']
|
| 854 |
-
# Reset cache for new audio clip
|
| 855 |
-
ast_model.cached_clip_prob = None
|
| 856 |
-
ast_prob = ast_model.predict(processed_audio, 0.0).probability
|
| 857 |
-
|
| 858 |
window_samples = int(self.processor.sample_rate * self.processor.window_size)
|
| 859 |
hop_samples = int(self.processor.sample_rate * self.processor.hop_size)
|
| 860 |
vad_results = []
|
| 861 |
|
|
|
|
|
|
|
|
|
|
| 862 |
for i in range(0, len(processed_audio) - window_samples, hop_samples):
|
| 863 |
timestamp = i / self.processor.sample_rate
|
|
|
|
| 864 |
|
| 865 |
for model_name in selected_models:
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
result = VADResult(panns_prob, panns_prob > threshold, 'PANNs', 0.0, timestamp)
|
| 870 |
-
elif model_name == 'AST':
|
| 871 |
-
if ast_prob is not None:
|
| 872 |
-
result = VADResult(ast_prob, ast_prob > threshold, 'AST', 0.0, timestamp)
|
| 873 |
-
else:
|
| 874 |
-
chunk = processed_audio[i:i + window_samples]
|
| 875 |
-
if model_name in self.models:
|
| 876 |
-
result = self.models[model_name].predict(chunk, timestamp)
|
| 877 |
-
result.is_speech = result.probability > threshold
|
| 878 |
-
|
| 879 |
-
if result:
|
| 880 |
vad_results.append(result)
|
| 881 |
|
| 882 |
delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
|
|
@@ -987,7 +997,7 @@ def create_interface():
|
|
| 987 |
with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
|
| 988 |
|
| 989 |
gr.Markdown("""
|
| 990 |
-
# 馃帳 VAD Demo: Real-time Speech Detection Framework
|
| 991 |
|
| 992 |
**Multi-Model Voice Activity Detection with Advanced Onset/Offset Detection**
|
| 993 |
|
|
|
|
| 201 |
if len(audio.shape) > 1:
|
| 202 |
audio = audio.mean(axis=1)
|
| 203 |
|
| 204 |
+
# Convert audio to target sample rate for E-PANNs
|
| 205 |
if LIBROSA_AVAILABLE:
|
| 206 |
+
# Resample to E-PANNs sample rate if needed
|
| 207 |
+
audio_resampled = librosa.resample(audio.astype(float),
|
| 208 |
+
orig_sr=16000,
|
| 209 |
+
target_sr=self.sample_rate)
|
| 210 |
+
|
| 211 |
+
mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
|
| 212 |
energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
|
| 213 |
+
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_resampled, sr=self.sample_rate))
|
| 214 |
+
|
| 215 |
+
# Better speech detection using multiple features
|
| 216 |
+
mfcc = librosa.feature.mfcc(y=audio_resampled, sr=self.sample_rate, n_mfcc=13)
|
| 217 |
+
mfcc_var = np.var(mfcc, axis=1).mean()
|
| 218 |
+
|
| 219 |
+
# Combine features for better speech detection
|
| 220 |
+
speech_score = ((energy + 80) / 40) * 0.4 + (spectral_centroid / 5000) * 0.3 + (mfcc_var / 100) * 0.3
|
| 221 |
else:
|
| 222 |
from scipy import signal
|
| 223 |
+
# Basic fallback without librosa
|
| 224 |
+
f, t, Sxx = signal.spectrogram(audio, 16000) # Use original sample rate
|
| 225 |
energy = np.mean(10 * np.log10(Sxx + 1e-10))
|
| 226 |
+
|
| 227 |
+
# Simple energy-based detection as fallback
|
| 228 |
speech_score = (energy + 100) / 50
|
| 229 |
|
| 230 |
probability = np.clip(speech_score, 0, 1)
|
|
|
|
| 242 |
self.sample_rate = 32000
|
| 243 |
self.model = None
|
| 244 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
| 245 |
self.load_model()
|
| 246 |
|
| 247 |
def load_model(self):
|
|
|
|
| 257 |
self.model = None
|
| 258 |
|
| 259 |
def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
start_time = time.time()
|
| 261 |
|
| 262 |
if self.model is None or len(audio) == 0:
|
|
|
|
| 274 |
if len(audio.shape) > 1:
|
| 275 |
audio = audio.mean(axis=1)
|
| 276 |
|
| 277 |
+
# Convert audio to PANNs sample rate
|
| 278 |
+
if LIBROSA_AVAILABLE:
|
| 279 |
+
audio_resampled = librosa.resample(audio.astype(float),
|
| 280 |
+
orig_sr=16000,
|
| 281 |
+
target_sr=self.sample_rate)
|
| 282 |
+
else:
|
| 283 |
+
# Simple resampling fallback
|
| 284 |
+
resample_factor = self.sample_rate / 16000
|
| 285 |
+
audio_resampled = np.interp(
|
| 286 |
+
np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
|
| 287 |
+
np.arange(len(audio)),
|
| 288 |
+
audio
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# Ensure minimum length for PANNs (need at least 1 second)
|
| 292 |
+
min_samples = self.sample_rate # 1 second
|
| 293 |
+
if len(audio_resampled) < min_samples:
|
| 294 |
+
audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant')
|
| 295 |
+
|
| 296 |
+
clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
|
| 297 |
input_sr=self.sample_rate)
|
| 298 |
|
| 299 |
+
# Find speech-related indices
|
| 300 |
+
speech_indices = []
|
| 301 |
+
for i, lbl in enumerate(labels):
|
| 302 |
+
if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking']):
|
| 303 |
+
speech_indices.append(i)
|
| 304 |
+
|
| 305 |
+
if not speech_indices:
|
| 306 |
+
# Fallback to a known speech index if available
|
| 307 |
+
try:
|
| 308 |
+
speech_indices = [labels.index('Speech')]
|
| 309 |
+
except ValueError:
|
| 310 |
+
# If 'Speech' label doesn't exist, use first 10 indices as approximation
|
| 311 |
+
speech_indices = list(range(min(10, len(labels))))
|
| 312 |
|
| 313 |
+
speech_prob = clip_probs[0, speech_indices].mean().item()
|
| 314 |
+
|
| 315 |
+
return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
|
|
|
|
|
|
|
| 316 |
|
| 317 |
except Exception as e:
|
| 318 |
print(f"Error in {self.model_name}: {e}")
|
|
|
|
| 333 |
self.model = None
|
| 334 |
self.feature_extractor = None
|
| 335 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
| 336 |
self.load_model()
|
| 337 |
|
| 338 |
def load_model(self):
|
|
|
|
| 352 |
self.model = None
|
| 353 |
|
| 354 |
def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
start_time = time.time()
|
| 356 |
|
| 357 |
if self.model is None or len(audio) == 0:
|
|
|
|
| 373 |
if len(audio.shape) > 1:
|
| 374 |
audio = audio.mean(axis=1)
|
| 375 |
|
| 376 |
+
# Ensure minimum length for AST (typically needs longer sequences)
|
| 377 |
+
min_samples = self.sample_rate # 1 second minimum
|
| 378 |
+
if len(audio) < min_samples:
|
| 379 |
+
audio = np.pad(audio, (0, min_samples - len(audio)), 'constant')
|
| 380 |
+
|
| 381 |
inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt")
|
| 382 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 383 |
|
|
|
|
| 387 |
probs = torch.sigmoid(logits)
|
| 388 |
|
| 389 |
label2id = self.model.config.label2id
|
| 390 |
+
speech_indices = []
|
| 391 |
+
for lbl, idx in label2id.items():
|
| 392 |
+
if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking', 'human']):
|
| 393 |
+
speech_indices.append(idx)
|
| 394 |
+
|
| 395 |
+
if speech_indices:
|
| 396 |
+
speech_prob = probs[0, speech_indices].mean().item()
|
| 397 |
+
else:
|
| 398 |
+
# Fallback: use average of first few probabilities
|
| 399 |
+
speech_prob = probs[0, :10].mean().item()
|
| 400 |
+
|
| 401 |
+
return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
|
| 402 |
|
| 403 |
except Exception as e:
|
| 404 |
print(f"Error in {self.model_name}: {e}")
|
|
|
|
| 872 |
if len(processed_audio) == 0:
|
| 873 |
return None, "馃幍 Processing audio...", "No audio data processed"
|
| 874 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
window_samples = int(self.processor.sample_rate * self.processor.window_size)
|
| 876 |
hop_samples = int(self.processor.sample_rate * self.processor.hop_size)
|
| 877 |
vad_results = []
|
| 878 |
|
| 879 |
+
selected_models = list(set([model_a, model_b]))
|
| 880 |
+
|
| 881 |
+
# Process each window individually for all models
|
| 882 |
for i in range(0, len(processed_audio) - window_samples, hop_samples):
|
| 883 |
timestamp = i / self.processor.sample_rate
|
| 884 |
+
chunk = processed_audio[i:i + window_samples]
|
| 885 |
|
| 886 |
for model_name in selected_models:
|
| 887 |
+
if model_name in self.models:
|
| 888 |
+
result = self.models[model_name].predict(chunk, timestamp)
|
| 889 |
+
result.is_speech = result.probability > threshold
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
vad_results.append(result)
|
| 891 |
|
| 892 |
delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
|
|
|
|
| 997 |
with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
|
| 998 |
|
| 999 |
gr.Markdown("""
|
| 1000 |
+
# 馃帳 VAD Demo: Real-time Speech Detection Framework v3
|
| 1001 |
|
| 1002 |
**Multi-Model Voice Activity Detection with Advanced Onset/Offset Detection**
|
| 1003 |
|