Gabriel Bibb贸 commited on
Commit
a3b933f
1 Parent(s): 60f0c90

GitHub-faithful implementation - 32kHz, 2048 FFT, per-model delays, 80ms gaps

Browse files
Files changed (1) hide show
  1. app.py +80 -70
app.py CHANGED
@@ -201,15 +201,30 @@ class OptimizedEPANNs:
201
  if len(audio.shape) > 1:
202
  audio = audio.mean(axis=1)
203
 
 
204
  if LIBROSA_AVAILABLE:
205
- mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sample_rate, n_mels=64)
 
 
 
 
 
206
  energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
207
- spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
208
- speech_score = (energy + 100) / 50 + spectral_centroid / 10000
 
 
 
 
 
 
209
  else:
210
  from scipy import signal
211
- f, t, Sxx = signal.spectrogram(audio, self.sample_rate)
 
212
  energy = np.mean(10 * np.log10(Sxx + 1e-10))
 
 
213
  speech_score = (energy + 100) / 50
214
 
215
  probability = np.clip(speech_score, 0, 1)
@@ -227,7 +242,6 @@ class OptimizedPANNs:
227
  self.sample_rate = 32000
228
  self.model = None
229
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
230
- self.cached_clip_prob = None
231
  self.load_model()
232
 
233
  def load_model(self):
@@ -243,11 +257,6 @@ class OptimizedPANNs:
243
  self.model = None
244
 
245
  def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
246
- if timestamp > 0 and self.cached_clip_prob is not None:
247
- return VADResult(self.cached_clip_prob,
248
- self.cached_clip_prob > 0.5,
249
- self.model_name, 0.0, timestamp)
250
-
251
  start_time = time.time()
252
 
253
  if self.model is None or len(audio) == 0:
@@ -265,19 +274,45 @@ class OptimizedPANNs:
265
  if len(audio.shape) > 1:
266
  audio = audio.mean(axis=1)
267
 
268
- clip_probs, _ = self.model.inference(audio[np.newaxis, :],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  input_sr=self.sample_rate)
270
 
271
- speech_idx = [i for i, lbl in enumerate(labels)
272
- if 'speech' in lbl.lower() or 'voice' in lbl.lower()]
273
- if not speech_idx:
274
- speech_idx = [labels.index('Speech')]
 
 
 
 
 
 
 
 
 
275
 
276
- speech_prob = clip_probs[0, speech_idx].mean().item()
277
- self.cached_clip_prob = float(speech_prob)
278
- return VADResult(self.cached_clip_prob,
279
- self.cached_clip_prob > 0.5,
280
- self.model_name, time.time()-start_time, timestamp)
281
 
282
  except Exception as e:
283
  print(f"Error in {self.model_name}: {e}")
@@ -298,7 +333,6 @@ class OptimizedAST:
298
  self.model = None
299
  self.feature_extractor = None
300
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
301
- self.cached_clip_prob = None
302
  self.load_model()
303
 
304
  def load_model(self):
@@ -318,11 +352,6 @@ class OptimizedAST:
318
  self.model = None
319
 
320
  def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
321
- if timestamp > 0 and self.cached_clip_prob is not None:
322
- return VADResult(self.cached_clip_prob,
323
- self.cached_clip_prob > 0.5,
324
- self.model_name, 0.0, timestamp)
325
-
326
  start_time = time.time()
327
 
328
  if self.model is None or len(audio) == 0:
@@ -344,6 +373,11 @@ class OptimizedAST:
344
  if len(audio.shape) > 1:
345
  audio = audio.mean(axis=1)
346
 
 
 
 
 
 
347
  inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt")
348
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
349
 
@@ -353,13 +387,18 @@ class OptimizedAST:
353
  probs = torch.sigmoid(logits)
354
 
355
  label2id = self.model.config.label2id
356
- speech_idx = [idx for lbl, idx in label2id.items()
357
- if 'speech' in lbl.lower() or 'voice' in lbl.lower()]
358
- speech_prob = probs[0, speech_idx].mean().item()
359
- self.cached_clip_prob = float(speech_prob)
360
- return VADResult(self.cached_clip_prob,
361
- self.cached_clip_prob > 0.5,
362
- self.model_name, time.time()-start_time, timestamp)
 
 
 
 
 
363
 
364
  except Exception as e:
365
  print(f"Error in {self.model_name}: {e}")
@@ -833,50 +872,21 @@ class VADDemo:
833
  if len(processed_audio) == 0:
834
  return None, "馃幍 Processing audio...", "No audio data processed"
835
 
836
- panns_prob = None
837
- ast_prob = None
838
- selected_models = list(set([model_a, model_b]))
839
-
840
- if 'PANNs' in selected_models:
841
- panns_model = self.models['PANNs']
842
- # Reset cache for new audio clip
843
- panns_model.cached_clip_prob = None
844
- if LIBROSA_AVAILABLE:
845
- audio_32k = librosa.resample(processed_audio,
846
- orig_sr=self.processor.sample_rate,
847
- target_sr=panns_model.sample_rate)
848
- panns_prob = panns_model.predict(audio_32k, 0.0).probability
849
- else:
850
- panns_prob = 0.0
851
-
852
- if 'AST' in selected_models:
853
- ast_model = self.models['AST']
854
- # Reset cache for new audio clip
855
- ast_model.cached_clip_prob = None
856
- ast_prob = ast_model.predict(processed_audio, 0.0).probability
857
-
858
  window_samples = int(self.processor.sample_rate * self.processor.window_size)
859
  hop_samples = int(self.processor.sample_rate * self.processor.hop_size)
860
  vad_results = []
861
 
 
 
 
862
  for i in range(0, len(processed_audio) - window_samples, hop_samples):
863
  timestamp = i / self.processor.sample_rate
 
864
 
865
  for model_name in selected_models:
866
- result = None
867
- if model_name == 'PANNs':
868
- if panns_prob is not None:
869
- result = VADResult(panns_prob, panns_prob > threshold, 'PANNs', 0.0, timestamp)
870
- elif model_name == 'AST':
871
- if ast_prob is not None:
872
- result = VADResult(ast_prob, ast_prob > threshold, 'AST', 0.0, timestamp)
873
- else:
874
- chunk = processed_audio[i:i + window_samples]
875
- if model_name in self.models:
876
- result = self.models[model_name].predict(chunk, timestamp)
877
- result.is_speech = result.probability > threshold
878
-
879
- if result:
880
  vad_results.append(result)
881
 
882
  delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
@@ -987,7 +997,7 @@ def create_interface():
987
  with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
988
 
989
  gr.Markdown("""
990
- # 馃帳 VAD Demo: Real-time Speech Detection Framework v2
991
 
992
  **Multi-Model Voice Activity Detection with Advanced Onset/Offset Detection**
993
 
 
201
  if len(audio.shape) > 1:
202
  audio = audio.mean(axis=1)
203
 
204
+ # Convert audio to target sample rate for E-PANNs
205
  if LIBROSA_AVAILABLE:
206
+ # Resample to E-PANNs sample rate if needed
207
+ audio_resampled = librosa.resample(audio.astype(float),
208
+ orig_sr=16000,
209
+ target_sr=self.sample_rate)
210
+
211
+ mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
212
  energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
213
+ spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_resampled, sr=self.sample_rate))
214
+
215
+ # Better speech detection using multiple features
216
+ mfcc = librosa.feature.mfcc(y=audio_resampled, sr=self.sample_rate, n_mfcc=13)
217
+ mfcc_var = np.var(mfcc, axis=1).mean()
218
+
219
+ # Combine features for better speech detection
220
+ speech_score = ((energy + 80) / 40) * 0.4 + (spectral_centroid / 5000) * 0.3 + (mfcc_var / 100) * 0.3
221
  else:
222
  from scipy import signal
223
+ # Basic fallback without librosa
224
+ f, t, Sxx = signal.spectrogram(audio, 16000) # Use original sample rate
225
  energy = np.mean(10 * np.log10(Sxx + 1e-10))
226
+
227
+ # Simple energy-based detection as fallback
228
  speech_score = (energy + 100) / 50
229
 
230
  probability = np.clip(speech_score, 0, 1)
 
242
  self.sample_rate = 32000
243
  self.model = None
244
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
245
  self.load_model()
246
 
247
  def load_model(self):
 
257
  self.model = None
258
 
259
  def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
 
 
 
 
 
260
  start_time = time.time()
261
 
262
  if self.model is None or len(audio) == 0:
 
274
  if len(audio.shape) > 1:
275
  audio = audio.mean(axis=1)
276
 
277
+ # Convert audio to PANNs sample rate
278
+ if LIBROSA_AVAILABLE:
279
+ audio_resampled = librosa.resample(audio.astype(float),
280
+ orig_sr=16000,
281
+ target_sr=self.sample_rate)
282
+ else:
283
+ # Simple resampling fallback
284
+ resample_factor = self.sample_rate / 16000
285
+ audio_resampled = np.interp(
286
+ np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
287
+ np.arange(len(audio)),
288
+ audio
289
+ )
290
+
291
+ # Ensure minimum length for PANNs (need at least 1 second)
292
+ min_samples = self.sample_rate # 1 second
293
+ if len(audio_resampled) < min_samples:
294
+ audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant')
295
+
296
+ clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
297
  input_sr=self.sample_rate)
298
 
299
+ # Find speech-related indices
300
+ speech_indices = []
301
+ for i, lbl in enumerate(labels):
302
+ if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking']):
303
+ speech_indices.append(i)
304
+
305
+ if not speech_indices:
306
+ # Fallback to a known speech index if available
307
+ try:
308
+ speech_indices = [labels.index('Speech')]
309
+ except ValueError:
310
+ # If 'Speech' label doesn't exist, use first 10 indices as approximation
311
+ speech_indices = list(range(min(10, len(labels))))
312
 
313
+ speech_prob = clip_probs[0, speech_indices].mean().item()
314
+
315
+ return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
 
 
316
 
317
  except Exception as e:
318
  print(f"Error in {self.model_name}: {e}")
 
333
  self.model = None
334
  self.feature_extractor = None
335
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
336
  self.load_model()
337
 
338
  def load_model(self):
 
352
  self.model = None
353
 
354
  def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
 
 
 
 
 
355
  start_time = time.time()
356
 
357
  if self.model is None or len(audio) == 0:
 
373
  if len(audio.shape) > 1:
374
  audio = audio.mean(axis=1)
375
 
376
+ # Ensure minimum length for AST (typically needs longer sequences)
377
+ min_samples = self.sample_rate # 1 second minimum
378
+ if len(audio) < min_samples:
379
+ audio = np.pad(audio, (0, min_samples - len(audio)), 'constant')
380
+
381
  inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt")
382
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
383
 
 
387
  probs = torch.sigmoid(logits)
388
 
389
  label2id = self.model.config.label2id
390
+ speech_indices = []
391
+ for lbl, idx in label2id.items():
392
+ if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking', 'human']):
393
+ speech_indices.append(idx)
394
+
395
+ if speech_indices:
396
+ speech_prob = probs[0, speech_indices].mean().item()
397
+ else:
398
+ # Fallback: use average of first few probabilities
399
+ speech_prob = probs[0, :10].mean().item()
400
+
401
+ return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
402
 
403
  except Exception as e:
404
  print(f"Error in {self.model_name}: {e}")
 
872
  if len(processed_audio) == 0:
873
  return None, "馃幍 Processing audio...", "No audio data processed"
874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
  window_samples = int(self.processor.sample_rate * self.processor.window_size)
876
  hop_samples = int(self.processor.sample_rate * self.processor.hop_size)
877
  vad_results = []
878
 
879
+ selected_models = list(set([model_a, model_b]))
880
+
881
+ # Process each window individually for all models
882
  for i in range(0, len(processed_audio) - window_samples, hop_samples):
883
  timestamp = i / self.processor.sample_rate
884
+ chunk = processed_audio[i:i + window_samples]
885
 
886
  for model_name in selected_models:
887
+ if model_name in self.models:
888
+ result = self.models[model_name].predict(chunk, timestamp)
889
+ result.is_speech = result.probability > threshold
 
 
 
 
 
 
 
 
 
 
 
890
  vad_results.append(result)
891
 
892
  delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
 
997
  with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
998
 
999
  gr.Markdown("""
1000
+ # 馃帳 VAD Demo: Real-time Speech Detection Framework v3
1001
 
1002
  **Multi-Model Voice Activity Detection with Advanced Onset/Offset Detection**
1003