Spaces:

hugofara
/

wavlm-phonemizer-word-detection

Sleeping

App Files Files Community

Hugo Farajallah commited on Sep 22

Commit

0e90d9f

1 Parent(s): a9d4833

refactor(code): apply DRY and SRP principles to the code.

Browse files

Files changed (3) hide show

common.py +76 -0
hf_space.py +13 -53
main.py +10 -9

common.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import transformers
 import wavlm_phoneme_fr_it
 def get_model():
     checkpoint = "hugofara/wavlm-base-plus-phonemizer-fr-it"
@@ -12,3 +17,74 @@ def get_model():
         checkpoint
     )
     return model, processor

+import numpy as np
+import torch
+import torchaudio
 import transformers
 import wavlm_phoneme_fr_it
+SAMPLING_RATE = 16_000
 def get_model():
     checkpoint = "hugofara/wavlm-base-plus-phonemizer-fr-it"
         checkpoint
     )
     return model, processor
+def preprocess_audio(audio_data, target_sample_rate=SAMPLING_RATE):
+    """Convert audio to the correct format and sample rate"""
+    if audio_data is None:
+        return None
+    sample_rate, audio = audio_data
+    # Ensure audio is in the correct format (mono, float32)
+    if len(audio.shape) > 1:
+        audio = audio.mean(axis=1)  # Convert to mono if stereo
+    # Resample if necessary using torchaudio
+    if sample_rate != target_sample_rate:
+        audio_tensor = torch.from_numpy(audio).float().unsqueeze(0)
+        resampled = torchaudio.transforms.Resample(sample_rate, target_sample_rate)(audio_tensor)
+        audio = resampled.squeeze(0).numpy()
+    # Normalize audio
+    audio = audio.astype(np.float32)
+    if np.max(np.abs(audio)) > 0:
+        audio = audio / np.max(np.abs(audio))
+    return audio
+def prepare_model_inputs(audio, processor, sampling_rate=SAMPLING_RATE):
+    """Prepare inputs for the model"""
+    inputs = processor(
+        audio,
+        sampling_rate=sampling_rate,
+        return_tensors="pt",
+        padding=True
+    )
+    # Add language tensor (assuming French/Italian model)
+    inputs["language"] = torch.tensor([[0]])
+    return inputs
+def run_inference(model, inputs):
+    """Run model inference and return predictions"""
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+    return outputs, predicted_ids
+def decode_transcription(processor, predicted_ids):
+    """Decode predicted IDs to text"""
+    return processor.batch_decode(predicted_ids)[0]
+def compare_with_target(transcription, target_word):
+    """Compare transcription with target word and return formatted result"""
+    result = f"**Transcription:** {transcription}\n\n"
+    if target_word and target_word.strip():
+        target_clean = target_word.strip().lower()
+        transcription_clean = transcription.lower().replace("[pad]", "").strip()
+        if target_clean in transcription_clean:
+            result += f"✅ **Match found!** The target word '{target_word}' appears in the transcription."
+        else:
+            result += f"❌ **No exact match.** The target word '{target_word}' was not found in the transcription."
+    return result

hf_space.py CHANGED Viewed

@@ -1,71 +1,31 @@
 import gradio as gr
-import numpy as np
-import torch
-import torchaudio
 import common
 model, processor = common.get_model()
-SAMPLING_RATE = 16_000
 def process_audio(audio_data, target_word):
     """Process recorded audio and return ASR output with target word comparison"""
     if audio_data is None:
         return "Please record some audio first."
-    # Extract audio data and sample rate
-    sample_rate, audio = audio_data
-    # Ensure audio is in the correct format (mono, float32)
-    if len(audio.shape) > 1:
-        audio = audio.mean(axis=1)  # Convert to mono if stereo
-    # Resample if necessary using torchaudio
-    if sample_rate != SAMPLING_RATE:
-        audio_tensor = torch.from_numpy(audio).float().unsqueeze(0)
-        resampled = torchaudio.transforms.Resample(sample_rate, SAMPLING_RATE)(audio_tensor)
-        audio = resampled.squeeze(0).numpy()
-    # Normalize audio
-    audio = audio.astype(np.float32)
-    if np.max(np.abs(audio)) > 0:
-        audio = audio / np.max(np.abs(audio))
-    # Process with the model
-    inputs = processor(
-        audio,
-        sampling_rate=SAMPLING_RATE,
-        return_tensors="pt",
-        padding=True
-    )
-    # Add language tensor (assuming French/Italian model)
-    inputs["language"] = torch.tensor([[0]])
     # Run inference
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-    # Decode the prediction
-    transcription = processor.batch_decode(predicted_ids)[0]
-    # Compare with target word if provided
-    result = f"**Transcription:** {transcription}\n\n"
-    if target_word and target_word.strip():
-        target_clean = target_word.strip().lower()
-        transcription_clean = transcription.lower().replace("[pad]", "").strip()
-        if target_clean in transcription_clean:
-            result += f"✅ **Match found!** The target word '{target_word}' appears in the transcription."
-        else:
-            result += f"❌ **No exact match.** The target word '{target_word}' was not found in the transcription."
-    return result
 def create_interface():
@@ -114,5 +74,5 @@ def create_interface():
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch()

 import gradio as gr
 import common
 model, processor = common.get_model()
 def process_audio(audio_data, target_word):
     """Process recorded audio and return ASR output with target word comparison"""
     if audio_data is None:
         return "Please record some audio first."
+    # Preprocess audio
+    audio = common.preprocess_audio(audio_data)
+    if audio is None:
+        return "Failed to process audio."
+    # Prepare model inputs
+    inputs = common.prepare_model_inputs(audio, processor)
     # Run inference
+    outputs, predicted_ids = common.run_inference(model, inputs)
+    # Decode transcription
+    transcription = common.decode_transcription(processor, predicted_ids)
+    # Compare with target word
+    return common.compare_with_target(transcription, target_word)
 def create_interface():
 if __name__ == "__main__":
+    my_demo = create_interface()
+    my_demo.launch()

main.py CHANGED Viewed

@@ -7,16 +7,17 @@ import sounddevice as sd
 import torch
 from torchcodec.decoders import AudioDecoder
 import wavlm_phoneme_fr_it
 import common
-SAMPLING_RATE = 16_000
-VOCAB_SIZE = 97
 def fake_model(chunk):
     output_length = int(chunk.shape[0] * 0.02)
-    return np.random.rand(output_length, VOCAB_SIZE)
 def update_frame(frames, ax, matrix_plot, tokenizer=None):
@@ -48,8 +49,8 @@ def main(record_mic=False):
     if record_mic:
         print("Recording the microphone...")
         waveform = sd.rec(
-            int(audio_duration * SAMPLING_RATE),
-            samplerate=SAMPLING_RATE,
             channels=1
         ).T
         sd.wait()  # Wait until recording is finished
@@ -58,7 +59,7 @@ def main(record_mic=False):
         audio_file = "ceci est un test.wav"
         decoded = AudioDecoder(audio_file).get_all_samples()
         waveform = decoded.data.numpy()
-        assert decoded.sample_rate == SAMPLING_RATE, f"Bad audio frequency {decoded.sample_rate}"
     # Split audio
     chunks = []
@@ -71,7 +72,7 @@ def main(record_mic=False):
     inputs = processor(
         chunks,
         return_attention_mask=True,
-        sampling_rate=SAMPLING_RATE,
         padding=True
     )
     inputs.update({
@@ -101,7 +102,7 @@ def main(record_mic=False):
     ax.set_title("Animation Preview")
     matrix_plot = ax.matshow(logit_groups[0][0], animated=True, vmin=0, vmax=1)
     logits_list = []
-    masks = inputs["attention_mask"].sum(dim=1) / SAMPLING_RATE
     for i, chunk in enumerate(chunks):
         # logits = fake_model(chunk)  # for testing purposes only
         logits_list.append(logits)

 import torch
 from torchcodec.decoders import AudioDecoder
 import wavlm_phoneme_fr_it
+import json
 import common
 def fake_model(chunk):
     output_length = int(chunk.shape[0] * 0.02)
+    with open("vocab.json", "r") as vocab_file:
+        vocab = json.loads(vocab_file.read())
+    vocab_size = len(vocab) + 3
+    return np.random.rand(output_length, vocab_size)
 def update_frame(frames, ax, matrix_plot, tokenizer=None):
     if record_mic:
         print("Recording the microphone...")
         waveform = sd.rec(
+            int(audio_duration * common.SAMPLING_RATE),
+            samplerate=common.SAMPLING_RATE,
             channels=1
         ).T
         sd.wait()  # Wait until recording is finished
         audio_file = "ceci est un test.wav"
         decoded = AudioDecoder(audio_file).get_all_samples()
         waveform = decoded.data.numpy()
+        assert decoded.sample_rate == common.SAMPLING_RATE, f"Bad audio frequency {decoded.sample_rate}"
     # Split audio
     chunks = []
     inputs = processor(
         chunks,
         return_attention_mask=True,
+        sampling_rate=common.SAMPLING_RATE,
         padding=True
     )
     inputs.update({
     ax.set_title("Animation Preview")
     matrix_plot = ax.matshow(logit_groups[0][0], animated=True, vmin=0, vmax=1)
     logits_list = []
+    masks = inputs["attention_mask"].sum(dim=1) / common.SAMPLING_RATE
     for i, chunk in enumerate(chunks):
         # logits = fake_model(chunk)  # for testing purposes only
         logits_list.append(logits)