Spaces:

hugofara
/

wavlm-phonemizer-word-detection

Sleeping

App Files Files Community

Hugo Farajallah commited on Sep 22

Commit

b6bd379

1 Parent(s): 9bc684b

feat(HF): display alignment matrix as well as two scoring systems.

Browse files

Files changed (3) hide show

dataset_process.py +29 -12
hf_space.py +65 -25
vocab.json +96 -95

dataset_process.py CHANGED Viewed

@@ -121,16 +121,24 @@ def solve_path(prediction, target, path_matrix):
     return matching
-def display_matrix_result(path_matrix, matching, prediction, target):
-    """Display all the information resulting from a Bellman matching of matrices."""
-    fig, axis = plt.subplots()
-    _model, processor = common.get_model()
     # Display the matrix
-    axis.matshow(path_matrix.T, aspect="auto")
     # Set the labels for the axes
-    axis.set_xlabel('Predicted String')
     # String for the x-axis
     predicted_labels = tuple(map(processor.decode, torch.argmax(prediction, -1)[0]))
     axis.set_xticks(
@@ -143,7 +151,7 @@ def display_matrix_result(path_matrix, matching, prediction, target):
         minor=True
     )
-    axis.set_ylabel('Target String')
     target_labels = tuple(map(processor.decode, torch.argmax(target, -1)[0]))
     axis.set_yticks(
         [i for i, label in enumerate(target_labels) if label == ""],
@@ -154,16 +162,25 @@ def display_matrix_result(path_matrix, matching, prediction, target):
         labels=[label for label in target_labels if label != ""],
         minor=True
     )
-    # axis.yaxis.grid(which="major", color='k', linestyle='--')
-    axis.grid(which="major", color="black")
-    axis.grid(which="minor", linestyle="--")
     axis.plot(
         [val[0] for val in matching],
         [val[1] for val in matching],
-        color="red"
     )
-    plt.show()
 def bellman_matching(prediction, target, insertion_cost=1.3, deletion_cost=3, metric=l2_logit_norm):

     return matching
+def display_matrix_result(path_matrix, matching, prediction, target, processor=None):
+    """Display all the information resulting from a Bellman matching of matrices.
+    Returns the figure instead of showing it directly for use in Gradio.
+    """
+    fig, axis = plt.subplots(figsize=(10, 6))
+    if processor is None:
+        _model, processor = common.get_model()
     # Display the matrix
+    im = axis.matshow(path_matrix.T, aspect="auto", cmap='Blues')
+    plt.colorbar(im, ax=axis)
     # Set the labels for the axes
+    axis.set_xlabel('Predicted String', fontsize=12)
+    axis.set_title('Alignment Matrix: Predicted vs Target Phonemes', fontsize=14, pad=20)
     # String for the x-axis
     predicted_labels = tuple(map(processor.decode, torch.argmax(prediction, -1)[0]))
     axis.set_xticks(
         minor=True
     )
+    axis.set_ylabel('Target String', fontsize=12)
     target_labels = tuple(map(processor.decode, torch.argmax(target, -1)[0]))
     axis.set_yticks(
         [i for i, label in enumerate(target_labels) if label == ""],
         labels=[label for label in target_labels if label != ""],
         minor=True
     )
+    axis.grid(which="major", color="black", alpha=0.3)
+    axis.grid(which="minor", linestyle="--", alpha=0.2)
+    # Plot the optimal path in red
     axis.plot(
         [val[0] for val in matching],
         [val[1] for val in matching],
+        color="red",
+        linewidth=2,
+        marker='o',
+        markersize=3,
+        label="Optimal Alignment Path"
     )
+    axis.legend()
+    plt.tight_layout()
+    return fig
 def bellman_matching(prediction, target, insertion_cost=1.3, deletion_cost=3, metric=l2_logit_norm):

hf_space.py CHANGED Viewed

@@ -21,10 +21,10 @@ def phonemize_text(text, language):
     return " ".join([word.replace(" ", "") for word in phonemes]) if phonemes and phonemes[0] else ""
-def process_audio_advanced(audio_data, target_word, language, advanced_mode, insertion_cost, deletion_cost, threshold, temperature):
     """Process recorded audio with advanced alignment if enabled"""
     if audio_data is None:
-        return "Please record some audio first.", "", ""
     # Convert target word to phonemes if provided
     phonemized_target = ""
@@ -34,7 +34,7 @@ def process_audio_advanced(audio_data, target_word, language, advanced_mode, ins
     # Preprocess audio
     audio = common.preprocess_audio(audio_data)
     if audio is None:
-        return "Failed to process audio.", "", ""
     # Prepare model inputs with correct language
     lang_enum = common.Languages.FR if language == "French" else common.Languages.IT
@@ -51,6 +51,7 @@ def process_audio_advanced(audio_data, target_word, language, advanced_mode, ins
     result += f"**Transcription:** {transcription}\n\n"
     alignment_result = ""
     if target_word and target_word.strip():
         result += f"**Target Word:** {target_word}\n"
@@ -61,7 +62,7 @@ def process_audio_advanced(audio_data, target_word, language, advanced_mode, ins
             try:
                 # Encode target phonemes
                 target_encoded = dataset_process.encode_phonemes(
-                    phonemized_target.split(), processor.tokenizer
                 )
                 # Get model logits (raw outputs before softmax)
@@ -76,26 +77,52 @@ def process_audio_advanced(audio_data, target_word, language, advanced_mode, ins
                     metric=dataset_process.l2_logit_norm
                 )
-                # Calculate alignment score using user-defined weights
                 weights = [insertion_cost, deletion_cost, threshold, temperature]
                 score = dataset_process.get_alignment_score(
                     prediction_logits,
                     target_encoded,
-                    weights
                 )
                 alignment_result = f"**🔬 Advanced Alignment Analysis:**\n\n"
                 alignment_result += f"**Settings:** Insertion={insertion_cost}, Deletion={deletion_cost}, Threshold={threshold}, Temperature={temperature}\n\n"
                 alignment_result += f"**Alignment Score:** {alignment_score:.3f}\n"
                 alignment_result += f"**Matching Points:** {len(matching)}\n"
-                alignment_result += f"**Classification Score:** {score}/2\n\n"
-                if score == 2:
-                    alignment_result += "✅ **Perfect Match!** Target phonemes align perfectly with transcription."
-                elif score == 1:
-                    alignment_result += "⚠️ **Close Match!** Target phonemes align with 1 minor error."
-                else:
-                    alignment_result += "❌ **Poor Match.** Target phonemes don't align well with transcription."
             except Exception as e:
                 alignment_result = f"**⚠️ Alignment Error:** {str(e)}"
@@ -109,13 +136,13 @@ def process_audio_advanced(audio_data, target_word, language, advanced_mode, ins
             else:
                 result += f"❌ **No phoneme match.** The phonemized target was not found in the transcription."
-    return result, phonemized_target, alignment_result
 # Keep the simple function for backward compatibility
 def process_audio(audio_data, target_word, language):
     """Simple audio processing without advanced features"""
-    result, phonemes, _ = process_audio_advanced(audio_data, target_word, language, False, 1.3, 3.0, 0.7, 1.0)
     return result, phonemes
@@ -181,6 +208,13 @@ def create_interface():
                         info="Softmax temperature for prediction confidence (1.0 = normal)"
                     )
                 target_word_input = gr.Textbox(
                     label="Target Word (optional)",
                     placeholder="Enter a word you expect to say...",
@@ -213,6 +247,11 @@ def create_interface():
                     label="Alignment Analysis"
                 )
         # Update phonemes when target word or language changes
         def update_phonemes(text, language):
             if text and text.strip():
@@ -223,7 +262,8 @@ def create_interface():
         def toggle_advanced_features(advanced):
             return (
                 gr.update(visible=advanced),  # alignment_output
-                gr.update(visible=advanced)   # weight_controls
             )
         target_word_input.change(
@@ -241,29 +281,29 @@ def create_interface():
         advanced_mode.change(
             fn=toggle_advanced_features,
             inputs=advanced_mode,
-            outputs=[alignment_output, weight_controls]
         )
         # Main processing function
-        def process_with_mode(audio_data, target_word, language, advanced, ins_cost, del_cost, thresh, temp):
-            result, phonemes, alignment = process_audio_advanced(
-                audio_data, target_word, language, advanced, ins_cost, del_cost, thresh, temp
             )
-            return result, phonemes, alignment
         process_btn.click(
             fn=process_with_mode,
             inputs=[audio_input, target_word_input, language_radio, advanced_mode,
-                   insertion_cost, deletion_cost, threshold, temperature],
-            outputs=[output_text, phonemes_display, alignment_output]
         )
         # Auto-process when audio is recorded
         audio_input.change(
             fn=process_with_mode,
             inputs=[audio_input, target_word_input, language_radio, advanced_mode,
-                   insertion_cost, deletion_cost, threshold, temperature],
-            outputs=[output_text, phonemes_display, alignment_output]
         )
     return demo

     return " ".join([word.replace(" ", "") for word in phonemes]) if phonemes and phonemes[0] else ""
+def process_audio_advanced(audio_data, target_word, language, advanced_mode, insertion_cost, deletion_cost, threshold, temperature, scoring_method):
     """Process recorded audio with advanced alignment if enabled"""
     if audio_data is None:
+        return "Please record some audio first.", "", "", None
     # Convert target word to phonemes if provided
     phonemized_target = ""
     # Preprocess audio
     audio = common.preprocess_audio(audio_data)
     if audio is None:
+        return "Failed to process audio.", "", "", None
     # Prepare model inputs with correct language
     lang_enum = common.Languages.FR if language == "French" else common.Languages.IT
     result += f"**Transcription:** {transcription}\n\n"
     alignment_result = ""
+    alignment_plot_fig = None
     if target_word and target_word.strip():
         result += f"**Target Word:** {target_word}\n"
             try:
                 # Encode target phonemes
                 target_encoded = dataset_process.encode_phonemes(
+                    phonemized_target, processor.tokenizer
                 )
                 # Get model logits (raw outputs before softmax)
                     metric=dataset_process.l2_logit_norm
                 )
+                # Calculate alignment score using user-defined weights and scoring method
                 weights = [insertion_cost, deletion_cost, threshold, temperature]
+                scoring_enum = common.Scoring.NUMBER_CORRECT if scoring_method == "NUMBER_CORRECT" else common.Scoring.PHONEME_DELETION
                 score = dataset_process.get_alignment_score(
                     prediction_logits,
                     target_encoded,
+                    weights,
+                    94,
+                    scoring=scoring_enum
+                )
+                # Generate alignment plot
+                path_matrix = dataset_process.compute_path_matrix(
+                    prediction_logits,
+                    target_encoded,
+                    dataset_process.l2_logit_norm,
+                    insertion_cost,
+                    deletion_cost
+                )
+                alignment_plot_fig = dataset_process.display_matrix_result(
+                    path_matrix, matching, prediction_logits, target_encoded, processor
                 )
                 alignment_result = f"**🔬 Advanced Alignment Analysis:**\n\n"
+                alignment_result += f"**Scoring Method:** {scoring_method}\n"
                 alignment_result += f"**Settings:** Insertion={insertion_cost}, Deletion={deletion_cost}, Threshold={threshold}, Temperature={temperature}\n\n"
                 alignment_result += f"**Alignment Score:** {alignment_score:.3f}\n"
                 alignment_result += f"**Matching Points:** {len(matching)}\n"
+                if scoring_method == "NUMBER_CORRECT":
+                    alignment_result += f"**Correct Phonemes:** {score}/{target_encoded.shape[1]}\n\n"
+                    accuracy = score / target_encoded.shape[1] if target_encoded.shape[1] > 0 else 0
+                    if accuracy >= 0.9:
+                        alignment_result += "✅ **Excellent Match!** Most target phonemes are correctly aligned."
+                    elif accuracy >= 0.7:
+                        alignment_result += "⚠️ **Good Match!** Most target phonemes align well."
+                    else:
+                        alignment_result += "❌ **Poor Match.** Many target phonemes don't align correctly."
+                else:  # PHONEME_DELETION
+                    alignment_result += f"**Classification Score:** {score}/2\n\n"
+                    if score == 2:
+                        alignment_result += "✅ **Perfect Match!** Target phonemes align perfectly with transcription."
+                    elif score == 1:
+                        alignment_result += "⚠️ **Close Match!** Target phonemes align with 1 minor error."
+                    else:
+                        alignment_result += "❌ **Poor Match.** Target phonemes don't align well with transcription."
             except Exception as e:
                 alignment_result = f"**⚠️ Alignment Error:** {str(e)}"
             else:
                 result += f"❌ **No phoneme match.** The phonemized target was not found in the transcription."
+    return result, phonemized_target, alignment_result, alignment_plot_fig
 # Keep the simple function for backward compatibility
 def process_audio(audio_data, target_word, language):
     """Simple audio processing without advanced features"""
+    result, phonemes, _, _ = process_audio_advanced(audio_data, target_word, language, False, 1.3, 3.0, 0.7, 1.0, "NUMBER_CORRECT")
     return result, phonemes
                         info="Softmax temperature for prediction confidence (1.0 = normal)"
                     )
+                    scoring_method = gr.Radio(
+                        choices=["NUMBER_CORRECT", "PHONEME_DELETION"],
+                        value="NUMBER_CORRECT",
+                        label="Scoring Method",
+                        info="Method for calculating alignment scores"
+                    )
                 target_word_input = gr.Textbox(
                     label="Target Word (optional)",
                     placeholder="Enter a word you expect to say...",
                     label="Alignment Analysis"
                 )
+                alignment_plot = gr.Plot(
+                    label="Alignment Matrix",
+                    visible=False
+                )
         # Update phonemes when target word or language changes
         def update_phonemes(text, language):
             if text and text.strip():
         def toggle_advanced_features(advanced):
             return (
                 gr.update(visible=advanced),  # alignment_output
+                gr.update(visible=advanced),  # weight_controls
+                gr.update(visible=advanced)   # alignment_plot
             )
         target_word_input.change(
         advanced_mode.change(
             fn=toggle_advanced_features,
             inputs=advanced_mode,
+            outputs=[alignment_output, weight_controls, alignment_plot]
         )
         # Main processing function
+        def process_with_mode(audio_data, target_word, language, advanced, ins_cost, del_cost, thresh, temp, score_method):
+            result, phonemes, alignment, plot_fig = process_audio_advanced(
+                audio_data, target_word, language, advanced, ins_cost, del_cost, thresh, temp, score_method
             )
+            return result, phonemes, alignment, plot_fig
         process_btn.click(
             fn=process_with_mode,
             inputs=[audio_input, target_word_input, language_radio, advanced_mode,
+                   insertion_cost, deletion_cost, threshold, temperature, scoring_method],
+            outputs=[output_text, phonemes_display, alignment_output, alignment_plot]
         )
         # Auto-process when audio is recorded
         audio_input.change(
             fn=process_with_mode,
             inputs=[audio_input, target_word_input, language_radio, advanced_mode,
+                   insertion_cost, deletion_cost, threshold, temperature, scoring_method],
+            outputs=[output_text, phonemes_display, alignment_output, alignment_plot]
         )
     return demo

vocab.json CHANGED Viewed

@@ -1,96 +1,97 @@
 {
-  "[PAD]": 0,
-  "[UNK]": 1,
-  "a": 2,
-  "ã": 3,
-  "b": 4,
-  "c": 5,
-  "d": 6,
-  "d͡z": 7,
-  "d͡ʒ": 8,
-  "e": 9,
-  "ẽ": 10,
-  "f": 11,
-  "g": 12,
-  "h": 13,
-  "i": 14,
-  "j": 15,
-  "k": 16,
-  "l": 17,
-  "m": 18,
-  "mʼ": 19,
-  "n": 20,
-  "nʼ": 21,
-  "o": 22,
-  "oʼ": 23,
-  "p": 24,
-  "pʼ": 25,
-  "r": 26,
-  "s": 27,
-  "sʼ": 28,
-  "t": 29,
-  "tʼ": 30,
-  "t͡s": 31,
-  "t͡ʃ": 32,
-  "u": 33,
-  "uʼ": 34,
-  "v": 35,
-  "vʼ": 36,
-  "w": 37,
-  "y": 38,
-  "yʼ": 39,
-  "z": 40,
-  "zʼ": 41,
-  "ø": 42,
-  "øʼ": 43,
-  "ŋ": 44,
-  "ŋʼ": 45,
-  "ɲ": 46,
-  "œ": 47,
-  "œ̃": 48,
-  "ɑ̃": 49,
-  "ɑ̃ʼ": 50,
-  "ɔ": 51,
-  "ɔʼ": 52,
-  "ɔ̃": 53,
-  "ɔ̃ʼ": 54,
-  "ə": 55,
-  "əʼ": 56,
-  "ɛ": 57,
-  "ɛʼ": 58,
-  "ɛː": 59,
-  "ɛ̃": 60,
-  "ɛ̃ʼ": 61,
-  "ɥ": 62,
-  "ʁ": 63,
-  "ʁʼ": 64,
-  "ʃ": 65,
-  "ʈ": 66,
-  "ʒ": 67,
-  "ʒʼ": 68,
-  "ʼ": 69,
-  "ʼa": 70,
-  "ʼe": 71,
-  "ʼi": 72,
-  "ʼj": 73,
-  "ʼo": 74,
-  "ʼu": 75,
-  "ʼy": 76,
-  "ʼœ": 77,
-  "ʼœ̃": 78,
-  "ʼɑ̃": 79,
-  "ʼɔ": 80,
-  "ʼɔ̃": 81,
-  "ʼə": 82,
-  "ʼɛ": 83,
-  "ʼɛ̃": 84,
-  "ʼɥ": 85,
-  "ˈe": 86,
-  "ˈh": 87,
-  "ˈk": 88,
-  "ˈp": 89,
-  "ˈs": 90,
-  "ˈu": 91,
-  "ˈæ": 92,
-  "ˈð": 93
-}

 {
+  "|": 0,
+  "a": 1,
+  "ã": 2,
+  "b": 3,
+  "c": 4,
+  "d": 5,
+  "d͡z": 6,
+  "d͡ʒ": 7,
+  "e": 8,
+  "ẽ": 9,
+  "f": 10,
+  "g": 11,
+  "h": 12,
+  "i": 13,
+  "j": 14,
+  "k": 15,
+  "l": 16,
+  "m": 17,
+  "mʼ": 18,
+  "n": 19,
+  "nʼ": 20,
+  "o": 21,
+  "oʼ": 22,
+  "p": 23,
+  "pʼ": 24,
+  "r": 25,
+  "s": 26,
+  "sʼ": 27,
+  "t": 28,
+  "tʼ": 29,
+  "t͡s": 30,
+  "t͡ʃ": 31,
+  "u": 32,
+  "uʼ": 33,
+  "v": 34,
+  "vʼ": 35,
+  "w": 36,
+  "y": 37,
+  "yʼ": 38,
+  "z": 39,
+  "zʼ": 40,
+  "ø": 41,
+  "øʼ": 42,
+  "ŋ": 43,
+  "ŋʼ": 44,
+  "ɲ": 45,
+  "œ": 46,
+  "œ̃": 47,
+  "ɑ̃": 48,
+  "ɑ̃ʼ": 49,
+  "ɔ": 50,
+  "ɔʼ": 51,
+  "ɔ̃": 52,
+  "ɔ̃ʼ": 53,
+  "ə": 54,
+  "əʼ": 55,
+  "ɛ": 56,
+  "ɛʼ": 57,
+  "ɛː": 58,
+  "ɛ̃": 59,
+  "ɛ̃ʼ": 60,
+  "ɥ": 61,
+  "ʁ": 62,
+  "ʁʼ": 63,
+  "ʃ": 64,
+  "ʈ": 65,
+  "ʒ": 66,
+  "ʒʼ": 67,
+  "ʼ": 68,
+  "ʼa": 69,
+  "ʼe": 70,
+  "ʼi": 71,
+  "ʼj": 72,
+  "ʼo": 73,
+  "ʼu": 74,
+  "ʼy": 75,
+  "ʼœ": 76,
+  "ʼœ̃": 77,
+  "ʼɑ̃": 78,
+  "ʼɔ": 79,
+  "ʼɔ̃": 80,
+  "ʼə": 81,
+  "ʼɛ": 82,
+  "ʼɛ̃": 83,
+  "ʼɥ": 84,
+  "ˈe": 85,
+  "ˈh": 86,
+  "ˈk": 87,
+  "ˈp": 88,
+  "ˈs": 89,
+  "ˈu": 90,
+  "ˈæ": 91,
+  "ˈð": 92,
+  "[UNK]": 93,
+  "[PAD]": 94
+}