Spaces:

ACloudCenter
/

canary-qwen-transcriber-2.5b

Runtime error

App Files Files Community

ACloudCenter commited on Sep 3

Commit

2bbd27c

1 Parent(s): 3e50cb7

Modify with tabs for architecture section

Browse files

Files changed (4) hide show

app.py +127 -46
public/chart1.png +3 -0
public/chart2.png +3 -0
public/nvidia-speech.png +3 -0

app.py CHANGED Viewed

@@ -132,11 +132,13 @@ with gr.Blocks(theme=theme) as demo:
                 summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only
                 its transcript. This model is ready for commercial use.''')
-    # State variables
-    transcript_state = gr.State("")
-    # Example questions
-    example_questions = [
         ["Can you summarize this meeting?"],
         ["Please provide bullet points of the key items."],
         ["What is the TL;DR of this meeting?"],
@@ -144,8 +146,8 @@ with gr.Blocks(theme=theme) as demo:
         ["What was the main topic?"],
     ]
-    # Audio Input and Transcript
-    with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### Audio Input")
             audio_input = gr.Audio(
@@ -160,9 +162,9 @@ with gr.Blocks(theme=theme) as demo:
                 label="Example Audio"
             )
             transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
-            clear_audio_btn = gr.Button("Clear Audio")
-        with gr.Column(scale=1):
             gr.Markdown("### Transcript")
             transcript_output = gr.Textbox(
                 label="",
@@ -171,22 +173,22 @@ with gr.Blocks(theme=theme) as demo:
                 max_lines=12,
                 autoscroll=True
             )
-            clear_transcript_btn = gr.Button("Clear Transcript")
-    # Spacing
-    gr.Markdown("---")
-    with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### Interactive Q&A")
             gr.Markdown("#### About Context-Aware Q&A")
             gr.Markdown("""The model retains the full transcript context, allowing you to ask follow-up questions
                         naturally without re-stating information. It understands references like 'they', 'it', or 'that topic'.""")
             gr.Markdown("#### Example Questions")
-            # Examples will be added after msg is defined
-            example_container = gr.Column()
-        with gr.Column(scale=3):
             # Add thinking display above chat
             with gr.Accordion("🧠 Model Thinking", open=False):
                 thinking_box = gr.Textbox(
@@ -211,10 +213,10 @@ with gr.Blocks(theme=theme) as demo:
                     lines=1
                 )
                 submit_chat_btn = gr.Button("Send", variant="primary", scale=1)
-            clear_chat_btn = gr.Button("Clear Chat", size="sm")
-    # Event handlers
-    def submit_question(question, transcript):
         if not question or question.strip() == "":
             yield "", [], ""
         answer, thinking = transcript_qa(transcript, question)
@@ -223,21 +225,21 @@ with gr.Blocks(theme=theme) as demo:
             {"role": "user", "content": question},
             {"role": "assistant", "content": answer}
         ]
-        yield "", messages, thinking
-    # Add examples inside the left column container
-    with example_container:
         gr.Examples(
             examples=example_questions,
             inputs=msg,
             outputs=[msg, chatbot, thinking_box],
             fn=lambda q: submit_question(q, transcript_state.value),
             cache_examples=False,
-            label=""
-        )
-    transcribe_btn.click(
         fn=disable_transcribe,
         outputs=[transcribe_btn]
     ).then(
@@ -249,35 +251,114 @@ with gr.Blocks(theme=theme) as demo:
         outputs=[transcript_output, transcript_state]
     ).then(
         fn=enable_transcribe,
-        outputs=[transcribe_btn]
-    )
-    clear_audio_btn.click(
         fn=lambda: None,
-        outputs=[audio_input]
-    )
-    clear_transcript_btn.click(
         fn=lambda: ("", ""),
-        outputs=[transcript_output, transcript_state]
-    )
-    msg.submit(
         fn=submit_question,
         inputs=[msg, transcript_state],
-        outputs=[msg, chatbot, thinking_box]
-    )
-    submit_chat_btn.click(
         fn=submit_question,
         inputs=[msg, transcript_state],
-        outputs=[msg, chatbot, thinking_box]
-    )
-    clear_chat_btn.click(
         fn=lambda: ([], ""),
-        outputs=[chatbot, thinking_box]
-    )
 demo.queue()
 demo.launch()

                 summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only
                 its transcript. This model is ready for commercial use.''')
+    with gr.Tabs():
+        with gr.Tab("Transcribe"):
+            # State variables
+            transcript_state = gr.State("")
+            # Example questions
+            example_questions = [
         ["Can you summarize this meeting?"],
         ["Please provide bullet points of the key items."],
         ["What is the TL;DR of this meeting?"],
         ["What was the main topic?"],
     ]
+            # Audio Input and Transcript
+            with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### Audio Input")
             audio_input = gr.Audio(
                 label="Example Audio"
             )
             transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
+                clear_audio_btn = gr.Button("Clear Audio")
+            with gr.Column(scale=1):
             gr.Markdown("### Transcript")
             transcript_output = gr.Textbox(
                 label="",
                 max_lines=12,
                 autoscroll=True
             )
+                clear_transcript_btn = gr.Button("Clear Transcript")
+            # Spacing
+            gr.Markdown("---")
+            with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### Interactive Q&A")
             gr.Markdown("#### About Context-Aware Q&A")
             gr.Markdown("""The model retains the full transcript context, allowing you to ask follow-up questions
                         naturally without re-stating information. It understands references like 'they', 'it', or 'that topic'.""")
             gr.Markdown("#### Example Questions")
+                # Examples will be added after msg is defined
+                example_container = gr.Column()
+            with gr.Column(scale=3):
             # Add thinking display above chat
             with gr.Accordion("🧠 Model Thinking", open=False):
                 thinking_box = gr.Textbox(
                     lines=1
                 )
                 submit_chat_btn = gr.Button("Send", variant="primary", scale=1)
+                clear_chat_btn = gr.Button("Clear Chat", size="sm")
+            # Event handlers
+            def submit_question(question, transcript):
         if not question or question.strip() == "":
             yield "", [], ""
         answer, thinking = transcript_qa(transcript, question)
             {"role": "user", "content": question},
             {"role": "assistant", "content": answer}
         ]
+                yield "", messages, thinking
+            # Add examples inside the left column container
+            with example_container:
         gr.Examples(
             examples=example_questions,
             inputs=msg,
             outputs=[msg, chatbot, thinking_box],
             fn=lambda q: submit_question(q, transcript_state.value),
             cache_examples=False,
+                label=""
+            )
+            transcribe_btn.click(
         fn=disable_transcribe,
         outputs=[transcribe_btn]
     ).then(
         outputs=[transcript_output, transcript_state]
     ).then(
         fn=enable_transcribe,
+                outputs=[transcribe_btn]
+            )
+            clear_audio_btn.click(
         fn=lambda: None,
+                outputs=[audio_input]
+            )
+            clear_transcript_btn.click(
         fn=lambda: ("", ""),
+                outputs=[transcript_output, transcript_state]
+            )
+            msg.submit(
         fn=submit_question,
         inputs=[msg, transcript_state],
+                outputs=[msg, chatbot, thinking_box]
+            )
+            submit_chat_btn.click(
         fn=submit_question,
         inputs=[msg, transcript_state],
+                outputs=[msg, chatbot, thinking_box]
+            )
+            clear_chat_btn.click(
         fn=lambda: ([], ""),
+                outputs=[chatbot, thinking_box]
+            )
+        with gr.Tab("Architecture"):
+            gr.Markdown("### Model Performance")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("""
+                    #### Industry-Leading Performance
+                    Canary ranks at the top of the HuggingFace Open ASR Leaderboard with an average word error rate (WER) of **6.67%**. It outperforms all other open-source models by a wide margin.
+                    #### Training Data
+                    Canary is trained on a combination of public and in-house data:
+                    - **85K hours** of transcribed speech for speech recognition
+                    - NVIDIA NeMo text translation models used to generate translations of the original transcripts in all supported languages
+                    Despite using an order of magnitude less data, Canary outperforms the similarly sized Whisper-large-v3 and SeamlessM4T-Medium-v1 models on both transcription and translation tasks.
+                    """)
+                with gr.Column(scale=1):
+                    gr.HTML("""
+                    <div style="text-align: center; padding: 20px;">
+                        <img src="https://developer-blogs.nvidia.com/wp-content/uploads/2024/12/fig-6-nvidia-nemo-canary-architecture.png"
+                             style="width: 100%; max-width: 500px; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);"
+                             alt="NVIDIA Canary Architecture">
+                        <p style="margin-top: 10px; color: #666; font-size: 14px;">NVIDIA Canary Architecture</p>
+                    </div>
+                    """)
+            gr.Markdown("### Benchmark Results")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("""
+                    #### Word Error Rate (WER) on MCV 16.1 Test Sets
+                    On the MCV 16.1 test sets for English, Spanish, French, and German, Canary achieved a WER of **5.77** (lower is better).
+                    | Model | Average WER |
+                    |-------|-------------|
+                    | **Canary** | **5.77** |
+                    | SeamlessM4T-v2 | 6.41 |
+                    | Whisper-large-v3 | 8.05 |
+                    | SeamlessM4T-v1 | 9.48 |
+                    """)
+                with gr.Column():
+                    gr.Markdown("""
+                    #### Translation BLEU Scores
+                    **From English** (ES, FR, DE on FLEURS & MExpresso):
+                    - Canary: **30.57** BLEU
+                    **To English** (ES, FR, DE on FLEURS & CoVoST):
+                    - Canary: **34.25** BLEU
+                    *(Higher BLEU scores indicate better translation quality)*
+                    """)
+            gr.Markdown("---")
+            gr.Markdown("""
+            ### Canary Architecture Details
+            Canary is an encoder-decoder model built on NVIDIA innovations:
+            - **Encoder**: Fast-Conformer - an efficient Conformer architecture optimized for ~3x savings on compute and ~4x savings on memory
+            - **Processing**: Audio is processed as log-mel spectrogram features
+            - **Decoder**: Transformer decoder generates output text tokens auto-regressively
+            - **Control**: Special tokens control whether Canary performs transcription or translation
+            - **Tokenizer**: Concatenated tokenizer offers explicit control of output token space
+            #### Licensing
+            - **Model weights**: CC BY-NC 4.0 license (research-friendly, non-commercial)
+            - **Training code**: Apache 2.0 license (available from NeMo)
+            For more information about accessing Canary locally and building on top of it, see the [NVIDIA/NeMo GitHub repository](https://github.com/NVIDIA/NeMo).
+            """)
 demo.queue()
 demo.launch()

public/chart1.png ADDED Viewed

Git LFS Details

SHA256: be21d130cfeb41e4102d09d2d0775321bf371f4bd520f3bb2461312524501907
Pointer size: 131 Bytes
Size of remote file: 202 kB

public/chart2.png ADDED Viewed

Git LFS Details

SHA256: 47374c51a88264715fc8d71ba001bd1ffe9883e7ee6d8b9b45e575c4f995ad26
Pointer size: 130 Bytes
Size of remote file: 12.8 kB

public/nvidia-speech.png ADDED Viewed

Git LFS Details

SHA256: 36487144d11d4f613ff713e4cafefe120bd85b2aabcdb9c1a597553bddef2a27
Pointer size: 131 Bytes
Size of remote file: 150 kB