ACloudCenter commited on
Commit
2bbd27c
·
1 Parent(s): 3e50cb7

Modify with tabs for architecture section

Browse files
Files changed (4) hide show
  1. app.py +127 -46
  2. public/chart1.png +3 -0
  3. public/chart2.png +3 -0
  4. public/nvidia-speech.png +3 -0
app.py CHANGED
@@ -132,11 +132,13 @@ with gr.Blocks(theme=theme) as demo:
132
  summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only
133
  its transcript. This model is ready for commercial use.''')
134
 
135
- # State variables
136
- transcript_state = gr.State("")
 
 
137
 
138
- # Example questions
139
- example_questions = [
140
  ["Can you summarize this meeting?"],
141
  ["Please provide bullet points of the key items."],
142
  ["What is the TL;DR of this meeting?"],
@@ -144,8 +146,8 @@ with gr.Blocks(theme=theme) as demo:
144
  ["What was the main topic?"],
145
  ]
146
 
147
- # Audio Input and Transcript
148
- with gr.Row():
149
  with gr.Column(scale=1):
150
  gr.Markdown("### Audio Input")
151
  audio_input = gr.Audio(
@@ -160,9 +162,9 @@ with gr.Blocks(theme=theme) as demo:
160
  label="Example Audio"
161
  )
162
  transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
163
- clear_audio_btn = gr.Button("Clear Audio")
164
 
165
- with gr.Column(scale=1):
166
  gr.Markdown("### Transcript")
167
  transcript_output = gr.Textbox(
168
  label="",
@@ -171,22 +173,22 @@ with gr.Blocks(theme=theme) as demo:
171
  max_lines=12,
172
  autoscroll=True
173
  )
174
- clear_transcript_btn = gr.Button("Clear Transcript")
175
 
176
- # Spacing
177
- gr.Markdown("---")
178
-
179
- with gr.Row():
180
  with gr.Column(scale=1):
181
  gr.Markdown("### Interactive Q&A")
182
  gr.Markdown("#### About Context-Aware Q&A")
183
  gr.Markdown("""The model retains the full transcript context, allowing you to ask follow-up questions
184
  naturally without re-stating information. It understands references like 'they', 'it', or 'that topic'.""")
185
  gr.Markdown("#### Example Questions")
186
- # Examples will be added after msg is defined
187
- example_container = gr.Column()
188
-
189
- with gr.Column(scale=3):
190
  # Add thinking display above chat
191
  with gr.Accordion("🧠 Model Thinking", open=False):
192
  thinking_box = gr.Textbox(
@@ -211,10 +213,10 @@ with gr.Blocks(theme=theme) as demo:
211
  lines=1
212
  )
213
  submit_chat_btn = gr.Button("Send", variant="primary", scale=1)
214
- clear_chat_btn = gr.Button("Clear Chat", size="sm")
215
-
216
- # Event handlers
217
- def submit_question(question, transcript):
218
  if not question or question.strip() == "":
219
  yield "", [], ""
220
  answer, thinking = transcript_qa(transcript, question)
@@ -223,21 +225,21 @@ with gr.Blocks(theme=theme) as demo:
223
  {"role": "user", "content": question},
224
  {"role": "assistant", "content": answer}
225
  ]
226
- yield "", messages, thinking
227
 
228
-
229
- # Add examples inside the left column container
230
- with example_container:
231
  gr.Examples(
232
  examples=example_questions,
233
  inputs=msg,
234
  outputs=[msg, chatbot, thinking_box],
235
  fn=lambda q: submit_question(q, transcript_state.value),
236
  cache_examples=False,
237
- label=""
238
- )
239
 
240
- transcribe_btn.click(
241
  fn=disable_transcribe,
242
  outputs=[transcribe_btn]
243
  ).then(
@@ -249,35 +251,114 @@ with gr.Blocks(theme=theme) as demo:
249
  outputs=[transcript_output, transcript_state]
250
  ).then(
251
  fn=enable_transcribe,
252
- outputs=[transcribe_btn]
253
- )
254
-
255
- clear_audio_btn.click(
256
  fn=lambda: None,
257
- outputs=[audio_input]
258
- )
259
 
260
- clear_transcript_btn.click(
261
  fn=lambda: ("", ""),
262
- outputs=[transcript_output, transcript_state]
263
- )
264
 
265
- msg.submit(
266
  fn=submit_question,
267
  inputs=[msg, transcript_state],
268
- outputs=[msg, chatbot, thinking_box]
269
- )
270
 
271
- submit_chat_btn.click(
272
  fn=submit_question,
273
  inputs=[msg, transcript_state],
274
- outputs=[msg, chatbot, thinking_box]
275
- )
276
 
277
- clear_chat_btn.click(
278
  fn=lambda: ([], ""),
279
- outputs=[chatbot, thinking_box]
280
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  demo.queue()
283
  demo.launch()
 
132
  summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only
133
  its transcript. This model is ready for commercial use.''')
134
 
135
+ with gr.Tabs():
136
+ with gr.Tab("Transcribe"):
137
+ # State variables
138
+ transcript_state = gr.State("")
139
 
140
+ # Example questions
141
+ example_questions = [
142
  ["Can you summarize this meeting?"],
143
  ["Please provide bullet points of the key items."],
144
  ["What is the TL;DR of this meeting?"],
 
146
  ["What was the main topic?"],
147
  ]
148
 
149
+ # Audio Input and Transcript
150
+ with gr.Row():
151
  with gr.Column(scale=1):
152
  gr.Markdown("### Audio Input")
153
  audio_input = gr.Audio(
 
162
  label="Example Audio"
163
  )
164
  transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
165
+ clear_audio_btn = gr.Button("Clear Audio")
166
 
167
+ with gr.Column(scale=1):
168
  gr.Markdown("### Transcript")
169
  transcript_output = gr.Textbox(
170
  label="",
 
173
  max_lines=12,
174
  autoscroll=True
175
  )
176
+ clear_transcript_btn = gr.Button("Clear Transcript")
177
 
178
+ # Spacing
179
+ gr.Markdown("---")
180
+
181
+ with gr.Row():
182
  with gr.Column(scale=1):
183
  gr.Markdown("### Interactive Q&A")
184
  gr.Markdown("#### About Context-Aware Q&A")
185
  gr.Markdown("""The model retains the full transcript context, allowing you to ask follow-up questions
186
  naturally without re-stating information. It understands references like 'they', 'it', or 'that topic'.""")
187
  gr.Markdown("#### Example Questions")
188
+ # Examples will be added after msg is defined
189
+ example_container = gr.Column()
190
+
191
+ with gr.Column(scale=3):
192
  # Add thinking display above chat
193
  with gr.Accordion("🧠 Model Thinking", open=False):
194
  thinking_box = gr.Textbox(
 
213
  lines=1
214
  )
215
  submit_chat_btn = gr.Button("Send", variant="primary", scale=1)
216
+ clear_chat_btn = gr.Button("Clear Chat", size="sm")
217
+
218
+ # Event handlers
219
+ def submit_question(question, transcript):
220
  if not question or question.strip() == "":
221
  yield "", [], ""
222
  answer, thinking = transcript_qa(transcript, question)
 
225
  {"role": "user", "content": question},
226
  {"role": "assistant", "content": answer}
227
  ]
228
+ yield "", messages, thinking
229
 
230
+
231
+ # Add examples inside the left column container
232
+ with example_container:
233
  gr.Examples(
234
  examples=example_questions,
235
  inputs=msg,
236
  outputs=[msg, chatbot, thinking_box],
237
  fn=lambda q: submit_question(q, transcript_state.value),
238
  cache_examples=False,
239
+ label=""
240
+ )
241
 
242
+ transcribe_btn.click(
243
  fn=disable_transcribe,
244
  outputs=[transcribe_btn]
245
  ).then(
 
251
  outputs=[transcript_output, transcript_state]
252
  ).then(
253
  fn=enable_transcribe,
254
+ outputs=[transcribe_btn]
255
+ )
256
+
257
+ clear_audio_btn.click(
258
  fn=lambda: None,
259
+ outputs=[audio_input]
260
+ )
261
 
262
+ clear_transcript_btn.click(
263
  fn=lambda: ("", ""),
264
+ outputs=[transcript_output, transcript_state]
265
+ )
266
 
267
+ msg.submit(
268
  fn=submit_question,
269
  inputs=[msg, transcript_state],
270
+ outputs=[msg, chatbot, thinking_box]
271
+ )
272
 
273
+ submit_chat_btn.click(
274
  fn=submit_question,
275
  inputs=[msg, transcript_state],
276
+ outputs=[msg, chatbot, thinking_box]
277
+ )
278
 
279
+ clear_chat_btn.click(
280
  fn=lambda: ([], ""),
281
+ outputs=[chatbot, thinking_box]
282
+ )
283
+
284
+ with gr.Tab("Architecture"):
285
+ gr.Markdown("### Model Performance")
286
+
287
+ with gr.Row():
288
+ with gr.Column(scale=1):
289
+ gr.Markdown("""
290
+ #### Industry-Leading Performance
291
+
292
+ Canary ranks at the top of the HuggingFace Open ASR Leaderboard with an average word error rate (WER) of **6.67%**. It outperforms all other open-source models by a wide margin.
293
+
294
+ #### Training Data
295
+
296
+ Canary is trained on a combination of public and in-house data:
297
+ - **85K hours** of transcribed speech for speech recognition
298
+ - NVIDIA NeMo text translation models used to generate translations of the original transcripts in all supported languages
299
+
300
+ Despite using an order of magnitude less data, Canary outperforms the similarly sized Whisper-large-v3 and SeamlessM4T-Medium-v1 models on both transcription and translation tasks.
301
+ """)
302
+
303
+ with gr.Column(scale=1):
304
+ gr.HTML("""
305
+ <div style="text-align: center; padding: 20px;">
306
+ <img src="https://developer-blogs.nvidia.com/wp-content/uploads/2024/12/fig-6-nvidia-nemo-canary-architecture.png"
307
+ style="width: 100%; max-width: 500px; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);"
308
+ alt="NVIDIA Canary Architecture">
309
+ <p style="margin-top: 10px; color: #666; font-size: 14px;">NVIDIA Canary Architecture</p>
310
+ </div>
311
+ """)
312
+
313
+ gr.Markdown("### Benchmark Results")
314
+
315
+ with gr.Row():
316
+ with gr.Column():
317
+ gr.Markdown("""
318
+ #### Word Error Rate (WER) on MCV 16.1 Test Sets
319
+ On the MCV 16.1 test sets for English, Spanish, French, and German, Canary achieved a WER of **5.77** (lower is better).
320
+
321
+ | Model | Average WER |
322
+ |-------|-------------|
323
+ | **Canary** | **5.77** |
324
+ | SeamlessM4T-v2 | 6.41 |
325
+ | Whisper-large-v3 | 8.05 |
326
+ | SeamlessM4T-v1 | 9.48 |
327
+ """)
328
+
329
+ with gr.Column():
330
+ gr.Markdown("""
331
+ #### Translation BLEU Scores
332
+
333
+ **From English** (ES, FR, DE on FLEURS & MExpresso):
334
+ - Canary: **30.57** BLEU
335
+
336
+ **To English** (ES, FR, DE on FLEURS & CoVoST):
337
+ - Canary: **34.25** BLEU
338
+
339
+ *(Higher BLEU scores indicate better translation quality)*
340
+ """)
341
+
342
+ gr.Markdown("---")
343
+
344
+ gr.Markdown("""
345
+ ### Canary Architecture Details
346
+
347
+ Canary is an encoder-decoder model built on NVIDIA innovations:
348
+
349
+ - **Encoder**: Fast-Conformer - an efficient Conformer architecture optimized for ~3x savings on compute and ~4x savings on memory
350
+ - **Processing**: Audio is processed as log-mel spectrogram features
351
+ - **Decoder**: Transformer decoder generates output text tokens auto-regressively
352
+ - **Control**: Special tokens control whether Canary performs transcription or translation
353
+ - **Tokenizer**: Concatenated tokenizer offers explicit control of output token space
354
+
355
+ #### Licensing
356
+
357
+ - **Model weights**: CC BY-NC 4.0 license (research-friendly, non-commercial)
358
+ - **Training code**: Apache 2.0 license (available from NeMo)
359
+
360
+ For more information about accessing Canary locally and building on top of it, see the [NVIDIA/NeMo GitHub repository](https://github.com/NVIDIA/NeMo).
361
+ """)
362
 
363
  demo.queue()
364
  demo.launch()
public/chart1.png ADDED

Git LFS Details

  • SHA256: be21d130cfeb41e4102d09d2d0775321bf371f4bd520f3bb2461312524501907
  • Pointer size: 131 Bytes
  • Size of remote file: 202 kB
public/chart2.png ADDED

Git LFS Details

  • SHA256: 47374c51a88264715fc8d71ba001bd1ffe9883e7ee6d8b9b45e575c4f995ad26
  • Pointer size: 130 Bytes
  • Size of remote file: 12.8 kB
public/nvidia-speech.png ADDED

Git LFS Details

  • SHA256: 36487144d11d4f613ff713e4cafefe120bd85b2aabcdb9c1a597553bddef2a27
  • Pointer size: 131 Bytes
  • Size of remote file: 150 kB