Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import whisper | |
| import os | |
| import tempfile | |
| import numpy as np | |
| # Load Whisper model (this happens once at startup) | |
| print("Loading Whisper model...") | |
| model = whisper.load_model("base") # Options: tiny, base, small, medium, large | |
| print("Model loaded successfully!") | |
| def transcribe_audio(audio): | |
| """ | |
| Transcribe audio file using Whisper | |
| Args: | |
| audio: Can be: | |
| - File path (string) | |
| - Tuple of (sample_rate, audio_data) from microphone | |
| - None | |
| Returns: | |
| Transcription text and detected language | |
| """ | |
| if audio is None: | |
| return "No audio provided", "N/A" | |
| try: | |
| # Handle different audio input types | |
| if isinstance(audio, tuple): | |
| # Microphone input: (sample_rate, audio_data) | |
| sr, audio_data = audio | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: | |
| # Convert to float32 and normalize | |
| audio_data = audio_data.astype(np.float32) | |
| audio_data = audio_data / np.max(np.abs(audio_data)) | |
| # Whisper expects mono audio at 16kHz | |
| import scipy.io.wavfile as wavfile | |
| wavfile.write(temp_audio.name, 16000, audio_data) | |
| audio_path = temp_audio.name | |
| else: | |
| # File upload | |
| audio_path = audio | |
| # Transcribe | |
| print(f"Transcribing: {audio_path}") | |
| result = model.transcribe(audio_path, fp16=False) # fp16=False for CPU | |
| # Clean up temp file if created | |
| if isinstance(audio, tuple): | |
| os.unlink(audio_path) | |
| transcription = result["text"] | |
| language = result["language"] | |
| print(f"Transcription complete. Language: {language}") | |
| return transcription, language | |
| except Exception as e: | |
| error_msg = f"Error during transcription: {str(e)}" | |
| print(error_msg) | |
| return error_msg, "Error" | |
| # Create Gradio Interface | |
| with gr.Blocks(title="Whisper ASR API") as demo: | |
| gr.Markdown( | |
| """ | |
| # ๐๏ธ Whisper ASR - Speech to Text | |
| Upload an audio file or record your voice to get transcription. | |
| Supports multiple languages automatically! | |
| """ | |
| ) | |
| with gr.Tab("Upload Audio"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Upload Audio File" | |
| ) | |
| upload_button = gr.Button("Transcribe", variant="primary") | |
| with gr.Column(): | |
| transcription_output = gr.Textbox( | |
| label="Transcription", | |
| lines=5, | |
| placeholder="Transcription will appear here..." | |
| ) | |
| language_output = gr.Textbox( | |
| label="Detected Language", | |
| lines=1 | |
| ) | |
| upload_button.click( | |
| fn=transcribe_audio, | |
| inputs=audio_input, | |
| outputs=[transcription_output, language_output] | |
| ) | |
| with gr.Tab("Record Audio"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| mic_input = gr.Audio( | |
| sources=["microphone"], | |
| type="numpy", | |
| label="Record Audio" | |
| ) | |
| record_button = gr.Button("Transcribe Recording", variant="primary") | |
| with gr.Column(): | |
| mic_transcription_output = gr.Textbox( | |
| label="Transcription", | |
| lines=5, | |
| placeholder="Transcription will appear here..." | |
| ) | |
| mic_language_output = gr.Textbox( | |
| label="Detected Language", | |
| lines=1 | |
| ) | |
| record_button.click( | |
| fn=transcribe_audio, | |
| inputs=mic_input, | |
| outputs=[mic_transcription_output, mic_language_output] | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### ๐ Notes: | |
| - Supports 99+ languages | |
| - Model: Whisper Base (fastest, good accuracy) | |
| - Processing time: ~5-15 seconds depending on audio length | |
| - Max audio length: 30 seconds recommended for best performance | |
| """ | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |