import gradio as gr import whisper import os import tempfile import numpy as np # Load Whisper model (this happens once at startup) print("Loading Whisper model...") model = whisper.load_model("base") # Options: tiny, base, small, medium, large print("Model loaded successfully!") def transcribe_audio(audio): """ Transcribe audio file using Whisper Args: audio: Can be: - File path (string) - Tuple of (sample_rate, audio_data) from microphone - None Returns: Transcription text and detected language """ if audio is None: return "No audio provided", "N/A" try: # Handle different audio input types if isinstance(audio, tuple): # Microphone input: (sample_rate, audio_data) sr, audio_data = audio # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: # Convert to float32 and normalize audio_data = audio_data.astype(np.float32) audio_data = audio_data / np.max(np.abs(audio_data)) # Whisper expects mono audio at 16kHz import scipy.io.wavfile as wavfile wavfile.write(temp_audio.name, 16000, audio_data) audio_path = temp_audio.name else: # File upload audio_path = audio # Transcribe print(f"Transcribing: {audio_path}") result = model.transcribe(audio_path, fp16=False) # fp16=False for CPU # Clean up temp file if created if isinstance(audio, tuple): os.unlink(audio_path) transcription = result["text"] language = result["language"] print(f"Transcription complete. Language: {language}") return transcription, language except Exception as e: error_msg = f"Error during transcription: {str(e)}" print(error_msg) return error_msg, "Error" # Create Gradio Interface with gr.Blocks(title="Whisper ASR API") as demo: gr.Markdown( """ # 🎙️ Whisper ASR - Speech to Text Upload an audio file or record your voice to get transcription. Supports multiple languages automatically! """ ) with gr.Tab("Upload Audio"): with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["upload"], type="filepath", label="Upload Audio File" ) upload_button = gr.Button("Transcribe", variant="primary") with gr.Column(): transcription_output = gr.Textbox( label="Transcription", lines=5, placeholder="Transcription will appear here..." ) language_output = gr.Textbox( label="Detected Language", lines=1 ) upload_button.click( fn=transcribe_audio, inputs=audio_input, outputs=[transcription_output, language_output] ) with gr.Tab("Record Audio"): with gr.Row(): with gr.Column(): mic_input = gr.Audio( sources=["microphone"], type="numpy", label="Record Audio" ) record_button = gr.Button("Transcribe Recording", variant="primary") with gr.Column(): mic_transcription_output = gr.Textbox( label="Transcription", lines=5, placeholder="Transcription will appear here..." ) mic_language_output = gr.Textbox( label="Detected Language", lines=1 ) record_button.click( fn=transcribe_audio, inputs=mic_input, outputs=[mic_transcription_output, mic_language_output] ) gr.Markdown( """ ### 📝 Notes: - Supports 99+ languages - Model: Whisper Base (fastest, good accuracy) - Processing time: ~5-15 seconds depending on audio length - Max audio length: 30 seconds recommended for best performance """ ) # Launch the app if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )