whisper_test / app.py
MEssam00's picture
Create app.py
81c1475 verified
import gradio as gr
import whisper
import os
import tempfile
import numpy as np
# Load Whisper model (this happens once at startup)
print("Loading Whisper model...")
model = whisper.load_model("base") # Options: tiny, base, small, medium, large
print("Model loaded successfully!")
def transcribe_audio(audio):
"""
Transcribe audio file using Whisper
Args:
audio: Can be:
- File path (string)
- Tuple of (sample_rate, audio_data) from microphone
- None
Returns:
Transcription text and detected language
"""
if audio is None:
return "No audio provided", "N/A"
try:
# Handle different audio input types
if isinstance(audio, tuple):
# Microphone input: (sample_rate, audio_data)
sr, audio_data = audio
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
# Convert to float32 and normalize
audio_data = audio_data.astype(np.float32)
audio_data = audio_data / np.max(np.abs(audio_data))
# Whisper expects mono audio at 16kHz
import scipy.io.wavfile as wavfile
wavfile.write(temp_audio.name, 16000, audio_data)
audio_path = temp_audio.name
else:
# File upload
audio_path = audio
# Transcribe
print(f"Transcribing: {audio_path}")
result = model.transcribe(audio_path, fp16=False) # fp16=False for CPU
# Clean up temp file if created
if isinstance(audio, tuple):
os.unlink(audio_path)
transcription = result["text"]
language = result["language"]
print(f"Transcription complete. Language: {language}")
return transcription, language
except Exception as e:
error_msg = f"Error during transcription: {str(e)}"
print(error_msg)
return error_msg, "Error"
# Create Gradio Interface
with gr.Blocks(title="Whisper ASR API") as demo:
gr.Markdown(
"""
# ๐ŸŽ™๏ธ Whisper ASR - Speech to Text
Upload an audio file or record your voice to get transcription.
Supports multiple languages automatically!
"""
)
with gr.Tab("Upload Audio"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload Audio File"
)
upload_button = gr.Button("Transcribe", variant="primary")
with gr.Column():
transcription_output = gr.Textbox(
label="Transcription",
lines=5,
placeholder="Transcription will appear here..."
)
language_output = gr.Textbox(
label="Detected Language",
lines=1
)
upload_button.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=[transcription_output, language_output]
)
with gr.Tab("Record Audio"):
with gr.Row():
with gr.Column():
mic_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record Audio"
)
record_button = gr.Button("Transcribe Recording", variant="primary")
with gr.Column():
mic_transcription_output = gr.Textbox(
label="Transcription",
lines=5,
placeholder="Transcription will appear here..."
)
mic_language_output = gr.Textbox(
label="Detected Language",
lines=1
)
record_button.click(
fn=transcribe_audio,
inputs=mic_input,
outputs=[mic_transcription_output, mic_language_output]
)
gr.Markdown(
"""
### ๐Ÿ“ Notes:
- Supports 99+ languages
- Model: Whisper Base (fastest, good accuracy)
- Processing time: ~5-15 seconds depending on audio length
- Max audio length: 30 seconds recommended for best performance
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)