DexterSptizu's picture
Create app.py
f5c8681 verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
# Load model and processor globally
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None
def transcribe_audio(audio_path):
try:
# Load and process audio
if audio_path is None:
return "Please provide an audio input."
# Read audio file
import librosa
audio, sr = librosa.load(audio_path, sr=16000)
# Process audio
input_features = processor(
audio,
sampling_rate=16000,
return_tensors="pt"
).input_features
# Generate transcription
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(
predicted_ids,
skip_special_tokens=True
)
return transcription[0]
except Exception as e:
return f"Error processing audio: {str(e)}"
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Whisper Audio Transcription")
with gr.Tabs():
with gr.TabItem("Upload Audio"):
with gr.Row():
with gr.Column():
audio_file = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload Audio File"
)
upload_button = gr.Button("Transcribe")
with gr.Column():
output_text1 = gr.Textbox(
label="Transcription",
placeholder="Transcription will appear here...",
lines=5
)
upload_button.click(
fn=transcribe_audio,
inputs=audio_file,
outputs=output_text1
)
with gr.TabItem("Record Audio"):
with gr.Row():
with gr.Column():
audio_mic = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record Audio"
)
record_button = gr.Button("Transcribe")
with gr.Column():
output_text2 = gr.Textbox(
label="Transcription",
placeholder="Transcription will appear here...",
lines=5
)
record_button.click(
fn=transcribe_audio,
inputs=audio_mic,
outputs=output_text2
)
gr.Markdown("""
### Instructions:
1. Choose either 'Upload Audio' or 'Record Audio' tab
2. Upload an audio file or record using your microphone
3. Click 'Transcribe' to get the transcription
4. The transcribed text will appear in the output box
### Supported Audio Formats:
- WAV
- MP3
- FLAC
- OGG
""")
if __name__ == "__main__":
demo.launch()