import gradio as gr import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration import numpy as np # Load model and processor globally processor = WhisperProcessor.from_pretrained("openai/whisper-small") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") model.config.forced_decoder_ids = None def transcribe_audio(audio_path): try: # Load and process audio if audio_path is None: return "Please provide an audio input." # Read audio file import librosa audio, sr = librosa.load(audio_path, sr=16000) # Process audio input_features = processor( audio, sampling_rate=16000, return_tensors="pt" ).input_features # Generate transcription predicted_ids = model.generate(input_features) transcription = processor.batch_decode( predicted_ids, skip_special_tokens=True ) return transcription[0] except Exception as e: return f"Error processing audio: {str(e)}" # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# Whisper Audio Transcription") with gr.Tabs(): with gr.TabItem("Upload Audio"): with gr.Row(): with gr.Column(): audio_file = gr.Audio( sources=["upload"], type="filepath", label="Upload Audio File" ) upload_button = gr.Button("Transcribe") with gr.Column(): output_text1 = gr.Textbox( label="Transcription", placeholder="Transcription will appear here...", lines=5 ) upload_button.click( fn=transcribe_audio, inputs=audio_file, outputs=output_text1 ) with gr.TabItem("Record Audio"): with gr.Row(): with gr.Column(): audio_mic = gr.Audio( sources=["microphone"], type="filepath", label="Record Audio" ) record_button = gr.Button("Transcribe") with gr.Column(): output_text2 = gr.Textbox( label="Transcription", placeholder="Transcription will appear here...", lines=5 ) record_button.click( fn=transcribe_audio, inputs=audio_mic, outputs=output_text2 ) gr.Markdown(""" ### Instructions: 1. Choose either 'Upload Audio' or 'Record Audio' tab 2. Upload an audio file or record using your microphone 3. Click 'Transcribe' to get the transcription 4. The transcribed text will appear in the output box ### Supported Audio Formats: - WAV - MP3 - FLAC - OGG """) if __name__ == "__main__": demo.launch()