import gradio as gr import os import whisper from pydub import AudioSegment # Load the Whisper model model = whisper.load_model("base") # Function to process the uploaded audio file and perform transcription def process_audio(upload): # Save the uploaded audio file file_path = "uploaded_audio" upload_path = f"{file_path}.mp3" upload.save(upload_path) # Convert the audio file to WAV format wav_path = f"{file_path}.wav" audio = AudioSegment.from_file(upload_path) audio.export(wav_path, format="wav") # Load the audio file and perform preprocessing audio = whisper.load_audio(wav_path) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) # Detect the spoken language _, probs = model.detect_language(mel) detected_language = max(probs, key=probs.get) # Perform transcription using Whisper ASR options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) transcription = result.text # Delete the temporary audio files os.remove(upload_path) os.remove(wav_path) return transcription # Create a file input component for uploading the audio file audio_input = gr.inputs.File(label="Upload Audio", accept=".wav, .mp3") # Create a text output component for displaying the transcription text_output = gr.outputs.Textbox(label="Transcription") # Create a Gradio interface gr.Interface(fn=process_audio, inputs=audio_input, outputs=text_output, title="Audio Transcription").launch()