Spaces:

frogcho123
/

s2s

Build error

App Files Files Community

frogcho123 commited on Jun 6, 2023

Commit

b3ba25a

•

1 Parent(s): 2920572

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -12

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import os
 import whisper
 # Load the Whisper model
 model = whisper.load_model("base")
@@ -8,34 +9,41 @@ model = whisper.load_model("base")
 # Function to process the uploaded audio file and perform transcription
 def process_audio(upload):
     # Save the uploaded audio file
-    file_path = "uploaded_audio.wav"
-    with open(file_path, "wb") as f:
-        f.write(upload.read())
     # Load the audio file and perform preprocessing
-    audio = whisper.load_audio(file_path)
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     # Detect the spoken language
     _, probs = model.detect_language(mel)
     detected_language = max(probs, key=probs.get)
     # Perform transcription using Whisper ASR
     options = whisper.DecodingOptions()
     result = whisper.decode(model, mel, options)
     transcription = result.text
-    # Delete the temporary audio file
-    os.remove(file_path)
     return transcription
 # Create a file input component for uploading the audio file
-audio_input = gr.inputs.File(label="Upload Audio")
 # Create a text output component for displaying the transcription
 text_output = gr.outputs.Textbox(label="Transcription")
 # Create a Gradio interface
 gr.Interface(fn=process_audio, inputs=audio_input, outputs=text_output, title="Audio Transcription").launch()

 import gradio as gr
 import os
 import whisper
+from pydub import AudioSegment
 # Load the Whisper model
 model = whisper.load_model("base")
 # Function to process the uploaded audio file and perform transcription
 def process_audio(upload):
     # Save the uploaded audio file
+    file_path = "uploaded_audio"
+    upload_path = f"{file_path}.mp3"
+    upload.save(upload_path)
+    # Convert the audio file to WAV format
+    wav_path = f"{file_path}.wav"
+    audio = AudioSegment.from_file(upload_path)
+    audio.export(wav_path, format="wav")
     # Load the audio file and perform preprocessing
+    audio = whisper.load_audio(wav_path)
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     # Detect the spoken language
     _, probs = model.detect_language(mel)
     detected_language = max(probs, key=probs.get)
     # Perform transcription using Whisper ASR
     options = whisper.DecodingOptions()
     result = whisper.decode(model, mel, options)
     transcription = result.text
+    # Delete the temporary audio files
+    os.remove(upload_path)
+    os.remove(wav_path)
     return transcription
 # Create a file input component for uploading the audio file
+audio_input = gr.inputs.File(label="Upload Audio", accept=".wav, .mp3")
 # Create a text output component for displaying the transcription
 text_output = gr.outputs.Textbox(label="Transcription")
 # Create a Gradio interface
 gr.Interface(fn=process_audio, inputs=audio_input, outputs=text_output, title="Audio Transcription").launch()