Spaces:

frogcho123
/

s2s

Build error

App Files Files Community

frogcho123 commited on Jun 6, 2023

Commit

bbee8bf

•

1 Parent(s): a287ca7

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -39

app.py CHANGED Viewed

@@ -1,49 +1,63 @@
 import gradio as gr
-import os
 import whisper
-from pydub import AudioSegment
-# Load the Whisper model
-model = whisper.load_model("base")
-# Function to process the uploaded audio file and perform transcription
-def process_audio(upload):
-    # Save the uploaded audio file
-    file_path = "uploaded_audio"
-    upload_path = f"{file_path}.mp3"
-    upload.save(upload_path)
-    # Convert the audio file to WAV format
-    wav_path = f"{file_path}.wav"
-    audio = AudioSegment.from_file(upload_path)
-    audio.export(wav_path, format="wav")
-    # Load the audio file and perform preprocessing
-    audio = whisper.load_audio(wav_path)
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    # Detect the spoken language
     _, probs = model.detect_language(mel)
-    detected_language = max(probs, key=probs.get)
-    # Perform transcription using Whisper ASR
     options = whisper.DecodingOptions()
     result = whisper.decode(model, mel, options)
-    transcription = result.text
-    # Delete the temporary audio files
-    os.remove(upload_path)
-    os.remove(wav_path)
-    return transcription
-# Create a file input component for uploading the audio file
-audio_input = gr.inputs.File(label="Upload Audio", accept=".wav, .mp3")
-# Create a text output component for displaying the transcription
-text_output = gr.outputs.Textbox(label="Transcription")
-# Create a Gradio interface
-gr.Interface(fn=process_audio, inputs=audio_input, outputs=text_output, title="Audio Transcription").launch()

 import gradio as gr
 import whisper
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from gtts import gTTS
+import sounddevice as sd
+import scipy.io.wavfile as wav
+import os
+def translate_speech_to_speech(input_audio):
+    # Save the input audio to a temporary file
+    input_file = "input_audio" + os.path.splitext(input_audio.name)[1]
+    input_audio.save(input_file)
+    # Language detection and translation code from the first code snippet
+    model = whisper.load_model("base")
+    audio = whisper.load_audio(input_file)
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     _, probs = model.detect_language(mel)
     options = whisper.DecodingOptions()
     result = whisper.decode(model, mel, options)
+    text = result.text
+    lang = max(probs, key=probs.get)
+    # Translation code from the first code snippet
+    to_lang = 'ru'
+    tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
+    model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
+    tokenizer.src_lang = lang
+    encoded_bg = tokenizer(text, return_tensors="pt")
+    generated_tokens = model.generate(**encoded_bg)
+    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+    # Text-to-speech (TTS) code from the first code snippet
+    tts = gTTS(text=translated_text, lang=to_lang)
+    output_file = "translated_speech.mp3"
+    tts.save(output_file)
+    # Load the translated audio and return as an output
+    translated_audio = open(output_file, "rb")
+    return translated_audio
+title = "Speech-to-Speech Translator"
+input_audio = gr.inputs.Audio(type=["mp3", "wav"])
+output_audio = gr.outputs.Audio(type=["mp3", "wav"])
+stt_demo = gr.Interface(
+    fn=translate_speech_to_speech,
+    inputs=input_audio,
+    outputs=output_audio,
+    title=title,
+    description="Speak in any language, and the translator will convert it to speech in the target language.",
+)
+if __name__ == "__main__":
+    stt_demo.launch()