Spaces:
Runtime error
Runtime error
File size: 2,083 Bytes
d7dfa49 da5250a 13b10f1 d7dfa49 d195d40 d7dfa49 726d965 da5250a 726d965 da5250a d7dfa49 726d965 d7dfa49 726d965 d7dfa49 da5250a 726d965 da5250a 726d965 da5250a d7dfa49 da5250a 726d965 d195d40 d7dfa49 da5250a d195d40 da5250a d195d40 b2604a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import gradio as gr
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import numpy as np
# Load models
model_stt = whisper.load_model("base")
model_translation = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer_translation = AutoTokenizer.from_pretrained("alirezamsh/small100")
def speech_to_speech(input_audio, to_lang):
# Save the uploaded audio file
input_file = "input_audio" + os.path.splitext(input_audio.name)[1]
input_audio.save(input_file)
# Speech-to-Text (STT)
audio = whisper.load_audio(input_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model_stt.device)
_, probs = model_stt.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model_stt, mel, options)
text = result.text
lang = max(probs, key=probs.get)
# Translate
tokenizer_translation.src_lang = lang
tokenizer_translation.tgt_lang = to_lang
encoded_bg = tokenizer_translation(text, return_tensors="pt")
generated_tokens = model_translation.generate(**encoded_bg)
translated_text = tokenizer_translation.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-Speech (TTS)
tts = gTTS(text=translated_text, lang=to_lang)
output_file = "output_audio.mp3"
tts.save(output_file)
# Load output audio as numpy array
audio_np = np.array(output_file)
return audio_np
languages = ["ru", "fr", "es", "de"] # Example languages: Russian, French, Spanish, German
file_input = gr.inputs.File(label="Upload Audio", accept="audio/*")
dropdown = gr.inputs.Dropdown(languages, label="Translation Language")
audio_output = gr.outputs.Audio(label="Translated Voice", type="numpy")
gr.Interface(
fn=speech_to_speech,
inputs=[file_input, dropdown],
outputs=audio_output,
title="Speech-to-Speech Translator",
description="Upload an audio file (MP3, WAV, or FLAC) and choose the target language for translation.",
theme="default"
).launch()
|