Spaces:
Sleeping
Sleeping
File size: 1,692 Bytes
d84f405 3420bec d84f405 17c527a 87d303a 17c527a c89a3ea 17c527a 30318a6 17c527a 30318a6 87d303a d84f405 17c527a bd97165 17c527a 30318a6 17c527a d84f405 17c527a d84f405 17c527a d84f405 17c527a d84f405 17c527a d84f405 17c527a 4cec81b d84f405 0123058 c89a3ea d84f405 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import sentencepiece
def translate_voice(file, target_lang):
model = whisper.load_model("base").float()
audio = whisper.load_audio(file.name)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device).float()
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(model, mel, options)
text = result.text
lang = max(probs, key=probs.get)
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer.src_lang = target_lang
encoded_bg = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_bg)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
tts = gTTS(text=translated_text, lang=target_lang)
filename = "to_speech.mp3"
tts.save(filename)
return filename, text, translated_text, target_lang
iface = gr.Interface(
fn=translate_voice,
inputs=[
gr.inputs.File(label="Your Audio"),
gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr', 'bg'], label="Target Language")
],
outputs=[
gr.outputs.Audio(type="filepath", label="Translated Audio"),
gr.outputs.Textbox(label="Original Text"),
gr.outputs.Textbox(label="Translated Text"),
gr.outputs.Textbox(label="Target Language"),
]
)
iface.launch()
|