Spaces:

frogcho123
/

s2t2s

Sleeping

File size: 2,109 Bytes

d84f405
 
 
 
 
3420bec
 
d84f405
17c527a
 
 
 
c89a3ea
17c527a
 
30318a6
17c527a
 
30318a6
17c527a
 
d84f405
17c527a
 
3420bec
17c527a
30318a6
17c527a
 
d84f405
17c527a
 
 
d84f405
17c527a
 
 
 
d84f405
17c527a
 
 
 
d84f405
17c527a
d84f405
17c527a
 
d84f405
 
 
 
17c527a
c89a3ea
d84f405
 
0123058
c89a3ea
 
 
d84f405

import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import sentencepiece


def translate_voice(file, target_lang):
    try:
        # Load the model and switch to float32
        model = whisper.load_model("base").float()

        # Load the audio
        audio = whisper.load_audio(file.name)

        # Pad or trim the audio
        audio = whisper.pad_or_trim(audio)

        # Convert the audio to a log Mel spectrogram and move it to the same device as the model (CPU in your case)
        mel = whisper.log_mel_spectrogram(audio).to(model.device).float()  # convert to full-precision float32

        # Proceed with your language detection and decoding
        _, probs = model.detect_language(mel)
        options = whisper.DecodingOptions()
        result = whisper.decode(model, mel, options)

        text = result.text
        lang = max(probs, key=probs.get)

        # Translate
        tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
        model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")

        tokenizer.src_lang = target_lang
        encoded_bg = tokenizer(text, return_tensors="pt")
        generated_tokens = model.generate(**encoded_bg)
        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

        # Text-to-audio (TTS)
        tts = gTTS(text=translated_text, lang=target_lang)
        filename = "to_speech.mp3"
        tts.save(filename)

        return filename, text, translated_text, target_lang

    except Exception as e:
        return str(e), "", "", ""

iface = gr.Interface(
    fn=translate_voice, 
    inputs=[
        gr.inputs.File(label="Your Audio"), 
        gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
    ], 
    outputs=[
        gr.outputs.Audio(type="filepath", label="Translated Audio"),
        gr.outputs.Textbox(label="Original Text"),
        gr.outputs.Textbox(label="Translated Text"),
        gr.outputs.Textbox(label="Target Language"),
    ]
)
iface.launch()