File size: 2,083 Bytes
d7dfa49
da5250a
13b10f1
d7dfa49
 
d195d40
d7dfa49
726d965
 
 
 
 
da5250a
 
 
 
726d965
da5250a
d7dfa49
 
726d965
 
d7dfa49
726d965
d7dfa49
da5250a
726d965
da5250a
726d965
 
 
 
 
 
da5250a
d7dfa49
da5250a
 
726d965
d195d40
 
 
 
d7dfa49
da5250a
d195d40
da5250a
d195d40
 
 
 
 
 
 
 
 
 
b2604a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import numpy as np

# Load models
model_stt = whisper.load_model("base")
model_translation = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer_translation = AutoTokenizer.from_pretrained("alirezamsh/small100")

def speech_to_speech(input_audio, to_lang):
    # Save the uploaded audio file
    input_file = "input_audio" + os.path.splitext(input_audio.name)[1]
    input_audio.save(input_file)

    # Speech-to-Text (STT)
    audio = whisper.load_audio(input_file)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model_stt.device)
    _, probs = model_stt.detect_language(mel)
    options = whisper.DecodingOptions()
    result = whisper.decode(model_stt, mel, options)
    text = result.text
    lang = max(probs, key=probs.get)

    # Translate
    tokenizer_translation.src_lang = lang
    tokenizer_translation.tgt_lang = to_lang
    encoded_bg = tokenizer_translation(text, return_tensors="pt")
    generated_tokens = model_translation.generate(**encoded_bg)
    translated_text = tokenizer_translation.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    # Text-to-Speech (TTS)
    tts = gTTS(text=translated_text, lang=to_lang)
    output_file = "output_audio.mp3"
    tts.save(output_file)

    # Load output audio as numpy array
    audio_np = np.array(output_file)

    return audio_np

languages = ["ru", "fr", "es", "de"]  # Example languages: Russian, French, Spanish, German
file_input = gr.inputs.File(label="Upload Audio", accept="audio/*")
dropdown = gr.inputs.Dropdown(languages, label="Translation Language")
audio_output = gr.outputs.Audio(label="Translated Voice", type="numpy")

gr.Interface(
    fn=speech_to_speech,
    inputs=[file_input, dropdown],
    outputs=audio_output,
    title="Speech-to-Speech Translator",
    description="Upload an audio file (MP3, WAV, or FLAC) and choose the target language for translation.",
    theme="default"
).launch()