File size: 2,028 Bytes
d7dfa49
da5250a
13b10f1
d7dfa49
 
292172d
d7dfa49
292172d
 
726d965
292172d
 
 
726d965
292172d
 
 
d7dfa49
292172d
 
 
 
 
 
 
d7dfa49
292172d
d7dfa49
726d965
292172d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2604a4
292172d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import IPython.display as ipd

# Load Whisper STT model
whisper_model = whisper.load_model("base")

# Load translation models
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")

def translate_speech(audio_file, target_lang):
    # Load audio
    audio = whisper.load_audio(audio_file)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)

    # Detect language
    _, probs = whisper_model.detect_language(mel)
    lang = max(probs, key=probs.get)

    # Decode audio into text
    options = whisper.DecodingOptions()
    result = whisper.decode(whisper_model, mel, options)
    text = result.text

    # Translate text
    tokenizer.src_lang = lang
    encoded_text = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_text)
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    # Text-to-speech (TTS)
    tts = gTTS(text=translated_text, lang=target_lang)
    audio_path = "translated_audio.mp3"
    tts.save(audio_path)

    return audio_path

def translate_speech_interface(audio, target_lang):
    audio_path = "recorded_audio.wav"
    with open(audio_path, "wb") as f:
        f.write(audio.read())

    translated_audio = translate_speech(audio_path, target_lang)
    translated_audio = open(translated_audio, "rb")

    return translated_audio

# Define the Gradio interface
audio_recording = gr.inputs.Audio(source="microphone", type="wav", label="Record your speech")
target_language = gr.inputs.Dropdown(["en", "ru", "fr"], label="Target Language")
output_audio = gr.outputs.Audio(type="audio/mpeg", label="Translated Audio")

gr.Interface(fn=translate_speech_interface, inputs=[audio_recording, target_language], outputs=output_audio, title="Speech Translator").launch()