|
import gradio as gr |
|
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM |
|
from gtts import gTTS |
|
import tempfile |
|
|
|
|
|
def initialize_model(): |
|
try: |
|
|
|
asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small") |
|
|
|
|
|
translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul") |
|
|
|
|
|
model_name = "microsoft/DialoGPT-medium" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
|
return asr_model, translation_model, tokenizer, model |
|
except Exception as e: |
|
print(f"Error initializing models: {e}") |
|
return None, None, None, None |
|
|
|
|
|
asr_model, translation_model, tokenizer, conversation_model = initialize_model() |
|
|
|
def chatbot_speech_to_speech(audio_input, target_language): |
|
try: |
|
|
|
text_input = asr_model(audio_input)["text"] |
|
|
|
|
|
if target_language != "en": |
|
translated_text = translation_model(text_input, src_lang=target_language, tgt_lang="en")[0]['translation_text'] |
|
else: |
|
translated_text = text_input |
|
|
|
|
|
inputs = tokenizer.encode(translated_text + tokenizer.eos_token, return_tensors='pt') |
|
response_ids = conversation_model.generate(inputs, max_length=100, pad_token_id=tokenizer.eos_token_id) |
|
response_text = tokenizer.decode(response_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True) |
|
|
|
|
|
if target_language != "en": |
|
final_response = translation_model(response_text, src_lang="en", tgt_lang=target_language)[0]['translation_text'] |
|
else: |
|
final_response = response_text |
|
|
|
|
|
tts = gTTS(final_response, lang=target_language) |
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") |
|
tts.save(temp_file.name) |
|
|
|
return temp_file.name |
|
except Exception as e: |
|
return f"Error in processing: {e}" |
|
|
|
|
|
def interface(audio, language): |
|
result = chatbot_speech_to_speech(audio, language) |
|
return result |
|
|
|
|
|
with gr.Blocks() as gradio_ui: |
|
gr.Markdown("# Multilingual Voice-to-Voice Chatbot for Kids") |
|
gr.Markdown("### Speak to the chatbot in your selected language and receive a spoken response.") |
|
|
|
audio_input = gr.Audio(type="filepath", label="Record your message") |
|
language_dropdown = gr.Dropdown(choices=["en", "fr", "es", "de", "zh", "ur"], label="Select Language") |
|
|
|
result_audio = gr.Audio(type="filepath", label="Chatbot Response") |
|
|
|
|
|
submit_btn = gr.Button("Submit") |
|
submit_btn.click(fn=interface, inputs=[audio_input, language_dropdown], outputs=result_audio) |
|
|
|
|
|
if asr_model and translation_model and tokenizer and conversation_model: |
|
gradio_ui.launch() |
|
else: |
|
print("Error initializing one or more models. Please check your model configuration.") |
|
|