import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM from gtts import gTTS import tempfile # Function to initialize models with exception handling def initialize_model(): try: # Load ASR (Automatic Speech Recognition) model for voice-to-text asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small") # Load Translation model (supports multiple language pairs) translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul") # Choose a supported model # Load conversational model (fine-tuned on dialogues) model_name = "microsoft/DialoGPT-medium" # Example conversational model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) return asr_model, translation_model, tokenizer, model except Exception as e: print(f"Error initializing models: {e}") return None, None, None, None # Initialize the models asr_model, translation_model, tokenizer, conversation_model = initialize_model() def chatbot_speech_to_speech(audio_input, target_language): try: # Step 1: Convert Audio to Text text_input = asr_model(audio_input)["text"] # Step 2: Translate Text to English if the input language is not English if target_language != "en": translated_text = translation_model(text_input, src_lang=target_language, tgt_lang="en")[0]['translation_text'] else: translated_text = text_input # Step 3: Generate conversational response using the dialogue model inputs = tokenizer.encode(translated_text + tokenizer.eos_token, return_tensors='pt') response_ids = conversation_model.generate(inputs, max_length=100, pad_token_id=tokenizer.eos_token_id) response_text = tokenizer.decode(response_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True) # Step 4: Translate the response text back to the target language if target_language != "en": final_response = translation_model(response_text, src_lang="en", tgt_lang=target_language)[0]['translation_text'] else: final_response = response_text # Step 5: Convert text to speech using gTTS tts = gTTS(final_response, lang=target_language) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tts.save(temp_file.name) return temp_file.name except Exception as e: return f"Error in processing: {e}" # Gradio Interface Function def interface(audio, language): result = chatbot_speech_to_speech(audio, language) return result # Define the Gradio app with Blocks using the latest syntax with gr.Blocks() as gradio_ui: gr.Markdown("# Multilingual Voice-to-Voice Chatbot for Kids") gr.Markdown("### Speak to the chatbot in your selected language and receive a spoken response.") audio_input = gr.Audio(type="filepath", label="Record your message") language_dropdown = gr.Dropdown(choices=["en", "fr", "es", "de", "zh", "ur"], label="Select Language") result_audio = gr.Audio(type="filepath", label="Chatbot Response") submit_btn = gr.Button("Submit") submit_btn.click(fn=interface, inputs=[audio_input, language_dropdown], outputs=result_audio) # Launch the app if asr_model and translation_model and tokenizer and conversation_model: gradio_ui.launch() else: print("Error initializing one or more models. Please check your model configuration.")