# Install required libraries !pip install gradio transformers torchaudio import gradio as gr import torch from transformers import pipeline import torchaudio from torchaudio.transforms import Resample from pydub import AudioSegment from io import BytesIO # Load Whisper model for speech-to-text whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-small") # Mock Groq LLM interaction (replace with real Groq LLM API integration) def groq_llm_interaction(prompt): # Replace this with actual Groq LLM API calls return f"Echoing your input: {prompt}" # Text-to-speech conversion def text_to_speech(text, output_path="output.wav"): # Use a simple TTS pipeline (can replace with more advanced models) tts = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech") audio = tts(text, return_tensors=True)["waveform"] torchaudio.save(output_path, audio.squeeze(), 22050) return output_path # Convert raw audio to desired format def process_audio(audio): audio = AudioSegment.from_file(audio) buf = BytesIO() audio.export(buf, format="wav") buf.seek(0) return buf # Chatbot function def voice_to_voice_chat(audio_file): # Process the audio input processed_audio = process_audio(audio_file) # Step 1: Speech-to-text (Whisper) transcript = whisper_model(processed_audio)["text"] # Step 2: LLM interaction response_text = groq_llm_interaction(transcript) # Step 3: Text-to-speech (TTS) response_audio_path = text_to_speech(response_text) gr.Audio(type="filepath") gr.Audio(type="numpy") interface = gr.Interface( fn=voice_to_voice_chat, inputs=gr.Audio(type="filepath"), # Corrected type outputs=[ "text", # Transcription and response text gr.Audio(type="filepath"), # Generated audio response as a file path ], live=True # Ensure real-time interaction ) # Launch Gradio app interface.launch()