# Install required libraries
!pip install gradio transformers torchaudio

import gradio as gr
import torch
from transformers import pipeline
import torchaudio
from torchaudio.transforms import Resample
from pydub import AudioSegment
from io import BytesIO

# Load Whisper model for speech-to-text
whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Mock Groq LLM interaction (replace with real Groq LLM API integration)
def groq_llm_interaction(prompt):
    # Replace this with actual Groq LLM API calls
    return f"Echoing your input: {prompt}"

# Text-to-speech conversion
def text_to_speech(text, output_path="output.wav"):
    # Use a simple TTS pipeline (can replace with more advanced models)
    tts = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
    audio = tts(text, return_tensors=True)["waveform"]
    torchaudio.save(output_path, audio.squeeze(), 22050)
    return output_path

# Convert raw audio to desired format
def process_audio(audio):
    audio = AudioSegment.from_file(audio)
    buf = BytesIO()
    audio.export(buf, format="wav")
    buf.seek(0)
    return buf

# Chatbot function
def voice_to_voice_chat(audio_file):
    # Process the audio input
    processed_audio = process_audio(audio_file)

    # Step 1: Speech-to-text (Whisper)
    transcript = whisper_model(processed_audio)["text"]

    # Step 2: LLM interaction
    response_text = groq_llm_interaction(transcript)

    # Step 3: Text-to-speech (TTS)
    response_audio_path = text_to_speech(response_text)
gr.Audio(type="filepath")
gr.Audio(type="numpy")
interface = gr.Interface(
    fn=voice_to_voice_chat,
    inputs=gr.Audio(type="filepath"),  # Corrected type
    outputs=[
        "text",  # Transcription and response text
        gr.Audio(type="filepath"),  # Generated audio response as a file path
    ],
    live=True  # Ensure real-time interaction
)


# Launch Gradio app
interface.launch()