import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import edge_tts
import tempfile
import asyncio

# Initialize the inference client with your Hugging Face token
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
# Initialize the ASR pipeline
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

INITIAL_MESSAGE = "Hi! I'm your music buddy—tell me about your mood and the type of tunes you're in the mood for today!"

def speech_to_text(speech):
    """Converts speech to text using the ASR pipeline."""
    return asr(speech)["text"]

def classify_mood(input_string):
    """Classifies the mood based on keywords in the input string."""
    input_string = input_string.lower()
    mood_words = {"happy", "sad", "instrumental", "party"}
    for word in mood_words:
        if word in input_string:
            return word, True
    return None, False

def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(prompt, history)

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text
        mood, is_classified = classify_mood(output)
        if is_classified:
            playlist_message = f"Playing {mood.capitalize()} playlist for you!"
            return playlist_message
    return output

def format_prompt(message, history):
    """Formats the prompt including fixed instructions and conversation history."""
    fixed_prompt = """
    You are a smart mood analyzer tasked with determining the user's mood for a music recommendation system. Your goal is to classify the user's mood into one of four categories: Happy, Sad, Instrumental, or Party.

    Instructions:
    1. Engage in a conversation with the user to understand their mood.
    2. Ask relevant questions to guide the conversation towards mood classification.
    3. If the user's mood is clear, respond with a single word: "Happy", "Sad", "Instrumental", or "Party".
    4. If the mood is unclear, continue the conversation with a follow-up question.
    5. Limit the conversation to a maximum of 5 exchanges.
    6. Do not classify the mood prematurely if it's not evident from the user's responses.
    7. Focus on the user's emotional state rather than specific activities or preferences.
    8. If unable to classify after 5 exchanges, respond with "Unclear" to indicate the need for more information.

    Remember: Your primary goal is mood classification. Stay on topic and guide the conversation towards understanding the user's emotional state.
    """
    prompt = f"{fixed_prompt}\n"
    
    # Add conversation history
    for i, (user_prompt, bot_response) in enumerate(history):
        prompt += f"User: {user_prompt}\nAssistant: {bot_response}\n"
        if i == 3:  # This is the 4th exchange (0-indexed)
            prompt += "Note: This is the last exchange. Classify the mood if possible or respond with 'Unclear'.\n"
    
    prompt += f"User: {message}\nAssistant:"
    return prompt

async def text_to_speech(text):
    communicate = edge_tts.Communicate(text)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

def process_input(input_text, history):
    if not input_text:
        return history, history, "", None
    response = generate(input_text, history)
    history.append((input_text, response))
    return history, history, "", None

async def generate_audio(history):
    if history and len(history) > 0:
        last_response = history[-1][1]
        audio_path = await text_to_speech(last_response)
        return audio_path
    return None

async def init_chat():
    history = [("", INITIAL_MESSAGE)]
    audio_path = await text_to_speech(INITIAL_MESSAGE)
    return history, history, audio_path

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Type your message here or use the microphone to speak...")
    audio_output = gr.Audio(label="AI Response", autoplay=True)

    state = gr.State([])

    with gr.Row():
        submit = gr.Button("Send")
        voice_input = gr.Audio(sources="microphone", type="filepath", label="Voice Input")

    # Initialize chat with greeting
    demo.load(init_chat, outputs=[state, chatbot, audio_output])

    # Handle text input
    msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )
    submit.click(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )

    # Handle voice input
    voice_input.stop_recording(
        lambda x: speech_to_text(x) if x else "",
        inputs=[voice_input],
        outputs=[msg]
    ).then(
        process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]
    ).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )

if __name__ == "__main__":
    demo.launch(share=True)