import gradio as gr from huggingface_hub import InferenceClient from transformers import pipeline import edge_tts import tempfile import asyncio # Initialize the inference client with your Hugging Face token client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1") # Initialize the ASR pipeline asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") INITIAL_MESSAGE = "Hi! I'm your music buddy—tell me about your mood and the type of tunes you're in the mood for today!" def speech_to_text(speech): """Converts speech to text using the ASR pipeline.""" return asr(speech)["text"] def classify_mood(input_string): """Classifies the mood based on keywords in the input string.""" input_string = input_string.lower() mood_words = {"happy", "sad", "instrumental", "party"} for word in mood_words: if word in input_string: return word, True return None, False def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0): temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, seed=42, ) formatted_prompt = format_prompt(prompt, history) stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) output = "" for response in stream: output += response.token.text mood, is_classified = classify_mood(output) if is_classified: playlist_message = f"Playing {mood.capitalize()} playlist for you!" return playlist_message return output def format_prompt(message, history): """Formats the prompt including fixed instructions and conversation history.""" fixed_prompt = """ You are a smart mood analyzer tasked with determining the user's mood for a music recommendation system. Your goal is to classify the user's mood into one of four categories: Happy, Sad, Instrumental, or Party. Instructions: 1. Engage in a conversation with the user to understand their mood. 2. Ask relevant questions to guide the conversation towards mood classification. 3. If the user's mood is clear, respond with a single word: "Happy", "Sad", "Instrumental", or "Party". 4. If the mood is unclear, continue the conversation with a follow-up question. 5. Limit the conversation to a maximum of 5 exchanges. 6. Do not classify the mood prematurely if it's not evident from the user's responses. 7. Focus on the user's emotional state rather than specific activities or preferences. 8. If unable to classify after 5 exchanges, respond with "Unclear" to indicate the need for more information. Remember: Your primary goal is mood classification. Stay on topic and guide the conversation towards understanding the user's emotional state. """ prompt = f"{fixed_prompt}\n" # Add conversation history for i, (user_prompt, bot_response) in enumerate(history): prompt += f"User: {user_prompt}\nAssistant: {bot_response}\n" if i == 3: # This is the 4th exchange (0-indexed) prompt += "Note: This is the last exchange. Classify the mood if possible or respond with 'Unclear'.\n" prompt += f"User: {message}\nAssistant:" return prompt async def text_to_speech(text): communicate = edge_tts.Communicate(text) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path def process_input(input_text, history): if not input_text: return history, history, "", None response = generate(input_text, history) history.append((input_text, response)) return history, history, "", None async def generate_audio(history): if history and len(history) > 0: last_response = history[-1][1] audio_path = await text_to_speech(last_response) return audio_path return None async def init_chat(): history = [("", INITIAL_MESSAGE)] audio_path = await text_to_speech(INITIAL_MESSAGE) return history, history, audio_path # Gradio interface setup with gr.Blocks() as demo: gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat") chatbot = gr.Chatbot() msg = gr.Textbox(placeholder="Type your message here or use the microphone to speak...") audio_output = gr.Audio(label="AI Response", autoplay=True) state = gr.State([]) with gr.Row(): submit = gr.Button("Send") voice_input = gr.Audio(sources="microphone", type="filepath", label="Voice Input") # Initialize chat with greeting demo.load(init_chat, outputs=[state, chatbot, audio_output]) # Handle text input msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then( generate_audio, inputs=[state], outputs=[audio_output] ) submit.click(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then( generate_audio, inputs=[state], outputs=[audio_output] ) # Handle voice input voice_input.stop_recording( lambda x: speech_to_text(x) if x else "", inputs=[voice_input], outputs=[msg] ).then( process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input] ).then( generate_audio, inputs=[state], outputs=[audio_output] ) if __name__ == "__main__": demo.launch(share=True)