import gradio as gr
import torch
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification

# Initialize model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Hatman/audio-emotion-detection"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
model.to(device)

# Define emotion labels
EMOTION_LABELS = {
    0: "angry",
    1: "disgust",
    2: "fear",
    3: "happy",
    4: "neutral",
    5: "sad",
    6: "surprise"
}

def process_audio(audio):
    """Process audio chunk and return emotion"""
    if audio is None:
        return ""
    
    # Get the audio data
    if isinstance(audio, tuple):
        audio = audio[1]
    
    # Convert to numpy array if needed
    audio = np.array(audio)
    
    # Ensure we have mono audio
    if len(audio.shape) > 1:
        audio = audio.mean(axis=1)
    
    try:
        # Prepare input for the model
        inputs = feature_extractor(
            audio,
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )
        
        # Move to appropriate device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get prediction
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_id = torch.argmax(logits, dim=-1).item()
            
        emotion = EMOTION_LABELS[predicted_id]
        return emotion
    
    except Exception as e:
        print(f"Error processing audio: {e}")
        return "Error processing audio"

# Create Gradio interface
demo = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(
            sources=["microphone"],
            type="numpy",
            streaming=True,
            label="Speak into your microphone",
            show_label=True
        )
    ],
    outputs=gr.Textbox(label="Detected Emotion"),
    title="Live Emotion Detection",
    description="Speak into your microphone to detect emotions in real-time.",
    live=True,
    allow_flagging=False
)

# Launch with a small queue for better real-time performance
demo.queue(max_size=1).launch(share=True)