import gradio as gr import torch import numpy as np from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification # Initialize model and processor device = "cuda" if torch.cuda.is_available() else "cpu" model_name = "Hatman/audio-emotion-detection" feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) model.to(device) # Define emotion labels EMOTION_LABELS = { 0: "angry", 1: "disgust", 2: "fear", 3: "happy", 4: "neutral", 5: "sad", 6: "surprise" } def process_audio(audio): """Process audio chunk and return emotion""" if audio is None: return "" # Get the audio data if isinstance(audio, tuple): audio = audio[1] # Convert to numpy array if needed audio = np.array(audio) # Ensure we have mono audio if len(audio.shape) > 1: audio = audio.mean(axis=1) try: # Prepare input for the model inputs = feature_extractor( audio, sampling_rate=16000, return_tensors="pt", padding=True ) # Move to appropriate device inputs = {k: v.to(device) for k, v in inputs.items()} # Get prediction with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits predicted_id = torch.argmax(logits, dim=-1).item() emotion = EMOTION_LABELS[predicted_id] return emotion except Exception as e: print(f"Error processing audio: {e}") return "Error processing audio" # Create Gradio interface demo = gr.Interface( fn=process_audio, inputs=[ gr.Audio( sources=["microphone"], type="numpy", streaming=True, label="Speak into your microphone", show_label=True ) ], outputs=gr.Textbox(label="Detected Emotion"), title="Live Emotion Detection", description="Speak into your microphone to detect emotions in real-time.", live=True, allow_flagging=False ) # Launch with a small queue for better real-time performance demo.queue(max_size=1).launch(share=True)