File size: 5,774 Bytes
00ae0ce
defc213
cb9a254
 
 
 
 
 
00ae0ce
cb9a254
defc213
cb9a254
 
 
 
 
 
 
 
 
 
 
 
 
defc213
cb9a254
 
 
defc213
cb9a254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
defc213
 
cb9a254
 
defc213
cb9a254
 
 
defc213
cb9a254
defc213
 
cb9a254
 
8b1154e
cb9a254
 
defc213
 
cb9a254
 
8b1154e
cb9a254
 
 
 
 
 
 
 
 
 
 
 
 
defc213
cb9a254
 
 
defc213
cb9a254
 
defc213
8b1154e
defc213
 
cb9a254
defc213
 
 
 
 
cb9a254
 
 
defc213
 
cb9a254
defc213
 
cb9a254
defc213
 
cb9a254
 
 
defc213
8b1154e
cb9a254
 
 
defc213
 
8b1154e
defc213
 
cb9a254
 
 
 
 
 
 
 
 
defc213
 
 
00ae0ce
cb9a254
 
 
 
 
 
 
 
 
986b8c7
cb9a254
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import gradio as gr
import numpy as np
import torch
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import os
import warnings
warnings.filterwarnings("ignore")

class EmotionRecognizer:
    def __init__(self):
        # Initialize the model and feature extractor
        self.model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
        self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
        self.feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        self.sample_rate = 16000
        
        # Define emotion labels
        self.labels = ['angry', 'happy', 'sad', 'neutral', 'fearful']

    def process_audio(self, audio):
        """Process audio and return emotions with confidence scores"""
        try:
            # Check if audio is a tuple (new Gradio audio format)
            if isinstance(audio, tuple):
                sample_rate, audio_data = audio
            else:
                return "Error: Invalid audio format", None

            # Resample if necessary
            if sample_rate != self.sample_rate:
                audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=self.sample_rate)

            # Convert to float32 if not already
            audio_data = audio_data.astype(np.float32)

            # Extract features
            inputs = self.feature_extractor(
                audio_data,
                sampling_rate=self.sample_rate,
                return_tensors="pt",
                padding=True
            ).to(self.device)

            # Get model predictions
            with torch.no_grad():
                outputs = self.model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

            # Process results
            scores = predictions[0].cpu().numpy()
            results = [
                {"label": label, "score": float(score)}
                for label, score in zip(self.labels, scores)
            ]

            # Sort by confidence
            results.sort(key=lambda x: x["score"], reverse=True)

            # Format results for display
            output_text = "Emotion Analysis Results:\n\n"
            output_text += "\n".join([
                f"{result['label'].title()}: {result['score']*100:.2f}%"
                for result in results
            ])

            # Prepare plot data
            plot_data = {
                "labels": [r["label"].title() for r in results],
                "values": [r["score"] * 100 for r in results]
            }

            return output_text, plot_data

        except Exception as e:
            return f"Error processing audio: {str(e)}", None

def create_interface():
    # Initialize the emotion recognizer
    recognizer = EmotionRecognizer()
    
    # Define processing function for Gradio
    def process_audio_file(audio):
        if audio is None:
            return "Please provide an audio input.", None
            
        output_text, plot_data = recognizer.process_audio(audio)
        
        if plot_data is not None:
            return (
                output_text,
                gr.BarPlot.update(
                    value=plot_data,
                    x="labels",
                    y="values",
                    title="Emotion Confidence Scores",
                    x_title="Emotions",
                    y_title="Confidence (%)"
                )
            )
        return output_text, None

    # Create the Gradio interface
    with gr.Blocks(title="Audio Emotion Recognition") as interface:
        gr.Markdown("# 🎭 Audio Emotion Recognition")
        gr.Markdown("""
        Upload an audio file or record directly to analyze the emotional content.
        The model will detect emotions like angry, happy, sad, neutral, and fearful.
        """)
        
        with gr.Row():
            with gr.Column():
                # Input audio component (updated format)
                audio_input = gr.Audio(
                    label="Upload or Record Audio",
                    type="numpy",
                    sources=["microphone", "upload"]
                )
                
                # Process button
                process_btn = gr.Button("Analyze Emotion", variant="primary")
            
            with gr.Column():
                # Output components
                output_text = gr.Textbox(
                    label="Analysis Results",
                    lines=6
                )
                output_plot = gr.BarPlot(
                    title="Emotion Confidence Scores",
                    x_title="Emotions",
                    y_title="Confidence (%)"
                )
        
        # Set up event handler
        process_btn.click(
            fn=process_audio_file,
            inputs=[audio_input],
            outputs=[output_text, output_plot]
        )
        
        gr.Markdown("""
        ### Usage Instructions:
        1. Click the microphone button to record audio or upload an audio file
        2. Click "Analyze Emotion" to process the audio
        3. View the results and confidence scores
        
        ### Notes:
        - For best results, ensure clear audio with minimal background noise
        - Speak naturally and clearly when recording
        - The model works best with speech in English
        """)
    
    return interface

def main():
    # Create and launch the interface
    interface = create_interface()
    interface.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860
    )

if __name__ == "__main__":
    main()