File size: 3,074 Bytes
00ae0ce
def04d4
 
786ea23
def04d4
ddf32d8
00ae0ce
def04d4
 
 
 
 
 
 
ba147ac
 
cb9a254
ddf32d8
def04d4
ddf32d8
 
 
 
 
 
 
 
 
def04d4
ddf32d8
 
 
 
 
 
 
 
 
def04d4
ddf32d8
 
 
def04d4
ddf32d8
 
def04d4
ba147ac
 
def04d4
ddf32d8
def04d4
 
ba147ac
cb9a254
def04d4
 
 
ddf32d8
 
ba147ac
8b1154e
def04d4
 
 
 
ddf32d8
ba147ac
def04d4
ddf32d8
 
def04d4
ba147ac
ddf32d8
 
 
 
 
 
def04d4
defc213
ddf32d8
 
 
 
 
def04d4
 
786ea23
def04d4
 
ddf32d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
import librosa
import soundfile as sf

class EmotionRecognizer:
    def __init__(self):
        self.classifier = pipeline(
            "audio-classification",
            model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
            device=0 if torch.cuda.is_available() else -1
        )
        self.target_sr = 16000
        self.max_duration = 10

    def process_audio(self, audio_path):
        try:
            audio, orig_sr = sf.read(audio_path)
            if len(audio.shape) > 1:
                audio = np.mean(audio, axis=1)
                
            if orig_sr != self.target_sr:
                audio = librosa.resample(
                    y=audio.astype(np.float32),
                    orig_sr=orig_sr,
                    target_sr=self.target_sr
                )
            else:
                audio = audio.astype(np.float32)
            
            audio = librosa.util.normalize(audio)
            max_samples = self.max_duration * self.target_sr
            if len(audio) > max_samples:
                audio = audio[:max_samples]
            else:
                audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
            
            results = self.classifier(
                {"array": audio, "sampling_rate": self.target_sr}
            )
            
            labels = [res["label"] for res in results]
            scores = [res["score"] * 100 for res in results]
            
            text_output = "\n".join([f"{label}: {score:.2f}%" for label, score in zip(labels, scores)])
            plot_data = {"labels": labels, "values": scores}
            
            return text_output, plot_data
            
        except Exception as e:
            return f"Error processing audio: {str(e)}", None

def create_interface():
    recognizer = EmotionRecognizer()
    
    with gr.Blocks(title="Audio Emotion Recognition") as interface:
        gr.Markdown("# 🎙️ Audio Emotion Recognition")
        gr.Markdown("Record or upload English speech (3-10 seconds)")
        
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(
                    sources=["microphone", "upload"],
                    type="filepath",
                    label="Input Audio"
                )
                submit_btn = gr.Button("Analyze", variant="primary")
                
            with gr.Column():
                text_output = gr.Textbox(label="Results", interactive=False)
                plot_output = gr.BarPlot(
                    label="Confidence Scores",
                    x="labels",
                    y="values",
                    color="labels",
                    height=300
                )
        
        submit_btn.click(
            fn=recognizer.process_audio,
            inputs=audio_input,
            outputs=[text_output, plot_output]
        )
    
    return interface

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()