Boltz79 commited on
Commit
904a0dd
·
verified ·
1 Parent(s): d372a6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -45
app.py CHANGED
@@ -3,88 +3,98 @@ import numpy as np
3
  import torch
4
  from transformers import pipeline
5
  import librosa
6
- import soundfile as sf
7
 
8
  class EmotionRecognizer:
9
  def __init__(self):
10
- self.classifier = pipeline(
 
11
  "audio-classification",
12
- model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
13
- device=0 if torch.cuda.is_available() else -1
14
  )
15
- self.target_sr = 16000
16
- self.max_duration = 10
17
 
18
- def process_audio(self, audio_path):
19
  try:
20
- audio, orig_sr = sf.read(audio_path)
21
- if len(audio.shape) > 1:
22
- audio = np.mean(audio, axis=1)
23
-
24
- if orig_sr != self.target_sr:
25
- audio = librosa.resample(
26
- y=audio.astype(np.float32),
27
- orig_sr=orig_sr,
 
 
 
 
 
 
 
 
28
  target_sr=self.target_sr
29
  )
30
- else:
31
- audio = audio.astype(np.float32)
32
 
33
- audio = librosa.util.normalize(audio)
34
  max_samples = self.max_duration * self.target_sr
35
- if len(audio) > max_samples:
36
- audio = audio[:max_samples]
37
- else:
38
- audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
39
 
40
- results = self.classifier(
41
- {"array": audio, "sampling_rate": self.target_sr}
42
- )
 
 
43
 
44
- labels = [res["label"] for res in results]
45
- scores = [res["score"] * 100 for res in results]
46
-
47
- text_output = "\n".join([f"{label}: {score:.2f}%" for label, score in zip(labels, scores)])
48
- plot_data = {"labels": labels, "values": scores}
 
 
 
 
49
 
50
- return text_output, plot_data
51
 
52
  except Exception as e:
53
- return f"Error processing audio: {str(e)}", None
54
 
55
  def create_interface():
56
  recognizer = EmotionRecognizer()
57
 
58
- with gr.Blocks(title="Audio Emotion Recognition") as interface:
59
- gr.Markdown("# 🎙️ Audio Emotion Recognition")
60
- gr.Markdown("Record or upload English speech (3-10 seconds)")
61
 
62
  with gr.Row():
63
  with gr.Column():
64
  audio_input = gr.Audio(
65
  sources=["microphone", "upload"],
66
- type="filepath",
67
  label="Input Audio"
68
  )
69
- submit_btn = gr.Button("Analyze", variant="primary")
70
-
71
  with gr.Column():
72
- text_output = gr.Textbox(label="Results", interactive=False)
73
- plot_output = gr.BarPlot(
74
- label="Confidence Scores",
75
  x="labels",
76
- y="values",
 
77
  color="labels",
78
  height=300
79
  )
80
 
81
- submit_btn.click(
82
  fn=recognizer.process_audio,
83
  inputs=audio_input,
84
- outputs=[text_output, plot_output]
85
  )
86
 
87
- return interface
88
 
89
  if __name__ == "__main__":
90
  demo = create_interface()
 
3
  import torch
4
  from transformers import pipeline
5
  import librosa
 
6
 
7
  class EmotionRecognizer:
8
  def __init__(self):
9
+ self.device = 0 if torch.cuda.is_available() else -1
10
+ self.model = pipeline(
11
  "audio-classification",
12
+ model="superb/wav2vec2-base-superb-er",
13
+ device=self.device
14
  )
15
+ self.target_sr = 16000 # Model's required sample rate
16
+ self.max_duration = 6 # Optimal duration for this model
17
 
18
+ def process_audio(self, audio):
19
  try:
20
+ # Handle Gradio audio input (sample_rate, audio_array)
21
+ sample_rate, audio_array = audio
22
+
23
+ # Convert stereo to mono if needed
24
+ if len(audio_array.shape) > 1:
25
+ audio_array = np.mean(audio_array, axis=1)
26
+
27
+ # Convert to float32 and normalize
28
+ audio_array = audio_array.astype(np.float32)
29
+ audio_array /= np.max(np.abs(audio_array))
30
+
31
+ # Resample if necessary
32
+ if sample_rate != self.target_sr:
33
+ audio_array = librosa.resample(
34
+ audio_array,
35
+ orig_sr=sample_rate,
36
  target_sr=self.target_sr
37
  )
 
 
38
 
39
+ # Trim to max duration
40
  max_samples = self.max_duration * self.target_sr
41
+ if len(audio_array) > max_samples:
42
+ audio_array = audio_array[:max_samples]
 
 
43
 
44
+ # Run inference
45
+ results = self.model({
46
+ "array": audio_array,
47
+ "sampling_rate": self.target_sr
48
+ })
49
 
50
+ # Format output
51
+ output_text = "\n".join(
52
+ [f"{res['label']}: {res['score']*100:.1f}%"
53
+ for res in results]
54
+ )
55
+ plot_data = {
56
+ "labels": [res["label"] for res in results],
57
+ "scores": [res["score"]*100 for res in results]
58
+ }
59
 
60
+ return output_text, plot_data
61
 
62
  except Exception as e:
63
+ return f"Error: {str(e)}", None
64
 
65
  def create_interface():
66
  recognizer = EmotionRecognizer()
67
 
68
+ with gr.Blocks(title="Voice Emotion Analysis") as app:
69
+ gr.Markdown("# 🎤 Real-time Voice Emotion Analysis")
70
+ gr.Markdown("Record or upload short audio clips (3-6 seconds)")
71
 
72
  with gr.Row():
73
  with gr.Column():
74
  audio_input = gr.Audio(
75
  sources=["microphone", "upload"],
76
+ type="numpy",
77
  label="Input Audio"
78
  )
79
+ analyze_btn = gr.Button("Analyze Emotion", variant="primary")
80
+
81
  with gr.Column():
82
+ output_text = gr.Textbox(label="Emotion Results", lines=4)
83
+ output_plot = gr.BarPlot(
 
84
  x="labels",
85
+ y="scores",
86
+ title="Emotion Distribution",
87
  color="labels",
88
  height=300
89
  )
90
 
91
+ analyze_btn.click(
92
  fn=recognizer.process_audio,
93
  inputs=audio_input,
94
+ outputs=[output_text, output_plot]
95
  )
96
 
97
+ return app
98
 
99
  if __name__ == "__main__":
100
  demo = create_interface()