Boltz79 commited on
Commit
7539cee
·
verified ·
1 Parent(s): 904a0dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -92
app.py CHANGED
@@ -1,101 +1,129 @@
1
  import gradio as gr
2
- import numpy as np
3
- import torch
4
- from transformers import pipeline
5
  import librosa
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- class EmotionRecognizer:
8
- def __init__(self):
9
- self.device = 0 if torch.cuda.is_available() else -1
10
- self.model = pipeline(
11
- "audio-classification",
12
- model="superb/wav2vec2-base-superb-er",
13
- device=self.device
14
- )
15
- self.target_sr = 16000 # Model's required sample rate
16
- self.max_duration = 6 # Optimal duration for this model
17
 
18
- def process_audio(self, audio):
19
- try:
20
- # Handle Gradio audio input (sample_rate, audio_array)
21
- sample_rate, audio_array = audio
22
-
23
- # Convert stereo to mono if needed
24
- if len(audio_array.shape) > 1:
25
- audio_array = np.mean(audio_array, axis=1)
26
-
27
- # Convert to float32 and normalize
28
- audio_array = audio_array.astype(np.float32)
29
- audio_array /= np.max(np.abs(audio_array))
30
-
31
- # Resample if necessary
32
- if sample_rate != self.target_sr:
33
- audio_array = librosa.resample(
34
- audio_array,
35
- orig_sr=sample_rate,
36
- target_sr=self.target_sr
37
- )
38
-
39
- # Trim to max duration
40
- max_samples = self.max_duration * self.target_sr
41
- if len(audio_array) > max_samples:
42
- audio_array = audio_array[:max_samples]
43
-
44
- # Run inference
45
- results = self.model({
46
- "array": audio_array,
47
- "sampling_rate": self.target_sr
48
- })
49
-
50
- # Format output
51
- output_text = "\n".join(
52
- [f"{res['label']}: {res['score']*100:.1f}%"
53
- for res in results]
54
- )
55
- plot_data = {
56
- "labels": [res["label"] for res in results],
57
- "scores": [res["score"]*100 for res in results]
58
- }
59
-
60
- return output_text, plot_data
61
-
62
- except Exception as e:
63
- return f"Error: {str(e)}", None
64
 
65
- def create_interface():
66
- recognizer = EmotionRecognizer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- with gr.Blocks(title="Voice Emotion Analysis") as app:
69
- gr.Markdown("# 🎤 Real-time Voice Emotion Analysis")
70
- gr.Markdown("Record or upload short audio clips (3-6 seconds)")
71
-
72
- with gr.Row():
73
- with gr.Column():
74
- audio_input = gr.Audio(
75
- sources=["microphone", "upload"],
76
- type="numpy",
77
- label="Input Audio"
78
- )
79
- analyze_btn = gr.Button("Analyze Emotion", variant="primary")
80
-
81
- with gr.Column():
82
- output_text = gr.Textbox(label="Emotion Results", lines=4)
83
- output_plot = gr.BarPlot(
84
- x="labels",
85
- y="scores",
86
- title="Emotion Distribution",
87
- color="labels",
88
- height=300
89
- )
90
-
91
- analyze_btn.click(
92
- fn=recognizer.process_audio,
93
- inputs=audio_input,
94
- outputs=[output_text, output_plot]
95
- )
96
 
97
- return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  if __name__ == "__main__":
100
- demo = create_interface()
101
- demo.launch()
 
1
  import gradio as gr
 
 
 
2
  import librosa
3
+ import numpy as np
4
+ import os
5
+ import tempfile
6
+ from collections import Counter
7
+ from speechbrain.inference.interfaces import foreign_class
8
+
9
+ # Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP)
10
+ classifier = foreign_class(
11
+ source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
12
+ pymodule_file="custom_interface.py",
13
+ classname="CustomEncoderWav2vec2Classifier",
14
+ run_opts={"device": "cpu"} # Change to {"device": "cuda"} if GPU is available
15
+ )
16
 
17
+ # Try to import noisereduce (if not available, noise reduction will be skipped)
18
+ try:
19
+ import noisereduce as nr
20
+ NOISEREDUCE_AVAILABLE = True
21
+ except ImportError:
22
+ NOISEREDUCE_AVAILABLE = False
 
 
 
 
23
 
24
+ def preprocess_audio(audio_file, apply_noise_reduction=False):
25
+ """
26
+ Load and preprocess the audio file:
27
+ - Convert to 16kHz mono.
28
+ - Optionally apply noise reduction.
29
+ - Normalize the audio.
30
+ The processed audio is saved to a temporary file and its path is returned.
31
+ """
32
+ # Load audio (resampled to 16kHz and in mono)
33
+ y, sr = librosa.load(audio_file, sr=16000, mono=True)
34
+
35
+ # Apply noise reduction if requested and available
36
+ if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
37
+ y = nr.reduce_noise(y=y, sr=sr)
38
+
39
+ # Normalize the audio (scale to -1 to 1)
40
+ if np.max(np.abs(y)) > 0:
41
+ y = y / np.max(np.abs(y))
42
+
43
+ # Write the preprocessed audio to a temporary WAV file
44
+ temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
45
+ import soundfile as sf
46
+ sf.write(temp_file.name, y, sr)
47
+ return temp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
50
+ """
51
+ For audio files longer than a given segment duration, split the file into overlapping segments,
52
+ predict the emotion for each segment, and then return the majority-voted label.
53
+ """
54
+ # Load audio
55
+ y, sr = librosa.load(audio_file, sr=16000, mono=True)
56
+ total_duration = librosa.get_duration(y=y, sr=sr)
57
+
58
+ # If the audio is short, just process it directly
59
+ if total_duration <= segment_duration:
60
+ temp_file = preprocess_audio(audio_file, apply_noise_reduction)
61
+ _, _, _, label = classifier.classify_file(temp_file)
62
+ os.remove(temp_file)
63
+ return label
64
+
65
+ # Split the audio into overlapping segments
66
+ step = segment_duration - overlap
67
+ segments = []
68
+ for start in np.arange(0, total_duration - segment_duration + 0.001, step):
69
+ start_sample = int(start * sr)
70
+ end_sample = int((start + segment_duration) * sr)
71
+ segment_audio = y[start_sample:end_sample]
72
+ # Save the segment as a temporary file
73
+ temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
74
+ import soundfile as sf
75
+ sf.write(temp_seg.name, segment_audio, sr)
76
+ segments.append(temp_seg.name)
77
 
78
+ # Process each segment and collect predictions
79
+ predictions = []
80
+ for seg in segments:
81
+ temp_file = preprocess_audio(seg, apply_noise_reduction)
82
+ _, _, _, label = classifier.classify_file(temp_file)
83
+ predictions.append(label)
84
+ os.remove(temp_file)
85
+ os.remove(seg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Determine the final label via majority vote
88
+ vote = Counter(predictions)
89
+ most_common = vote.most_common(1)[0][0]
90
+ return most_common
91
+
92
+ def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False):
93
+ """
94
+ Main prediction function.
95
+ - If use_ensemble is True, the audio is split into segments and ensemble prediction is used.
96
+ - Otherwise, the audio is processed as a whole.
97
+ """
98
+ try:
99
+ if use_ensemble:
100
+ label = ensemble_prediction(audio_file, apply_noise_reduction)
101
+ else:
102
+ temp_file = preprocess_audio(audio_file, apply_noise_reduction)
103
+ _, _, _, label = classifier.classify_file(temp_file)
104
+ os.remove(temp_file)
105
+ return label
106
+ except Exception as e:
107
+ return f"Error processing file: {str(e)}"
108
+
109
+ # Define the Gradio interface with additional options for ensemble prediction and noise reduction
110
+ iface = gr.Interface(
111
+ fn=predict_emotion,
112
+ inputs=[
113
+ gr.Audio(type="filepath", label="Upload Audio"),
114
+ gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False),
115
+ gr.Checkbox(label="Apply Noise Reduction", value=False)
116
+ ],
117
+ outputs="text",
118
+ title="Enhanced Emotion Recognition",
119
+ description=(
120
+ "Upload an audio file (expected 16kHz, mono) and the model will predict the emotion "
121
+ "using a wav2vec2 model fine-tuned on IEMOCAP data.\n\n"
122
+ "Options:\n"
123
+ " - Use Ensemble Prediction: For long audio, the file is split into segments and predictions are aggregated.\n"
124
+ " - Apply Noise Reduction: Applies a noise reduction filter before classification (requires noisereduce library)."
125
+ )
126
+ )
127
 
128
  if __name__ == "__main__":
129
+ iface.launch()