File size: 5,008 Bytes
00ae0ce
def04d4
7539cee
 
 
 
 
 
 
 
 
 
 
 
 
00ae0ce
7539cee
 
 
 
 
 
cb9a254
7539cee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb9a254
7539cee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
def04d4
7539cee
 
 
 
 
 
 
 
def04d4
7539cee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786ea23
def04d4
7539cee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import librosa
import numpy as np
import os
import tempfile
from collections import Counter
from speechbrain.inference.interfaces import foreign_class

# Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP)
classifier = foreign_class(
    source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
    pymodule_file="custom_interface.py",
    classname="CustomEncoderWav2vec2Classifier",
    run_opts={"device": "cpu"}  # Change to {"device": "cuda"} if GPU is available
)

# Try to import noisereduce (if not available, noise reduction will be skipped)
try:
    import noisereduce as nr
    NOISEREDUCE_AVAILABLE = True
except ImportError:
    NOISEREDUCE_AVAILABLE = False

def preprocess_audio(audio_file, apply_noise_reduction=False):
    """
    Load and preprocess the audio file:
      - Convert to 16kHz mono.
      - Optionally apply noise reduction.
      - Normalize the audio.
    The processed audio is saved to a temporary file and its path is returned.
    """
    # Load audio (resampled to 16kHz and in mono)
    y, sr = librosa.load(audio_file, sr=16000, mono=True)
    
    # Apply noise reduction if requested and available
    if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
        y = nr.reduce_noise(y=y, sr=sr)
    
    # Normalize the audio (scale to -1 to 1)
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    
    # Write the preprocessed audio to a temporary WAV file
    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    import soundfile as sf
    sf.write(temp_file.name, y, sr)
    return temp_file.name

def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
    """
    For audio files longer than a given segment duration, split the file into overlapping segments,
    predict the emotion for each segment, and then return the majority-voted label.
    """
    # Load audio
    y, sr = librosa.load(audio_file, sr=16000, mono=True)
    total_duration = librosa.get_duration(y=y, sr=sr)
    
    # If the audio is short, just process it directly
    if total_duration <= segment_duration:
        temp_file = preprocess_audio(audio_file, apply_noise_reduction)
        _, _, _, label = classifier.classify_file(temp_file)
        os.remove(temp_file)
        return label
    
    # Split the audio into overlapping segments
    step = segment_duration - overlap
    segments = []
    for start in np.arange(0, total_duration - segment_duration + 0.001, step):
        start_sample = int(start * sr)
        end_sample = int((start + segment_duration) * sr)
        segment_audio = y[start_sample:end_sample]
        # Save the segment as a temporary file
        temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        import soundfile as sf
        sf.write(temp_seg.name, segment_audio, sr)
        segments.append(temp_seg.name)
    
    # Process each segment and collect predictions
    predictions = []
    for seg in segments:
        temp_file = preprocess_audio(seg, apply_noise_reduction)
        _, _, _, label = classifier.classify_file(temp_file)
        predictions.append(label)
        os.remove(temp_file)
        os.remove(seg)
    
    # Determine the final label via majority vote
    vote = Counter(predictions)
    most_common = vote.most_common(1)[0][0]
    return most_common

def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False):
    """
    Main prediction function.
      - If use_ensemble is True, the audio is split into segments and ensemble prediction is used.
      - Otherwise, the audio is processed as a whole.
    """
    try:
        if use_ensemble:
            label = ensemble_prediction(audio_file, apply_noise_reduction)
        else:
            temp_file = preprocess_audio(audio_file, apply_noise_reduction)
            _, _, _, label = classifier.classify_file(temp_file)
            os.remove(temp_file)
        return label
    except Exception as e:
        return f"Error processing file: {str(e)}"

# Define the Gradio interface with additional options for ensemble prediction and noise reduction
iface = gr.Interface(
    fn=predict_emotion,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio"),
        gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False),
        gr.Checkbox(label="Apply Noise Reduction", value=False)
    ],
    outputs="text",
    title="Enhanced Emotion Recognition",
    description=(
        "Upload an audio file (expected 16kHz, mono) and the model will predict the emotion "
        "using a wav2vec2 model fine-tuned on IEMOCAP data.\n\n"
        "Options:\n"
        " - Use Ensemble Prediction: For long audio, the file is split into segments and predictions are aggregated.\n"
        " - Apply Noise Reduction: Applies a noise reduction filter before classification (requires noisereduce library)."
    )
)

if __name__ == "__main__":
    iface.launch()