Spaces:

Boltz79
/

Sentiment-Analysis

Sleeping

App Files Files Community

Boltz79 commited on Feb 8

Commit

7539cee

verified ·

1 Parent(s): 904a0dd

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -92

app.py CHANGED Viewed

@@ -1,101 +1,129 @@
 import gradio as gr
-import numpy as np
-import torch
-from transformers import pipeline
 import librosa
-class EmotionRecognizer:
-    def __init__(self):
-        self.device = 0 if torch.cuda.is_available() else -1
-        self.model = pipeline(
-            "audio-classification",
-            model="superb/wav2vec2-base-superb-er",
-            device=self.device
-        )
-        self.target_sr = 16000  # Model's required sample rate
-        self.max_duration = 6   # Optimal duration for this model
-    def process_audio(self, audio):
-        try:
-            # Handle Gradio audio input (sample_rate, audio_array)
-            sample_rate, audio_array = audio
-            # Convert stereo to mono if needed
-            if len(audio_array.shape) > 1:
-                audio_array = np.mean(audio_array, axis=1)
-            # Convert to float32 and normalize
-            audio_array = audio_array.astype(np.float32)
-            audio_array /= np.max(np.abs(audio_array))
-            # Resample if necessary
-            if sample_rate != self.target_sr:
-                audio_array = librosa.resample(
-                    audio_array,
-                    orig_sr=sample_rate,
-                    target_sr=self.target_sr
-                )
-            # Trim to max duration
-            max_samples = self.max_duration * self.target_sr
-            if len(audio_array) > max_samples:
-                audio_array = audio_array[:max_samples]
-            # Run inference
-            results = self.model({
-                "array": audio_array,
-                "sampling_rate": self.target_sr
-            })
-            # Format output
-            output_text = "\n".join(
-                [f"{res['label']}: {res['score']*100:.1f}%"
-                 for res in results]
-            )
-            plot_data = {
-                "labels": [res["label"] for res in results],
-                "scores": [res["score"]*100 for res in results]
-            }
-            return output_text, plot_data
-        except Exception as e:
-            return f"Error: {str(e)}", None
-def create_interface():
-    recognizer = EmotionRecognizer()
-    with gr.Blocks(title="Voice Emotion Analysis") as app:
-        gr.Markdown("# 🎤 Real-time Voice Emotion Analysis")
-        gr.Markdown("Record or upload short audio clips (3-6 seconds)")
-        with gr.Row():
-            with gr.Column():
-                audio_input = gr.Audio(
-                    sources=["microphone", "upload"],
-                    type="numpy",
-                    label="Input Audio"
-                )
-                analyze_btn = gr.Button("Analyze Emotion", variant="primary")
-            with gr.Column():
-                output_text = gr.Textbox(label="Emotion Results", lines=4)
-                output_plot = gr.BarPlot(
-                    x="labels",
-                    y="scores",
-                    title="Emotion Distribution",
-                    color="labels",
-                    height=300
-                )
-        analyze_btn.click(
-            fn=recognizer.process_audio,
-            inputs=audio_input,
-            outputs=[output_text, output_plot]
-        )
-    return app
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch()

 import gradio as gr
 import librosa
+import numpy as np
+import os
+import tempfile
+from collections import Counter
+from speechbrain.inference.interfaces import foreign_class
+# Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP)
+classifier = foreign_class(
+    source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
+    pymodule_file="custom_interface.py",
+    classname="CustomEncoderWav2vec2Classifier",
+    run_opts={"device": "cpu"}  # Change to {"device": "cuda"} if GPU is available
+)
+# Try to import noisereduce (if not available, noise reduction will be skipped)
+try:
+    import noisereduce as nr
+    NOISEREDUCE_AVAILABLE = True
+except ImportError:
+    NOISEREDUCE_AVAILABLE = False
+def preprocess_audio(audio_file, apply_noise_reduction=False):
+    """
+    Load and preprocess the audio file:
+      - Convert to 16kHz mono.
+      - Optionally apply noise reduction.
+      - Normalize the audio.
+    The processed audio is saved to a temporary file and its path is returned.
+    """
+    # Load audio (resampled to 16kHz and in mono)
+    y, sr = librosa.load(audio_file, sr=16000, mono=True)
+    # Apply noise reduction if requested and available
+    if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
+        y = nr.reduce_noise(y=y, sr=sr)
+    # Normalize the audio (scale to -1 to 1)
+    if np.max(np.abs(y)) > 0:
+        y = y / np.max(np.abs(y))
+    # Write the preprocessed audio to a temporary WAV file
+    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    import soundfile as sf
+    sf.write(temp_file.name, y, sr)
+    return temp_file.name
+def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
+    """
+    For audio files longer than a given segment duration, split the file into overlapping segments,
+    predict the emotion for each segment, and then return the majority-voted label.
+    """
+    # Load audio
+    y, sr = librosa.load(audio_file, sr=16000, mono=True)
+    total_duration = librosa.get_duration(y=y, sr=sr)
+    # If the audio is short, just process it directly
+    if total_duration <= segment_duration:
+        temp_file = preprocess_audio(audio_file, apply_noise_reduction)
+        _, _, _, label = classifier.classify_file(temp_file)
+        os.remove(temp_file)
+        return label
+    # Split the audio into overlapping segments
+    step = segment_duration - overlap
+    segments = []
+    for start in np.arange(0, total_duration - segment_duration + 0.001, step):
+        start_sample = int(start * sr)
+        end_sample = int((start + segment_duration) * sr)
+        segment_audio = y[start_sample:end_sample]
+        # Save the segment as a temporary file
+        temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        import soundfile as sf
+        sf.write(temp_seg.name, segment_audio, sr)
+        segments.append(temp_seg.name)
+    # Process each segment and collect predictions
+    predictions = []
+    for seg in segments:
+        temp_file = preprocess_audio(seg, apply_noise_reduction)
+        _, _, _, label = classifier.classify_file(temp_file)
+        predictions.append(label)
+        os.remove(temp_file)
+        os.remove(seg)
+    # Determine the final label via majority vote
+    vote = Counter(predictions)
+    most_common = vote.most_common(1)[0][0]
+    return most_common
+def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False):
+    """
+    Main prediction function.
+      - If use_ensemble is True, the audio is split into segments and ensemble prediction is used.
+      - Otherwise, the audio is processed as a whole.
+    """
+    try:
+        if use_ensemble:
+            label = ensemble_prediction(audio_file, apply_noise_reduction)
+        else:
+            temp_file = preprocess_audio(audio_file, apply_noise_reduction)
+            _, _, _, label = classifier.classify_file(temp_file)
+            os.remove(temp_file)
+        return label
+    except Exception as e:
+        return f"Error processing file: {str(e)}"
+# Define the Gradio interface with additional options for ensemble prediction and noise reduction
+iface = gr.Interface(
+    fn=predict_emotion,
+    inputs=[
+        gr.Audio(type="filepath", label="Upload Audio"),
+        gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False),
+        gr.Checkbox(label="Apply Noise Reduction", value=False)
+    ],
+    outputs="text",
+    title="Enhanced Emotion Recognition",
+    description=(
+        "Upload an audio file (expected 16kHz, mono) and the model will predict the emotion "
+        "using a wav2vec2 model fine-tuned on IEMOCAP data.\n\n"
+        "Options:\n"
+        " - Use Ensemble Prediction: For long audio, the file is split into segments and predictions are aggregated.\n"
+        " - Apply Noise Reduction: Applies a noise reduction filter before classification (requires noisereduce library)."
+    )
+)
 if __name__ == "__main__":
+    iface.launch()