Spaces:
Sleeping
Sleeping
File size: 5,774 Bytes
00ae0ce defc213 cb9a254 00ae0ce cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 8b1154e cb9a254 defc213 cb9a254 8b1154e cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 8b1154e defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 cb9a254 defc213 8b1154e cb9a254 defc213 8b1154e defc213 cb9a254 defc213 00ae0ce cb9a254 986b8c7 cb9a254 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import gradio as gr
import numpy as np
import torch
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import os
import warnings
warnings.filterwarnings("ignore")
class EmotionRecognizer:
def __init__(self):
# Initialize the model and feature extractor
self.model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
self.feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_name)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
self.sample_rate = 16000
# Define emotion labels
self.labels = ['angry', 'happy', 'sad', 'neutral', 'fearful']
def process_audio(self, audio):
"""Process audio and return emotions with confidence scores"""
try:
# Check if audio is a tuple (new Gradio audio format)
if isinstance(audio, tuple):
sample_rate, audio_data = audio
else:
return "Error: Invalid audio format", None
# Resample if necessary
if sample_rate != self.sample_rate:
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=self.sample_rate)
# Convert to float32 if not already
audio_data = audio_data.astype(np.float32)
# Extract features
inputs = self.feature_extractor(
audio_data,
sampling_rate=self.sample_rate,
return_tensors="pt",
padding=True
).to(self.device)
# Get model predictions
with torch.no_grad():
outputs = self.model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
# Process results
scores = predictions[0].cpu().numpy()
results = [
{"label": label, "score": float(score)}
for label, score in zip(self.labels, scores)
]
# Sort by confidence
results.sort(key=lambda x: x["score"], reverse=True)
# Format results for display
output_text = "Emotion Analysis Results:\n\n"
output_text += "\n".join([
f"{result['label'].title()}: {result['score']*100:.2f}%"
for result in results
])
# Prepare plot data
plot_data = {
"labels": [r["label"].title() for r in results],
"values": [r["score"] * 100 for r in results]
}
return output_text, plot_data
except Exception as e:
return f"Error processing audio: {str(e)}", None
def create_interface():
# Initialize the emotion recognizer
recognizer = EmotionRecognizer()
# Define processing function for Gradio
def process_audio_file(audio):
if audio is None:
return "Please provide an audio input.", None
output_text, plot_data = recognizer.process_audio(audio)
if plot_data is not None:
return (
output_text,
gr.BarPlot.update(
value=plot_data,
x="labels",
y="values",
title="Emotion Confidence Scores",
x_title="Emotions",
y_title="Confidence (%)"
)
)
return output_text, None
# Create the Gradio interface
with gr.Blocks(title="Audio Emotion Recognition") as interface:
gr.Markdown("# 🎭 Audio Emotion Recognition")
gr.Markdown("""
Upload an audio file or record directly to analyze the emotional content.
The model will detect emotions like angry, happy, sad, neutral, and fearful.
""")
with gr.Row():
with gr.Column():
# Input audio component (updated format)
audio_input = gr.Audio(
label="Upload or Record Audio",
type="numpy",
sources=["microphone", "upload"]
)
# Process button
process_btn = gr.Button("Analyze Emotion", variant="primary")
with gr.Column():
# Output components
output_text = gr.Textbox(
label="Analysis Results",
lines=6
)
output_plot = gr.BarPlot(
title="Emotion Confidence Scores",
x_title="Emotions",
y_title="Confidence (%)"
)
# Set up event handler
process_btn.click(
fn=process_audio_file,
inputs=[audio_input],
outputs=[output_text, output_plot]
)
gr.Markdown("""
### Usage Instructions:
1. Click the microphone button to record audio or upload an audio file
2. Click "Analyze Emotion" to process the audio
3. View the results and confidence scores
### Notes:
- For best results, ensure clear audio with minimal background noise
- Speak naturally and clearly when recording
- The model works best with speech in English
""")
return interface
def main():
# Create and launch the interface
interface = create_interface()
interface.launch(
share=True,
server_name="0.0.0.0",
server_port=7860
)
if __name__ == "__main__":
main() |