Spaces:
Sleeping
Sleeping
File size: 3,074 Bytes
00ae0ce def04d4 786ea23 def04d4 ddf32d8 00ae0ce def04d4 ba147ac cb9a254 ddf32d8 def04d4 ddf32d8 def04d4 ddf32d8 def04d4 ddf32d8 def04d4 ddf32d8 def04d4 ba147ac def04d4 ddf32d8 def04d4 ba147ac cb9a254 def04d4 ddf32d8 ba147ac 8b1154e def04d4 ddf32d8 ba147ac def04d4 ddf32d8 def04d4 ba147ac ddf32d8 def04d4 defc213 ddf32d8 def04d4 786ea23 def04d4 ddf32d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
import librosa
import soundfile as sf
class EmotionRecognizer:
def __init__(self):
self.classifier = pipeline(
"audio-classification",
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
device=0 if torch.cuda.is_available() else -1
)
self.target_sr = 16000
self.max_duration = 10
def process_audio(self, audio_path):
try:
audio, orig_sr = sf.read(audio_path)
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
if orig_sr != self.target_sr:
audio = librosa.resample(
y=audio.astype(np.float32),
orig_sr=orig_sr,
target_sr=self.target_sr
)
else:
audio = audio.astype(np.float32)
audio = librosa.util.normalize(audio)
max_samples = self.max_duration * self.target_sr
if len(audio) > max_samples:
audio = audio[:max_samples]
else:
audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
results = self.classifier(
{"array": audio, "sampling_rate": self.target_sr}
)
labels = [res["label"] for res in results]
scores = [res["score"] * 100 for res in results]
text_output = "\n".join([f"{label}: {score:.2f}%" for label, score in zip(labels, scores)])
plot_data = {"labels": labels, "values": scores}
return text_output, plot_data
except Exception as e:
return f"Error processing audio: {str(e)}", None
def create_interface():
recognizer = EmotionRecognizer()
with gr.Blocks(title="Audio Emotion Recognition") as interface:
gr.Markdown("# 🎙️ Audio Emotion Recognition")
gr.Markdown("Record or upload English speech (3-10 seconds)")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Input Audio"
)
submit_btn = gr.Button("Analyze", variant="primary")
with gr.Column():
text_output = gr.Textbox(label="Results", interactive=False)
plot_output = gr.BarPlot(
label="Confidence Scores",
x="labels",
y="values",
color="labels",
height=300
)
submit_btn.click(
fn=recognizer.process_audio,
inputs=audio_input,
outputs=[text_output, plot_output]
)
return interface
if __name__ == "__main__":
demo = create_interface()
demo.launch() |