raffaelsiregar's picture
Update app.py
ff22bc5 verified
raw
history blame
5.51 kB
import gradio as gr
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import numpy as np
from sklearn.preprocessing import LabelEncoder
class CNN1DLSTMAudioClassifier(nn.Module):
def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160):
super(CNN1DLSTMAudioClassifier, self).__init__()
# 1D CNN layers
self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2)
self.bn1 = nn.BatchNorm1d(8)
self.pool1 = nn.MaxPool1d(kernel_size=2)
self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2)
self.bn2 = nn.BatchNorm1d(16)
self.pool2 = nn.MaxPool1d(kernel_size=2)
self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2)
self.bn3 = nn.BatchNorm1d(32)
self.pool3 = nn.MaxPool1d(kernel_size=2)
# Calculate the output size of the last CNN layer
self._to_linear = None
self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length)
# LSTM layers
self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True)
# Fully connected layer
self.fc1 = nn.Linear(128, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, num_classes)
# Dropout
self.dropout = nn.Dropout(0.2)
def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length):
# Calculate the size of the input to the LSTM layer
num_frames = (sample_rate - n_fft) // hop_length + 1
x = torch.randn(1, input_channels, num_frames)
self.convs(x)
self._to_linear = x.shape[1]
def convs(self, x):
x = self.pool1(self.bn1(F.relu(self.conv1(x))))
x = self.pool2(self.bn2(F.relu(self.conv2(x))))
x = self.pool3(self.bn3(F.relu(self.conv3(x))))
return x
def forward(self, x):
x = x.view(x.size(0), 1, -1)
x = self.convs(x)
x = x.permute(0, 2, 1)
x, _ = self.lstm(x)
x = x[:, -1, :]
# Fully connected layers
x = self.dropout(x)
x = self.fc1(x)
x = self.dropout(x)
x = self.fc2(x)
return x
num_class = 6
device = torch.device('cpu')
state_dict = torch.load('best-model-emotion-recognition.bin', map_location=device)
model = CNN1DLSTMAudioClassifier(num_class)
model.load_state_dict(state_dict)
model.eval()
def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512):
# Load the audio file
waveform, sr = torchaudio.load(file_path)
# Resample if necessary
if sr != sample_rate:
resampler = torchaudio.transforms.Resample(sr, sample_rate)
waveform = resampler(waveform)
# Ensure consistent audio length (2 seconds)
target_length = 2 * sample_rate
if waveform.size(1) > target_length:
waveform = waveform[:, :target_length]
else:
waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1)))
# Apply Mel Spectrogram transform
mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=sample_rate,
n_mels=n_mels,
n_fft=n_fft,
hop_length=hop_length
)
mel_spectrogram = mel_transform(waveform)
# Normalize (use the mean and std from your training data)
mean = 12.65
std = 117.07
normalized_mel_spectrogram = (mel_spectrogram - mean) / std
# Flatten the mel spectrogram
flattened = normalized_mel_spectrogram.flatten()
if flattened.shape[0] < 12288:
flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0]))
elif flattened.shape[0] > 12288:
flattened = flattened[:12288]
return flattened
def decode_emotion_prediction(prediction_tensor, label_encoder):
"""
Decodes the prediction tensor into an emotion label.
Args:
prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6]
label_encoder (LabelEncoder): The LabelEncoder used during training
Returns:
str: The predicted emotion label
float: The confidence score for the prediction
"""
# Get the index of the highest probability
max_index = torch.argmax(prediction_tensor, dim=1).item()
# Get the confidence score (probability) for the prediction
confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item()
# Decode the index to get the emotion label
predicted_emotion = label_encoder.inverse_transform([max_index])[0]
return predicted_emotion, confidence
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
def predict(wave):
wave = preprocess_single_audio(wave)
le = LabelEncoder()
le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad'])
wave = wave.unsqueeze(0)
with torch.no_grad():
prediction = model(wave)
predicted_emotion, confidence = decode_emotion_prediction(prediction, le)
return f"Predicted emotion: {predicted_emotion} (Confidence: {confidence:.2f})"
# Gradio Interface
iface = gr.Interface(
fn=predict,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
live=True,
title="Speech Emotion Recognition",
description="Record your voice and get the predicted emotion."
)
iface.launch()