|
import gradio as gr |
|
import time |
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
import torchaudio |
|
import numpy as np |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
class CNN1DLSTMAudioClassifier(nn.Module): |
|
def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160): |
|
super(CNN1DLSTMAudioClassifier, self).__init__() |
|
|
|
|
|
self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2) |
|
self.bn1 = nn.BatchNorm1d(8) |
|
self.pool1 = nn.MaxPool1d(kernel_size=2) |
|
self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2) |
|
self.bn2 = nn.BatchNorm1d(16) |
|
self.pool2 = nn.MaxPool1d(kernel_size=2) |
|
self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2) |
|
self.bn3 = nn.BatchNorm1d(32) |
|
self.pool3 = nn.MaxPool1d(kernel_size=2) |
|
|
|
|
|
self._to_linear = None |
|
self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length) |
|
|
|
|
|
self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True) |
|
|
|
|
|
self.fc1 = nn.Linear(128, 64) |
|
self.fc2 = nn.Linear(64, 32) |
|
self.fc3 = nn.Linear(32, num_classes) |
|
|
|
|
|
self.dropout = nn.Dropout(0.2) |
|
|
|
def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length): |
|
|
|
num_frames = (sample_rate - n_fft) // hop_length + 1 |
|
x = torch.randn(1, input_channels, num_frames) |
|
self.convs(x) |
|
self._to_linear = x.shape[1] |
|
|
|
def convs(self, x): |
|
x = self.pool1(self.bn1(F.relu(self.conv1(x)))) |
|
x = self.pool2(self.bn2(F.relu(self.conv2(x)))) |
|
x = self.pool3(self.bn3(F.relu(self.conv3(x)))) |
|
return x |
|
|
|
def forward(self, x): |
|
x = x.view(x.size(0), 1, -1) |
|
x = self.convs(x) |
|
|
|
x = x.permute(0, 2, 1) |
|
x, _ = self.lstm(x) |
|
x = x[:, -1, :] |
|
|
|
|
|
x = self.dropout(x) |
|
x = self.fc1(x) |
|
x = self.dropout(x) |
|
x = self.fc2(x) |
|
|
|
return x |
|
|
|
num_class = 6 |
|
device = torch.device('cpu') |
|
state_dict = torch.load('best-model-emotion-recognition.bin', map_location=device) |
|
model = CNN1DLSTMAudioClassifier(num_class) |
|
model.load_state_dict(state_dict) |
|
model.eval() |
|
|
|
def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512): |
|
|
|
waveform, sr = torchaudio.load(file_path) |
|
|
|
|
|
if sr != sample_rate: |
|
resampler = torchaudio.transforms.Resample(sr, sample_rate) |
|
waveform = resampler(waveform) |
|
|
|
|
|
target_length = 2 * sample_rate |
|
if waveform.size(1) > target_length: |
|
waveform = waveform[:, :target_length] |
|
else: |
|
waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1))) |
|
|
|
|
|
mel_transform = torchaudio.transforms.MelSpectrogram( |
|
sample_rate=sample_rate, |
|
n_mels=n_mels, |
|
n_fft=n_fft, |
|
hop_length=hop_length |
|
) |
|
mel_spectrogram = mel_transform(waveform) |
|
|
|
|
|
mean = 12.65 |
|
std = 117.07 |
|
normalized_mel_spectrogram = (mel_spectrogram - mean) / std |
|
|
|
|
|
flattened = normalized_mel_spectrogram.flatten() |
|
|
|
if flattened.shape[0] < 12288: |
|
flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0])) |
|
elif flattened.shape[0] > 12288: |
|
flattened = flattened[:12288] |
|
|
|
return flattened |
|
|
|
def decode_emotion_prediction(prediction_tensor, label_encoder): |
|
""" |
|
Decodes the prediction tensor into an emotion label. |
|
|
|
Args: |
|
prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6] |
|
label_encoder (LabelEncoder): The LabelEncoder used during training |
|
|
|
Returns: |
|
str: The predicted emotion label |
|
float: The confidence score for the prediction |
|
""" |
|
|
|
max_index = torch.argmax(prediction_tensor, dim=1).item() |
|
|
|
|
|
confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item() |
|
|
|
|
|
predicted_emotion = label_encoder.inverse_transform([max_index])[0] |
|
|
|
return predicted_emotion, confidence |
|
|
|
|
|
|
|
|
|
def predict(wave): |
|
wave = preprocess_single_audio(wave) |
|
le = LabelEncoder() |
|
le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad']) |
|
wave = wave.unsqueeze(0) |
|
with torch.no_grad(): |
|
prediction = model(wave) |
|
predicted_emotion, confidence = decode_emotion_prediction(prediction, le) |
|
return f"Predicted emotion: {predicted_emotion} (Confidence: {confidence:.2f})" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict, |
|
inputs=gr.Audio(sources="microphone", type="filepath"), |
|
outputs="text", |
|
live=True, |
|
title="Speech Emotion Recognition", |
|
description="Record your voice and get the predicted emotion." |
|
) |
|
|
|
iface.launch() |
|
|