import gradio as gr import time import torch import torch.nn as nn import torch.nn.functional as F import torchaudio import numpy as np from sklearn.preprocessing import LabelEncoder class CNN1DLSTMAudioClassifier(nn.Module): def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160): super(CNN1DLSTMAudioClassifier, self).__init__() # 1D CNN layers self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2) self.bn1 = nn.BatchNorm1d(8) self.pool1 = nn.MaxPool1d(kernel_size=2) self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2) self.bn2 = nn.BatchNorm1d(16) self.pool2 = nn.MaxPool1d(kernel_size=2) self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2) self.bn3 = nn.BatchNorm1d(32) self.pool3 = nn.MaxPool1d(kernel_size=2) # Calculate the output size of the last CNN layer self._to_linear = None self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length) # LSTM layers self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True) # Fully connected layer self.fc1 = nn.Linear(128, 64) self.fc2 = nn.Linear(64, 32) self.fc3 = nn.Linear(32, num_classes) # Dropout self.dropout = nn.Dropout(0.2) def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length): # Calculate the size of the input to the LSTM layer num_frames = (sample_rate - n_fft) // hop_length + 1 x = torch.randn(1, input_channels, num_frames) self.convs(x) self._to_linear = x.shape[1] def convs(self, x): x = self.pool1(self.bn1(F.relu(self.conv1(x)))) x = self.pool2(self.bn2(F.relu(self.conv2(x)))) x = self.pool3(self.bn3(F.relu(self.conv3(x)))) return x def forward(self, x): x = x.view(x.size(0), 1, -1) x = self.convs(x) x = x.permute(0, 2, 1) x, _ = self.lstm(x) x = x[:, -1, :] # Fully connected layers x = self.dropout(x) x = self.fc1(x) x = self.dropout(x) x = self.fc2(x) return x num_class = 6 device = torch.device('cpu') state_dict = torch.load('best_model.pth', map_location=device) model = CNN1DLSTMAudioClassifier(num_class) model.load_state_dict(state_dict) model.eval() def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512): # Load the audio file waveform, sr = torchaudio.load(file_path) # Resample if necessary if sr != sample_rate: resampler = torchaudio.transforms.Resample(sr, sample_rate) waveform = resampler(waveform) # Ensure consistent audio length (2 seconds) target_length = 2 * sample_rate if waveform.size(1) > target_length: waveform = waveform[:, :target_length] else: waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1))) # Apply Mel Spectrogram transform mel_transform = torchaudio.transforms.MelSpectrogram( sample_rate=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length ) mel_spectrogram = mel_transform(waveform) # Normalize (use the mean and std from your training data) mean = 12.65 std = 117.07 normalized_mel_spectrogram = (mel_spectrogram - mean) / std # Flatten the mel spectrogram flattened = normalized_mel_spectrogram.flatten() if flattened.shape[0] < 12288: flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0])) elif flattened.shape[0] > 12288: flattened = flattened[:12288] return flattened def decode_emotion_prediction(prediction_tensor, label_encoder): """ Decodes the prediction tensor into an emotion label. Args: prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6] label_encoder (LabelEncoder): The LabelEncoder used during training Returns: str: The predicted emotion label float: The confidence score for the prediction """ # Get the index of the highest probability max_index = torch.argmax(prediction_tensor, dim=1).item() # Get the confidence score (probability) for the prediction confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item() # Decode the index to get the emotion label predicted_emotion = label_encoder.inverse_transform([max_index])[0] return predicted_emotion, confidence def predict(wave): if wave is None or wave == '': return "No audio input provided." try: wave = preprocess_single_audio(wave) le = LabelEncoder() le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad']) wave = wave.unsqueeze(0) with torch.no_grad(): prediction = model(wave) predicted_emotion, confidence = decode_emotion_prediction(prediction, le) return f"Your emotion is: {predicted_emotion} with {confidence*100:.4f}% confidence level." except Exception as e: return f'Error in processing audio: {str(e)}' # Gradio Interface article = """ ### How It Works - The model classifies the speech emotion given into 6 emotions (Angry, Happy, Sad, Disgusting, Fear, Neutral). - It returns the highest chance of the emotion and its confidence level. - This model is built with CNN Architecture combined with LSTM Architecture. - Please use English to record your voice. """ iface = gr.Interface( fn=predict, inputs=gr.Audio(sources="microphone", type="filepath"), outputs="text", live=True, title="Speech Emotion Recognition", description="Record your voice to express an emotion and get the predicted emotion. The model only support English. Record it about 2-3 s", article=article ) iface.launch()