Spaces:

raffaelsiregar
/

speech-emotion-recognition

Sleeping

App Files Files Community

speech-emotion-recognition / app.py

raffaelsiregar

Update app.py

ff22bc5 verified 4 months ago

raw

history blame

5.51 kB

	import gradio as gr
	import time
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchaudio
	import numpy as np
	from sklearn.preprocessing import LabelEncoder

	class CNN1DLSTMAudioClassifier(nn.Module):
	def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160):
	super(CNN1DLSTMAudioClassifier, self).__init__()

	# 1D CNN layers
	self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2)
	self.bn1 = nn.BatchNorm1d(8)
	self.pool1 = nn.MaxPool1d(kernel_size=2)
	self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2)
	self.bn2 = nn.BatchNorm1d(16)
	self.pool2 = nn.MaxPool1d(kernel_size=2)
	self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2)
	self.bn3 = nn.BatchNorm1d(32)
	self.pool3 = nn.MaxPool1d(kernel_size=2)

	# Calculate the output size of the last CNN layer
	self._to_linear = None
	self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length)

	# LSTM layers
	self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True)

	# Fully connected layer
	self.fc1 = nn.Linear(128, 64)
	self.fc2 = nn.Linear(64, 32)
	self.fc3 = nn.Linear(32, num_classes)

	# Dropout
	self.dropout = nn.Dropout(0.2)

	def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length):
	# Calculate the size of the input to the LSTM layer
	num_frames = (sample_rate - n_fft) // hop_length + 1
	x = torch.randn(1, input_channels, num_frames)
	self.convs(x)
	self._to_linear = x.shape[1]

	def convs(self, x):
	x = self.pool1(self.bn1(F.relu(self.conv1(x))))
	x = self.pool2(self.bn2(F.relu(self.conv2(x))))
	x = self.pool3(self.bn3(F.relu(self.conv3(x))))
	return x

	def forward(self, x):
	x = x.view(x.size(0), 1, -1)
	x = self.convs(x)

	x = x.permute(0, 2, 1)
	x, _ = self.lstm(x)
	x = x[:, -1, :]

	# Fully connected layers
	x = self.dropout(x)
	x = self.fc1(x)
	x = self.dropout(x)
	x = self.fc2(x)

	return x

	num_class = 6
	device = torch.device('cpu')
	state_dict = torch.load('best-model-emotion-recognition.bin', map_location=device)
	model = CNN1DLSTMAudioClassifier(num_class)
	model.load_state_dict(state_dict)
	model.eval()

	def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512):
	# Load the audio file
	waveform, sr = torchaudio.load(file_path)

	# Resample if necessary
	if sr != sample_rate:
	resampler = torchaudio.transforms.Resample(sr, sample_rate)
	waveform = resampler(waveform)

	# Ensure consistent audio length (2 seconds)
	target_length = 2 * sample_rate
	if waveform.size(1) > target_length:
	waveform = waveform[:, :target_length]
	else:
	waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1)))

	# Apply Mel Spectrogram transform
	mel_transform = torchaudio.transforms.MelSpectrogram(
	sample_rate=sample_rate,
	n_mels=n_mels,
	n_fft=n_fft,
	hop_length=hop_length
	)
	mel_spectrogram = mel_transform(waveform)

	# Normalize (use the mean and std from your training data)
	mean = 12.65
	std = 117.07
	normalized_mel_spectrogram = (mel_spectrogram - mean) / std

	# Flatten the mel spectrogram
	flattened = normalized_mel_spectrogram.flatten()

	if flattened.shape[0] < 12288:
	flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0]))
	elif flattened.shape[0] > 12288:
	flattened = flattened[:12288]

	return flattened

	def decode_emotion_prediction(prediction_tensor, label_encoder):
	"""
	Decodes the prediction tensor into an emotion label.

	Args:
	prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6]
	label_encoder (LabelEncoder): The LabelEncoder used during training

	Returns:
	str: The predicted emotion label
	float: The confidence score for the prediction
	"""
	# Get the index of the highest probability
	max_index = torch.argmax(prediction_tensor, dim=1).item()

	# Get the confidence score (probability) for the prediction
	confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item()

	# Decode the index to get the emotion label
	predicted_emotion = label_encoder.inverse_transform([max_index])[0]

	return predicted_emotion, confidence

	# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# model = model.to(device)

	def predict(wave):
	wave = preprocess_single_audio(wave)
	le = LabelEncoder()
	le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad'])
	wave = wave.unsqueeze(0)
	with torch.no_grad():
	prediction = model(wave)
	predicted_emotion, confidence = decode_emotion_prediction(prediction, le)
	return f"Predicted emotion: {predicted_emotion} (Confidence: {confidence:.2f})"

	# Gradio Interface
	iface = gr.Interface(
	fn=predict,
	inputs=gr.Audio(sources="microphone", type="filepath"),
	outputs="text",
	live=True,
	title="Speech Emotion Recognition",
	description="Record your voice and get the predicted emotion."
	)

	iface.launch()