|
import sys |
|
import os |
|
import torch |
|
import librosa |
|
import numpy as np |
|
from model.emotion_classifier import EmotionClassifier |
|
from utils.preprocessing import collate_fn |
|
from config import DEVICE, NUM_LABELS, BEST_MODEL_NAME |
|
|
|
|
|
feature_dim = 40 |
|
model = EmotionClassifier(feature_dim, NUM_LABELS).to(DEVICE) |
|
model.load_state_dict(torch.load(BEST_MODEL_NAME, map_location=DEVICE)) |
|
model.eval() |
|
|
|
|
|
LABELS = {0: "colère", 1: "neutre", 2: "joie"} |
|
|
|
|
|
def predict_emotion(audio_path, max_length=128): |
|
|
|
y, sr = librosa.load(audio_path, sr=16000) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) |
|
|
|
|
|
if mfcc.shape[1] > max_length: |
|
mfcc = mfcc[:, :max_length] |
|
else: |
|
pad_width = max_length - mfcc.shape[1] |
|
mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant') |
|
|
|
|
|
input_tensor = torch.tensor(mfcc.T, dtype=torch.float32).unsqueeze(0).to(DEVICE) |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(input_tensor) |
|
probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy().flatten() |
|
predicted_class = torch.argmax(logits, dim=-1).item() |
|
|
|
|
|
probabilities_dict = {LABELS[i]: float(probabilities[i]) for i in range(NUM_LABELS)} |
|
|
|
return LABELS[predicted_class], probabilities_dict |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "data")) |
|
audio_file = os.path.join(base_path, "colere", "c1ac.wav") |
|
|
|
predicted_emotion, probabilities = predict_emotion(audio_file) |
|
|
|
print(f"🎤 L'émotion prédite est : {predicted_emotion}") |
|
print(f"📊 Probabilités par classe : {probabilities}") |
|
|