|
import os |
|
import numpy as np |
|
import librosa |
|
import sounddevice as sd |
|
import soundfile as sf |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.metrics import accuracy_score, classification_report |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer |
|
import torch |
|
|
|
|
|
model_name = "facebook/wav2vec2-large-xlsr-53" |
|
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) |
|
model = Wav2Vec2ForCTC.from_pretrained(model_name) |
|
|
|
def extract_features(audio, sample_rate=16000): |
|
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) |
|
mfccs_scaled = np.mean(mfccs.T, axis=0) |
|
return mfccs_scaled |
|
|
|
|
|
def predict_emotion(audio): |
|
|
|
features = extract_features(audio).reshape(1, -1) |
|
predicted_emotion = model_rf.predict(features) |
|
return predicted_emotion[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
emotions = ['happy', 'sad', 'angry', 'fear', 'surprise'] |
|
|
|
|
|
model_rf = RandomForestClassifier(n_estimators=100, random_state=42) |
|
|
|
|
|
features_dummy = np.random.rand(100, 40) |
|
labels_dummy = np.random.choice(emotions, 100) |
|
model_rf.fit(features_dummy, labels_dummy) |
|
|
|
|
|
def record_and_predict(): |
|
print("Recording... Please speak with emotion...") |
|
duration = 5 |
|
sample_rate = 16000 |
|
|
|
|
|
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32') |
|
sd.wait() |
|
print("Recording finished.") |
|
|
|
|
|
emotion = predict_emotion(audio.flatten()) |
|
print(f'Predicted Emotion: {emotion}') |
|
|
|
if __name__ == "__main__": |
|
record_and_predict() |