File size: 2,445 Bytes
1473306 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import numpy as np
import librosa
import sounddevice as sd
import soundfile as sf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import torch
# Load Hugging Face Wav2Vec2 Model
model_name = "facebook/wav2vec2-large-xlsr-53"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
def extract_features(audio, sample_rate=16000):
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) # Extract MFCCs
mfccs_scaled = np.mean(mfccs.T, axis=0) # Scale the MFCCs
return mfccs_scaled
# Function to predict emotion based on audio input
def predict_emotion(audio):
# Extract features from audio
features = extract_features(audio).reshape(1, -1) # Reshape for classifier input
predicted_emotion = model_rf.predict(features)
return predicted_emotion[0]
# Prepare your emotion classification model
# Replace this section with your own training procedures as necessary
# Assume we have trained 'model_rf', for example purposes
# Here you can load a trained model or define how to train it
emotions = ['happy', 'sad', 'angry', 'fear', 'surprise'] # Example emotion categories
# For demonstration purposes, we are creating a dummy classifier.
# Replace this with the actual model training as demonstrated previously.
model_rf = RandomForestClassifier(n_estimators=100, random_state=42) # Dummy model for demo
# This is a placeholder training step; you would train your model on actual data.
features_dummy = np.random.rand(100, 40) # Dummy feature data
labels_dummy = np.random.choice(emotions, 100) # Random dummy labels
model_rf.fit(features_dummy, labels_dummy) # Dummy fit
# Function to record audio and analyze emotion
def record_and_predict():
print("Recording... Please speak with emotion...")
duration = 5 # Duration of recording in seconds
sample_rate = 16000 # Sample rate for audio recording
# Record audio
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
sd.wait() # Wait until recording is finished
print("Recording finished.")
# Predict emotion from the recorded audio
emotion = predict_emotion(audio.flatten())
print(f'Predicted Emotion: {emotion}')
if __name__ == "__main__":
record_and_predict() |