import librosa import joblib from keras.models import load_model import numpy as np import pandas as pd import gradio as gr import h5py TF_ENABLE_ONEDNN_OPTS=0 root_path ="./model/" num2label = {0:"Neutral", 1: "Calm", 2:"Happy", 3:"Sad", 4:"Angry", 5:"Fearful", 6:"Disgust", 7:"Surprised"} SVM_spectral = joblib.load(root_path + "SVM_spectral.joblib") SVM_prosodic = joblib.load(root_path + "SVM_prosodic.joblib") SVM_full = joblib.load(root_path + "SVM_full.joblib") SVM_mfcc = joblib.load(root_path + "SVM_mfcc.joblib") NB_spectral = joblib.load(root_path + "NB_spectral.joblib") NB_prosodic = joblib.load(root_path + "NB_prosodic.joblib") NB_full = joblib.load(root_path + "NB_full.joblib") NB_mfcc = joblib.load(root_path + "NB_mfcc.joblib") DT_spectral = joblib.load(root_path + "DT_spectral.joblib") DT_prosodic = joblib.load(root_path + "DT_prosodic.joblib") DT_full = joblib.load(root_path + "DT_full.joblib") DT_mfcc = joblib.load(root_path + "DT_mfcc.joblib") MLP_spectral = joblib.load(root_path + "MLP_spectral.joblib") MLP_prosodic = joblib.load(root_path + "MLP_prosodic.joblib") MLP_full = joblib.load(root_path + "MLP_full.joblib") MLP_mfcc = joblib.load(root_path + "MLP_mfcc.joblib") RF_spectral = joblib.load(root_path + "RF_spectral.joblib") RF_prosodic = joblib.load(root_path + "RF_prosodic.joblib") RF_full = joblib.load(root_path + "RF_full.joblib") RF_mfcc = joblib.load(root_path + "RF_mfcc.joblib") def load_model_from_h5(file_path): with h5py.File(file_path, 'r') as file: model = load_model(file, compile=False) return model LSTM_spectral = load_model_from_h5(root_path + "LSTM_spectral.h5") LSTM_prosodic = load_model_from_h5(root_path + "LSTM_prosodic.h5") LSTM_full = load_model_from_h5(root_path + "LSTM_full.h5") LSTM_mfcc = load_model_from_h5(root_path + "LSTM_mfcc.h5") LSTM_CNN_spectral = load_model_from_h5(root_path + "LSTM_CNN_spectral.h5") LSTM_CNN_prosodic = load_model_from_h5(root_path + "LSTM_CNN_prosodic.h5") LSTM_CNN_full = load_model_from_h5(root_path + "LSTM_CNN_full.h5") LSTM_CNN_mfcc = load_model_from_h5(root_path + "LSTM_CNN_mfcc.h5") CNN_spectral = load_model_from_h5(root_path + "CNN_spectral.h5") CNN_prosodic = load_model_from_h5(root_path + "CNN_prosodic.h5") CNN_full = load_model_from_h5(root_path + "CNN_full.h5") CNN_mfcc = load_model_from_h5(root_path + "CNN_mfcc.h5") total_model = {"SVM": {'mfcc': SVM_mfcc, 'spectral': SVM_spectral, 'prosodic':SVM_prosodic, 'full':SVM_full}, "NB": {'mfcc': NB_mfcc, 'spectral': NB_spectral, 'prosodic': NB_prosodic, 'full': NB_full}, "DT": {'mfcc': DT_mfcc, 'spectral': DT_spectral, 'prosodic': DT_prosodic, 'full': DT_full}, "MLP": {'mfcc': MLP_mfcc, 'spectral': MLP_spectral, 'prosodic':MLP_prosodic, 'full':MLP_full}, "RF": {'mfcc': RF_mfcc, 'spectral': RF_spectral, 'prosodic': RF_prosodic, 'full': RF_full}, "LSTM": {'mfcc': LSTM_mfcc, 'spectral': LSTM_spectral, 'prosodic': LSTM_prosodic, 'full': LSTM_full}, "LSTM_CNN": {'mfcc': LSTM_CNN_mfcc, 'spectral': LSTM_CNN_spectral, 'prosodic': LSTM_CNN_prosodic, 'full': LSTM_CNN_full}, "CNN": {'mfcc': CNN_mfcc, 'spectral': CNN_spectral, 'prosodic': CNN_prosodic, 'full': CNN_full} } spectral_scaler = joblib.load(root_path + 'spectral_features_standard_scaler.joblib') prosodic_scaler = joblib.load(root_path + 'prosodic_features_standard_scaler.joblib') full_scaler = joblib.load(root_path + 'full_features_standard_scaler.joblib') mfcc_scaler = joblib.load(root_path + 'mfcc_features_standard_scaler.joblib') scaler = {'mfcc': mfcc_scaler, 'spectral': spectral_scaler, 'prosodic': prosodic_scaler, 'full': full_scaler} def Load_audio(audio_path): # Đọc file âm thanh và tần số lấy mẫu y, sr = librosa.load(audio_path, sr=48000) return y # Bạn có thể sử dụng y và sr cho các mục đích xử lý âm thanh tiếp theo def Spectral_extract_features(audio): # data là một file âm thanh thôi mfccs = librosa.feature.mfcc(y=audio, n_mfcc=40) # sr=sr, chroma = librosa.feature.chroma_stft(y=audio) spectral_contrast = librosa.feature.spectral_contrast(y=audio) tonal_centroid = librosa.feature.tonnetz(y=audio) mel_spectrogram = librosa.feature.melspectrogram(y=audio) feature_vector = np.concatenate((mfccs.mean(axis=1), chroma.mean(axis=1), spectral_contrast.mean(axis=1), tonal_centroid.mean(axis = 1), mel_spectrogram.mean(axis = 1))) return np.array(feature_vector) def mfcc_extract_features(audio): mfccs = librosa.feature.mfcc(y=audio, n_mfcc=40) # sr=sr, mfcc_features = mfccs.mean(axis=1) return mfcc_features def Prosodic_extract_features(audio): pitch, _ = librosa.piptrack(y=audio, n_fft=128, hop_length = 512) #print("pitch:", pitch.mean(axis=1)) # ok duration = librosa.get_duration(y=audio) #print("duration:",duration) # ok energy = librosa.feature.rms(y=audio) #print("energy:", energy.shape) duration = np.array([duration]).reshape(1,1) #print("duration:", duration.shape) feature_vector = np.concatenate((pitch.mean(axis=1), duration.mean(axis=1), energy.mean(axis=1))) return np.array(feature_vector) def Spectral_Prosodic(audio): Spectral_features = Spectral_extract_features(audio) Prosodic_features = Prosodic_extract_features(audio) full_features = np.concatenate((Spectral_features, Prosodic_features)) return full_features def Total_features(audio, scaler): features = {} features['spectral'] = scaler['spectral'].transform(Spectral_extract_features(audio).reshape(1, -1)) features['prosodic'] = scaler['prosodic'].transform(Prosodic_extract_features(audio).reshape(1, -1)) features['full'] = scaler['full'].transform(Spectral_Prosodic(audio).reshape(1, -1)) features['mfcc'] = scaler['mfcc'].transform(mfcc_extract_features(audio).reshape(1, -1)) return features def total_predict(feature, total_model): # feature là một dict tổng hợp 4 loại đặc trưng result = {'mfcc': {}, 'spectral' : {}, 'prosodic': {}, 'full': {} } f_keys = ['mfcc', 'spectral', 'prosodic', 'full'] ML = ['SVM', 'NB', 'DT', 'MLP', 'RF'] m_keys = ['SVM', 'NB', 'DT', 'MLP', 'RF', 'LSTM', 'LSTM_CNN', 'CNN'] for f in f_keys: for m in m_keys: try: if m in ML: model = total_model[m][f] result[f][m] = num2label[model.predict(feature[f])[0]] else: model = total_model[m][f] temp = [np.array(feature[f]).reshape((1,-1))] y_pred = model.predict(temp) y_pred_labels = np.argmax(y_pred, axis=1)[0] result[f][m] = num2label[y_pred_labels] except: print(f, m) return result # def main_function(audio_path, scaler, total_model): # audio = Load_audio(audio_path) # feature = Total_features(audio, scaler) # labels = total_predict(feature, total_model) # table = pd.DataFrame.from_dict(labels).T # return table def main_function(audio_path, scaler, total_model): audio = Load_audio(audio_path) feature = Total_features(audio, scaler) labels = total_predict(feature, total_model) table = pd.DataFrame.from_dict(labels).T table.insert(0, 'Đặc trưng', ['mfcc', 'spectral', 'prosodic', 'full']) return table def main_interface(audio_file): # print("đường dẫn", audio_file) # sr, audio_data = audio_file # print(sr, audio_data) # if 1: # audio_data = audio_data.astype(float) # audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=48000) # print("đã đọc lại file") # else: # pass # # audio_path = "./uploaded.wav" # # write(audio_path, 48000, np.int16(audio_data)) # # print("đã lưu") result_table = main_function(audio_file, scaler, total_model) return result_table # Create Gradio Interface iface = gr.Interface( fn=main_interface, inputs=gr.Audio(type= 'filepath'), outputs=gr.Dataframe(headers=['Đặc trưng', 'SVM', 'NB', 'DT', 'MLP', 'RF', 'LSTM', 'LSTM_CNN', 'CNN']), ) # Launch the Gradio Interface iface.launch()