import librosa
import joblib
from keras.models import load_model
import numpy as np
import pandas as pd
import gradio as gr
import h5py
TF_ENABLE_ONEDNN_OPTS=0

root_path ="./model/"
num2label = {0:"Neutral", 1: "Calm", 2:"Happy", 3:"Sad", 4:"Angry", 5:"Fearful", 6:"Disgust", 7:"Surprised"}

SVM_spectral = joblib.load(root_path + "SVM_spectral.joblib")
SVM_prosodic = joblib.load(root_path + "SVM_prosodic.joblib")
SVM_full = joblib.load(root_path + "SVM_full.joblib")
SVM_mfcc = joblib.load(root_path + "SVM_mfcc.joblib")

NB_spectral = joblib.load(root_path + "NB_spectral.joblib")
NB_prosodic = joblib.load(root_path + "NB_prosodic.joblib")
NB_full = joblib.load(root_path + "NB_full.joblib")
NB_mfcc = joblib.load(root_path + "NB_mfcc.joblib")

DT_spectral = joblib.load(root_path + "DT_spectral.joblib")
DT_prosodic = joblib.load(root_path + "DT_prosodic.joblib")
DT_full = joblib.load(root_path + "DT_full.joblib")
DT_mfcc = joblib.load(root_path + "DT_mfcc.joblib")


MLP_spectral = joblib.load(root_path + "MLP_spectral.joblib")
MLP_prosodic = joblib.load(root_path + "MLP_prosodic.joblib")
MLP_full = joblib.load(root_path + "MLP_full.joblib")
MLP_mfcc = joblib.load(root_path + "MLP_mfcc.joblib")

RF_spectral = joblib.load(root_path + "RF_spectral.joblib")
RF_prosodic = joblib.load(root_path + "RF_prosodic.joblib")
RF_full = joblib.load(root_path + "RF_full.joblib")
RF_mfcc = joblib.load(root_path + "RF_mfcc.joblib")

def load_model_from_h5(file_path):
    with h5py.File(file_path, 'r') as file:
        model = load_model(file, compile=False)
    return model

LSTM_spectral = load_model_from_h5(root_path + "LSTM_spectral.h5")
LSTM_prosodic = load_model_from_h5(root_path + "LSTM_prosodic.h5")
LSTM_full = load_model_from_h5(root_path + "LSTM_full.h5")
LSTM_mfcc = load_model_from_h5(root_path + "LSTM_mfcc.h5")

LSTM_CNN_spectral = load_model_from_h5(root_path + "LSTM_CNN_spectral.h5")
LSTM_CNN_prosodic = load_model_from_h5(root_path + "LSTM_CNN_prosodic.h5")
LSTM_CNN_full = load_model_from_h5(root_path + "LSTM_CNN_full.h5")
LSTM_CNN_mfcc = load_model_from_h5(root_path + "LSTM_CNN_mfcc.h5")

CNN_spectral = load_model_from_h5(root_path + "CNN_spectral.h5")
CNN_prosodic = load_model_from_h5(root_path + "CNN_prosodic.h5")
CNN_full = load_model_from_h5(root_path + "CNN_full.h5")
CNN_mfcc = load_model_from_h5(root_path + "CNN_mfcc.h5")    

total_model = {"SVM": {'mfcc': SVM_mfcc, 'spectral': SVM_spectral, 'prosodic':SVM_prosodic, 'full':SVM_full},
               "NB": {'mfcc': NB_mfcc, 'spectral': NB_spectral, 'prosodic': NB_prosodic, 'full': NB_full},
               "DT": {'mfcc': DT_mfcc, 'spectral': DT_spectral, 'prosodic': DT_prosodic, 'full': DT_full},
               "MLP": {'mfcc': MLP_mfcc, 'spectral': MLP_spectral, 'prosodic':MLP_prosodic, 'full':MLP_full},
               "RF": {'mfcc': RF_mfcc, 'spectral': RF_spectral, 'prosodic': RF_prosodic, 'full': RF_full},
               "LSTM": {'mfcc': LSTM_mfcc, 'spectral': LSTM_spectral, 'prosodic': LSTM_prosodic, 'full': LSTM_full},
               "LSTM_CNN": {'mfcc': LSTM_CNN_mfcc, 'spectral': LSTM_CNN_spectral, 'prosodic': LSTM_CNN_prosodic, 'full': LSTM_CNN_full},
               "CNN": {'mfcc': CNN_mfcc, 'spectral': CNN_spectral, 'prosodic': CNN_prosodic, 'full': CNN_full}
               }

spectral_scaler = joblib.load(root_path + 'spectral_features_standard_scaler.joblib')
prosodic_scaler = joblib.load(root_path + 'prosodic_features_standard_scaler.joblib')
full_scaler = joblib.load(root_path + 'full_features_standard_scaler.joblib')
mfcc_scaler = joblib.load(root_path + 'mfcc_features_standard_scaler.joblib')
scaler = {'mfcc': mfcc_scaler, 'spectral': spectral_scaler, 'prosodic': prosodic_scaler, 'full': full_scaler}

def Load_audio(audio_path):
  # Đọc file âm thanh và tần số lấy mẫu
  y, sr = librosa.load(audio_path, sr=48000)
  return y

# Bạn có thể sử dụng y và sr cho các mục đích xử lý âm thanh tiếp theo

def Spectral_extract_features(audio): # data là một file âm thanh thôi

    mfccs = librosa.feature.mfcc(y=audio, n_mfcc=40) # sr=sr,

    chroma = librosa.feature.chroma_stft(y=audio)

    spectral_contrast = librosa.feature.spectral_contrast(y=audio)

    tonal_centroid = librosa.feature.tonnetz(y=audio)

    mel_spectrogram = librosa.feature.melspectrogram(y=audio)
    feature_vector = np.concatenate((mfccs.mean(axis=1), chroma.mean(axis=1), spectral_contrast.mean(axis=1), tonal_centroid.mean(axis = 1), mel_spectrogram.mean(axis = 1)))

    return np.array(feature_vector)

def mfcc_extract_features(audio):
    mfccs = librosa.feature.mfcc(y=audio, n_mfcc=40) # sr=sr,
    mfcc_features = mfccs.mean(axis=1)
    return mfcc_features

def Prosodic_extract_features(audio):

    pitch, _ = librosa.piptrack(y=audio, n_fft=128, hop_length = 512)
    #print("pitch:",  pitch.mean(axis=1)) # ok
    duration = librosa.get_duration(y=audio)
    #print("duration:",duration) # ok
    energy = librosa.feature.rms(y=audio)
    #print("energy:", energy.shape)
    duration = np.array([duration]).reshape(1,1)
    #print("duration:", duration.shape)
    feature_vector = np.concatenate((pitch.mean(axis=1), duration.mean(axis=1), energy.mean(axis=1)))
    return np.array(feature_vector)

def Spectral_Prosodic(audio):
  Spectral_features = Spectral_extract_features(audio)
  Prosodic_features = Prosodic_extract_features(audio)
  full_features = np.concatenate((Spectral_features, Prosodic_features))
  return full_features

def Total_features(audio, scaler):
  features = {}
  features['spectral'] = scaler['spectral'].transform(Spectral_extract_features(audio).reshape(1, -1))
  features['prosodic'] = scaler['prosodic'].transform(Prosodic_extract_features(audio).reshape(1, -1))
  features['full'] = scaler['full'].transform(Spectral_Prosodic(audio).reshape(1, -1))
  features['mfcc'] = scaler['mfcc'].transform(mfcc_extract_features(audio).reshape(1, -1))
  return features


def total_predict(feature, total_model): # feature là một dict tổng hợp 4 loại đặc trưng
  result = {'mfcc': {}, 'spectral' : {}, 'prosodic': {}, 'full': {} }
  f_keys = ['mfcc', 'spectral', 'prosodic', 'full']
  ML = ['SVM', 'NB', 'DT', 'MLP', 'RF']
  m_keys = ['SVM', 'NB', 'DT', 'MLP', 'RF', 'LSTM', 'LSTM_CNN', 'CNN']
  for f in f_keys:
    for m in m_keys:
      try:
        if m in ML:
          model = total_model[m][f]
          result[f][m] = num2label[model.predict(feature[f])[0]]
        else:
          model = total_model[m][f]
          temp = [np.array(feature[f]).reshape((1,-1))]
          y_pred = model.predict(temp)
          y_pred_labels = np.argmax(y_pred, axis=1)[0]
          result[f][m] = num2label[y_pred_labels]
      except:
        print(f, m)
  return result

# def main_function(audio_path, scaler, total_model):
#   audio = Load_audio(audio_path)
#   feature = Total_features(audio, scaler)
#   labels = total_predict(feature, total_model)
#   table = pd.DataFrame.from_dict(labels).T
#   return table
def main_function(audio_path, scaler, total_model):
  audio = Load_audio(audio_path)
  feature = Total_features(audio, scaler)
  labels = total_predict(feature, total_model)
  table = pd.DataFrame.from_dict(labels).T
  table.insert(0, 'Đặc trưng', ['mfcc', 'spectral', 'prosodic', 'full'])
  return table

def main_interface(audio_file):
    # print("đường dẫn", audio_file)
    # sr, audio_data = audio_file
    # print(sr, audio_data)
    # if 1:
    #     audio_data = audio_data.astype(float)
    #     audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=48000)
    #     print("đã đọc lại file")
    # else:
    #     pass
    # # audio_path = "./uploaded.wav"    
    # # write(audio_path, 48000, np.int16(audio_data))
    # # print("đã lưu")
    result_table = main_function(audio_file, scaler, total_model)
    return result_table


# Create Gradio Interface
iface = gr.Interface(
    fn=main_interface,
    inputs=gr.Audio(type= 'filepath'),
    outputs=gr.Dataframe(headers=['Đặc trưng', 'SVM', 'NB', 'DT', 'MLP', 'RF', 'LSTM', 'LSTM_CNN', 'CNN']),
)

# Launch the Gradio Interface
iface.launch()