from __future__ import absolute_import, division, print_function, unicode_literals import os import librosa import librosa.display import numpy as np import shutil import random import string import warnings import datetime import gradio as gr import tensorflow as tf from tqdm import tqdm from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical from keras.layers import Flatten, Dropout, Activation from keras.layers import Conv2D, MaxPooling2D from keras.layers import BatchNormalization from sklearn.model_selection import train_test_split from tqdm import tqdm from save_data import flag warnings.filterwarnings("ignore") timestamp = datetime.datetime.now() current_date = timestamp.strftime('%d-%m-%Y') current_time = timestamp.strftime('%I:%M:%S') IP = '' cwd = os.getcwd() classLabels = ('Angry', 'Fear', 'Disgust', 'Happy', 'Sad', 'Surprised', 'Neutral') numLabels = len(classLabels) in_shape = (39,216) model = Sequential() model.add(Conv2D(8, (13, 13), input_shape=(in_shape[0], in_shape[1], 1))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(Conv2D(8, (13, 13))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 1))) model.add(Conv2D(8, (3, 3))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(Conv2D(8, (1, 1))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 1))) model.add(Flatten()) model.add(Dense(64)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(numLabels, activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.load_weights('speech_emotion_detection_ravdess_savee.h5') def selected_audio(audio): try: if audio and audio != 'Please select any of the following options': post_file_name = audio.lower() + '.wav' filepath = os.path.join("pre_recoreded",post_file_name) if os.path.exists(filepath): print("SELECT file name => ",filepath) result = predict_speech_emotion(filepath) print("result = ",result) return result except Exception as e: print(e) return "ERROR" def recorded_audio(audio): get_audio_name = '' final_output = '' if audio: get_audio_name = ''.join([random.choice(string.ascii_letters + string.digits) for n in range(5)]) get_audio_name = get_audio_name + '.wav' audio_file_path = audio.name final_output = predict_speech_emotion(audio_file_path) flag(audio_file_path,get_audio_name,final_output) return final_output else: raise gr.Error("Please record audio first!!!!") def predict_speech_emotion(filepath): if os.path.exists(filepath): print("last file name => ",filepath) X, sample_rate = librosa.load(filepath, res_type='kaiser_best',duration=2.5,sr=22050*2,offset=0.5) sample_rate = np.array(sample_rate) mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=39) feature = mfccs feature = feature.reshape(39, 216, 1) # np_array = np.array([feature]) np_array = np.array([feature]) prediction = model.predict(np_array) np_argmax = np.argmax(prediction) result = classLabels[np_argmax] return result def return_audio_clip(audio_text): post_file_name = audio_text.lower() + '.wav' filepath = os.path.join("pre_recoreded",post_file_name) return filepath with gr.Blocks(css=".gradio-container {background-color: lightgray;} #btn {background-color: orange;}") as blocks: gr.Markdown("
Feel free to give us your feedback and contact us at letstalk@pragnakalp.com if you want to have your own Speech emotion detection system. We are just one click away. And don't forget to check out more interesting NLP services we are offering.
Developed by: Pragnakalp Techlabs
""") sub_btn.click(selected_audio, inputs=input_audio_text, outputs=output_text) sub_btn2.click(recorded_audio, inputs=audio, outputs=recorded_text) blocks.launch()