#import os #import shutil import numpy as np import tensorflow as tf from tensorflow import keras #from pathlib import Path #from IPython.display import display, Audio import numpy as np import tensorflow as tf import gradio as gr from huggingface_hub import from_pretrained_keras #import cv2 #from IPython.display import Audio classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela'] # Percentage of samples to use for validation # VALID_SPLIT = 0.1 # Seed to use when shuffling the dataset and the noise # SHUFFLE_SEED = 43 # The sampling rate to use. # This is the one used in all of the audio samples. # We will resample all of the noise to this sampling rate. # This will also be the output size of the audio wave samples # (since all samples are of 1 second long) SAMPLING_RATE = 16000 # The factor to multiply the noise with according to: # noisy_sample = sample + noise * prop * scale # where prop = sample_amplitude / noise_amplitude # SCALE = 0.5 # test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels) # test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch( # BATCH_SIZE # ) # test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y)) model = from_pretrained_keras("keras-io/speaker-recognition") def path_to_audio(path): """Reads and decodes an audio file.""" audio = tf.io.read_file(path) audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE) return audio def audio_to_fft(audio): # Since tf.signal.fft applies FFT on the innermost dimension, # we need to squeeze the dimensions and then expand them again # after FFT audio = tf.squeeze(audio, axis=-1) fft = tf.signal.fft( tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64) ) fft = tf.expand_dims(fft, axis=-1) # print("audio.shape[1]", audio.shape) # Return the absolute value of the first half of the FFT # which represents the positive frequencies return tf.math.abs(fft[:, : (audio.shape[1] // 2), :]) #actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav' # print(path_to_audio(actual_audio_path).shape) # print(actual_audio_path.shape) def predict(actual_audio_path, actual_label): path_of_actual_audio = path_to_audio(actual_audio_path) actual_audio = tf.expand_dims(path_of_actual_audio, axis=0) # Get the signal FFT ffts = audio_to_fft(actual_audio) # Predict y_pred = model.predict(ffts) y_pred = np.argmax(y_pred, axis=-1) # print(y_pred) return classes_names[y_pred[0]], actual_audio_path # the app takes one AUDIO to be recognised input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")] # the app outputs two segmented images output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")] # it's good practice to pass examples, description and a title to guide users examples = [['audios/260.wav', 'Benjamin_Netanyau'], ['audios/611.wav', 'Jens_Stoltenberg'], ['audios/65.wav', 'Julia_Gillard'], ['audios/1083.wav', 'Magaret_Tarcher'], ['audios/605.wav', 'Nelson_Mandela']] title = "Speaker Recognition" description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!" gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, live=True, allow_flagging=False, analytics_enabled=False, title=title, description=description, article="