Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import io | |
import csv | |
import gradio as gr | |
import numpy as np | |
import tensorflow as tf | |
import tensorflow_hub as hub | |
import tensorflow_io as tfio | |
import matplotlib.pyplot as plt | |
from tensorflow import keras | |
from huggingface_hub import from_pretrained_keras | |
# Configuration | |
class_names = [ | |
"Irish", | |
"Midlands", | |
"Northern", | |
"Scottish", | |
"Southern", | |
"Welsh", | |
"Not a speech", | |
] | |
# Download Yamnet model from TF Hub | |
yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1") | |
# Download dense model from HF Hub | |
model = from_pretrained_keras( | |
pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification" | |
) | |
# Function that reads a wav audio file and resamples it to 16000 Hz | |
# This function is copied from the tutorial: | |
# https://www.tensorflow.org/tutorials/audio/transfer_learning_audio | |
def load_16k_audio_wav(filename): | |
# Read file content | |
file_content = tf.io.read_file(filename) | |
# Decode audio wave | |
audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1) | |
audio_wav = tf.squeeze(audio_wav, axis=-1) | |
sample_rate = tf.cast(sample_rate, dtype=tf.int64) | |
# Resample to 16k | |
audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) | |
return audio_wav | |
# Function thatt takes the audio file produced by gr.Audio(source="microphone") and | |
# returns a tensor applying the following transformations: | |
# - Resample to 16000 Hz | |
# - Normalize | |
# - Reshape to [1, -1] | |
def mic_to_tensor(recorded_audio_file): | |
sample_rate, audio = recorded_audio_file | |
audio_wav = tf.constant(audio, dtype=tf.float32) | |
if tf.rank(audio_wav) > 1: | |
audio_wav = tf.reduce_mean(audio_wav, axis=1) | |
audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) | |
audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav))) | |
return audio_wav | |
# Function that takes a tensor and applies the following: | |
# - Pass it through Yamnet model to get the embeddings which are the input of the dense model | |
# - Pass the embeddings through the dense model to get the predictions | |
def tensor_to_predictions(audio_tensor): | |
# Get audio embeddings & scores. | |
scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor) | |
# Predict the output of the accent recognition model with embeddings as input | |
predictions = model.predict(embeddings) | |
return predictions, mel_spectrogram | |
# Function tha is called when the user clicks "Predict" button. It does the following: | |
# - Calls tensor_to_predictions() to get the predictions | |
# - Generates the top scoring labels | |
# - Generates the top scoring plot | |
def predict_accent(recorded_audio_file, uploaded_audio_file): | |
# Transform input to tensor | |
if recorded_audio_file: | |
audio_tensor = mic_to_tensor(recorded_audio_file) | |
else: | |
audio_tensor = load_16k_audio_wav(uploaded_audio_file) | |
# Model Inference | |
predictions, mel_spectrogram = tensor_to_predictions(audio_tensor) | |
# Get the infered class | |
infered_class = class_names[predictions.mean(axis=0).argmax()] | |
# Generate Output 1 - Accents | |
top_scoring_labels_output = { | |
class_names[i]: float(predictions.mean(axis=0)[i]) | |
for i in range(len(class_names)) | |
} | |
# Generate Output 2 | |
top_scoring_plot_output = generate_top_scoring_plot(predictions) | |
return [top_scoring_labels_output, top_scoring_plot_output] | |
# Clears all inputs and outputs when the user clicks "Clear" button | |
def clear_inputs_and_outputs(): | |
return [None, None, None, None] | |
# Function that generates the top scoring plot | |
# This function is copied from the tutorial and adjusted to our needs | |
# https://keras.io/examples/audio/uk_ireland_accent_recognition/tinyurl.com/4a8xn7at | |
def generate_top_scoring_plot(predictions): | |
# Plot and label the model output scores for the top-scoring classes. | |
mean_predictions = np.mean(predictions, axis=0) | |
top_class_indices = np.argsort(mean_predictions)[::-1] | |
fig = plt.figure(figsize=(10, 2)) | |
plt.imshow( | |
predictions[:, top_class_indices].T, | |
aspect="auto", | |
interpolation="nearest", | |
cmap="gray_r", | |
) | |
# patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS | |
# values from the model documentation | |
patch_padding = (0.025 / 2) / 0.01 | |
plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5]) | |
# Label the top_N classes. | |
yticks = range(0, len(class_names), 1) | |
plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks]) | |
_ = plt.ylim(-0.5 + np.array([len(class_names), 0])) | |
return fig | |
# Main function | |
if __name__ == "__main__": | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown( | |
""" | |
<center><h1>English speaker accent recognition using Transfer Learning</h1></center> \ | |
This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.<br> \ | |
In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio<br><br> | |
""" | |
) | |
with gr.Row(): | |
## Input | |
with gr.Column(): | |
src_input = gr.Audio(sources=["microphone", "upload"]) | |
with gr.Row(): | |
clr_btn = gr.Button(value="Clear", variant="secondary") | |
prd_btn = gr.Button(value="Predict") | |
# Outputs | |
with gr.Column(): | |
lbl_output = gr.Label(label="Top Predictions") | |
with gr.Group(): | |
gr.Markdown("<center>Prediction per time slot</center>") | |
plt_output = gr.Plot( | |
label="Prediction per time slot", show_label=False | |
) | |
# Credits | |
with gr.Row(): | |
gr.Markdown( | |
""" | |
<h4>Credits</h4> | |
Author: <a href="https://twitter.com/fadibadine"> Fadi Badine</a>.<br> | |
Based on the following Keras example <a href="https://keras.io/examples/audio/uk_ireland_accent_recognition"> English speaker accent recognition using Transfer Learning</a> by Fadi Badine<br> | |
Check out the model <a href="https://huggingface.co/keras-io/english-speaker-accent-recognition-using-transfer-learning">here</a> | |
""" | |
) | |
clr_btn.click( | |
fn=clear_inputs_and_outputs, | |
inputs=[], | |
outputs=[src_input, lbl_output, plt_output], | |
) | |
prd_btn.click( | |
fn=predict_accent, | |
inputs=[src_input], | |
outputs=[lbl_output, plt_output], | |
) | |
demo.launch(debug=True, share=True) | |