|
import os |
|
import io |
|
import csv |
|
import gradio as gr |
|
import numpy as np |
|
import tensorflow as tf |
|
import tensorflow_hub as hub |
|
import tensorflow_io as tfio |
|
import matplotlib.pyplot as plt |
|
from tensorflow import keras |
|
from huggingface_hub import from_pretrained_keras |
|
|
|
|
|
class_names = [ |
|
"Irish", |
|
"Midlands", |
|
"Northern", |
|
"Scottish", |
|
"Southern", |
|
"Welsh", |
|
"Not a speech", |
|
] |
|
|
|
|
|
yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1") |
|
|
|
|
|
model = from_pretrained_keras( |
|
pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification" |
|
) |
|
|
|
|
|
|
|
|
|
def load_16k_audio_wav(filename): |
|
|
|
file_content = tf.io.read_file(filename) |
|
|
|
|
|
audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1) |
|
audio_wav = tf.squeeze(audio_wav, axis=-1) |
|
sample_rate = tf.cast(sample_rate, dtype=tf.int64) |
|
|
|
|
|
audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) |
|
|
|
return audio_wav |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mic_to_tensor(recorded_audio_file): |
|
sample_rate, audio = recorded_audio_file |
|
|
|
audio_wav = tf.constant(audio, dtype=tf.float32) |
|
if tf.rank(audio_wav) > 1: |
|
audio_wav = tf.reduce_mean(audio_wav, axis=1) |
|
audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000) |
|
|
|
audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav))) |
|
|
|
return audio_wav |
|
|
|
|
|
|
|
|
|
|
|
def tensor_to_predictions(audio_tensor): |
|
|
|
scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor) |
|
|
|
|
|
predictions = model.predict(embeddings) |
|
|
|
return predictions, mel_spectrogram |
|
|
|
|
|
|
|
|
|
|
|
|
|
def predict_accent(recorded_audio_file, uploaded_audio_file): |
|
|
|
if recorded_audio_file: |
|
audio_tensor = mic_to_tensor(recorded_audio_file) |
|
else: |
|
audio_tensor = load_16k_audio_wav(uploaded_audio_file) |
|
|
|
|
|
predictions, mel_spectrogram = tensor_to_predictions(audio_tensor) |
|
|
|
|
|
infered_class = class_names[predictions.mean(axis=0).argmax()] |
|
|
|
|
|
top_scoring_labels_output = { |
|
class_names[i]: float(predictions.mean(axis=0)[i]) |
|
for i in range(len(class_names)) |
|
} |
|
|
|
|
|
top_scoring_plot_output = generate_top_scoring_plot(predictions) |
|
|
|
return [top_scoring_labels_output, top_scoring_plot_output] |
|
|
|
|
|
|
|
def clear_inputs_and_outputs(): |
|
return [None, None, None, None] |
|
|
|
|
|
|
|
|
|
|
|
def generate_top_scoring_plot(predictions): |
|
|
|
mean_predictions = np.mean(predictions, axis=0) |
|
|
|
top_class_indices = np.argsort(mean_predictions)[::-1] |
|
fig = plt.figure(figsize=(10, 2)) |
|
plt.imshow( |
|
predictions[:, top_class_indices].T, |
|
aspect="auto", |
|
interpolation="nearest", |
|
cmap="gray_r", |
|
) |
|
|
|
|
|
|
|
patch_padding = (0.025 / 2) / 0.01 |
|
plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5]) |
|
|
|
yticks = range(0, len(class_names), 1) |
|
plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks]) |
|
_ = plt.ylim(-0.5 + np.array([len(class_names), 0])) |
|
|
|
return fig |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.Markdown( |
|
""" |
|
<center><h1>English speaker accent recognition using Transfer Learning</h1></center> \ |
|
This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.<br> \ |
|
In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio<br><br> |
|
""" |
|
) |
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
src_input = gr.Audio(sources=["microphone", "upload"]) |
|
|
|
with gr.Row(): |
|
clr_btn = gr.Button(value="Clear", variant="secondary") |
|
prd_btn = gr.Button(value="Predict") |
|
|
|
with gr.Column(): |
|
lbl_output = gr.Label(label="Top Predictions") |
|
with gr.Group(): |
|
gr.Markdown("<center>Prediction per time slot</center>") |
|
plt_output = gr.Plot( |
|
label="Prediction per time slot", show_label=False |
|
) |
|
|
|
clr_btn.click( |
|
fn=clear_inputs_and_outputs, |
|
inputs=[], |
|
outputs=[src_input, lbl_output, plt_output], |
|
) |
|
prd_btn.click( |
|
fn=predict_accent, |
|
inputs=[src_input], |
|
outputs=[lbl_output, plt_output], |
|
) |
|
|
|
demo.launch(debug=True, share=True) |
|
|