fbadine commited on
Commit
4595449
1 Parent(s): eba2aff

Added app.py and requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +186 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import csv
4
+ import gradio as gr
5
+ import numpy as np
6
+ import tensorflow as tf
7
+ import tensorflow_hub as hub
8
+ import tensorflow_io as tfio
9
+ import matplotlib.pyplot as plt
10
+ from tensorflow import keras
11
+ from huggingface_hub import from_pretrained_keras
12
+
13
+ # Configuration
14
+ class_names = [
15
+ "Irish",
16
+ "Midlands",
17
+ "Northern",
18
+ "Scottish",
19
+ "Southern",
20
+ "Welsh",
21
+ "Not a speech",
22
+ ]
23
+
24
+ # Download Yamnet model from TF Hub
25
+ yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")
26
+
27
+ # Download dense model from HF Hub
28
+ model = from_pretrained_keras(
29
+ pretrained_model_name_or_path="fbadine/uk_ireland_accent_classification"
30
+ )
31
+
32
+ # Function that reads a wav audio file and resamples it to 16000 Hz
33
+ # This function is copied from the tutorial:
34
+ # https://www.tensorflow.org/tutorials/audio/transfer_learning_audio
35
+ def load_16k_audio_wav(filename):
36
+ # Read file content
37
+ file_content = tf.io.read_file(filename)
38
+
39
+ # Decode audio wave
40
+ audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
41
+ audio_wav = tf.squeeze(audio_wav, axis=-1)
42
+ sample_rate = tf.cast(sample_rate, dtype=tf.int64)
43
+
44
+ # Resample to 16k
45
+ audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
46
+
47
+ return audio_wav
48
+
49
+
50
+ # Function thatt takes the audio file produced by gr.Audio(source="microphone") and
51
+ # returns a tensor applying the following transformations:
52
+ # - Resample to 16000 Hz
53
+ # - Normalize
54
+ # - Reshape to [1, -1]
55
+ def mic_to_tensor(recorded_audio_file):
56
+ sample_rate, audio = recorded_audio_file
57
+
58
+ audio_wav = tf.constant(audio, dtype=tf.float32)
59
+ if tf.rank(audio_wav) > 1:
60
+ audio_wav = tf.reduce_mean(audio_wav, axis=1)
61
+ audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
62
+
63
+ audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav)))
64
+
65
+ return audio_wav
66
+
67
+
68
+ # Function that takes a tensor and applies the following:
69
+ # - Pass it through Yamnet model to get the embeddings which are the input of the dense model
70
+ # - Pass the embeddings through the dense model to get the predictions
71
+ def tensor_to_predictions(audio_tensor):
72
+ # Get audio embeddings & scores.
73
+ scores, embeddings, mel_spectrogram = yamnet_model(audio_tensor)
74
+
75
+ # Predict the output of the accent recognition model with embeddings as input
76
+ predictions = model.predict(embeddings)
77
+
78
+ return predictions, mel_spectrogram
79
+
80
+
81
+ # Function tha is called when the user clicks "Predict" button. It does the following:
82
+ # - Calls tensor_to_predictions() to get the predictions
83
+ # - Generates the top scoring labels
84
+ # - Generates the top scoring plot
85
+ def predict_accent(recorded_audio_file, uploaded_audio_file):
86
+ # Transform input to tensor
87
+ if recorded_audio_file:
88
+ audio_tensor = mic_to_tensor(recorded_audio_file)
89
+ else:
90
+ audio_tensor = load_16k_audio_wav(uploaded_audio_file)
91
+
92
+ # Model Inference
93
+ predictions, mel_spectrogram = tensor_to_predictions(audio_tensor)
94
+
95
+ # Get the infered class
96
+ infered_class = class_names[predictions.mean(axis=0).argmax()]
97
+
98
+ # Generate Output 1 - Accents
99
+ top_scoring_labels_output = {
100
+ class_names[i]: float(predictions.mean(axis=0)[i])
101
+ for i in range(len(class_names))
102
+ }
103
+
104
+ # Generate Output 2
105
+ top_scoring_plot_output = generate_top_scoring_plot(predictions)
106
+
107
+ return [top_scoring_labels_output, top_scoring_plot_output]
108
+
109
+
110
+ # Clears all inputs and outputs when the user clicks "Clear" button
111
+ def clear_inputs_and_outputs():
112
+ return [None, None, None, None]
113
+
114
+
115
+ # Function that generates the top scoring plot
116
+ # This function is copied from the tutorial and adjusted to our needs
117
+ # https://keras.io/examples/audio/uk_ireland_accent_recognition/tinyurl.com/4a8xn7at
118
+ def generate_top_scoring_plot(predictions):
119
+ # Plot and label the model output scores for the top-scoring classes.
120
+ mean_predictions = np.mean(predictions, axis=0)
121
+
122
+ top_class_indices = np.argsort(mean_predictions)[::-1]
123
+ fig = plt.figure(figsize=(10, 2))
124
+ plt.imshow(
125
+ predictions[:, top_class_indices].T,
126
+ aspect="auto",
127
+ interpolation="nearest",
128
+ cmap="gray_r",
129
+ )
130
+
131
+ # patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS
132
+ # values from the model documentation
133
+ patch_padding = (0.025 / 2) / 0.01
134
+ plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5])
135
+ # Label the top_N classes.
136
+ yticks = range(0, len(class_names), 1)
137
+ plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks])
138
+ _ = plt.ylim(-0.5 + np.array([len(class_names), 0]))
139
+
140
+ return fig
141
+
142
+
143
+ # Main function
144
+ if __name__ == "__main__":
145
+ demo = gr.Blocks()
146
+
147
+ with demo:
148
+ gr.Markdown(
149
+ """
150
+ <center><h1>English speaker accent recognition using Transfer Learning</h1></center> \
151
+ This space is a demo of an English (precisely UK & Ireland) accent classification model using Keras.<br> \
152
+ In this space, you can record your voice or upload a wav file and the model will predict the English accent spoken in the audio<br><br>
153
+ """
154
+ )
155
+ with gr.Row():
156
+ ## Input
157
+ with gr.Column():
158
+ mic_input = gr.Audio(source="microphone", label="Record your own voice")
159
+ upl_input = gr.Audio(
160
+ source="upload", type="filepath", label="Upload a wav file"
161
+ )
162
+
163
+ with gr.Row():
164
+ clr_btn = gr.Button(value="Clear", variant="secondary")
165
+ prd_btn = gr.Button(value="Predict")
166
+
167
+ with gr.Column():
168
+ lbl_output = gr.Label(label="Top Predictions")
169
+ with gr.Group():
170
+ gr.Markdown("<center>Prediction per time slot</center>")
171
+ plt_output = gr.Plot(
172
+ label="Prediction per time slot", show_label=False
173
+ )
174
+
175
+ clr_btn.click(
176
+ fn=clear_inputs_and_outputs,
177
+ inputs=[],
178
+ outputs=[mic_input, upl_input, lbl_output, plt_output],
179
+ )
180
+ prd_btn.click(
181
+ fn=predict_accent,
182
+ inputs=[mic_input, upl_input],
183
+ outputs=[lbl_output, plt_output],
184
+ )
185
+
186
+ demo.launch(debug=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ matplotlib
3
+ tensorflow==2.8.2
4
+ tensorflow_io==0.25.0
5
+ tensorflow_hub==0.12.0