File size: 5,831 Bytes
609963b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
from huggingface_hub import from_pretrained_keras
model = from_pretrained_keras("keras-io/ctc_asr", compile=False)
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384
SAMPLE_RATE = 22050
def decode_batch_predictions(pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
# Use greedy search. For complex tasks, you can use beam search
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
# Iterate over the results and get back the text
output_text = []
for result in results:
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
output_text.append(result)
return output_text
def load_16k_audio_wav(filename):
# Read file content
file_content = tf.io.read_file(filename)
# Decode audio wave
audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
audio_wav = tf.squeeze(audio_wav, axis=-1)
sample_rate = tf.cast(sample_rate, dtype=tf.int64)
# Resample to 16k
audio_wav = tfio.audio.resample(
audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE
)
return audio_wav
def mic_to_tensor(recorded_audio_file):
sample_rate, audio = recorded_audio_file
audio_wav = tf.constant(audio, dtype=tf.float32)
if tf.rank(audio_wav) > 1:
audio_wav = tf.reduce_mean(audio_wav, axis=1)
audio_wav = tfio.audio.resample(
audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE
)
audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav)))
return audio_wav
def tensor_to_predictions(audio_tensor):
# 3. Change type to float
audio_tensor = tf.cast(audio_tensor, tf.float32)
# 4. Get the spectrogram
spectrogram = tf.signal.stft(
audio_tensor,
frame_length=frame_length,
frame_step=frame_step,
fft_length=fft_length,
)
# 5. We only need the magnitude, which can be derived by applying tf.abs
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
# 6. normalisation
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
spectrogram = tf.expand_dims(spectrogram, axis=0)
batch_predictions = model.predict(spectrogram)
batch_predictions = decode_batch_predictions(batch_predictions)
return batch_predictions
def clear_inputs_and_outputs():
return [None, None, None]
def predict(recorded_audio_file, uploaded_audio_file):
# 1. Read wav file
if recorded_audio_file:
audio_tensor = mic_to_tensor(recorded_audio_file)
else:
audio_tensor = load_16k_audio_wav(uploaded_audio_file)
prediction = tensor_to_predictions(audio_tensor)[0]
return prediction
# gr.Interface(
# infer,
# inputs=gr.Audio(source="microphone", type="filepath"),
# outputs=gr.Textbox(lines=5, label="Input Text"),
# #title=title,
# #description=description,
# #article=article,
# #examples=examples,
# enable_queue=True,
# ).launch(debug=True)
# Main function
if __name__ == "__main__":
demo = gr.Blocks()
with demo:
gr.Markdown(
"""
<center><h1>Automatic Speech Recognition using CTC</h1></center> \
This space is a demo of Automatic Speech Recognition using Keras trained on LJSpeech dataset.<br> \
In this space, you can record your voice or upload a wav file and the model will predict the words spoken in English<br><br>
"""
)
with gr.Row():
## Input
with gr.Column():
mic_input = gr.Audio(source="microphone", label="Record your own voice")
upl_input = gr.Audio(
source="upload", type="filepath", label="Upload a wav file"
)
with gr.Row():
clr_btn = gr.Button(value="Clear", variant="secondary")
prd_btn = gr.Button(value="Predict")
# Outputs
with gr.Column():
lbl_output = gr.Label(label="Text")
# Credits
with gr.Row():
gr.Markdown(
"""
<h4>Credits</h4>
Author: <a href="https://twitter.com/anuragcomm"> Anurag Singh</a>.<br>
Based on the following Keras example <a href="https://keras.io/examples/audio/ctc_asr">Automatic Speech Recognition using CTC</a> by <a href="https://rbouadjenek.github.io/">Mohamed Reda Bouadjenek</a> and <a href="https://www.linkedin.com/in/parkerhuynh/">Ngoc Dung Huynh</a><br>
Check out the model <a href="https://huggingface.co/keras-io/ctc_asr">here</a>
"""
)
clr_btn.click(
fn=clear_inputs_and_outputs,
inputs=[],
outputs=[mic_input, upl_input, lbl_output],
)
prd_btn.click(
fn=predict,
inputs=[mic_input, upl_input],
outputs=[lbl_output],
)
demo.launch(debug=True)
|