ctc_asr / app.py
anuragshas's picture
add application file
609963b
raw
history blame contribute delete
No virus
5.83 kB
import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
from huggingface_hub import from_pretrained_keras
model = from_pretrained_keras("keras-io/ctc_asr", compile=False)
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384
SAMPLE_RATE = 22050
def decode_batch_predictions(pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
# Use greedy search. For complex tasks, you can use beam search
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
# Iterate over the results and get back the text
output_text = []
for result in results:
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
output_text.append(result)
return output_text
def load_16k_audio_wav(filename):
# Read file content
file_content = tf.io.read_file(filename)
# Decode audio wave
audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
audio_wav = tf.squeeze(audio_wav, axis=-1)
sample_rate = tf.cast(sample_rate, dtype=tf.int64)
# Resample to 16k
audio_wav = tfio.audio.resample(
audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE
)
return audio_wav
def mic_to_tensor(recorded_audio_file):
sample_rate, audio = recorded_audio_file
audio_wav = tf.constant(audio, dtype=tf.float32)
if tf.rank(audio_wav) > 1:
audio_wav = tf.reduce_mean(audio_wav, axis=1)
audio_wav = tfio.audio.resample(
audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE
)
audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav)))
return audio_wav
def tensor_to_predictions(audio_tensor):
# 3. Change type to float
audio_tensor = tf.cast(audio_tensor, tf.float32)
# 4. Get the spectrogram
spectrogram = tf.signal.stft(
audio_tensor,
frame_length=frame_length,
frame_step=frame_step,
fft_length=fft_length,
)
# 5. We only need the magnitude, which can be derived by applying tf.abs
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
# 6. normalisation
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
spectrogram = tf.expand_dims(spectrogram, axis=0)
batch_predictions = model.predict(spectrogram)
batch_predictions = decode_batch_predictions(batch_predictions)
return batch_predictions
def clear_inputs_and_outputs():
return [None, None, None]
def predict(recorded_audio_file, uploaded_audio_file):
# 1. Read wav file
if recorded_audio_file:
audio_tensor = mic_to_tensor(recorded_audio_file)
else:
audio_tensor = load_16k_audio_wav(uploaded_audio_file)
prediction = tensor_to_predictions(audio_tensor)[0]
return prediction
# gr.Interface(
# infer,
# inputs=gr.Audio(source="microphone", type="filepath"),
# outputs=gr.Textbox(lines=5, label="Input Text"),
# #title=title,
# #description=description,
# #article=article,
# #examples=examples,
# enable_queue=True,
# ).launch(debug=True)
# Main function
if __name__ == "__main__":
demo = gr.Blocks()
with demo:
gr.Markdown(
"""
<center><h1>Automatic Speech Recognition using CTC</h1></center> \
This space is a demo of Automatic Speech Recognition using Keras trained on LJSpeech dataset.<br> \
In this space, you can record your voice or upload a wav file and the model will predict the words spoken in English<br><br>
"""
)
with gr.Row():
## Input
with gr.Column():
mic_input = gr.Audio(source="microphone", label="Record your own voice")
upl_input = gr.Audio(
source="upload", type="filepath", label="Upload a wav file"
)
with gr.Row():
clr_btn = gr.Button(value="Clear", variant="secondary")
prd_btn = gr.Button(value="Predict")
# Outputs
with gr.Column():
lbl_output = gr.Label(label="Text")
# Credits
with gr.Row():
gr.Markdown(
"""
<h4>Credits</h4>
Author: <a href="https://twitter.com/anuragcomm"> Anurag Singh</a>.<br>
Based on the following Keras example <a href="https://keras.io/examples/audio/ctc_asr">Automatic Speech Recognition using CTC</a> by <a href="https://rbouadjenek.github.io/">Mohamed Reda Bouadjenek</a> and <a href="https://www.linkedin.com/in/parkerhuynh/">Ngoc Dung Huynh</a><br>
Check out the model <a href="https://huggingface.co/keras-io/ctc_asr">here</a>
"""
)
clr_btn.click(
fn=clear_inputs_and_outputs,
inputs=[],
outputs=[mic_input, upl_input, lbl_output],
)
prd_btn.click(
fn=predict,
inputs=[mic_input, upl_input],
outputs=[lbl_output],
)
demo.launch(debug=True)