|
from deepspeech import Model |
|
import gradio as gr |
|
import numpy as np |
|
import urllib.request |
|
import wave |
|
import subprocess |
|
import sys |
|
import shlex |
|
from shlex import quote |
|
|
|
model_file_path = "deepspeech-0.9.3-models.pbmm" |
|
lm_file_path = "deepspeech-0.9.3-models.scorer" |
|
url = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/" |
|
|
|
urllib.request.urlretrieve(url + model_file_path, filename=model_file_path) |
|
urllib.request.urlretrieve(url + lm_file_path, filename=lm_file_path) |
|
|
|
beam_width = 100 |
|
lm_alpha = 0.93 |
|
lm_beta = 1.18 |
|
|
|
model = Model(model_file_path) |
|
model.enableExternalScorer(lm_file_path) |
|
model.setScorerAlphaBeta(lm_alpha, lm_beta) |
|
model.setBeamWidth(beam_width) |
|
|
|
|
|
def convert_samplerate(audio_path, desired_sample_rate): |
|
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate) |
|
try: |
|
output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) |
|
except subprocess.CalledProcessError as e: |
|
raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) |
|
except OSError as e: |
|
raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror)) |
|
|
|
return desired_sample_rate, np.frombuffer(output, np.int16) |
|
|
|
|
|
|
|
def transcribe(audio_file): |
|
desired_sample_rate = model.sampleRate() |
|
fin = wave.open(audio_file, 'rb') |
|
fs_orig = fin.getframerate() |
|
if fs_orig != desired_sample_rate: |
|
print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) |
|
fs_new, audio = convert_samplerate(audio_file, desired_sample_rate) |
|
else: |
|
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) |
|
|
|
audio_length = fin.getnframes() * (1/fs_orig) |
|
fin.close() |
|
|
|
text = model.stt(audio) |
|
return text |
|
|
|
|
|
demo = gr.Interface( |
|
transcribe, |
|
|
|
gr.Audio(label="Upload Audio File", source="upload", type="filepath"), |
|
outputs=gr.Textbox(label="Transcript") |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |