Spaces:
Runtime error
Runtime error
File size: 2,786 Bytes
4a43745 e80f2b5 4a43745 5058a94 b002b3d 4a43745 a7d3f45 4a43745 e80f2b5 4a43745 5058a94 b002b3d 4e11af0 4a43745 d722032 887f6b2 4a43745 887f6b2 4a43745 e80f2b5 887f6b2 4a43745 4db4dc7 4a43745 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
'''Librispeech 100h English ASR demo
@ML2 --> @HuggingFace
2022-02-23 jkang first created
'''
import os
from difflib import Differ
from glob import glob
from loguru import logger
# import librosa
import wavio
import gradio as gr
from espnet2.bin.asr_inference import Speech2Text
# ---------- Settings ----------
GPU_ID = '-1'
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'
# SERVER_PORT = 42208
# SERVER_NAME = "0.0.0.0"
MODEL_DIR = './model'
EXAMPLE_DIR = './examples'
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
examples = [[example] for example in examples]
d = Differ()
# ---------- Logging ----------
logger.add('app.log', mode='a')
logger.info('============================= App restarted =============================')
# ---------- Model ----------
logger.info('download model')
logger.info('model downloaded')
model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word")
model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer")
logger.info('model loaded')
def predict(wav_file):
logger.info('wav file loaded')
# Load audio
# speech, rate = librosa.load(wav_file, sr=16000)
wav = wavio.read(wav_file)
speech = wav.data
rate = wav.rate
# number_of_samples = round(len(speech_raw) * float(16000) / rate)
# speech = sps.resample(speech_raw, number_of_samples)
# Run inference
W = model_word(speech)[0]
B = model_bpe(speech)[0]
word_decoded = W[0]
bpe_decoded = B[0]
# comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n'])))
logger.info('predicted')
# return word_decoded, bpe_decoded, comparison
return word_decoded, bpe_decoded
iface = gr.Interface(
predict,
title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
description='Two models were trained on Librispeech (clean-100h)',
inputs=[
gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
],
outputs=[
gr.outputs.Textbox(label='Decoding result (word-token model)'),
gr.outputs.Textbox(label='Decoding result (BPE-token model)'),
# gr.outputs.Textbox(label='Comparison'),
],
examples=examples,
examples_per_page=5,
# article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>',
)
if __name__ == '__main__':
try:
iface.launch(debug=True,
# server_name=SERVER_NAME,
# server_port=SERVER_PORT,
enable_queue=True,
)
except KeyboardInterrupt as e:
print(e)
finally:
iface.close() |