File size: 2,786 Bytes
4a43745
 
 
 
 
 
 
e80f2b5
4a43745
 
5058a94
b002b3d
4a43745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7d3f45
4a43745
e80f2b5
 
4a43745
 
 
 
 
 
 
 
 
 
 
 
 
 
5058a94
b002b3d
 
 
 
4e11af0
 
 
4a43745
 
 
d722032
 
887f6b2
4a43745
 
887f6b2
 
4a43745
 
 
 
 
 
 
 
 
e80f2b5
 
887f6b2
4a43745
 
4db4dc7
4a43745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
'''Librispeech 100h English ASR demo
@ML2 --> @HuggingFace

2022-02-23 jkang first created
'''

import os
from difflib import Differ
from glob import glob
from loguru import logger
# import librosa
import wavio
import gradio as gr

from espnet2.bin.asr_inference import Speech2Text

# ---------- Settings ----------
GPU_ID = '-1'
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'

# SERVER_PORT = 42208
# SERVER_NAME = "0.0.0.0"

MODEL_DIR = './model'

EXAMPLE_DIR = './examples'
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
examples = [[example] for example in examples]

d = Differ()

# ---------- Logging ----------
logger.add('app.log', mode='a')
logger.info('============================= App restarted =============================')

# ---------- Model ----------
logger.info('download model')
logger.info('model downloaded')
model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word")
model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer")
logger.info('model loaded')

def predict(wav_file):
    logger.info('wav file loaded')
    # Load audio    
    # speech, rate = librosa.load(wav_file, sr=16000)
    wav = wavio.read(wav_file)
    speech = wav.data
    rate = wav.rate
    
    # number_of_samples = round(len(speech_raw) * float(16000) / rate)
    # speech = sps.resample(speech_raw, number_of_samples)

    # Run inference
    W = model_word(speech)[0]
    B = model_bpe(speech)[0]
    word_decoded = W[0]
    bpe_decoded = B[0]
    # comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n'])))

    logger.info('predicted')
    # return word_decoded, bpe_decoded, comparison
    return word_decoded, bpe_decoded

iface = gr.Interface(
    predict,
    title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
    description='Two models were trained on Librispeech (clean-100h)',
    inputs=[
        gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
    ],
    outputs=[
        gr.outputs.Textbox(label='Decoding result (word-token model)'),
        gr.outputs.Textbox(label='Decoding result (BPE-token model)'),
        # gr.outputs.Textbox(label='Comparison'),
    ],
    examples=examples,
    examples_per_page=5,
    # article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>',
)

if __name__ == '__main__':
    try:
        iface.launch(debug=True,
                    #  server_name=SERVER_NAME,
                    #  server_port=SERVER_PORT,
                     enable_queue=True,
                     )    
    except KeyboardInterrupt as e:
        print(e)

    finally:
        iface.close()