jaekookang commited on
Commit
49041a5
β€’
1 Parent(s): a07e2df

first upload

Browse files
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *~
2
+ __pycache__
3
+ *.log
4
+ *.db
5
+ *.nohup
6
+ .vscode
7
+ keyble_ssl/*
8
+ README_github.md
examples/gentleman.wav ADDED
Binary file (153 kB). View file
 
examples/jaekoo_numbers.wav ADDED
Binary file (218 kB). View file
 
examples/maybe_next_time.wav ADDED
Binary file (25.7 kB). View file
 
examples/old_oily_rag.wav ADDED
Binary file (67.8 kB). View file
 
gradio_asr_en_libri100.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''Librispeech 100h English ASR demo
2
+
3
+ @ML2
4
+
5
+ 2022-02-11
6
+ '''
7
+
8
+ import os
9
+ from glob import glob
10
+ from loguru import logger
11
+ import soundfile as sf
12
+ import gradio as gr
13
+
14
+ from espnet_model_zoo.downloader import ModelDownloader
15
+ from espnet2.bin.asr_inference import Speech2Text
16
+
17
+
18
+ # ---------- Settings ----------
19
+ GPU_ID = '-1'
20
+ os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
21
+ DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'
22
+
23
+ SERVER_PORT = 42208
24
+ SERVER_NAME = "0.0.0.0"
25
+
26
+ SSL_DIR = './keyble_ssl'
27
+ MODEL_DIR = '/home/jkang/HDD4T/jkang/huggingface'
28
+
29
+ EXAMPLE_DIR = './examples'
30
+ examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
31
+
32
+ # ---------- Logging ----------
33
+ logger.add('app.log', mode='a')
34
+ logger.info('============================= App restarted =============================')
35
+
36
+ # ---------- Model ----------
37
+ logger.info('download model')
38
+ d = ModelDownloader(MODEL_DIR)
39
+ out = d.download_and_unpack("jkang/espnet2_librispeech_100_conformer")
40
+ logger.info('model downloaded')
41
+ model = Speech2Text.from_pretrained(
42
+ asr_train_config=out['asr_train_config'],
43
+ asr_model_file=out['asr_model_file']
44
+ )
45
+ logger.info('model loaded')
46
+
47
+ def predict(wav_file):
48
+ logger.info('wav file loaded')
49
+ speech, rate = sf.read(wav_file)
50
+ nbests = model(speech)
51
+ text, *_ = nbests[0]
52
+ logger.info('predicted')
53
+ return text
54
+
55
+ iface = gr.Interface(
56
+ predict,
57
+ title='μ˜μ–΄ μŒμ„±μΈμ‹ 데λͺ¨ (espnet libri100) -- ν”„λ‘œν† νƒ€μž…',
58
+ description='μ˜μ–΄ μŒμ„± νŒŒμΌμ„ μ—…λ‘œλ“œν•˜λ©΄ ν…μŠ€νŠΈ λ‚΄μš©μ„ 결과둜 λ³΄μ—¬μ€λ‹ˆλ‹€.',
59
+ inputs=[
60
+ gr.inputs.Audio(label='μ˜μ–΄ μŒμ„±', source='upload', type='filepath')
61
+ ],
62
+ outputs=[
63
+ gr.outputs.Textbox(label='μŒμ„± 인식 λ””μ½”λ”©κ²°κ³Ό'),
64
+ ],
65
+ examples=examples,
66
+ article='<p style="text-align:center">i-Scream AI</p>',
67
+ )
68
+
69
+ if __name__ == '__main__':
70
+ try:
71
+ iface.launch(debug=True,
72
+ server_name=SERVER_NAME,
73
+ server_port=SERVER_PORT,
74
+ enable_queue=True,
75
+ # ssl_keyfile=SSL_DIR,
76
+ # ssl_certfile=SSL_DIR
77
+ )
78
+ except KeyboardInterrupt as e:
79
+ print(e)
80
+
81
+ finally:
82
+ iface.close()