csukuangfj commited on
Commit
3330d20
Β·
1 Parent(s): 3761eac

first commit

Browse files
Files changed (4) hide show
  1. README.md +6 -5
  2. app.py +290 -0
  3. model.py +126 -0
  4. requirements.txt +6 -0
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: Automatic Speech Recognition With Whisper
3
- emoji: πŸ‘€
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 3.40.1
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: Automatic Speech Recognition
3
+ emoji: πŸŒ–
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
+ python_version: 3.8.9
8
+ sdk_version: 3.0.26
9
  app_file: app.py
10
  pinned: false
11
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ # References:
20
+ # https://gradio.app/docs/#dropdown
21
+
22
+ import logging
23
+ import os
24
+ import tempfile
25
+ import time
26
+ from datetime import datetime
27
+
28
+ import gradio as gr
29
+ import soundfile as sf
30
+ import urllib.request
31
+
32
+
33
+ from examples import examples
34
+ from model import decode, get_pretrained_model, whisper_models
35
+
36
+ languages = list(language_to_models.keys())
37
+
38
+
39
+ def convert_to_wav(in_filename: str) -> str:
40
+ """Convert the input audio file to a wave file"""
41
+ out_filename = in_filename + ".wav"
42
+ logging.info(f"Converting '{in_filename}' to '{out_filename}'")
43
+
44
+ _ = os.system(
45
+ f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 -ac 1 '{out_filename}'"
46
+ )
47
+
48
+ return out_filename
49
+
50
+
51
+ def build_html_output(s: str, style: str = "result_item_success"):
52
+ return f"""
53
+ <div class='result'>
54
+ <div class='result_item {style}'>
55
+ {s}
56
+ </div>
57
+ </div>
58
+ """
59
+
60
+
61
+ def process_url(
62
+ repo_id: str,
63
+ url: str,
64
+ ):
65
+ logging.info(f"Processing URL: {url}")
66
+ with tempfile.NamedTemporaryFile() as f:
67
+ try:
68
+ urllib.request.urlretrieve(url, f.name)
69
+
70
+ return process(
71
+ in_filename=f.name,
72
+ repo_id=repo_id,
73
+ )
74
+ except Exception as e:
75
+ logging.info(str(e))
76
+ return "", build_html_output(str(e), "result_item_error")
77
+
78
+
79
+ def process_uploaded_file(
80
+ repo_id: str,
81
+ in_filename: str,
82
+ ):
83
+ if in_filename is None or in_filename == "":
84
+ return "", build_html_output(
85
+ "Please first upload a file and then click "
86
+ 'the button "submit for recognition"',
87
+ "result_item_error",
88
+ )
89
+
90
+ logging.info(f"Processing uploaded file: {in_filename}")
91
+ try:
92
+ return process(
93
+ in_filename=in_filename,
94
+ repo_id=repo_id,
95
+ )
96
+ except Exception as e:
97
+ logging.info(str(e))
98
+ return "", build_html_output(str(e), "result_item_error")
99
+
100
+
101
+ def process_microphone(
102
+ repo_id: str,
103
+ in_filename: str,
104
+ ):
105
+ if in_filename is None or in_filename == "":
106
+ return "", build_html_output(
107
+ "Please first click 'Record from microphone', speak, "
108
+ "click 'Stop recording', and then "
109
+ "click the button 'submit for recognition'",
110
+ "result_item_error",
111
+ )
112
+
113
+ logging.info(f"Processing microphone: {in_filename}")
114
+ try:
115
+ return process(
116
+ in_filename=in_filename,
117
+ repo_id=repo_id,
118
+ )
119
+ except Exception as e:
120
+ logging.info(str(e))
121
+ return "", build_html_output(str(e), "result_item_error")
122
+
123
+
124
+ def process(
125
+ repo_id: str,
126
+ in_filename: str,
127
+ ):
128
+ logging.info(f"repo_id: {repo_id}")
129
+ logging.info(f"in_filename: {in_filename}")
130
+
131
+ filename = convert_to_wav(in_filename)
132
+
133
+ now = datetime.now()
134
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
135
+ logging.info(f"Started at {date_time}")
136
+
137
+ start = time.time()
138
+
139
+ recognizer = get_pretrained_model(
140
+ repo_id,
141
+ )
142
+
143
+ text = decode(recognizer, filename)
144
+
145
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
146
+ end = time.time()
147
+
148
+ info = torchaudio.info(filename)
149
+ duration = info.duration
150
+
151
+ elapsed = end - start
152
+ rtf = elapsed / duration
153
+
154
+ logging.info(f"Finished at {date_time} s. Elapsed: {elapsed: .3f} s")
155
+
156
+ info = f"""
157
+ Wave duration : {duration: .3f} s <br/>
158
+ Processing time: {elapsed: .3f} s <br/>
159
+ RTF: {elapsed: .3f}/{duration: .3f} = {rtf:.3f} <br/>
160
+ """
161
+ if rtf > 1:
162
+ info += (
163
+ "<br/>We are loading the model for the first run. "
164
+ "Please run again to measure the real RTF.<br/>"
165
+ )
166
+
167
+ logging.info(info)
168
+ logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
169
+
170
+ return text, build_html_output(info)
171
+
172
+
173
+ title = "# Automatic Speech Recognition with Next-gen Kaldi using Whisper models"
174
+ description = """
175
+ This space shows how to do automatic speech recognition with Next-gen Kaldi
176
+ using Whisper models.
177
+
178
+ It is running on CPU within a docker container provided by Hugging Face.
179
+
180
+ See more information by visiting the following links:
181
+
182
+ - <https://github.com/k2-fsa/sherpa-onnx>
183
+
184
+ If you want to deploy it locally, please see
185
+ <https://k2-fsa.github.io/sherpa/>
186
+ """
187
+
188
+ # css style is copied from
189
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
190
+ css = """
191
+ .result {display:flex;flex-direction:column}
192
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
193
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
194
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
195
+ """
196
+
197
+
198
+ def update_model_dropdown(language: str):
199
+ if language in language_to_models:
200
+ choices = language_to_models[language]
201
+ return gr.Dropdown.update(choices=choices, value=choices[0])
202
+
203
+ raise ValueError(f"Unsupported language: {language}")
204
+
205
+
206
+ demo = gr.Blocks(css=css)
207
+
208
+
209
+ with demo:
210
+ gr.Markdown(title)
211
+ language_choices = list(language_to_models.keys())
212
+ model_choices = list(whisper_models.keys())
213
+
214
+ model_dropdown = gr.Dropdown(
215
+ choices=model_choices,
216
+ label="Select a model",
217
+ value=model_choices[0],
218
+ )
219
+
220
+ with gr.Tabs():
221
+ with gr.TabItem("Upload from disk"):
222
+ uploaded_file = gr.Audio(
223
+ source="upload", # Choose between "microphone", "upload"
224
+ type="filepath",
225
+ optional=False,
226
+ label="Upload from disk",
227
+ )
228
+ upload_button = gr.Button("Submit for recognition")
229
+ uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
230
+ uploaded_html_info = gr.HTML(label="Info")
231
+
232
+ with gr.TabItem("Record from microphone"):
233
+ microphone = gr.Audio(
234
+ source="microphone", # Choose between "microphone", "upload"
235
+ type="filepath",
236
+ optional=False,
237
+ label="Record from microphone",
238
+ )
239
+
240
+ record_button = gr.Button("Submit for recognition")
241
+ recorded_output = gr.Textbox(label="Recognized speech from recordings")
242
+ recorded_html_info = gr.HTML(label="Info")
243
+
244
+ with gr.TabItem("From URL"):
245
+ url_textbox = gr.Textbox(
246
+ max_lines=1,
247
+ placeholder="URL to an audio file",
248
+ label="URL",
249
+ interactive=True,
250
+ )
251
+
252
+ url_button = gr.Button("Submit for recognition")
253
+ url_output = gr.Textbox(label="Recognized speech from URL")
254
+ url_html_info = gr.HTML(label="Info")
255
+
256
+ upload_button.click(
257
+ process_uploaded_file,
258
+ inputs=[
259
+ model_dropdown,
260
+ uploaded_file,
261
+ ],
262
+ outputs=[uploaded_output, uploaded_html_info],
263
+ )
264
+
265
+ record_button.click(
266
+ process_microphone,
267
+ inputs=[
268
+ model_dropdown,
269
+ microphone,
270
+ ],
271
+ outputs=[recorded_output, recorded_html_info],
272
+ )
273
+
274
+ url_button.click(
275
+ process_url,
276
+ inputs=[
277
+ model_dropdown,
278
+ url_textbox,
279
+ ],
280
+ outputs=[url_output, url_html_info],
281
+ )
282
+
283
+ gr.Markdown(description)
284
+
285
+ if __name__ == "__main__":
286
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
287
+
288
+ logging.basicConfig(format=formatter, level=logging.INFO)
289
+
290
+ demo.launch()
model.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # See LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from functools import lru_cache
18
+
19
+ from huggingface_hub import hf_hub_download
20
+
21
+
22
+ import sherpa_onnx
23
+ import numpy as np
24
+ from typing import Tuple
25
+ import wave
26
+
27
+ sample_rate = 16000
28
+
29
+
30
+ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
31
+ """
32
+ Args:
33
+ wave_filename:
34
+ Path to a wave file. It should be single channel and each sample should
35
+ be 16-bit. Its sample rate does not need to be 16kHz.
36
+ Returns:
37
+ Return a tuple containing:
38
+ - A 1-D array of dtype np.float32 containing the samples, which are
39
+ normalized to the range [-1, 1].
40
+ - sample rate of the wave file
41
+ """
42
+
43
+ with wave.open(wave_filename) as f:
44
+ assert f.getnchannels() == 1, f.getnchannels()
45
+ assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes
46
+ num_samples = f.getnframes()
47
+ samples = f.readframes(num_samples)
48
+ samples_int16 = np.frombuffer(samples, dtype=np.int16)
49
+ samples_float32 = samples_int16.astype(np.float32)
50
+
51
+ samples_float32 = samples_float32 / 32768
52
+ return samples_float32, f.getframerate()
53
+
54
+
55
+ def decode(
56
+ recognizer: sherpa_onnx.OfflineRecognizer,
57
+ filename: str,
58
+ ) -> str:
59
+ s = recognizer.create_stream()
60
+ samples, sample_rate = read_wave(filename)
61
+ s.accept_waveform(sample_rate, samples)
62
+ recognizer.decode_stream(s)
63
+
64
+ return s.result.text.lower()
65
+
66
+
67
+ def _get_nn_model_filename(
68
+ repo_id: str,
69
+ filename: str,
70
+ subfolder: str = ".",
71
+ ) -> str:
72
+ nn_model_filename = hf_hub_download(
73
+ repo_id=repo_id,
74
+ filename=filename,
75
+ subfolder=subfolder,
76
+ )
77
+ return nn_model_filename
78
+
79
+
80
+ def _get_token_filename(
81
+ repo_id: str,
82
+ filename: str,
83
+ subfolder: str = ".",
84
+ ) -> str:
85
+ token_filename = hf_hub_download(
86
+ repo_id=repo_id,
87
+ filename=filename,
88
+ subfolder=subfolder,
89
+ )
90
+ return token_filename
91
+
92
+
93
+ @lru_cache(maxsize=8)
94
+ def get_pretrained_model(name: str) -> sherpa_onnx.OfflineRecognizer:
95
+ assert name in ("tiny.en", "base.en", "small.en", "tiny", "base", "small"), name
96
+ full_repo_id = "csukuangfj/sherpa-onnx-whisper-" + name
97
+ encoder = _get_nn_model_filename(
98
+ repo_id=full_repo_id,
99
+ filename=f"{name}-encoder.int8.ort",
100
+ )
101
+
102
+ decoder = _get_nn_model_filename(
103
+ repo_id=full_repo_id,
104
+ filename=f"{name}-decoder.int8.ort",
105
+ )
106
+
107
+ tokens = _get_token_filename(repo_id=full_repo_id, filename=f"{name}-tokens.txt")
108
+
109
+ recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
110
+ encoder=encoder,
111
+ decoder=decoder,
112
+ tokens=tokens,
113
+ num_threads=2,
114
+ )
115
+
116
+ return recognizer
117
+
118
+
119
+ whisper_models = {
120
+ "tiny.en": get_pretrained_model,
121
+ "base.en": get_pretrained_model,
122
+ "small.en": get_pretrained_model,
123
+ "tiny": get_pretrained_model,
124
+ "base": get_pretrained_model,
125
+ "small": get_pretrained_model,
126
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ soundfile
2
+ sentencepiece>=0.1.96
3
+ numpy
4
+
5
+ huggingface_hub
6
+ sherpa-onnx>=1.7.7