Spaces:

IhebJettPilot
/

meetingsum

Build error

App Files Files Community

IhebJettPilot commited on Aug 31, 2023

Commit

c293cf4

1 Parent(s): 49280c9

Upload 5 files

Browse files

Files changed (5) hide show

app.py +385 -0
decode.py +121 -0
giga-tokens.txt +500 -0
model.py +1001 -0
requirements (1).txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,385 @@

+#!/usr/bin/env python3
+#
+# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# References:
+# https://gradio.app/docs/#dropdown
+import logging
+import os
+import tempfile
+import time
+from datetime import datetime
+import gradio as gr
+import torch
+import torchaudio
+import urllib.request
+from examples import examples
+from model import decode, get_pretrained_model, language_to_models, sample_rate
+languages = list(language_to_models.keys())
+def convert_to_wav(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 '{out_filename}.flac'"
+    )
+    return out_filename
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def process_url(
+    language: str,
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+    url: str,
+):
+    logging.info(f"Processing URL: {url}")
+    with tempfile.NamedTemporaryFile() as f:
+        try:
+            urllib.request.urlretrieve(url, f.name)
+            return process(
+                in_filename=f.name,
+                language=language,
+                repo_id=repo_id,
+                decoding_method=decoding_method,
+                num_active_paths=num_active_paths,
+            )
+        except Exception as e:
+            logging.info(str(e))
+            return "", build_html_output(str(e), "result_item_error")
+def process_uploaded_file(
+    language: str,
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first upload a file and then click "
+            'the button "submit for recognition"',
+            "result_item_error",
+        )
+    logging.info(f"Processing uploaded file: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            language=language,
+            repo_id=repo_id,
+            decoding_method=decoding_method,
+            num_active_paths=num_active_paths,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process_microphone(
+    language: str,
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first click 'Record from microphone', speak, "
+            "click 'Stop recording', and then "
+            "click the button 'submit for recognition'",
+            "result_item_error",
+        )
+    logging.info(f"Processing microphone: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            language=language,
+            repo_id=repo_id,
+            decoding_method=decoding_method,
+            num_active_paths=num_active_paths,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+@torch.no_grad()
+def process(
+    language: str,
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+    in_filename: str,
+):
+    logging.info(f"language: {language}")
+    logging.info(f"repo_id: {repo_id}")
+    logging.info(f"decoding_method: {decoding_method}")
+    logging.info(f"num_active_paths: {num_active_paths}")
+    logging.info(f"in_filename: {in_filename}")
+    filename = convert_to_wav(in_filename)
+    now = datetime.now()
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    logging.info(f"Started at {date_time}")
+    start = time.time()
+    recognizer = get_pretrained_model(
+        repo_id,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    text = decode(recognizer, filename)
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    end = time.time()
+    metadata = torchaudio.info(filename)
+    duration = metadata.num_frames / sample_rate
+    rtf = (end - start) / duration
+    logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
+    info = f"""
+    Wave duration  : {duration: .3f} s <br/>
+    Processing time: {end - start: .3f} s <br/>
+    RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
+    """
+    if rtf > 1:
+        info += (
+            "<br/>We are loading the model for the first run. "
+            "Please run again to measure the real RTF.<br/>"
+        )
+    logging.info(info)
+    logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
+    return text, build_html_output(info)
+title = "# Automatic Speech Recognition with Next-gen Kaldi"
+description = """
+This space shows how to do automatic speech recognition with Next-gen Kaldi.
+Please visit
+<https://huggingface.co/spaces/k2-fsa/streaming-automatic-speech-recognition>
+for streaming speech recognition with **Next-gen Kaldi**.
+It is running on CPU within a docker container provided by Hugging Face.
+See more information by visiting the following links:
+- <https://github.com/k2-fsa/icefall>
+- <https://github.com/k2-fsa/sherpa>
+- <https://github.com/k2-fsa/k2>
+- <https://github.com/lhotse-speech/lhotse>
+If you want to deploy it locally, please see
+<https://k2-fsa.github.io/sherpa/>
+"""
+# css style is copied from
+# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
+css = """
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+"""
+def update_model_dropdown(language: str):
+    if language in language_to_models:
+        choices = language_to_models[language]
+        return gr.Dropdown.update(choices=choices, value=choices[0])
+    raise ValueError(f"Unsupported language: {language}")
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    language_choices = list(language_to_models.keys())
+    language_radio = gr.Radio(
+        label="Language",
+        choices=language_choices,
+        value=language_choices[0],
+    )
+    model_dropdown = gr.Dropdown(
+        choices=language_to_models[language_choices[0]],
+        label="Select a model",
+        value=language_to_models[language_choices[0]][0],
+    )
+    language_radio.change(
+        update_model_dropdown,
+        inputs=language_radio,
+        outputs=model_dropdown,
+    )
+    decoding_method_radio = gr.Radio(
+        label="Decoding method",
+        choices=["greedy_search", "modified_beam_search"],
+        value="greedy_search",
+    )
+    num_active_paths_slider = gr.Slider(
+        minimum=1,
+        value=4,
+        step=1,
+        label="Number of active paths for modified_beam_search",
+    )
+    with gr.Tabs():
+        with gr.TabItem("Upload from disk"):
+            uploaded_file = gr.Audio(
+                source="upload",  # Choose between "microphone", "upload"
+                type="filepath",
+                optional=False,
+                label="Upload from disk",
+            )
+            upload_button = gr.Button("Submit for recognition")
+            uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
+            uploaded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    language_radio,
+                    model_dropdown,
+                    decoding_method_radio,
+                    num_active_paths_slider,
+                    uploaded_file,
+                ],
+                outputs=[uploaded_output, uploaded_html_info],
+                fn=process_uploaded_file,
+            )
+        with gr.TabItem("Record from microphone"):
+            microphone = gr.Audio(
+                source="microphone",  # Choose between "microphone", "upload"
+                type="filepath",
+                optional=False,
+                label="Record from microphone",
+            )
+            record_button = gr.Button("Submit for recognition")
+            recorded_output = gr.Textbox(label="Recognized speech from recordings")
+            recorded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    language_radio,
+                    model_dropdown,
+                    decoding_method_radio,
+                    num_active_paths_slider,
+                    microphone,
+                ],
+                outputs=[recorded_output, recorded_html_info],
+                fn=process_microphone,
+            )
+        with gr.TabItem("From URL"):
+            url_textbox = gr.Textbox(
+                    max_lines=1,
+                    placeholder="URL to an audio file",
+                    label="URL",
+                    interactive=True,
+            )
+            url_button = gr.Button("Submit for recognition")
+            url_output = gr.Textbox(label="Recognized speech from URL")
+            url_html_info = gr.HTML(label="Info")
+        upload_button.click(
+            process_uploaded_file,
+            inputs=[
+                language_radio,
+                model_dropdown,
+                decoding_method_radio,
+                num_active_paths_slider,
+                uploaded_file,
+            ],
+            outputs=[uploaded_output, uploaded_html_info],
+        )
+        record_button.click(
+            process_microphone,
+            inputs=[
+                language_radio,
+                model_dropdown,
+                decoding_method_radio,
+                num_active_paths_slider,
+                microphone,
+            ],
+            outputs=[recorded_output, recorded_html_info],
+        )
+        url_button.click(
+            process_url,
+            inputs=[
+                language_radio,
+                model_dropdown,
+                decoding_method_radio,
+                num_active_paths_slider,
+                url_textbox,
+            ],
+            outputs=[url_output, url_html_info],
+        )
+    gr.Markdown(description)
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_set_profiling_mode(False)
+torch._C._set_graph_executor_optimize(False)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    demo.launch()

decode.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# Copied from https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/conformer_rnnt/decode.py
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List
+import torch
+from sherpa import RnntConformerModel, greedy_search, modified_beam_search
+from torch.nn.utils.rnn import pad_sequence
+LOG_EPS = math.log(1e-10)
+@torch.no_grad()
+def run_model_and_do_greedy_search(
+    model: RnntConformerModel,
+    features: List[torch.Tensor],
+) -> List[List[int]]:
+    """Run RNN-T model with the given features and use greedy search
+    to decode the output of the model.
+    Args:
+      model:
+        The RNN-T model.
+      features:
+        A list of 2-D tensors. Each entry is of shape
+        (num_frames, feature_dim).
+    Returns:
+      Return a list-of-list containing the decoding token IDs.
+    """
+    features_length = torch.tensor(
+        [f.size(0) for f in features],
+        dtype=torch.int64,
+    )
+    features = pad_sequence(
+        features,
+        batch_first=True,
+        padding_value=LOG_EPS,
+    )
+    device = model.device
+    features = features.to(device)
+    features_length = features_length.to(device)
+    encoder_out, encoder_out_length = model.encoder(
+        features=features,
+        features_length=features_length,
+    )
+    hyp_tokens = greedy_search(
+        model=model,
+        encoder_out=encoder_out,
+        encoder_out_length=encoder_out_length.cpu(),
+    )
+    return hyp_tokens
+@torch.no_grad()
+def run_model_and_do_modified_beam_search(
+    model: RnntConformerModel,
+    features: List[torch.Tensor],
+    num_active_paths: int,
+) -> List[List[int]]:
+    """Run RNN-T model with the given features and use greedy search
+    to decode the output of the model.
+    Args:
+      model:
+        The RNN-T model.
+      features:
+        A list of 2-D tensors. Each entry is of shape
+        (num_frames, feature_dim).
+      num_active_paths:
+        Used only when decoding_method is modified_beam_search.
+        It specifies number of active paths for each utterance. Due to
+        merging paths with identical token sequences, the actual number
+        may be less than "num_active_paths".
+    Returns:
+      Return a list-of-list containing the decoding token IDs.
+    """
+    features_length = torch.tensor(
+        [f.size(0) for f in features],
+        dtype=torch.int64,
+    )
+    features = pad_sequence(
+        features,
+        batch_first=True,
+        padding_value=LOG_EPS,
+    )
+    device = model.device
+    features = features.to(device)
+    features_length = features_length.to(device)
+    encoder_out, encoder_out_length = model.encoder(
+        features=features,
+        features_length=features_length,
+    )
+    hyp_tokens = modified_beam_search(
+        model=model,
+        encoder_out=encoder_out,
+        encoder_out_length=encoder_out_length.cpu(),
+        num_active_paths=num_active_paths,
+    )
+    return hyp_tokens

giga-tokens.txt ADDED Viewed

	@@ -0,0 +1,500 @@

+<blk> 0
+<sos/eos> 1
+<unk> 2
+S 3
+T 4
+▁THE 5
+▁A 6
+E 7
+▁AND 8
+▁TO 9
+N 10
+D 11
+▁OF 12
+' 13
+ING 14
+▁I 15
+Y 16
+▁IN 17
+ED 18
+▁THAT 19
+▁ 20
+P 21
+R 22
+▁YOU 23
+M 24
+RE 25
+ER 26
+C 27
+O 28
+▁IT 29
+L 30
+A 31
+U 32
+G 33
+▁WE 34
+▁IS 35
+▁SO 36
+AL 37
+I 38
+▁S 39
+▁RE 40
+AR 41
+B 42
+▁FOR 43
+▁C 44
+▁BE 45
+LE 46
+F 47
+W 48
+▁E 49
+▁HE 50
+LL 51
+▁WAS 52
+LY 53
+OR 54
+IN 55
+▁F 56
+VE 57
+▁THIS 58
+TH 59
+K 60
+▁ON 61
+IT 62
+▁B 63
+▁WITH 64
+▁BUT 65
+EN 66
+CE 67
+RI 68
+▁DO 69
+UR 70
+▁HAVE 71
+▁DE 72
+▁ME 73
+▁T 74
+ENT 75
+CH 76
+▁THEY 77
+▁NOT 78
+ES 79
+V 80
+▁AS 81
+RA 82
+▁P 83
+ON 84
+TER 85
+▁ARE 86
+▁WHAT 87
+IC 88
+▁ST 89
+▁LIKE 90
+ATION 91
+▁OR 92
+▁CA 93
+▁AT 94
+H 95
+▁KNOW 96
+▁G 97
+AN 98
+▁CON 99
+IL 100
+ND 101
+RO 102
+▁HIS 103
+▁CAN 104
+▁ALL 105
+TE 106
+▁THERE 107
+▁SU 108
+▁MO 109
+▁MA 110
+LI 111
+▁ONE 112
+▁ABOUT 113
+LA 114
+▁CO 115
+- 116
+▁MY 117
+▁HAD 118
+CK 119
+NG 120
+▁NO 121
+MENT 122
+AD 123
+LO 124
+ME 125
+▁AN 126
+▁FROM 127
+NE 128
+▁IF 129
+VER 130
+▁JUST 131
+▁PRO 132
+ION 133
+▁PA 134
+▁WHO 135
+▁SE 136
+EL 137
+IR 138
+▁US 139
+▁UP 140
+▁YOUR 141
+CI 142
+RY 143
+▁GO 144
+▁SHE 145
+▁LE 146
+▁OUT 147
+▁PO 148
+▁HO 149
+ATE 150
+▁BO 151
+▁BY 152
+▁FA 153
+▁MI 154
+AS 155
+MP 156
+▁HER 157
+VI 158
+▁THINK 159
+▁SOME 160
+▁WHEN 161
+▁AH 162
+▁PEOPLE 163
+IG 164
+▁WA 165
+▁TE 166
+▁LA 167
+▁WERE 168
+▁LI 169
+▁WOULD 170
+▁SEE 171
+▁WHICH 172
+DE 173
+GE 174
+▁K 175
+IGHT 176
+▁HA 177
+▁OUR 178
+UN 179
+▁HOW 180
+▁GET 181
+IS 182
+UT 183
+Z 184
+CO 185
+ET 186
+UL 187
+IES 188
+IVE 189
+AT 190
+▁O 191
+▁DON 192
+LU 193
+▁TIME 194
+▁WILL 195
+▁MORE 196
+▁SP 197
+▁NOW 198
+RU 199
+▁THEIR 200
+▁UN 201
+ITY 202
+OL 203
+X 204
+TI 205
+US 206
+▁VERY 207
+TION 208
+▁FI 209
+▁SAY 210
+▁BECAUSE 211
+▁EX 212
+▁RO 213
+ERS 214
+IST 215
+▁DA 216
+TING 217
+▁EN 218
+OM 219
+▁BA 220
+▁BEEN 221
+▁LO 222
+▁UM 223
+AGE 224
+ABLE 225
+▁WO 226
+▁RA 227
+▁OTHER 228
+▁REALLY 229
+ENCE 230
+▁GOING 231
+▁HIM 232
+▁HAS 233
+▁THEM 234
+▁DIS 235
+▁WANT 236
+ID 237
+TA 238
+▁LOOK 239
+KE 240
+▁DID 241
+▁SA 242
+▁VI 243
+▁SAID 244
+▁RIGHT 245
+▁THESE 246
+▁WORK 247
+▁COM 248
+ALLY 249
+FF 250
+QU 251
+AC 252
+▁DR 253
+▁WAY 254
+▁INTO 255
+MO 256
+TED 257
+EST 258
+▁HERE 259
+OK 260
+▁COULD 261
+▁WELL 262
+MA 263
+▁PRE 264
+▁DI 265
+MAN 266
+▁COMP 267
+▁THEN 268
+IM 269
+▁PER 270
+▁NA 271
+▁WHERE 272
+▁TWO 273
+▁WI 274
+▁FE 275
+INE 276
+▁ANY 277
+TURE 278
+▁OVER 279
+BO 280
+ACH 281
+OW 282
+▁MAKE 283
+▁TRA 284
+HE 285
+UND 286
+▁EVEN 287
+ANCE 288
+▁YEAR 289
+HO 290
+AM 291
+▁CHA 292
+▁BACK 293
+VO 294
+ANT 295
+DI 296
+▁ALSO 297
+▁THOSE 298
+▁MAN 299
+CTION 300
+ICAL 301
+▁JO 302
+▁OP 303
+▁NEW 304
+▁MU 305
+▁HU 306
+▁KIND 307
+▁NE 308
+CA 309
+END 310
+TIC 311
+FUL 312
+▁YEAH 313
+SH 314
+▁APP 315
+▁THINGS 316
+SIDE 317
+▁GOOD 318
+ONE 319
+▁TAKE 320
+CU 321
+▁EVERY 322
+▁MEAN 323
+▁FIRST 324
+OP 325
+▁TH 326
+▁MUCH 327
+▁PART 328
+UGH 329
+▁COME 330
+J 331
+▁THAN 332
+▁EXP 333
+▁AGAIN 334
+▁LITTLE 335
+MB 336
+▁NEED 337
+▁TALK 338
+IF 339
+FOR 340
+▁SH 341
+ISH 342
+▁STA 343
+ATED 344
+▁GU 345
+▁LET 346
+IA 347
+▁MAR 348
+▁DOWN 349
+▁DAY 350
+▁GA 351
+▁SOMETHING 352
+▁BU 353
+DUC 354
+HA 355
+▁LOT 356
+▁RU 357
+▁THOUGH 358
+▁GREAT 359
+AIN 360
+▁THROUGH 361
+▁THING 362
+OUS 363
+▁PRI 364
+▁GOT 365
+▁SHOULD 366
+▁AFTER 367
+▁HEAR 368
+▁TA 369
+▁ONLY 370
+▁CHI 371
+IOUS 372
+▁SHA 373
+▁MOST 374
+▁ACTUALLY 375
+▁START 376
+LIC 377
+▁VA 378
+▁RI 379
+DAY 380
+IAN 381
+▁DOES 382
+ROW 383
+▁GRA 384
+ITION 385
+▁MANY 386
+▁BEFORE 387
+▁GIVE 388
+PORT 389
+QUI 390
+▁LIFE 391
+▁WORLD 392
+▁PI 393
+▁LONG 394
+▁THREE 395
+IZE 396
+NESS 397
+▁SHOW 398
+PH 399
+▁WHY 400
+▁QUESTION 401
+WARD 402
+▁THANK 403
+▁PH 404
+▁DIFFERENT 405
+▁OWN 406
+▁FEEL 407
+▁MIGHT 408
+▁HAPPEN 409
+▁MADE 410
+▁BRO 411
+IBLE 412
+▁HI 413
+▁STATE 414
+▁HAND 415
+▁NEVER 416
+▁PLACE 417
+▁LOVE 418
+▁DU 419
+▁POINT 420
+▁HELP 421
+▁COUNT 422
+▁STILL 423
+▁MR 424
+▁FIND 425
+▁PERSON 426
+▁CAME 427
+▁SAME 428
+▁LAST 429
+▁HIGH 430
+▁OLD 431
+▁UNDER 432
+▁FOUR 433
+▁AROUND 434
+▁SORT 435
+▁CHANGE 436
+▁YES 437
+SHIP 438
+▁ANOTHER 439
+ATIVE 440
+▁FOUND 441
+▁JA 442
+▁ALWAYS 443
+▁NEXT 444
+▁TURN 445
+▁JU 446
+▁SIX 447
+▁FACT 448
+▁INTEREST 449
+▁WORD 450
+▁THOUSAND 451
+▁HUNDRED 452
+▁NUMBER 453
+▁IDEA 454
+▁PLAN 455
+▁COURSE 456
+▁SCHOOL 457
+▁HOUSE 458
+▁TWENTY 459
+▁JE 460
+▁PLAY 461
+▁AWAY 462
+▁LEARN 463
+▁HARD 464
+▁WEEK 465
+▁BETTER 466
+▁WHILE 467
+▁FRIEND 468
+▁OKAY 469
+▁NINE 470
+▁UNDERSTAND 471
+▁KEEP 472
+▁GONNA 473
+▁SYSTEM 474
+▁AMERICA 475
+▁POWER 476
+▁IMPORTANT 477
+▁WITHOUT 478
+▁MAYBE 479
+▁SEVEN 480
+▁BETWEEN 481
+▁BUILD 482
+▁CERTAIN 483
+▁PROBLEM 484
+▁MONEY 485
+▁BELIEVE 486
+▁SECOND 487
+▁REASON 488
+▁TOGETHER 489
+▁PUBLIC 490
+▁ANYTHING 491
+▁SPEAK 492
+▁BUSINESS 493
+▁EVERYTHING 494
+▁CLOSE 495
+▁QUITE 496
+▁ANSWER 497
+▁ENOUGH 498
+Q 499

model.py ADDED Viewed

	@@ -0,0 +1,1001 @@

+# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from functools import lru_cache
+from typing import Union
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+os.system(
+    "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
+)
+import k2  # noqa
+import sherpa
+import sherpa_onnx
+import numpy as np
+from typing import Tuple
+import wave
+sample_rate = 16000
+def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
+    """
+    Args:
+      wave_filename:
+        Path to a wave file. It should be single channel and each sample should
+        be 16-bit. Its sample rate does not need to be 16kHz.
+    Returns:
+      Return a tuple containing:
+       - A 1-D array of dtype np.float32 containing the samples, which are
+       normalized to the range [-1, 1].
+       - sample rate of the wave file
+    """
+    with wave.open(wave_filename) as f:
+        assert f.getnchannels() == 1, f.getnchannels()
+        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
+        num_samples = f.getnframes()
+        samples = f.readframes(num_samples)
+        samples_int16 = np.frombuffer(samples, dtype=np.int16)
+        samples_float32 = samples_int16.astype(np.float32)
+        samples_float32 = samples_float32 / 32768
+        return samples_float32, f.getframerate()
+def decode_offline_recognizer(
+    recognizer: sherpa.OfflineRecognizer,
+    filename: str,
+) -> str:
+    s = recognizer.create_stream()
+    s.accept_wave_file(filename)
+    recognizer.decode_stream(s)
+    text = s.result.text.strip()
+    return text.lower()
+def decode_online_recognizer(
+    recognizer: sherpa.OnlineRecognizer,
+    filename: str,
+) -> str:
+    samples, actual_sample_rate = torchaudio.load(filename)
+    assert sample_rate == actual_sample_rate, (
+        sample_rate,
+        actual_sample_rate,
+    )
+    samples = samples[0].contiguous()
+    s = recognizer.create_stream()
+    tail_padding = torch.zeros(int(sample_rate * 0.3), dtype=torch.float32)
+    s.accept_waveform(sample_rate, samples)
+    s.accept_waveform(sample_rate, tail_padding)
+    s.input_finished()
+    while recognizer.is_ready(s):
+        recognizer.decode_stream(s)
+    text = recognizer.get_result(s).text
+    return text.strip().lower()
+def decode_offline_recognizer_sherpa_onnx(
+    recognizer: sherpa_onnx.OfflineRecognizer,
+    filename: str,
+) -> str:
+    s = recognizer.create_stream()
+    samples, sample_rate = read_wave(filename)
+    s.accept_waveform(sample_rate, samples)
+    recognizer.decode_stream(s)
+    return s.result.text.lower()
+def decode_online_recognizer_sherpa_onnx(
+    recognizer: sherpa_onnx.OnlineRecognizer,
+    filename: str,
+) -> str:
+    s = recognizer.create_stream()
+    samples, sample_rate = read_wave(filename)
+    s.accept_waveform(sample_rate, samples)
+    tail_paddings = np.zeros(int(0.3 * sample_rate), dtype=np.float32)
+    s.accept_waveform(sample_rate, tail_paddings)
+    s.input_finished()
+    while recognizer.is_ready(s):
+        recognizer.decode_stream(s)
+    return recognizer.get_result(s).lower()
+def decode(
+    recognizer: Union[
+        sherpa.OfflineRecognizer,
+        sherpa.OnlineRecognizer,
+        sherpa_onnx.OfflineRecognizer,
+        sherpa_onnx.OnlineRecognizer,
+    ],
+    filename: str,
+) -> str:
+    if isinstance(recognizer, sherpa.OfflineRecognizer):
+        return decode_offline_recognizer(recognizer, filename)
+    elif isinstance(recognizer, sherpa.OnlineRecognizer):
+        return decode_online_recognizer(recognizer, filename)
+    elif isinstance(recognizer, sherpa_onnx.OfflineRecognizer):
+        return decode_offline_recognizer_sherpa_onnx(recognizer, filename)
+    elif isinstance(recognizer, sherpa_onnx.OnlineRecognizer):
+        return decode_online_recognizer_sherpa_onnx(recognizer, filename)
+    else:
+        raise ValueError(f"Unknown recognizer type {type(recognizer)}")
+@lru_cache(maxsize=30)
+def get_pretrained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer]:
+    if repo_id in chinese_models:
+        return chinese_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in english_models:
+        return english_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in chinese_english_mixed_models:
+        return chinese_english_mixed_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in tibetan_models:
+        return tibetan_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in arabic_models:
+        return arabic_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in german_models:
+        return german_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in french_models:
+        return french_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in japanese_models:
+        return japanese_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    elif repo_id in russian_models:
+        return russian_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
+    else:
+        raise ValueError(f"Unsupported repo_id: {repo_id}")
+def _get_nn_model_filename(
+    repo_id: str,
+    filename: str,
+    subfolder: str = "exp",
+) -> str:
+    nn_model_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+    )
+    return nn_model_filename
+def _get_bpe_model_filename(
+    repo_id: str,
+    filename: str = "bpe.model",
+    subfolder: str = "data/lang_bpe_500",
+) -> str:
+    bpe_model_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+    )
+    return bpe_model_filename
+def _get_token_filename(
+    repo_id: str,
+    filename: str = "tokens.txt",
+    subfolder: str = "data/lang_char",
+) -> str:
+    token_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+    )
+    return token_filename
+@lru_cache(maxsize=10)
+def _get_aishell2_pretrained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OfflineRecognizer:
+    assert repo_id in [
+        # context-size 1
+        "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",  # noqa
+        # context-size 2
+        "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",  # noqa
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="cpu_jit.pt",
+    )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_russian_pre_trained_model(
+    repo_id: str, decoding_method: str, num_active_paths: int
+) -> sherpa_onnx.OfflineRecognizer:
+    assert repo_id in (
+        "alphacep/vosk-model-ru",
+        "alphacep/vosk-model-small-ru",
+    ), repo_id
+    if repo_id == "alphacep/vosk-model-ru":
+        model_dir = "am-onnx"
+    elif repo_id == "alphacep/vosk-model-small-ru":
+        model_dir = "am"
+    encoder_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="encoder.onnx",
+        subfolder=model_dir,
+    )
+    decoder_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="decoder.onnx",
+        subfolder=model_dir,
+    )
+    joiner_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="joiner.onnx",
+        subfolder=model_dir,
+    )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder="lang")
+    recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
+        tokens=tokens,
+        encoder=encoder_model,
+        decoder=decoder_model,
+        joiner=joiner_model,
+        num_threads=2,
+        sample_rate=16000,
+        feature_dim=80,
+        decoding_method=decoding_method,
+    )
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_whisper_model(
+    repo_id: str, decoding_method: str, num_active_paths: int
+) -> sherpa_onnx.OfflineRecognizer:
+    name = repo_id.split("-")[1]
+    assert name in ("tiny.en", "base.en", "small.en", "medium.en"), repo_id
+    full_repo_id = "csukuangfj/sherpa-onnx-whisper-" + name
+    encoder = _get_nn_model_filename(
+        repo_id=full_repo_id,
+        filename=f"{name}-encoder.int8.ort",
+        subfolder=".",
+    )
+    decoder = _get_nn_model_filename(
+        repo_id=full_repo_id,
+        filename=f"{name}-decoder.int8.ort",
+        subfolder=".",
+    )
+    tokens = _get_token_filename(
+        repo_id=full_repo_id, subfolder=".", filename=f"{name}-tokens.txt"
+    )
+    recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
+        encoder=encoder,
+        decoder=decoder,
+        tokens=tokens,
+        num_threads=2,
+    )
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_gigaspeech_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OfflineRecognizer:
+    assert repo_id in [
+        "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="cpu_jit-iter-3488000-avg-20.pt",
+    )
+    tokens = "./giga-tokens.txt"
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_english_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OfflineRecognizer:
+    assert repo_id in [
+        "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02",  # noqa
+        "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04",  # noqa
+        "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19",  # noqa
+        "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",  # noqa
+        "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11",  # noqa
+        "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14",  # noqa
+        "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16",  # noqa
+        "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15",  # noqa
+        "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16",  # noqa
+        "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
+        "pkufool/icefall_asr_librispeech_conformer_ctc",
+        "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21",
+    ], repo_id
+    filename = "cpu_jit.pt"
+    if (
+        repo_id
+        == "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11"
+    ):
+        filename = "cpu_jit-torch-1.10.0.pt"
+    if (
+        repo_id
+        == "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02"
+    ):
+        filename = "cpu_jit-torch-1.10.pt"
+    if (
+        repo_id
+        == "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04"
+    ):
+        filename = "cpu_jit-epoch-30-avg-4.pt"
+    if (
+        repo_id
+        == "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19"
+    ):
+        filename = "cpu_jit-epoch-20-avg-5.pt"
+    if repo_id in (
+        "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16",
+        "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15",
+        "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16",
+    ):
+        filename = "jit_script.pt"
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename=filename,
+    )
+    subfolder = "data/lang_bpe_500"
+    if repo_id in (
+        "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
+        "pkufool/icefall_asr_librispeech_conformer_ctc",
+    ):
+        subfolder = "data/lang_bpe"
+    tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_wenetspeech_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
+    )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_chinese_english_mixed_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
+        "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
+    ], repo_id
+    if repo_id == "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5":
+        filename = "cpu_jit.pt"
+        subfolder = "data/lang_char"
+    elif repo_id == "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh":
+        filename = "cpu_jit-epoch-11-avg-1.pt"
+        subfolder = "data/lang_char_bpe"
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename=filename,
+    )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_alimeeting_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
+        "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
+    ], repo_id
+    if repo_id == "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7":
+        filename = "cpu_jit.pt"
+    elif repo_id == "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2":
+        filename = "cpu_jit_torch_1.7.1.pt"
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename=filename,
+    )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_wenet_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "csukuangfj/wenet-chinese-model",
+        "csukuangfj/wenet-english-model",
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="final.zip",
+        subfolder=".",
+    )
+    tokens = _get_token_filename(
+        repo_id=repo_id,
+        filename="units.txt",
+        subfolder=".",
+    )
+    feat_config = sherpa.FeatureConfig(normalize_samples=False)
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_aidatatang_200zh_pretrained_mode(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="cpu_jit_torch.1.7.1.pt",
+    )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_tibetan_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
+        "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
+    ], repo_id
+    filename = "cpu_jit.pt"
+    if (
+        repo_id
+        == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29"
+    ):
+        filename = "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt"
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename=filename,
+    )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_arabic_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="cpu_jit.pt",
+    )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_5000")
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_german_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "csukuangfj/wav2vec2.0-torchaudio",
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="voxpopuli_asr_base_10k_de.pt",
+        subfolder=".",
+    )
+    tokens = _get_token_filename(
+        repo_id=repo_id,
+        filename="tokens-de.txt",
+        subfolder=".",
+    )
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+    )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_french_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    assert repo_id in [
+        "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
+    ], repo_id
+    encoder_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="encoder-epoch-29-avg-9-with-averaged-model.onnx",
+        subfolder=".",
+    )
+    decoder_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="decoder-epoch-29-avg-9-with-averaged-model.onnx",
+        subfolder=".",
+    )
+    joiner_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="joiner-epoch-29-avg-9-with-averaged-model.onnx",
+        subfolder=".",
+    )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
+    recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
+        tokens=tokens,
+        encoder=encoder_model,
+        decoder=decoder_model,
+        joiner=joiner_model,
+        num_threads=2,
+        sample_rate=16000,
+        feature_dim=80,
+        decoding_method=decoding_method,
+        max_active_paths=num_active_paths,
+    )
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_japanese_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OnlineRecognizer:
+    repo_id, kind = repo_id.rsplit("-", maxsplit=1)
+    assert repo_id in [
+        "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208"
+    ], repo_id
+    assert kind in ("fluent", "disfluent"), kind
+    encoder_model = _get_nn_model_filename(
+        repo_id=repo_id, filename="encoder_jit_trace.pt", subfolder=f"exp_{kind}"
+    )
+    decoder_model = _get_nn_model_filename(
+        repo_id=repo_id, filename="decoder_jit_trace.pt", subfolder=f"exp_{kind}"
+    )
+    joiner_model = _get_nn_model_filename(
+        repo_id=repo_id, filename="joiner_jit_trace.pt", subfolder=f"exp_{kind}"
+    )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OnlineRecognizerConfig(
+        nn_model="",
+        encoder_model=encoder_model,
+        decoder_model=decoder_model,
+        joiner_model=joiner_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+        chunk_size=32,
+    )
+    recognizer = sherpa.OnlineRecognizer(config)
+    return recognizer
+@lru_cache(maxsize=10)
+def _get_paraformer_zh_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa_onnx.OfflineRecognizer:
+    assert repo_id in [
+        "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28",
+    ], repo_id
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="model.onnx",
+        subfolder=".",
+    )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
+    recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
+        paraformer=nn_model,
+        tokens=tokens,
+        num_threads=2,
+        sample_rate=sample_rate,
+        feature_dim=80,
+        decoding_method="greedy_search",
+        debug=False,
+    )
+    return recognizer
+chinese_models = {
+    "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28": _get_paraformer_zh_pre_trained_model,
+    "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model,  # noqa
+    "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7": _get_alimeeting_pre_trained_model,
+    "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model,  # noqa
+    "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model,  # noqa
+    "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode,  # noqa
+    "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model,  # noqa
+    "csukuangfj/wenet-chinese-model": _get_wenet_model,
+    #  "csukuangfj/icefall-asr-wenetspeech-lstm-transducer-stateless-2022-10-14": _get_lstm_transducer_model,
+}
+english_models = {
+    "whisper-tiny.en": _get_whisper_model,
+    "whisper-base.en": _get_whisper_model,
+    "whisper-small.en": _get_whisper_model,
+    #  "whisper-medium.en": _get_whisper_model,
+    "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model,  # noqa
+    "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04": _get_english_model,  # noqa
+    "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19": _get_english_model,  # noqa
+    "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_english_model,  # noqa
+    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_english_model,  # noqa
+    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_english_model,  # noqa
+    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_english_model,  # noqa
+    "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16": _get_english_model,  # noqa
+    "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15": _get_english_model,  # noqa
+    "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16": _get_english_model,  # noqa
+    "videodanchik/icefall-asr-tedlium3-conformer-ctc2": _get_english_model,
+    "pkufool/icefall_asr_librispeech_conformer_ctc": _get_english_model,
+    "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21": _get_english_model,
+    "csukuangfj/wenet-english-model": _get_wenet_model,
+}
+chinese_english_mixed_models = {
+    "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": _get_chinese_english_mixed_model,
+    "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_chinese_english_mixed_model,  # noqa
+}
+tibetan_models = {
+    "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02": _get_tibetan_pre_trained_model,  # noqa
+    "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29": _get_tibetan_pre_trained_model,  # noqa
+}
+arabic_models = {
+    "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06": _get_arabic_pre_trained_model,  # noqa
+}
+german_models = {
+    "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
+}
+french_models = {
+    "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": _get_french_pre_trained_model,
+}
+japanese_models = {
+    "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent": _get_japanese_pre_trained_model,
+    "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent": _get_japanese_pre_trained_model,
+}
+russian_models = {
+    "alphacep/vosk-model-ru": _get_russian_pre_trained_model,
+    "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model,
+}
+all_models = {
+    **chinese_models,
+    **english_models,
+    **chinese_english_mixed_models,
+    #  **japanese_models,
+    **tibetan_models,
+    **arabic_models,
+    **german_models,
+    **french_models,
+    **russian_models,
+}
+language_to_models = {
+    "Chinese": list(chinese_models.keys()),
+    "English": list(english_models.keys()),
+    "Chinese+English": list(chinese_english_mixed_models.keys()),
+    #  "Japanese": list(japanese_models.keys()),
+    "Tibetan": list(tibetan_models.keys()),
+    "Arabic": list(arabic_models.keys()),
+    "German": list(german_models.keys()),
+    "French": list(french_models.keys()),
+    "Russian": list(russian_models.keys()),
+}

requirements (1).txt ADDED Viewed

	@@ -0,0 +1,12 @@

+https://download.pytorch.org/whl/cpu/torch-1.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
+https://download.pytorch.org/whl/cpu/torchaudio-0.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
+https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2-1.23.4.dev20230130%2Bcpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
+https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl
+https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/kaldifeat-1.22-cp38-cp38-linux_x86_64.whl
+sentencepiece>=0.1.96
+numpy
+huggingface_hub
+sherpa-onnx>=1.7.0