HoneyTian commited on
Commit
302c392
1 Parent(s): d49cecc
toolbox/k2_sherpa/examples.py CHANGED
@@ -43,6 +43,30 @@ examples = [
43
  "Yes",
44
  "./data/test_wavs/cantonese/1.wav",
45
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  [
47
  "German",
48
  "csukuangfj/wav2vec2.0-torchaudio",
@@ -84,11 +108,43 @@ examples = [
84
  "./data/test_wavs/french/common_voice_fr_27024649.wav",
85
  ],
86
  [
87
- "Tibetan",
88
- "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
89
  "greedy_search",
90
  4,
91
  "No",
92
- "./data/test_wavs/tibetan/a_0_cacm-A70_31117.wav",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  ],
94
  ]
 
43
  "Yes",
44
  "./data/test_wavs/cantonese/1.wav",
45
  ],
46
+ [
47
+ "Tibetan",
48
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
49
+ "greedy_search",
50
+ 4,
51
+ "No",
52
+ "./data/test_wavs/tibetan/a_0_cacm-A70_31117.wav",
53
+ ],
54
+ [
55
+ "Tibetan",
56
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
57
+ "greedy_search",
58
+ 4,
59
+ "No",
60
+ "./data/test_wavs/tibetan/a_0_cacm-A70_31116.wav",
61
+ ],
62
+ [
63
+ "Tibetan",
64
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
65
+ "greedy_search",
66
+ 4,
67
+ "No",
68
+ "./data/test_wavs/tibetan/a_0_cacm-A70_31118.wav",
69
+ ],
70
  [
71
  "German",
72
  "csukuangfj/wav2vec2.0-torchaudio",
 
108
  "./data/test_wavs/french/common_voice_fr_27024649.wav",
109
  ],
110
  [
111
+ "Russian",
112
+ "alphacep/vosk-model-ru",
113
  "greedy_search",
114
  4,
115
  "No",
116
+ "./data/test_wavs/russian/russian-i-love-you.wav",
117
+ ],
118
+ [
119
+ "Russian",
120
+ "alphacep/vosk-model-ru",
121
+ "greedy_search",
122
+ 4,
123
+ "No",
124
+ "./data/test_wavs/russian/test.wav",
125
+ ],
126
+ [
127
+ "Arabic",
128
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
129
+ "greedy_search",
130
+ 4,
131
+ "No",
132
+ "./data/test_wavs/arabic/a.wav",
133
+ ],
134
+ [
135
+ "Arabic",
136
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
137
+ "greedy_search",
138
+ 4,
139
+ "No",
140
+ "./data/test_wavs/arabic/b.wav",
141
+ ],
142
+ [
143
+ "Arabic",
144
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
145
+ "greedy_search",
146
+ 4,
147
+ "No",
148
+ "./data/test_wavs/arabic/c.wav",
149
  ],
150
  ]
toolbox/k2_sherpa/nn_models.py CHANGED
@@ -206,6 +206,7 @@ model_map = {
206
  "nn_model_file_sub_folder": "exp",
207
  "tokens_file": "./giga-tokens.txt",
208
  "tokens_file_sub_folder": ".",
 
209
  "loader": "load_sherpa_offline_recognizer",
210
  },
211
  {
@@ -214,6 +215,7 @@ model_map = {
214
  "nn_model_file_sub_folder": "exp",
215
  "tokens_file": "tokens.txt",
216
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
217
  "loader": "load_sherpa_offline_recognizer",
218
  },
219
  {
@@ -222,6 +224,7 @@ model_map = {
222
  "nn_model_file_sub_folder": "exp",
223
  "tokens_file": "tokens.txt",
224
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
225
  "loader": "load_sherpa_offline_recognizer",
226
  },
227
  {
@@ -230,6 +233,7 @@ model_map = {
230
  "nn_model_file_sub_folder": "exp",
231
  "tokens_file": "tokens.txt",
232
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
233
  "loader": "load_sherpa_offline_recognizer",
234
  },
235
  {
@@ -238,6 +242,7 @@ model_map = {
238
  "nn_model_file_sub_folder": "exp",
239
  "tokens_file": "tokens.txt",
240
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
241
  "loader": "load_sherpa_offline_recognizer",
242
  },
243
  {
@@ -246,6 +251,7 @@ model_map = {
246
  "nn_model_file_sub_folder": "exp",
247
  "tokens_file": "tokens.txt",
248
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
249
  "loader": "load_sherpa_offline_recognizer",
250
  },
251
  {
@@ -254,6 +260,7 @@ model_map = {
254
  "nn_model_file_sub_folder": "exp",
255
  "tokens_file": "tokens.txt",
256
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
257
  "loader": "load_sherpa_offline_recognizer",
258
  },
259
  {
@@ -270,6 +277,7 @@ model_map = {
270
  "nn_model_file_sub_folder": "exp",
271
  "tokens_file": "tokens.txt",
272
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
273
  "loader": "load_sherpa_offline_recognizer",
274
  },
275
  {
@@ -278,6 +286,7 @@ model_map = {
278
  "nn_model_file_sub_folder": "exp",
279
  "tokens_file": "tokens.txt",
280
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
281
  "loader": "load_sherpa_offline_recognizer",
282
  },
283
  {
@@ -286,6 +295,7 @@ model_map = {
286
  "nn_model_file_sub_folder": "exp",
287
  "tokens_file": "tokens.txt",
288
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
289
  "loader": "load_sherpa_offline_recognizer",
290
  },
291
  {
@@ -294,6 +304,7 @@ model_map = {
294
  "nn_model_file_sub_folder": "exp",
295
  "tokens_file": "tokens.txt",
296
  "tokens_file_sub_folder": "data/lang_bpe",
 
297
  "loader": "load_sherpa_offline_recognizer",
298
  },
299
  {
@@ -302,6 +313,7 @@ model_map = {
302
  "nn_model_file_sub_folder": "exp",
303
  "tokens_file": "tokens.txt",
304
  "tokens_file_sub_folder": "data/lang_bpe",
 
305
  "loader": "load_sherpa_offline_recognizer",
306
  },
307
  {
@@ -310,6 +322,7 @@ model_map = {
310
  "nn_model_file_sub_folder": "exp",
311
  "tokens_file": "tokens.txt",
312
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
313
  "loader": "load_sherpa_offline_recognizer",
314
  },
315
  {
@@ -318,6 +331,7 @@ model_map = {
318
  "nn_model_file_sub_folder": ".",
319
  "tokens_file": "units.txt",
320
  "tokens_file_sub_folder": ".",
 
321
  "loader": "load_sherpa_offline_recognizer",
322
  },
323
  ],
@@ -348,6 +362,7 @@ model_map = {
348
  "nn_model_file_sub_folder": "exp",
349
  "tokens_file": "tokens.txt",
350
  "tokens_file_sub_folder": "data/lang_char_bpe",
 
351
  "loader": "load_sherpa_offline_recognizer",
352
  },
353
  {
@@ -356,6 +371,7 @@ model_map = {
356
  "nn_model_file_sub_folder": "exp",
357
  "tokens_file": "tokens.txt",
358
  "tokens_file_sub_folder": "data/lang_char",
 
359
  "loader": "load_sherpa_offline_recognizer",
360
  },
361
  ],
@@ -393,34 +409,34 @@ model_map = {
393
  "loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
394
  },
395
  ],
396
- "Japanese": [
397
- {
398
- "repo_id": "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent",
399
- "encoder_model_file": "encoder_jit_trace.pt",
400
- "encoder_model_file_sub_folder": "exp_fluent",
401
- "decoder_model_file": "decoder_jit_trace.pt",
402
- "decoder_model_file_sub_folder": "exp_fluent",
403
- "joiner_model_file": "joiner_jit_trace.pt",
404
- "joiner_model_file_sub_folder": "exp_fluent",
405
- "tokens_file": "tokens.txt",
406
- "tokens_file_sub_folder": "data/lang_char",
407
- "normalize_samples": True,
408
- "loader": "load_sherpa_online_recognizer",
409
- },
410
- {
411
- "repo_id": "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent",
412
- "encoder_model_file": "encoder_jit_trace.pt",
413
- "encoder_model_file_sub_folder": "exp_disfluent",
414
- "decoder_model_file": "decoder_jit_trace.pt",
415
- "decoder_model_file_sub_folder": "exp_disfluent",
416
- "joiner_model_file": "joiner_jit_trace.pt",
417
- "joiner_model_file_sub_folder": "exp_disfluent",
418
- "tokens_file": "tokens.txt",
419
- "tokens_file_sub_folder": "data/lang_char",
420
- "normalize_samples": True,
421
- "loader": "load_sherpa_online_recognizer",
422
- },
423
- ],
424
  "German": [
425
  {
426
  "repo_id": "csukuangfj/wav2vec2.0-torchaudio",
@@ -428,8 +444,7 @@ model_map = {
428
  "nn_model_file_sub_folder": ".",
429
  "tokens_file": "tokens-de.txt",
430
  "tokens_file_sub_folder": ".",
431
- "normalize_samples": False,
432
- "loader": "load_sherpa_offline_recognizer",
433
  },
434
  ],
435
  "French": [
@@ -446,6 +461,42 @@ model_map = {
446
  "loader": "load_sherpa_onnx_online_recognizer_from_transducer",
447
  },
448
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  "Tibetan": [
450
  {
451
  "repo_id": "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
@@ -453,6 +504,7 @@ model_map = {
453
  "nn_model_file_sub_folder": "exp",
454
  "tokens_file": "tokens.txt",
455
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
456
  "loader": "load_sherpa_offline_recognizer",
457
  },
458
  {
@@ -461,6 +513,7 @@ model_map = {
461
  "nn_model_file_sub_folder": "exp",
462
  "tokens_file": "tokens.txt",
463
  "tokens_file_sub_folder": "data/lang_bpe_500",
 
464
  "loader": "load_sherpa_offline_recognizer",
465
  },
466
  ],
@@ -551,6 +604,24 @@ def load_sherpa_offline_recognizer(nn_model_file: str,
551
  return recognizer
552
 
553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  def load_sherpa_onnx_offline_recognizer_from_paraformer(nn_model_file: str,
555
  tokens_file: str,
556
  sample_rate: int = 16000,
@@ -730,6 +801,11 @@ def load_recognizer(local_model_dir: Path,
730
  num_active_paths=num_active_paths,
731
  **kwargs_
732
  )
 
 
 
 
 
733
  elif loader == "load_sherpa_onnx_offline_recognizer_from_paraformer":
734
  recognizer = load_sherpa_onnx_offline_recognizer_from_paraformer(
735
  decoding_method=decoding_method,
 
206
  "nn_model_file_sub_folder": "exp",
207
  "tokens_file": "./giga-tokens.txt",
208
  "tokens_file_sub_folder": ".",
209
+ "normalize_samples": True,
210
  "loader": "load_sherpa_offline_recognizer",
211
  },
212
  {
 
215
  "nn_model_file_sub_folder": "exp",
216
  "tokens_file": "tokens.txt",
217
  "tokens_file_sub_folder": "data/lang_bpe_500",
218
+ "normalize_samples": True,
219
  "loader": "load_sherpa_offline_recognizer",
220
  },
221
  {
 
224
  "nn_model_file_sub_folder": "exp",
225
  "tokens_file": "tokens.txt",
226
  "tokens_file_sub_folder": "data/lang_bpe_500",
227
+ "normalize_samples": True,
228
  "loader": "load_sherpa_offline_recognizer",
229
  },
230
  {
 
233
  "nn_model_file_sub_folder": "exp",
234
  "tokens_file": "tokens.txt",
235
  "tokens_file_sub_folder": "data/lang_bpe_500",
236
+ "normalize_samples": True,
237
  "loader": "load_sherpa_offline_recognizer",
238
  },
239
  {
 
242
  "nn_model_file_sub_folder": "exp",
243
  "tokens_file": "tokens.txt",
244
  "tokens_file_sub_folder": "data/lang_bpe_500",
245
+ "normalize_samples": True,
246
  "loader": "load_sherpa_offline_recognizer",
247
  },
248
  {
 
251
  "nn_model_file_sub_folder": "exp",
252
  "tokens_file": "tokens.txt",
253
  "tokens_file_sub_folder": "data/lang_bpe_500",
254
+ "normalize_samples": True,
255
  "loader": "load_sherpa_offline_recognizer",
256
  },
257
  {
 
260
  "nn_model_file_sub_folder": "exp",
261
  "tokens_file": "tokens.txt",
262
  "tokens_file_sub_folder": "data/lang_bpe_500",
263
+ "normalize_samples": True,
264
  "loader": "load_sherpa_offline_recognizer",
265
  },
266
  {
 
277
  "nn_model_file_sub_folder": "exp",
278
  "tokens_file": "tokens.txt",
279
  "tokens_file_sub_folder": "data/lang_bpe_500",
280
+ "normalize_samples": True,
281
  "loader": "load_sherpa_offline_recognizer",
282
  },
283
  {
 
286
  "nn_model_file_sub_folder": "exp",
287
  "tokens_file": "tokens.txt",
288
  "tokens_file_sub_folder": "data/lang_bpe_500",
289
+ "normalize_samples": True,
290
  "loader": "load_sherpa_offline_recognizer",
291
  },
292
  {
 
295
  "nn_model_file_sub_folder": "exp",
296
  "tokens_file": "tokens.txt",
297
  "tokens_file_sub_folder": "data/lang_bpe_500",
298
+ "normalize_samples": True,
299
  "loader": "load_sherpa_offline_recognizer",
300
  },
301
  {
 
304
  "nn_model_file_sub_folder": "exp",
305
  "tokens_file": "tokens.txt",
306
  "tokens_file_sub_folder": "data/lang_bpe",
307
+ "normalize_samples": True,
308
  "loader": "load_sherpa_offline_recognizer",
309
  },
310
  {
 
313
  "nn_model_file_sub_folder": "exp",
314
  "tokens_file": "tokens.txt",
315
  "tokens_file_sub_folder": "data/lang_bpe",
316
+ "normalize_samples": True,
317
  "loader": "load_sherpa_offline_recognizer",
318
  },
319
  {
 
322
  "nn_model_file_sub_folder": "exp",
323
  "tokens_file": "tokens.txt",
324
  "tokens_file_sub_folder": "data/lang_bpe_500",
325
+ "normalize_samples": True,
326
  "loader": "load_sherpa_offline_recognizer",
327
  },
328
  {
 
331
  "nn_model_file_sub_folder": ".",
332
  "tokens_file": "units.txt",
333
  "tokens_file_sub_folder": ".",
334
+ "normalize_samples": False,
335
  "loader": "load_sherpa_offline_recognizer",
336
  },
337
  ],
 
362
  "nn_model_file_sub_folder": "exp",
363
  "tokens_file": "tokens.txt",
364
  "tokens_file_sub_folder": "data/lang_char_bpe",
365
+ "normalize_samples": True,
366
  "loader": "load_sherpa_offline_recognizer",
367
  },
368
  {
 
371
  "nn_model_file_sub_folder": "exp",
372
  "tokens_file": "tokens.txt",
373
  "tokens_file_sub_folder": "data/lang_char",
374
+ "normalize_samples": True,
375
  "loader": "load_sherpa_offline_recognizer",
376
  },
377
  ],
 
409
  "loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
410
  },
411
  ],
412
+ # "Japanese": [
413
+ # {
414
+ # "repo_id": "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent",
415
+ # "encoder_model_file": "encoder_jit_trace.pt",
416
+ # "encoder_model_file_sub_folder": "exp_fluent",
417
+ # "decoder_model_file": "decoder_jit_trace.pt",
418
+ # "decoder_model_file_sub_folder": "exp_fluent",
419
+ # "joiner_model_file": "joiner_jit_trace.pt",
420
+ # "joiner_model_file_sub_folder": "exp_fluent",
421
+ # "tokens_file": "tokens.txt",
422
+ # "tokens_file_sub_folder": "data/lang_char",
423
+ # "normalize_samples": True,
424
+ # "loader": "load_sherpa_online_recognizer",
425
+ # },
426
+ # {
427
+ # "repo_id": "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent",
428
+ # "encoder_model_file": "encoder_jit_trace.pt",
429
+ # "encoder_model_file_sub_folder": "exp_disfluent",
430
+ # "decoder_model_file": "decoder_jit_trace.pt",
431
+ # "decoder_model_file_sub_folder": "exp_disfluent",
432
+ # "joiner_model_file": "joiner_jit_trace.pt",
433
+ # "joiner_model_file_sub_folder": "exp_disfluent",
434
+ # "tokens_file": "tokens.txt",
435
+ # "tokens_file_sub_folder": "data/lang_char",
436
+ # "normalize_samples": True,
437
+ # "loader": "load_sherpa_online_recognizer",
438
+ # },
439
+ # ],
440
  "German": [
441
  {
442
  "repo_id": "csukuangfj/wav2vec2.0-torchaudio",
 
444
  "nn_model_file_sub_folder": ".",
445
  "tokens_file": "tokens-de.txt",
446
  "tokens_file_sub_folder": ".",
447
+ "loader": "load_sherpa_offline_recognizer_without_feat_config",
 
448
  },
449
  ],
450
  "French": [
 
461
  "loader": "load_sherpa_onnx_online_recognizer_from_transducer",
462
  },
463
  ],
464
+ "Russian": [
465
+ {
466
+ "repo_id": "alphacep/vosk-model-ru",
467
+ "encoder_model_file": "encoder.onnx",
468
+ "encoder_model_file_sub_folder": "am-onnx",
469
+ "decoder_model_file": "decoder.onnx",
470
+ "decoder_model_file_sub_folder": "am-onnx",
471
+ "joiner_model_file": "joiner.onnx",
472
+ "joiner_model_file_sub_folder": "am-onnx",
473
+ "tokens_file": "tokens.txt",
474
+ "tokens_file_sub_folder": "lang",
475
+ "loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
476
+ },
477
+ {
478
+ "repo_id": "alphacep/vosk-model-small-ru",
479
+ "encoder_model_file": "encoder.onnx",
480
+ "encoder_model_file_sub_folder": "am",
481
+ "decoder_model_file": "decoder.onnx",
482
+ "decoder_model_file_sub_folder": "am",
483
+ "joiner_model_file": "joiner.onnx",
484
+ "joiner_model_file_sub_folder": "am",
485
+ "tokens_file": "tokens.txt",
486
+ "tokens_file_sub_folder": "lang",
487
+ "loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
488
+ },
489
+ ],
490
+ "Arabic": [
491
+ {
492
+ "repo_id": "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
493
+ "nn_model_file": "cpu_jit.pt",
494
+ "nn_model_file_sub_folder": "exp",
495
+ "tokens_file": "tokens.txt",
496
+ "tokens_file_sub_folder": "data/lang_bpe_5000",
497
+ "loader": "load_sherpa_offline_recognizer_without_feat_config",
498
+ },
499
+ ],
500
  "Tibetan": [
501
  {
502
  "repo_id": "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
 
504
  "nn_model_file_sub_folder": "exp",
505
  "tokens_file": "tokens.txt",
506
  "tokens_file_sub_folder": "data/lang_bpe_500",
507
+ "normalize_samples": True,
508
  "loader": "load_sherpa_offline_recognizer",
509
  },
510
  {
 
513
  "nn_model_file_sub_folder": "exp",
514
  "tokens_file": "tokens.txt",
515
  "tokens_file_sub_folder": "data/lang_bpe_500",
516
+ "normalize_samples": True,
517
  "loader": "load_sherpa_offline_recognizer",
518
  },
519
  ],
 
604
  return recognizer
605
 
606
 
607
+ def load_sherpa_offline_recognizer_without_feat_config(nn_model_file: str,
608
+ tokens_file: str,
609
+ num_active_paths: int = 2,
610
+ decoding_method: str = "greedy_search",
611
+ ):
612
+ config = sherpa.OfflineRecognizerConfig(
613
+ nn_model=nn_model_file,
614
+ tokens=tokens_file,
615
+ use_gpu=False,
616
+ decoding_method=decoding_method,
617
+ num_active_paths=num_active_paths,
618
+ )
619
+
620
+ recognizer = sherpa.OfflineRecognizer(config)
621
+
622
+ return recognizer
623
+
624
+
625
  def load_sherpa_onnx_offline_recognizer_from_paraformer(nn_model_file: str,
626
  tokens_file: str,
627
  sample_rate: int = 16000,
 
801
  num_active_paths=num_active_paths,
802
  **kwargs_
803
  )
804
+ elif loader == "load_sherpa_offline_recognizer_without_feat_config":
805
+ recognizer = load_sherpa_offline_recognizer_without_feat_config(
806
+ decoding_method=decoding_method,
807
+ **kwargs_
808
+ )
809
  elif loader == "load_sherpa_onnx_offline_recognizer_from_paraformer":
810
  recognizer = load_sherpa_onnx_offline_recognizer_from_paraformer(
811
  decoding_method=decoding_method,