Axelisme commited on
Commit
def3e88
1 Parent(s): ef8800d

Upload 7 files

Browse files

ASR-API version 1.0.0

ASR_model/infer.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import torch
3
+ import argparse
4
+ import shutil
5
+ import tempfile
6
+ from speechbrain.pretrained import EncoderDecoderASR
7
+
8
+
9
+ def asr_model_inference(model: EncoderDecoderASR, audios: List[str]) -> List[str]:
10
+ """
11
+ convert input audio to words and return the result
12
+ """
13
+ tmp_dir = tempfile.mkdtemp()
14
+ results = [process_audio(model, audio, tmp_dir) for audio in audios]
15
+ shutil.rmtree(tmp_dir)
16
+ return results
17
+
18
+ def process_audio(model: EncoderDecoderASR, audio: str, savedir:str) -> str:
19
+ """
20
+ convert input audio to words and return the result
21
+ """
22
+ waveform = model.load_audio(audio, savedir=savedir)
23
+ # Fake a batch:
24
+ batch = waveform.unsqueeze(0)
25
+ rel_length = torch.tensor([1.0])
26
+ predicted_words, predicted_tokens = model.transcribe_batch(
27
+ batch, rel_length
28
+ )
29
+ return predicted_words[0]
30
+
31
+
32
+ if __name__ == "__main__":
33
+ parser = argparse.ArgumentParser()
34
+ parser.add_argument("-I", dest="audio_file", required=True)
35
+
36
+ args = parser.parse_args()
37
+
38
+ asr_model = EncoderDecoderASR.from_hparams(
39
+ source="./inference", hparams_file="hyperparams.yaml", savedir="inference", run_opts={"device": "cpu"})
40
+
41
+ print(asr_model_inference(asr_model, [args.audio_file]))
ASR_model/inference/asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5533d036f8c2922e4e0246d4543b1936f9b1d80df1e09f624e927f5609e8f75f
3
+ size 126714188
ASR_model/inference/hyperparams.yaml ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
2
+
3
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
4
+ loadables:
5
+ lm: !ref <lm_model>
6
+ tokenizer: !ref <tokenizer>
7
+ normalizer: !ref <normalizer>
8
+ asr: !ref <asr_model>
9
+
10
+ # Feature parameters
11
+ sample_rate: 16000
12
+ n_fft: 400
13
+ n_mels: 80
14
+ hop_length: 20
15
+
16
+ compute_features: !new:speechbrain.lobes.features.Fbank
17
+ sample_rate: !ref <sample_rate>
18
+ n_fft: !ref <n_fft>
19
+ n_mels: !ref <n_mels>
20
+ hop_length: !ref <hop_length>
21
+
22
+ ####################### Model parameters ###########################
23
+ # Transformer
24
+ d_model: 256
25
+ nhead: 4
26
+ num_encoder_layers: 12
27
+ num_decoder_layers: 6
28
+ d_ffn: 2048
29
+ transformer_dropout: 0.1
30
+ activation: !name:torch.nn.GELU
31
+ output_neurons: 5000
32
+ vocab_size: 5000
33
+
34
+ # Outputs
35
+ blank_index: 0
36
+ label_smoothing: 0.1
37
+ pad_index: 0
38
+ bos_index: 1
39
+ eos_index: 2
40
+ unk_index: 0
41
+
42
+ # Decoding parameters
43
+ min_decode_ratio: 0.0
44
+ max_decode_ratio: 1.0
45
+ valid_search_interval: 10
46
+ valid_beam_size: 10
47
+ test_beam_size: 10
48
+ ctc_weight_decode: 0.3
49
+ lm_weight: 0.2
50
+
51
+ ############################## models ################################
52
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
53
+ input_shape: !!python/tuple [8, 10, 8]
54
+ num_blocks: 2
55
+ num_layers_per_block: 1
56
+ out_channels: !!python/tuple [256, 256]
57
+ kernel_sizes: !!python/tuple [3, 3]
58
+ strides: !!python/tuple [2, 2]
59
+ residuals: !!python/tuple [False, False]
60
+
61
+ Transformer:
62
+ !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
63
+ input_size: 5120
64
+ tgt_vocab: !ref <output_neurons>
65
+ d_model: !ref <d_model>
66
+ nhead: !ref <nhead>
67
+ num_encoder_layers: !ref <num_encoder_layers>
68
+ num_decoder_layers: !ref <num_decoder_layers>
69
+ d_ffn: !ref <d_ffn>
70
+ dropout: !ref <transformer_dropout>
71
+ activation: !ref <activation>
72
+ normalize_before: True
73
+
74
+ lm_model:
75
+ !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length
76
+ vocab: !ref <output_neurons>
77
+ d_model: 576
78
+ nhead: 6
79
+ num_encoder_layers: 6
80
+ num_decoder_layers: 0
81
+ d_ffn: 1538
82
+ dropout: 0.2
83
+ activation: !name:torch.nn.GELU
84
+ normalize_before: False
85
+
86
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
87
+ input_size: !ref <d_model>
88
+ n_neurons: !ref <output_neurons>
89
+
90
+ seq_lin: !new:speechbrain.nnet.linear.Linear
91
+ input_size: !ref <d_model>
92
+ n_neurons: !ref <output_neurons>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+
101
+ asr_model: !new:torch.nn.ModuleList
102
+ - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
103
+
104
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
105
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
106
+ bos_index: !ref <bos_index>
107
+ eos_index: !ref <eos_index>
108
+ blank_index: !ref <blank_index>
109
+ min_decode_ratio: !ref <min_decode_ratio>
110
+ max_decode_ratio: !ref <max_decode_ratio>
111
+ beam_size: !ref <test_beam_size>
112
+ ctc_weight: !ref <ctc_weight_decode>
113
+ lm_weight: !ref <lm_weight>
114
+ lm_modules: !ref <lm_model>
115
+ temperature: 1.15
116
+ temperature_lm: 1.15
117
+ using_eos_threshold: False
118
+ length_normalization: True
119
+
120
+ Tencoder:
121
+ !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
122
+ transformer: !ref <Transformer>
123
+
124
+ normalizer: !new:speechbrain.processing.features.InputNormalization
125
+ norm_type: global
126
+ update_until_epoch: 4
127
+
128
+
129
+
130
+ modules:
131
+ normalizer: !ref <normalizer>
132
+ encoder: !ref <encoder>
133
+ decoder: !ref <decoder>
134
+ # define two optimizers here for two-stage training
135
+
136
+
137
+
138
+ log_softmax: !new:torch.nn.LogSoftmax
139
+ dim: -1
ASR_model/inference/lm.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e9e1dbc2935debdd681293515d27a3bb93578e8f4e3b7e01ee1d87c47bb10c
3
+ size 104725990
ASR_model/inference/normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99a037e5582c33311d233f5465436f79de0331230e5ec433b9534a928325de69
3
+ size 1783
ASR_model/inference/tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:448a66ac83788337506bff24c5be0f0ee97d59491af4f92e2341eca40a3b832c
3
+ size 288715
ASR_model/test.wav ADDED
Binary file (263 kB). View file