sdelangen commited on
Commit
1576489
·
verified ·
1 Parent(s): fe63ce0

Upload 4 files

Browse files
Files changed (4) hide show
  1. hyperparams.yaml +189 -0
  2. model.ckpt +3 -0
  3. normalizer.ckpt +3 -0
  4. tokenizer.ckpt +3 -0
hyperparams.yaml ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with transformer and transducer
3
+ # Encoder: Conformer
4
+ # Decoder: LSTM + beamsearch + RNNLM
5
+ # Tokens: BPE with unigram
6
+ # losses: Transducer + CTC (optional) + CE (optional)
7
+ # Training: GigaSpeech
8
+ # Authors: Titouan Parcollet 2024
9
+ # ############################################################################
10
+
11
+ # Feature parameters
12
+ sample_rate: 16000
13
+ n_fft: 512
14
+ n_mels: 80
15
+ win_length: 32
16
+
17
+ # BPE parameters
18
+ token_type: unigram # ["unigram", "bpe", "char"]
19
+ character_coverage: 1.0
20
+
21
+ ####################### Model Parameters #######################################
22
+
23
+ # Transformer
24
+ d_model: 768
25
+ joint_dim: 512
26
+ nhead: 8
27
+ num_encoder_layers: 12
28
+ num_decoder_layers: 0
29
+ d_ffn: 2048
30
+ transformer_dropout: 0.1
31
+ activation: !name:torch.nn.GELU
32
+ output_neurons: 1024
33
+ dec_dim: 512
34
+ dec_emb_dropout: 0.2
35
+ dec_dropout: 0.1
36
+
37
+ # Decoding parameters
38
+ blank_index: 0
39
+ bos_index: 1
40
+ eos_index: 2
41
+ pad_index: 0
42
+ beam_size: 10
43
+ nbest: 1
44
+ # by default {state,expand}_beam = 2.3 as mention in paper
45
+ # https://arxiv.org/abs/1904.02619
46
+ state_beam: 2.3
47
+ expand_beam: 2.3
48
+
49
+ normalize: !new:speechbrain.processing.features.InputNormalization
50
+ norm_type: global
51
+ update_until_epoch: 4
52
+
53
+ compute_features: !new:speechbrain.lobes.features.Fbank
54
+ sample_rate: !ref <sample_rate>
55
+ n_fft: !ref <n_fft>
56
+ n_mels: !ref <n_mels>
57
+ win_length: !ref <win_length>
58
+
59
+ ############################## Models ##########################################
60
+
61
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
62
+ input_shape: (8, 10, 80)
63
+ num_blocks: 2
64
+ num_layers_per_block: 1
65
+ out_channels: (64, 32)
66
+ kernel_sizes: (3, 3)
67
+ strides: (2, 2)
68
+ residuals: (False, False)
69
+
70
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
71
+ input_size: 640
72
+ tgt_vocab: !ref <output_neurons>
73
+ d_model: !ref <d_model>
74
+ nhead: !ref <nhead>
75
+ num_encoder_layers: !ref <num_encoder_layers>
76
+ num_decoder_layers: !ref <num_decoder_layers>
77
+ d_ffn: !ref <d_ffn>
78
+ dropout: !ref <transformer_dropout>
79
+ activation: !ref <activation>
80
+ encoder_module: conformer
81
+ attention_type: RelPosMHAXL
82
+ normalize_before: True
83
+ causal: False
84
+
85
+ # We must call an encoder wrapper so the decoder isn't run (we don't have any)
86
+ enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
87
+ transformer: !ref <Transformer>
88
+
89
+ # For MTL CTC over the encoder
90
+ proj_ctc: !new:speechbrain.nnet.linear.Linear
91
+ input_size: !ref <joint_dim>
92
+ n_neurons: !ref <output_neurons>
93
+
94
+ # Define some projection layers to make sure that enc and dec
95
+ # output dim are the same before joining
96
+ proj_enc: !new:speechbrain.nnet.linear.Linear
97
+ input_size: !ref <d_model>
98
+ n_neurons: !ref <joint_dim>
99
+ bias: False
100
+
101
+ proj_dec: !new:speechbrain.nnet.linear.Linear
102
+ input_size: !ref <dec_dim>
103
+ n_neurons: !ref <joint_dim>
104
+ bias: False
105
+
106
+ emb: !new:speechbrain.nnet.embedding.Embedding
107
+ num_embeddings: !ref <output_neurons>
108
+ consider_as_one_hot: True
109
+ blank_id: !ref <blank_index>
110
+
111
+ dec: !new:speechbrain.nnet.RNN.LSTM
112
+ input_shape: [null, null, !ref <output_neurons> - 1]
113
+ hidden_size: !ref <dec_dim>
114
+ num_layers: 1
115
+ re_init: True
116
+
117
+ Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
118
+ joint: sum # joint [sum | concat]
119
+ nonlinearity: !ref <activation>
120
+
121
+ transducer_lin: !new:speechbrain.nnet.linear.Linear
122
+ input_size: !ref <joint_dim>
123
+ n_neurons: !ref <output_neurons>
124
+ bias: False
125
+
126
+ # for MTL
127
+ # update model if any HEAD module is added
128
+ modules:
129
+ CNN: !ref <CNN>
130
+ enc: !ref <enc>
131
+ emb: !ref <emb>
132
+ dec: !ref <dec>
133
+ Tjoint: !ref <Tjoint>
134
+ transducer_lin: !ref <transducer_lin>
135
+ normalize: !ref <normalize>
136
+ proj_ctc: !ref <proj_ctc>
137
+ proj_dec: !ref <proj_dec>
138
+ proj_enc: !ref <proj_enc>
139
+
140
+
141
+ # update model if any HEAD module is added
142
+ model: !new:torch.nn.ModuleList
143
+ - [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>]
144
+
145
+ ############################## Decoding & optimiser ############################
146
+
147
+ Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
148
+ decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
149
+ tjoint: !ref <Tjoint>
150
+ classifier_network: [!ref <transducer_lin>]
151
+ blank_id: !ref <blank_index>
152
+ beam_size: 1
153
+ nbest: 1
154
+
155
+ #Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
156
+ # decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
157
+ # tjoint: !ref <Tjoint>
158
+ # classifier_network: [!ref <transducer_lin>]
159
+ # blank_id: !ref <blank_index>
160
+ # beam_size: !ref <beam_size>
161
+ # nbest: !ref <nbest>
162
+ # state_beam: !ref <state_beam>
163
+ # expand_beam: !ref <expand_beam>
164
+
165
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
166
+
167
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
168
+ loadables:
169
+ model: !ref <model>
170
+ normalizer: !ref <normalize>
171
+ tokenizer: !ref <tokenizer>
172
+
173
+ make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext
174
+ tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space
175
+
176
+ make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor
177
+ decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming
178
+ - !ref <Greedysearcher> # self
179
+
180
+ fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper
181
+ module: !new:speechbrain.nnet.containers.LengthsCapableSequential
182
+ - !ref <compute_features>
183
+ - !ref <normalize>
184
+ - !ref <CNN>
185
+ # don't consider normalization as part of the input filter chain.
186
+ # normalization will operate at chunk level, which mismatches training
187
+ # somewhat, but does not appear to result in noticeable degradation.
188
+ properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties
189
+ - [!ref <compute_features>, !ref <CNN>]
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbabf39e84aaf378006ced3db8d3d0128f38fb5e77aed2ff663a6ccfc1603edb
3
+ size 564108886
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9635569f61529346ab582d03890dbcd426b840db410102cf7f69ca74418c6f48
3
+ size 2218
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210d07217677a925cde1900017fde1741d6b746862b249e5f93e2596e5571c3c
3
+ size 253722