poonehmousavi commited on
Commit
de44b22
1 Parent(s): 9f403ea

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.json +3 -0
  2. hyperparams .yaml +140 -0
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "speechbrain_interface": "EncoderDecoderASR"
3
+ }
hyperparams .yaml ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: Transducer ASR
3
+ # Augmentation: SpecAugment
4
+ # Authors: Pooneh Mousavi 2023
5
+ # ################################
6
+ # Feature parameters (FBANKS etc)
7
+ sample_rate: 16000
8
+ n_fft: 400
9
+ n_mels: 80
10
+
11
+ # Model parameters
12
+ activation: !name:torch.nn.LeakyReLU
13
+ dropout: 0.15
14
+ cnn_blocks: 3
15
+ cnn_channels: (128, 200, 256)
16
+ inter_layer_pooling_size: (2, 2, 2)
17
+ cnn_kernelsize: (3, 3)
18
+ time_pooling_size: 4
19
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
20
+ rnn_layers: 5
21
+ rnn_neurons: 1024
22
+ rnn_bidirectional: True
23
+ dnn_blocks: 2
24
+ dnn_neurons: 1024
25
+ dec_neurons: 1024
26
+ joint_dim: 1024
27
+
28
+ # Outputs
29
+ output_neurons: 1000 # BPE size, index(blank/eos/bos) = 0
30
+ # transducer_beam_search : True
31
+ # Decoding parameters
32
+ # Be sure that the bos and eos index match with the BPEs ones
33
+ blank_index: 0
34
+ bos_index: 0
35
+ eos_index: 0
36
+
37
+ min_decode_ratio: 0.0
38
+ max_decode_ratio: 1.0
39
+ beam_size: 4
40
+ nbest: 1
41
+ # by default {state,expand}_beam = 2.3 as mention in paper
42
+ # https://arxiv.org/abs/1904.02619
43
+ state_beam: 2.3
44
+ expand_beam: 2.3
45
+ transducer_beam_search: True
46
+
47
+
48
+ normalizer: !new:speechbrain.processing.features.InputNormalization
49
+ norm_type: global
50
+
51
+ compute_features: !new:speechbrain.lobes.features.Fbank
52
+ sample_rate: !ref <sample_rate>
53
+ n_fft: !ref <n_fft>
54
+ n_mels: !ref <n_mels>
55
+
56
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
57
+ input_shape: [null, null, !ref <n_mels>]
58
+ activation: !ref <activation>
59
+ dropout: !ref <dropout>
60
+ cnn_blocks: !ref <cnn_blocks>
61
+ cnn_channels: !ref <cnn_channels>
62
+ cnn_kernelsize: !ref <cnn_kernelsize>
63
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
64
+ time_pooling: True
65
+ using_2d_pooling: False
66
+ time_pooling_size: !ref <time_pooling_size>
67
+ rnn_class: !ref <rnn_class>
68
+ rnn_layers: !ref <rnn_layers>
69
+ rnn_neurons: !ref <rnn_neurons>
70
+ rnn_bidirectional: !ref <rnn_bidirectional>
71
+ rnn_re_init: True
72
+ dnn_blocks: !ref <dnn_blocks>
73
+ dnn_neurons: !ref <dnn_neurons>
74
+
75
+ enc_lin: !new:speechbrain.nnet.linear.Linear
76
+ input_size: !ref <dnn_neurons>
77
+ n_neurons: !ref <joint_dim>
78
+
79
+ emb: !new:speechbrain.nnet.embedding.Embedding
80
+ num_embeddings: !ref <output_neurons>
81
+ consider_as_one_hot: True
82
+ blank_id: !ref <blank_index>
83
+
84
+ dec: !new:speechbrain.nnet.RNN.GRU
85
+ input_shape: [null, null, !ref <output_neurons> - 1]
86
+ hidden_size: !ref <dec_neurons>
87
+ num_layers: 1
88
+ re_init: True
89
+
90
+ # For MTL with LM over the decoder
91
+ dec_lin: !new:speechbrain.nnet.linear.Linear
92
+ input_size: !ref <dec_neurons>
93
+ n_neurons: !ref <joint_dim>
94
+ bias: False
95
+
96
+ Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
97
+ joint: sum # joint [sum | concat]
98
+ nonlinearity: !ref <activation>
99
+
100
+ transducer_lin: !new:speechbrain.nnet.linear.Linear
101
+ input_size: !ref <joint_dim>
102
+ n_neurons: !ref <output_neurons>
103
+ bias: False
104
+
105
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
106
+ apply_log: True
107
+
108
+ asr_model: !new:torch.nn.ModuleList
109
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <transducer_lin>]
110
+
111
+
112
+
113
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
114
+ # We compose the inference (encoder) pipeline.
115
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
116
+ input_shape: [null, null, !ref <n_mels>]
117
+ compute_features: !ref <compute_features>
118
+ normalize: !ref <normalizer>
119
+ model: !ref <enc>
120
+
121
+ decoder: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
122
+ decode_network_lst: [!ref <emb>, !ref <dec>]
123
+ tjoint: !ref <Tjoint>
124
+ classifier_network: [!ref <transducer_lin>]
125
+ blank_id: !ref <blank_index>
126
+ beam_size: !ref <beam_size>
127
+ nbest: !ref <nbest>
128
+ state_beam: !ref <state_beam>
129
+ expand_beam: !ref <expand_beam>
130
+
131
+ modules:
132
+ normalizer: !ref <normalizer>
133
+ encoder: !ref <encoder>
134
+ decoder: !ref <decoder>
135
+
136
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
137
+ loadables:
138
+ normalizer: !ref <normalizer>
139
+ asr: !ref <asr_model>
140
+ tokenizer: !ref <tokenizer>