File size: 4,960 Bytes
929f8fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3eaae3
 
76e238a
c3eaae3
 
 
 
 
 
 
 
 
 
 
 
 
 
929f8fe
 
 
 
 
 
 
 
 
 
 
c3eaae3
929f8fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Transformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: Switchboard
# Authors:  Jianyuan Zhong, Titouan Parcollet, Samuele Cornell, Dominik Wagner
# ############################################################################

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80

####################### Model parameters  ###########################
# Transformer
transformer_input_size: 1280
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 2000

# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
# unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 10
valid_beam_size: 10
lm_weight: 0.30
test_beam_size: 60
ctc_weight_decode: 0.30
temperature: 1.0
temperature_lm: 1.0
using_eos_threshold: False
eos_threshold: 1.5
length_normalization: True
using_max_attn_shift: False
max_attn_shift: 30

CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
  input_shape: (8, 10, 80)
  num_blocks: 3
  num_layers_per_block: 1
  out_channels: (64, 64, 64)
  kernel_sizes: (5, 5, 1)
  strides: (2, 2, 1)
  residuals: (False, False, True)

Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
  input_size: !ref <transformer_input_size>
  tgt_vocab: !ref <output_neurons>
  d_model: !ref <d_model>
  nhead: !ref <nhead>
  num_encoder_layers: !ref <num_encoder_layers>
  num_decoder_layers: !ref <num_decoder_layers>
  d_ffn: !ref <d_ffn>
  dropout: !ref <transformer_dropout>
  activation: !ref <activation>
  encoder_module: transformer
  attention_type: regularMHA
  normalize_before: True
  causal: False

lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length
  vocab: !ref <output_neurons>
  d_model: 264
  d_embedding: 128
  nhead: 12
  num_encoder_layers: 12
  num_decoder_layers: 0
  d_ffn: 1024
  dropout: 0.1
  activation: !name:torch.nn.ReLU
  normalize_before: False

tokenizer: !new:sentencepiece.SentencePieceProcessor

ctc_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <d_model>
  n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <d_model>
  n_neurons: !ref <output_neurons>

asr_model: !new:torch.nn.ModuleList
  - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]

log_softmax: !new:torch.nn.LogSoftmax
  dim: -1

normalizer: !new:speechbrain.processing.features.InputNormalization
    norm_type: global

compute_features: !new:speechbrain.lobes.features.Fbank
  sample_rate: !ref <sample_rate>
  n_fft: !ref <n_fft>
  n_mels: !ref <n_mels>

Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
    transformer: !ref <Transformer>

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalizer>
    cnn: !ref <CNN>
    transformer_encoder: !ref <Tencoder>

transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
   language_model: !ref <lm_model>
   temperature: !ref <temperature_lm>
   
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
    eos_index: !ref <eos_index>
    blank_index: !ref <blank_index>
    ctc_fc: !ref <ctc_lin>
    
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
    full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
    weights:
        transformerlm: !ref <lm_weight>
        ctc: !ref <ctc_weight_decode>
        
decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
  modules: [!ref <Transformer>, !ref <seq_lin>]
  bos_index: !ref <bos_index>
  eos_index: !ref <eos_index>
  min_decode_ratio: !ref <min_decode_ratio>
  max_decode_ratio: !ref <max_decode_ratio>
  beam_size: !ref <test_beam_size>
  temperature: !ref <temperature>
  using_eos_threshold: !ref <using_eos_threshold>
  eos_threshold: !ref <eos_threshold>
  length_normalization: !ref <length_normalization>
  using_max_attn_shift: !ref <using_max_attn_shift>
  max_attn_shift: !ref <max_attn_shift>
  scorer: !ref <scorer>

modules:
   compute_features: !ref <compute_features>
   normalizer: !ref <normalizer>
   pre_transformer: !ref <CNN>
   transformer: !ref <Transformer>
   asr_model: !ref <asr_model>
   lm_model: !ref <lm_model>
   encoder: !ref <encoder>
   decoder: !ref <decoder>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    normalizer: !ref <normalizer>
    asr: !ref <asr_model>
    lm: !ref <lm_model>
    tokenizer: !ref <tokenizer>