File size: 3,704 Bytes
ef8800d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
tokenizer: !new:sentencepiece.SentencePieceProcessor

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    lm: !ref <lm_model>
    tokenizer: !ref <tokenizer>
    normalizer: !ref <normalizer>
    asr: !ref <asr_model>

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
hop_length: 20

compute_features: !new:speechbrain.lobes.features.Fbank
  sample_rate: !ref <sample_rate>
  n_fft: !ref <n_fft>
  n_mels: !ref <n_mels>
  hop_length: !ref <hop_length>

####################### Model parameters ###########################
# Transformer
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000

# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 10
valid_beam_size: 10
test_beam_size: 10
ctc_weight_decode: 0.3
lm_weight: 0.2

############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
  input_shape: !!python/tuple [8, 10, 8]
  num_blocks: 2
  num_layers_per_block: 1
  out_channels: !!python/tuple [256, 256]
  kernel_sizes: !!python/tuple [3, 3]
  strides: !!python/tuple [2, 2]
  residuals: !!python/tuple [False, False]

Transformer:
  !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
  input_size: 5120
  tgt_vocab: !ref <output_neurons>
  d_model: !ref <d_model>
  nhead: !ref <nhead>
  num_encoder_layers: !ref <num_encoder_layers>
  num_decoder_layers: !ref <num_decoder_layers>
  d_ffn: !ref <d_ffn>
  dropout: !ref <transformer_dropout>
  activation: !ref <activation>
  normalize_before: True

lm_model:
  !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length
  vocab: !ref <output_neurons>
  d_model: 576
  nhead: 6
  num_encoder_layers: 6
  num_decoder_layers: 0
  d_ffn: 1538
  dropout: 0.2
  activation: !name:torch.nn.GELU
  normalize_before: False

ctc_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <d_model>
  n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <d_model>
  n_neurons: !ref <output_neurons>

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
  input_shape: [null, null, !ref <n_mels>]
  compute_features: !ref <compute_features>
  normalize: !ref <normalizer>
  cnn: !ref <CNN>
  transformer_encoder: !ref <Tencoder>

asr_model: !new:torch.nn.ModuleList
  - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]

decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
  modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
  bos_index: !ref <bos_index>
  eos_index: !ref <eos_index>
  blank_index: !ref <blank_index>
  min_decode_ratio: !ref <min_decode_ratio>
  max_decode_ratio: !ref <max_decode_ratio>
  beam_size: !ref <test_beam_size>
  ctc_weight: !ref <ctc_weight_decode>
  lm_weight: !ref <lm_weight>
  lm_modules: !ref <lm_model>
  temperature: 1.15
  temperature_lm: 1.15
  using_eos_threshold: False
  length_normalization: True

Tencoder:
  !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
  transformer: !ref <Transformer>

normalizer: !new:speechbrain.processing.features.InputNormalization
  norm_type: global
  update_until_epoch: 4



modules:
  normalizer: !ref <normalizer>
  encoder: !ref <encoder>
  decoder: !ref <decoder>
# define two optimizers here for two-stage training



log_softmax: !new:torch.nn.LogSoftmax
  dim: -1