File size: 4,594 Bytes
929f8fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Transformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: Switchboard
# Authors: Jianyuan Zhong, Titouan Parcollet, Samuele Cornell, Dominik Wagner
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
####################### Model parameters ###########################
# Transformer
transformer_input_size: 1280
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 2000
# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
# unk_index: 0
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 10
valid_beam_size: 10
lm_weight: 0.30
test_beam_size: 60
ctc_weight_decode: 0.30
temperature: 1.0
temperature_lm: 1.0
using_eos_threshold: False
eos_threshold: 1.5
length_normalization: True
using_max_attn_shift: False
max_attn_shift: 30
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 3
num_layers_per_block: 1
out_channels: (64, 64, 64)
kernel_sizes: (5, 5, 1)
strides: (2, 2, 1)
residuals: (False, False, True)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: !ref <transformer_input_size>
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
encoder_module: transformer
attention_type: regularMHA
normalize_before: True
causal: False
lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length
vocab: !ref <output_neurons>
d_model: 264
d_embedding: 128
nhead: 12
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 1024
dropout: 0.1
activation: !name:torch.nn.ReLU
normalize_before: False
tokenizer: !new:sentencepiece.SentencePieceProcessor
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
asr_model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
cnn: !ref <CNN>
transformer_encoder: !ref <Tencoder>
decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
ctc_weight: !ref <ctc_weight_decode>
lm_weight: !ref <lm_weight>
lm_modules: !ref <lm_model>
temperature: !ref <temperature>
temperature_lm: !ref <temperature_lm>
using_eos_threshold: !ref <using_eos_threshold>
eos_threshold: !ref <eos_threshold>
length_normalization: !ref <length_normalization>
using_max_attn_shift: !ref <using_max_attn_shift>
max_attn_shift: !ref <max_attn_shift>
modules:
compute_features: !ref <compute_features>
normalizer: !ref <normalizer>
pre_transformer: !ref <CNN>
transformer: !ref <Transformer>
asr_model: !ref <asr_model>
lm_model: !ref <lm_model>
encoder: !ref <encoder>
decoder: !ref <decoder>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
normalizer: !ref <normalizer>
asr: !ref <asr_model>
lm: !ref <lm_model>
tokenizer: !ref <tokenizer>
|