File size: 3,704 Bytes
ef8800d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
tokenizer: !new:sentencepiece.SentencePieceProcessor
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
lm: !ref <lm_model>
tokenizer: !ref <tokenizer>
normalizer: !ref <normalizer>
asr: !ref <asr_model>
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
hop_length: 20
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
hop_length: !ref <hop_length>
####################### Model parameters ###########################
# Transformer
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000
# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 10
valid_beam_size: 10
test_beam_size: 10
ctc_weight_decode: 0.3
lm_weight: 0.2
############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: !!python/tuple [8, 10, 8]
num_blocks: 2
num_layers_per_block: 1
out_channels: !!python/tuple [256, 256]
kernel_sizes: !!python/tuple [3, 3]
strides: !!python/tuple [2, 2]
residuals: !!python/tuple [False, False]
Transformer:
!new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: 5120
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
normalize_before: True
lm_model:
!new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length
vocab: !ref <output_neurons>
d_model: 576
nhead: 6
num_encoder_layers: 6
num_decoder_layers: 0
d_ffn: 1538
dropout: 0.2
activation: !name:torch.nn.GELU
normalize_before: False
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
cnn: !ref <CNN>
transformer_encoder: !ref <Tencoder>
asr_model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
ctc_weight: !ref <ctc_weight_decode>
lm_weight: !ref <lm_weight>
lm_modules: !ref <lm_model>
temperature: 1.15
temperature_lm: 1.15
using_eos_threshold: False
length_normalization: True
Tencoder:
!new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: 4
modules:
normalizer: !ref <normalizer>
encoder: !ref <encoder>
decoder: !ref <decoder>
# define two optimizers here for two-stage training
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
|