File size: 6,095 Bytes
1576489 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
# ############################################################################
# Model: E2E ASR with transformer and transducer
# Encoder: Conformer
# Decoder: LSTM + beamsearch + RNNLM
# Tokens: BPE with unigram
# losses: Transducer + CTC (optional) + CE (optional)
# Training: GigaSpeech
# Authors: Titouan Parcollet 2024
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 512
n_mels: 80
win_length: 32
# BPE parameters
token_type: unigram # ["unigram", "bpe", "char"]
character_coverage: 1.0
####################### Model Parameters #######################################
# Transformer
d_model: 768
joint_dim: 512
nhead: 8
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 1024
dec_dim: 512
dec_emb_dropout: 0.2
dec_dropout: 0.1
# Decoding parameters
blank_index: 0
bos_index: 1
eos_index: 2
pad_index: 0
beam_size: 10
nbest: 1
# by default {state,expand}_beam = 2.3 as mention in paper
# https://arxiv.org/abs/1904.02619
state_beam: 2.3
expand_beam: 2.3
normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: 4
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
win_length: !ref <win_length>
############################## Models ##########################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (64, 32)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: 640
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
encoder_module: conformer
attention_type: RelPosMHAXL
normalize_before: True
causal: False
# We must call an encoder wrapper so the decoder isn't run (we don't have any)
enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
# For MTL CTC over the encoder
proj_ctc: !new:speechbrain.nnet.linear.Linear
input_size: !ref <joint_dim>
n_neurons: !ref <output_neurons>
# Define some projection layers to make sure that enc and dec
# output dim are the same before joining
proj_enc: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <joint_dim>
bias: False
proj_dec: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_dim>
n_neurons: !ref <joint_dim>
bias: False
emb: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
consider_as_one_hot: True
blank_id: !ref <blank_index>
dec: !new:speechbrain.nnet.RNN.LSTM
input_shape: [null, null, !ref <output_neurons> - 1]
hidden_size: !ref <dec_dim>
num_layers: 1
re_init: True
Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
joint: sum # joint [sum | concat]
nonlinearity: !ref <activation>
transducer_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <joint_dim>
n_neurons: !ref <output_neurons>
bias: False
# for MTL
# update model if any HEAD module is added
modules:
CNN: !ref <CNN>
enc: !ref <enc>
emb: !ref <emb>
dec: !ref <dec>
Tjoint: !ref <Tjoint>
transducer_lin: !ref <transducer_lin>
normalize: !ref <normalize>
proj_ctc: !ref <proj_ctc>
proj_dec: !ref <proj_dec>
proj_enc: !ref <proj_enc>
# update model if any HEAD module is added
model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>]
############################## Decoding & optimiser ############################
Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
tjoint: !ref <Tjoint>
classifier_network: [!ref <transducer_lin>]
blank_id: !ref <blank_index>
beam_size: 1
nbest: 1
#Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
# decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
# tjoint: !ref <Tjoint>
# classifier_network: [!ref <transducer_lin>]
# blank_id: !ref <blank_index>
# beam_size: !ref <beam_size>
# nbest: !ref <nbest>
# state_beam: !ref <state_beam>
# expand_beam: !ref <expand_beam>
tokenizer: !new:sentencepiece.SentencePieceProcessor
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
normalizer: !ref <normalize>
tokenizer: !ref <tokenizer>
make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext
tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space
make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor
decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming
- !ref <Greedysearcher> # self
fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper
module: !new:speechbrain.nnet.containers.LengthsCapableSequential
- !ref <compute_features>
- !ref <normalize>
- !ref <CNN>
# don't consider normalization as part of the input filter chain.
# normalization will operate at chunk level, which mismatches training
# somewhat, but does not appear to result in noticeable degradation.
properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties
- [!ref <compute_features>, !ref <CNN>]
|