speechbrain
/

asr-conformer-transformerlm-librispeech

Automatic Speech Recognition

hf-asr-leaderboard

Model card Files Files and versions Community

Titouan Parcollet commited on Jun 21, 2023

Commit

964e281

•

1 Parent(s): 56773ef

update params

Files changed (1) hide show

hyperparams.yaml +13 -15

hyperparams.yaml CHANGED Viewed

@@ -10,24 +10,23 @@
 # Feature parameters
 sample_rate: 16000
-n_fft: 400
 n_mels: 80
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
-nhead: 4
 num_encoder_layers: 12
 num_decoder_layers: 6
 d_ffn: 2048
 transformer_dropout: 0.1
 activation: !name:torch.nn.GELU
 output_neurons: 5000
-vocab_size: 5000
 # Outputs
 blank_index: 0
-label_smoothing: 0.0
 pad_index: 0
 bos_index: 1
 eos_index: 2
@@ -45,16 +44,15 @@ ctc_weight_decode: 0.40
 CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
     input_shape: (8, 10, 80)
-    num_blocks: 3
     num_layers_per_block: 1
-    out_channels: (64, 64, 64)
-    kernel_sizes: (5, 5, 1)
-    strides: (2, 2, 1)
-    residuals: (False, False, True)
-    norm: !name:speechbrain.nnet.normalization.LayerNorm
-Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
-    input_size: 1280
     tgt_vocab: !ref <output_neurons>
     d_model: !ref <d_model>
     nhead: !ref <nhead>
@@ -63,8 +61,8 @@ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.Transforme
     d_ffn: !ref <d_ffn>
     dropout: !ref <transformer_dropout>
     activation: !ref <activation>
-    encoder_module: transformer
-    attention_type: regularMHA
     normalize_before: True
     causal: False

 # Feature parameters
 sample_rate: 16000
+n_fft: 512
 n_mels: 80
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
+nhead: 8
 num_encoder_layers: 12
 num_decoder_layers: 6
 d_ffn: 2048
 transformer_dropout: 0.1
 activation: !name:torch.nn.GELU
 output_neurons: 5000
 # Outputs
 blank_index: 0
+label_smoothing: 0.1
 pad_index: 0
 bos_index: 1
 eos_index: 2
 CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
     input_shape: (8, 10, 80)
+    num_blocks: 2
     num_layers_per_block: 1
+    out_channels: (64, 32)
+    kernel_sizes: (3, 3)
+    strides: (2, 2)
+    residuals: (False, False)
+Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+    input_size: 640
     tgt_vocab: !ref <output_neurons>
     d_model: !ref <d_model>
     nhead: !ref <nhead>
     d_ffn: !ref <d_ffn>
     dropout: !ref <transformer_dropout>
     activation: !ref <activation>
+    encoder_module: conformer
+    attention_type: RelPosMHAXL
     normalize_before: True
     causal: False