# network architecture | |
# encoder related | |
encoder: conformer | |
encoder_conf: | |
output_size: 512 # dimension of attention | |
attention_heads: 8 | |
linear_units: 2048 # the number of units of position-wise feed forward | |
num_blocks: 18 # the number of encoder blocks | |
dropout_rate: 0.1 | |
positional_dropout_rate: 0.0 | |
attention_dropout_rate: 0.0 | |
input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8 | |
normalize_before: true | |
cnn_module_kernel: 15 | |
use_cnn_module: True | |
activation_type: 'swish' | |
macaron_style: True | |
pos_enc_layer_type: 'rel_pos' | |
selfattention_layer_type: 'abs_selfattn' | |
nonorm: False | |
cnn_prev: True | |
cnn_after: False | |
# decoder related | |
decoder: transformer | |
decoder_conf: | |
attention_heads: 4 | |
linear_units: 2048 | |
num_blocks: 1 | |
dropout_rate: 0.0 | |
positional_dropout_rate: 0.0 | |
self_attention_dropout_rate: 0.0 | |
src_attention_dropout_rate: 0.0 | |
# hybrid CTC/attention | |
model_conf: | |
ctc_weight: 1.0 | |
lsm_weight: 0.1 # label smoothing option | |
length_normalized_loss: false | |
raw_wav: False | |
data_save: True | |
use_gc: True | |
w2v_encoder: True | |
pretrain: True | |
random_pretrain: False | |
wav2vec: True | |
w2v_coef: 1.0 | |
mpc_didi_ver: False | |
wav2mpc: False | |
wav2mpc_reduction: False | |
mpc_mask_loss: False | |
mpc_coef: 0.0 | |
mask: True | |
quantize_targets: True | |
project_targets: True | |
latent_vars: 320 | |
w2v_reduct: True | |
w2v_ext_loss: True | |
w2v_loss_weights: [0.1,0] | |
w2v_mask_prob: 0.65 | |
mpc_prob: 0.5 | |
remove_valbest: False | |
model: | |
method: 'npc' # Accepts npc/apc/vqapc | |
paras: | |
kernel_size: 15 # Receptive field size (R) = kernel_size + 2*(n_blocks) | |
mask_size: 5 # Desired input mask size (M_in) as described in NPC paper | |
n_blocks: 4 # Number of ConvBlocks stacked in NPC model | |
hidden_size: 512 # Dimension of feature of all layers | |
dropout: 0.1 # Dropout in ConvBlock | |
residual: True # Residual connection in ConvBlock | |
batch_norm: True # Apply BatchNorm in ConvBlock | |
activate: 'relu' # Activation function of ConvBlock | |
disable_cross_layer: False # Apply Masked ConvBlock at last layer only | |
vq: | |
codebook_size: [64,64,64,64] # Codebook size of each group in VQ-layer | |
code_dim: [128,128,128,128] # Dim of each group summing up to hidden_size | |
gumbel_temperature: 1.0 # Temperature of Gumbel Softmax in VQ-layer | |
collate_conf: | |
spec_aug: false | |
# specaugmentation related | |
spec_aug_conf: | |
num_time_mask: 2 | |
num_freq_mask: 2 | |
max_time_mask: 50 | |
max_freq_mask: 10 | |
max_time_warp: 80 | |
gauss_mask_for_time: False | |
warp_for_time: False | |
# dataset related | |
dataset_conf: | |
max_length: 4500 | |
min_length: 80 | |
max_frames_in_batch: 16000 | |
batch_type: 'dynamic' # static or dynamic | |
batch_size: 20 | |
sort: true | |
grad_clip: 10 | |
accum_grad: 2 | |
max_epoch: 180 | |
log_interval: 100 | |
optim: adam | |
optim_conf: | |
lr: 0.001 | |
scheduler: warmuplr # pytorch v1.1.0+ required | |
scheduler_conf: | |
warmup_steps: 10000 | |