DATA_ROOT=/home/wangrui/projects/SpeechT5/manifest SAVE_DIR=/home/wangrui/projects/SpeechT5/experimental/s2c TRAIN_SET=train VALID_SET=valid # multi-gpu training demands USER_DIR in examples/ USER_DIR=/home/wangrui/projects/SpeechT5/SpeechT5/fairseq/examples/speecht5 PT_CHECKPOINT_PATH=/nfs-data/user1/PhDHub/ckpt/speecht5_base.pt mkdir -p ${SAVE_DIR} # 8 gpu 32 bsz 2 freq / 4 gpu 8 bsz 2 freq fairseq-train ${DATA_ROOT} \ --save-dir ${SAVE_DIR} \ --tensorboard-logdir ${SAVE_DIR} \ --train-subset ${TRAIN_SET} \ --valid-subset ${VALID_SET} \ --user-dir ${USER_DIR} \ --distributed-world-size 4 \ --distributed-port 0 \ --ddp-backend legacy_ddp \ --log-format json \ --seed 1 \ --fp16 \ \ --task speecht5 \ --t5-task s2c \ --sample-rate 16000 \ --num-workers 4 \ --batch-size 8 \ --update-freq 2 \ --data-buffer-size 0 \ \ --criterion speecht5 \ --report-accuracy \ --best-checkpoint-metric "s2c_accuracy" \ --maximize-best-checkpoint-metric \ \ --optimizer adam \ --dropout 0.1 \ --activation-dropout 0.1 \ --attention-dropout 0.1 \ --encoder-layerdrop 0.05 \ --lr-scheduler triangular \ --max-lr 2e-4 \ --lr-period-updates 60000 \ --lr-shrink 0.5 \ --lr 1e-8 \ --feature-grad-mult 1.0 \ --weight-decay 0.1 \ \ --max-update 60000 \ --max-text-positions 600 \ --max-speech-positions 8000 \ --required-batch-size-multiple 1 \ --skip-invalid-size-inputs-valid-test \ --save-interval-updates 10000 \ --validate-after-updates 20000 \ --no-epoch-checkpoints \ --log-interval 10 \ \ --arch t5_transformer_base_asr \ --share-input-output-embed \ --find-unused-parameters \ --bert-init \ --relative-position-embedding \ --mask-prob 0.0 \ --mask-channel-prob 0.0 \ --sid-no-pooling-bn \ --sid-no-embed-postnet \ \ --finetune-from-model ${PT_CHECKPOINT_PATH} >> ${SAVE_DIR}/train.log echo "SID finetuning finished"