framework: bart
data_dir: ../../dataset
train_data: msd_balence
text_type: all
arch: transformer
workers: 12
epochs: 4096
warmup_epochs: 125
start_epoch: 0
batch_size: 256
world_size: 1
lr: 0.0001
min_lr: 1.0e-09
rank: 0
dist_url: tcp://localhost:12312
dist_backend: nccl
seed: null
gpu: 0
print_freq: 100
multiprocessing_distributed: false
cos: true
bart_pretrain: false
label_smoothing: 0.1
use_early_stopping: false
eval_sample: 0
max_length: 110
distributed: false