framework: bart data_dir: ../../dataset train_data: msd_balence text_type: all arch: transformer workers: 12 epochs: 4096 warmup_epochs: 125 start_epoch: 0 batch_size: 256 world_size: 1 lr: 0.0001 min_lr: 1.0e-09 rank: 0 dist_url: tcp://localhost:12312 dist_backend: nccl seed: null gpu: 0 print_freq: 100 multiprocessing_distributed: false cos: true bart_pretrain: false label_smoothing: 0.1 use_early_stopping: false eval_sample: 0 max_length: 110 distributed: false