_wandb: value: cli_version: 0.18.1 m: [] python_version: 3.11.10 t: "1": - 1 - 11 - 41 - 49 - 50 - 51 - 55 - 71 - 100 "2": - 1 - 11 - 41 - 49 - 50 - 51 - 55 - 71 - 100 "3": - 2 - 15 - 16 - 23 - 55 - 61 "4": 3.11.10 "5": 0.18.1 "6": 4.44.2 "8": - 5 "12": 0.18.1 "13": linux-x86_64 checkpoint: value: every_steps: 2500 data: value: before_mask_input_length: 1137 input_length: 1024 mean_noise_span_length: 3 mlm_probability: 0.15 num_workers: 16 target_length: 229 device: value: gpu eval: value: corrected_steps: 500 every_steps: 1000000000 steps: 500 eval_only: value: false logging: value: every_steps: 25 grad_l2: true use_wandb: true wandb_config: entity: pszemraj mode: online project: nanoT5 tags: - 24x24 - "1024" weights_l2: true mode: value: pt model: value: checkpoint_path: "" compile: true klass: hf_t5 name: pszemraj/tFINE-850m-24x24-512ctx overwrite: dropout_rate: 0 random_init: false n_all_param: value: 853929472 optim: value: base_lr: 0.01 batch_size: 128 epochs: -1 final_cosine: 2e-05 grad_acc: 8 grad_clip: 1 lr_scheduler: cosine name: adamwscale total_steps: 20000 warmup_steps: 5000 weight_decay: 0 precision: value: bf16 predict_only: value: false seed: value: 34534 slurm_id: value: none tokenizer: value: name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5 working_dir: value: /workspace/nanoT5/outputs/2024-09-26/05-19-51