_wandb: | |
value: | |
cli_version: 0.18.1 | |
m: [] | |
python_version: 3.11.10 | |
t: | |
"1": | |
- 1 | |
- 11 | |
- 41 | |
- 49 | |
- 50 | |
- 51 | |
- 55 | |
- 71 | |
- 100 | |
"2": | |
- 1 | |
- 11 | |
- 41 | |
- 49 | |
- 50 | |
- 51 | |
- 55 | |
- 71 | |
- 100 | |
"3": | |
- 2 | |
- 15 | |
- 16 | |
- 23 | |
- 55 | |
- 61 | |
"4": 3.11.10 | |
"5": 0.18.1 | |
"6": 4.44.2 | |
"8": | |
- 5 | |
"12": 0.18.1 | |
"13": linux-x86_64 | |
checkpoint: | |
value: | |
every_steps: 2500 | |
data: | |
value: | |
before_mask_input_length: 1137 | |
input_length: 1024 | |
mean_noise_span_length: 3 | |
mlm_probability: 0.15 | |
num_workers: 16 | |
target_length: 229 | |
device: | |
value: gpu | |
eval: | |
value: | |
corrected_steps: 500 | |
every_steps: 1000000000 | |
steps: 500 | |
eval_only: | |
value: false | |
logging: | |
value: | |
every_steps: 25 | |
grad_l2: true | |
use_wandb: true | |
wandb_config: | |
entity: pszemraj | |
mode: online | |
project: nanoT5 | |
tags: | |
- 24x24 | |
- "1024" | |
weights_l2: true | |
mode: | |
value: pt | |
model: | |
value: | |
checkpoint_path: "" | |
compile: true | |
klass: hf_t5 | |
name: pszemraj/tFINE-850m-24x24-512ctx | |
overwrite: | |
dropout_rate: 0 | |
random_init: false | |
n_all_param: | |
value: 853929472 | |
optim: | |
value: | |
base_lr: 0.01 | |
batch_size: 128 | |
epochs: -1 | |
final_cosine: 2e-05 | |
grad_acc: 8 | |
grad_clip: 1 | |
lr_scheduler: cosine | |
name: adamwscale | |
total_steps: 20000 | |
warmup_steps: 5000 | |
weight_decay: 0 | |
precision: | |
value: bf16 | |
predict_only: | |
value: false | |
seed: | |
value: 34534 | |
slurm_id: | |
value: none | |
tokenizer: | |
value: | |
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5 | |
working_dir: | |
value: /workspace/nanoT5/outputs/2024-09-26/05-19-51 | |