_wandb: | |
value: | |
cli_version: 0.18.1 | |
m: [] | |
python_version: 3.10.12 | |
t: | |
"1": | |
- 1 | |
- 55 | |
"2": | |
- 1 | |
- 55 | |
"3": | |
- 2 | |
- 13 | |
- 16 | |
- 23 | |
- 55 | |
- 61 | |
"4": 3.10.12 | |
"5": 0.18.1 | |
"8": | |
- 5 | |
"12": 0.18.1 | |
"13": linux-x86_64 | |
always_save_checkpoint: | |
value: true | |
attention_types: | |
value: | |
- default | |
backend: | |
value: nccl | |
batch_size: | |
value: 120 | |
beta1: | |
value: 0.9 | |
beta2: | |
value: 0.95 | |
bias: | |
value: false | |
block_size: | |
value: 512 | |
checkpoint_path: | |
value: "" | |
collect_activations: | |
value: false | |
collect_attention_patterns: | |
value: false | |
compile: | |
value: true | |
dataset: | |
value: fineweb | |
decay_lr: | |
value: true | |
device: | |
value: cuda | |
dropout: | |
value: 0 | |
dtype: | |
value: bfloat16 | |
embedding_types: | |
value: | |
- polynomial_legendre | |
- polynomial_chebyshev | |
- random_fourier | |
- wavelet | |
eval_datasets: | |
value: | |
- wikitext-103-v1 | |
- ptb | |
- lambada | |
eval_interval: | |
value: 100 | |
eval_iters: | |
value: 100 | |
eval_only: | |
value: false | |
grad_clip: | |
value: 1 | |
gradient_accumulation_steps: | |
value: 40 | |
init_from: | |
value: scratch | |
learning_rate: | |
value: 0.0006 | |
log_interval: | |
value: 1 | |
lr_decay_iters: | |
value: 10000 | |
max_iters: | |
value: 10000 | |
min_lr: | |
value: 6e-05 | |
n_embd: | |
value: 256 | |
n_head: | |
value: 4 | |
n_layer: | |
value: 4 | |
out_dir: | |
value: out | |
seed: | |
value: 1337 | |
wandb_log: | |
value: true | |
wandb_project: | |
value: gpt2_positional_encodings_100B | |
wandb_run_name: | |
value: experiment | |
warmup_iters: | |
value: 100 | |
weight_decay: | |
value: 0.1 | |