NanoGPT_charToken

Sleeping

RashiAgarwal commited on Oct 27, 2023

Commit

4015e59

•

1 Parent(s): c1c0b27

Upload 6 files

Files changed (6) hide show

config/eval_gpt2.py ADDED Viewed

+# evaluate the base gpt2
+# n_layer=12, n_head=12, n_embd=768
+# 124M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2'

config/eval_gpt2_large.py ADDED Viewed

+# evaluate the base gpt2
+# n_layer=36, n_head=20, n_embd=1280
+# 774M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-large'

config/eval_gpt2_medium.py ADDED Viewed

+# evaluate the base gpt2
+# n_layer=24, n_head=16, n_embd=1024
+# 350M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-medium'

config/eval_gpt2_xl.py ADDED Viewed

+# evaluate the base gpt2
+# n_layer=48, n_head=25, n_embd=1600
+# 1558M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-xl'

config/finetune_shakespeare.py ADDED Viewed

+import time
+out_dir = 'out-shakespeare'
+eval_interval = 5
+eval_iters = 40
+wandb_log = False # feel free to turn on
+wandb_project = 'shakespeare'
+wandb_run_name = 'ft-' + str(time.time())
+dataset = 'shakespeare'
+init_from = 'gpt2-xl' # this is the largest GPT-2 model
+# only save checkpoints if the validation loss improves
+always_save_checkpoint = False
+# the number of examples per iter:
+# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
+# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
+batch_size = 1
+gradient_accumulation_steps = 32
+max_iters = 20
+# finetune at constant LR
+learning_rate = 3e-5
+decay_lr = False

config/train_gpt2.py ADDED Viewed

+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+wandb_log = True
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 12
+block_size = 1024
+gradient_accumulation_steps = 5 * 8
+# this makes total number of tokens be 300B
+max_iters = 600000
+lr_decay_iters = 600000
+# eval stuff
+eval_interval = 1000
+eval_iters = 200
+log_interval = 10
+# weight decay
+weight_decay = 1e-1