""" Configuration for character-level language model on enwik8 Targeting ~44M parameters for comparison with baseline models """ # Model configuration config = { # Dataset params 'dataset': 'enwik8', 'vocab_size': 256, # Character-level, so 256 possible byte values 'block_size': 1024, # Context length # Model params (tuned for ~44M parameters) 'n_layer': 12, 'n_head': 8, 'n_embd': 512, 'dropout': 0.1, 'bias': False, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster # Training params 'learning_rate': 6e-4, 'max_iters': 100000, 'weight_decay': 1e-1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, # Learning rate decay settings 'decay_lr': True, 'warmup_iters': 2000, 'lr_decay_iters': 100000, 'min_lr': 6e-5, # Evaluation and logging 'eval_interval': 500, 'log_interval': 100, 'eval_iters': 200, # System 'device': 'cuda', # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. 'dtype': 'bfloat16', # 'float32', 'bfloat16', or 'float16' 'compile': True, # use PyTorch 2.0 to compile the model to be faster }