""" Configuration for Baseline Transformer on enwik8. Matches DTAT's training setup for fair comparison. """ class BaselineConfig: def __init__(self): # Model architecture (exactly matching DTAT) self.n_layer = 12 self.n_head = 8 # Same as DTAT self.n_embd = 512 # Same as DTAT self.dropout = 0.1 self.bias = True # Sequence parameters self.block_size = 1024 # Same as DTAT self.vocab_size = 256 # For character-level model # Training parameters (matched with DTAT) self.learning_rate = 6e-4 self.min_lr = 1e-5 # Lower minimum to allow fine-tuning self.warmup_iters = 367 # 5% of total iterations self.max_iters = 7334 # Exactly 4 epochs with batch_size=24 self.weight_decay = 0.1 # Same as DTAT self.beta1 = 0.9 self.beta2 = 0.95 self.grad_clip = 1.0 # Learning rate schedule self.decay_lr = True self.lr_decay_iters = 5000 # Same as DTAT # Early stopping self.patience = 15 # Same as DTAT self.min_delta = 0.005 # Same as DTAT self.eval_interval = 250 # Same as DTAT self.eval_iters = 200 # Same as DTAT # Logging self.log_interval = 10 # Mixed precision training self.mixed_precision = True self.dtype = 'bfloat16' # Memory optimization self.gradient_checkpointing = True self.batch_size = 24 # Same as DTAT # System self.device = 'cuda' self.compile = True # Performance optimization self.compile_model = True self.cudnn_benchmark = True # Git config for model versioning self.git_name = "Your Name" self.git_email = "your.email@example.com" def get_config(self): return self def get_config(): """Helper function to get config instance.""" return BaselineConfig()