""" Configuration for Dynamic Token-Aware Transformer (DTAT) on enwik8 """ class DTATConfig: def __init__(self): # Model architecture self.n_layer = 12 self.n_head = 8 # Reduced from 12 self.n_embd = 512 # Reduced from 768 self.dropout = 0.1 # Reduced for more stability self.bias = True # Sequence parameters self.block_size = 1024 # Reduced from 2048 self.vocab_size = 256 # For character-level model # Training parameters self.learning_rate = 6e-4 self.min_lr = 1e-5 # Lower minimum to allow fine-tuning self.warmup_iters = 367 # 5% of 14,667 iterations self.max_iters = 7334 # Exactly 4 epochs with batch_size=24 self.weight_decay = 0.1 # Reduced for more stability self.beta1 = 0.9 self.beta2 = 0.95 self.grad_clip = 1.0 # Learning rate schedule self.decay_lr = True self.lr_decay_iters = 5000 # Slower decay since we're improving # Early stopping self.patience = 15 # Increased patience since we're improving self.min_delta = 0.005 # Smaller improvements still count self.eval_interval = 250 # Keep frequent evaluation self.eval_iters = 200 # Keep same number of eval iterations # Logging self.log_interval = 10 # Sparse attention parameters self.sparse_topk = 32 # Number of tokens to attend to self.importance_dropout = 0.1 # Reduced for more stability # Mixed precision training self.mixed_precision = True self.dtype = 'bfloat16' # Memory optimization self.gradient_checkpointing = True self.batch_size = 32 # Increased for more stable gradients # System self.device = 'cuda' self.compile = True # Performance optimization self.compile_model = True # Enable torch.compile self.cudnn_benchmark = True # Enable cuDNN benchmarking # Git config for model versioning self.git_name = "Your Name" self.git_email = "your.email@example.com" def get_config(self): return self def get_config(): return DTATConfig()