"""
Configuration for Dynamic Token-Aware Transformer (DTAT) on enwik8
"""

class DTATConfig:
    def __init__(self):
        # Model architecture
        self.n_layer = 12
        self.n_head = 8  # Reduced from 12
        self.n_embd = 512  # Reduced from 768
        self.dropout = 0.1  # Reduced for more stability
        self.bias = True
        
        # Sequence parameters
        self.block_size = 1024  # Reduced from 2048
        self.vocab_size = 256  # For character-level model
        
        # Training parameters
        self.learning_rate = 6e-4
        self.min_lr = 1e-5         # Lower minimum to allow fine-tuning
        self.warmup_iters = 367     # 5% of 14,667 iterations
        self.max_iters = 7334      # Exactly 4 epochs with batch_size=24
        self.weight_decay = 0.1     # Reduced for more stability
        self.beta1 = 0.9
        self.beta2 = 0.95
        self.grad_clip = 1.0
        
        # Learning rate schedule
        self.decay_lr = True
        self.lr_decay_iters = 5000  # Slower decay since we're improving
        
        # Early stopping
        self.patience = 15          # Increased patience since we're improving
        self.min_delta = 0.005     # Smaller improvements still count
        self.eval_interval = 250    # Keep frequent evaluation
        self.eval_iters = 200      # Keep same number of eval iterations
        
        # Logging
        self.log_interval = 10
        
        # Sparse attention parameters
        self.sparse_topk = 32  # Number of tokens to attend to
        self.importance_dropout = 0.1  # Reduced for more stability
        
        # Mixed precision training
        self.mixed_precision = True
        self.dtype = 'bfloat16'
        
        # Memory optimization
        self.gradient_checkpointing = True
        self.batch_size = 32        # Increased for more stable gradients
        
        # System
        self.device = 'cuda'
        self.compile = True
        
        # Performance optimization
        self.compile_model = True  # Enable torch.compile
        self.cudnn_benchmark = True  # Enable cuDNN benchmarking
        
        # Git config for model versioning
        self.git_name = "Your Name"
        self.git_email = "your.email@example.com"
    
    def get_config(self):
        return self

def get_config():
    return DTATConfig()