eyad-silx
/

llm

eyad-silx commited on Dec 31, 2024

Commit

73d9487

verified ·

1 Parent(s): b95e4a2

Update config/dtat_config.py

Files changed (1) hide show

config/dtat_config.py CHANGED Viewed

@@ -8,7 +8,7 @@ class DTATConfig:
         self.n_layer = 12
         self.n_head = 8  # Reduced from 12
         self.n_embd = 512  # Reduced from 768
-        self.dropout = 0.3  # Increased from 0.1
         self.bias = True
         # Sequence parameters
@@ -20,7 +20,7 @@ class DTATConfig:
         self.min_lr = 1e-5         # Lower minimum to allow fine-tuning
         self.warmup_iters = 367     # 5% of 14,667 iterations
         self.max_iters = 7334      # Exactly 4 epochs with batch_size=24
-        self.weight_decay = 0.2     # Increased from 0.1
         self.beta1 = 0.9
         self.beta2 = 0.95
         self.grad_clip = 1.0
@@ -40,7 +40,7 @@ class DTATConfig:
         # Sparse attention parameters
         self.sparse_topk = 32  # Number of tokens to attend to
-        self.importance_dropout = 0.2  # Added dropout for importance scores
         # Mixed precision training
         self.mixed_precision = True
@@ -48,12 +48,16 @@ class DTATConfig:
         # Memory optimization
         self.gradient_checkpointing = True
-        self.batch_size = 24        # Reduced for better GPU utilization
         # System
         self.device = 'cuda'
         self.compile = True
         # Git config for model versioning
         self.git_name = "Your Name"
         self.git_email = "your.email@example.com"

         self.n_layer = 12
         self.n_head = 8  # Reduced from 12
         self.n_embd = 512  # Reduced from 768
+        self.dropout = 0.1  # Reduced for more stability
         self.bias = True
         # Sequence parameters
         self.min_lr = 1e-5         # Lower minimum to allow fine-tuning
         self.warmup_iters = 367     # 5% of 14,667 iterations
         self.max_iters = 7334      # Exactly 4 epochs with batch_size=24
+        self.weight_decay = 0.1     # Reduced for more stability
         self.beta1 = 0.9
         self.beta2 = 0.95
         self.grad_clip = 1.0
         # Sparse attention parameters
         self.sparse_topk = 32  # Number of tokens to attend to
+        self.importance_dropout = 0.1  # Reduced for more stability
         # Mixed precision training
         self.mixed_precision = True
         # Memory optimization
         self.gradient_checkpointing = True
+        self.batch_size = 32        # Increased for more stable gradients
         # System
         self.device = 'cuda'
         self.compile = True
+        # Performance optimization
+        self.compile_model = True  # Enable torch.compile
+        self.cudnn_benchmark = True  # Enable cuDNN benchmarking
         # Git config for model versioning
         self.git_name = "Your Name"
         self.git_email = "your.email@example.com"