Create qa1.0.0
Browse files
qa1.0.0
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Quantumaurora: Advanced Transformer-based Language Model
|
3 |
+
Version: 1.0.0
|
4 |
+
Created: 2025
|
5 |
+
"""
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
import torch.nn as nn
|
10 |
+
import torch.nn.functional as F
|
11 |
+
from torch.utils.data import Dataset, DataLoader
|
12 |
+
from transformers import PreTrainedTokenizerFast
|
13 |
+
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
|
14 |
+
import math
|
15 |
+
from typing import Optional, Dict, List, Tuple
|
16 |
+
from torch.cuda.amp import autocast, GradScaler
|
17 |
+
from torch.nn.parallel import DistributedDataParallel
|
18 |
+
import torch.distributed as dist
|
19 |
+
import torch.multiprocessing as mp
|
20 |
+
from torch.utils.checkpoint import checkpoint
|
21 |
+
import json
|
22 |
+
import os
|
23 |
+
from datetime import datetime
|
24 |
+
|
25 |
+
class QuantumauroraConfig:
|
26 |
+
"""Configuration class for Quantumaurora model"""
|
27 |
+
def __init__(self,
|
28 |
+
vocab_size: int = 50000,
|
29 |
+
d_model: int = 512,
|
30 |
+
num_heads: int = 8,
|
31 |
+
num_layers: int = 6,
|
32 |
+
d_ff: int = 2048,
|
33 |
+
dropout: float = 0.1,
|
34 |
+
attention_type: str = "full",
|
35 |
+
use_checkpointing: bool = True,
|
36 |
+
max_sequence_length: int = 2048,
|
37 |
+
model_version: str = "1.0.0"):
|
38 |
+
self.vocab_size = vocab_size
|
39 |
+
self.d_model = d_model
|
40 |
+
self.num_heads = num_heads
|
41 |
+
self.num_layers = num_layers
|
42 |
+
self.d_ff = d_ff
|
43 |
+
self.dropout = dropout
|
44 |
+
self.attention_type = attention_type
|
45 |
+
self.use_checkpointing = use_checkpointing
|
46 |
+
self.max_sequence_length = max_sequence_length
|
47 |
+
self.model_version = model_version
|
48 |
+
self.model_type = "quantumaurora"
|
49 |
+
|
50 |
+
def save(self, path: str):
|
51 |
+
"""Save configuration to JSON file"""
|
52 |
+
config_dict = self.__dict__
|
53 |
+
config_dict['timestamp'] = datetime.now().isoformat()
|
54 |
+
|
55 |
+
with open(path, 'w') as f:
|
56 |
+
json.dump(config_dict, f, indent=2)
|
57 |
+
|
58 |
+
@classmethod
|
59 |
+
def load(cls, path: str) -> 'QuantumauroraConfig':
|
60 |
+
"""Load configuration from JSON file"""
|
61 |
+
with open(path, 'r') as f:
|
62 |
+
config_dict = json.load(f)
|
63 |
+
|
64 |
+
# Remove timestamp from loaded config
|
65 |
+
if 'timestamp' in config_dict:
|
66 |
+
del config_dict['timestamp']
|
67 |
+
|
68 |
+
return cls(**config_dict)
|
69 |
+
|
70 |
+
class Quantumaurora(nn.Module):
|
71 |
+
"""
|
72 |
+
Quantumaurora: Advanced Transformer-based Language Model
|
73 |
+
|
74 |
+
A state-of-the-art language model featuring:
|
75 |
+
- Multi-head attention with sparse/local patterns
|
76 |
+
- Multiple pre-training objectives
|
77 |
+
- Gradient checkpointing
|
78 |
+
- Mixed precision training
|
79 |
+
- Distributed training support
|
80 |
+
"""
|
81 |
+
|
82 |
+
def __init__(self, config: QuantumauroraConfig):
|
83 |
+
super().__init__()
|
84 |
+
self.config = config
|
85 |
+
|
86 |
+
# Model components
|
87 |
+
self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
|
88 |
+
self.positional_encoding = PositionalEncoding(config.d_model)
|
89 |
+
|
90 |
+
self.transformer_blocks = nn.ModuleList([
|
91 |
+
TransformerBlock(
|
92 |
+
config.d_model,
|
93 |
+
config.num_heads,
|
94 |
+
config.d_ff,
|
95 |
+
config.dropout,
|
96 |
+
config.attention_type
|
97 |
+
) for _ in range(config.num_layers)
|
98 |
+
])
|
99 |
+
|
100 |
+
self.pretraining_objectives = PreTrainingObjectives(
|
101 |
+
config.d_model,
|
102 |
+
config.vocab_size
|
103 |
+
)
|
104 |
+
|
105 |
+
self.dropout = nn.Dropout(config.dropout)
|
106 |
+
|
107 |
+
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
108 |
+
x = self.token_embedding(x)
|
109 |
+
x = self.positional_encoding(x)
|
110 |
+
x = self.dropout(x)
|
111 |
+
|
112 |
+
for transformer_block in self.transformer_blocks:
|
113 |
+
if self.config.use_checkpointing and self.training:
|
114 |
+
x = checkpoint(transformer_block, x, mask)
|
115 |
+
else:
|
116 |
+
x = transformer_block(x, mask)
|
117 |
+
|
118 |
+
return self.pretraining_objectives(x)
|
119 |
+
|
120 |
+
def save_pretrained(self, path: str):
|
121 |
+
"""Save model and configuration"""
|
122 |
+
os.makedirs(path, exist_ok=True)
|
123 |
+
|
124 |
+
# Save configuration
|
125 |
+
config_path = os.path.join(path, 'config.json')
|
126 |
+
self.config.save(config_path)
|
127 |
+
|
128 |
+
# Save model weights
|
129 |
+
model_path = os.path.join(path, 'model.pt')
|
130 |
+
torch.save(self.state_dict(), model_path)
|
131 |
+
|
132 |
+
# Save tokenizer if available
|
133 |
+
if hasattr(self, 'tokenizer'):
|
134 |
+
tokenizer_path = os.path.join(path, 'tokenizer.json')
|
135 |
+
self.tokenizer.save(tokenizer_path)
|
136 |
+
|
137 |
+
@classmethod
|
138 |
+
def from_pretrained(cls, path: str) -> 'Quantumaurora':
|
139 |
+
"""Load pretrained model and configuration"""
|
140 |
+
config = QuantumauroraConfig.load(os.path.join(path, 'config.json'))
|
141 |
+
model = cls(config)
|
142 |
+
|
143 |
+
model_path = os.path.join(path, 'model.pt')
|
144 |
+
model.load_state_dict(torch.load(model_path))
|
145 |
+
|
146 |
+
# Load tokenizer if available
|
147 |
+
tokenizer_path = os.path.join(path, 'tokenizer.json')
|
148 |
+
if os.path.exists(tokenizer_path):
|
149 |
+
model.tokenizer = PreTrainedTokenizerFast.from_file(tokenizer_path)
|
150 |
+
|
151 |
+
return model
|
152 |
+
|
153 |
+
class QuantumauroraTrainer:
|
154 |
+
"""Training manager for Quantumaurora model"""
|
155 |
+
|
156 |
+
def __init__(self,
|
157 |
+
model: Quantumaurora,
|
158 |
+
train_dataloader: DataLoader,
|
159 |
+
optimizer: torch.optim.Optimizer,
|
160 |
+
device: str = "cuda",
|
161 |
+
use_mixed_precision: bool = True,
|
162 |
+
distributed: bool = True):
|
163 |
+
self.model = model
|
164 |
+
self.train_dataloader = train_dataloader
|
165 |
+
self.optimizer = optimizer
|
166 |
+
self.device = device
|
167 |
+
self.use_mixed_precision = use_mixed_precision
|
168 |
+
self.distributed = distributed
|
169 |
+
|
170 |
+
if use_mixed_precision:
|
171 |
+
self.scaler = GradScaler()
|
172 |
+
|
173 |
+
if distributed:
|
174 |
+
self.model = DistributedDataParallel(model)
|
175 |
+
|
176 |
+
def train(self, num_epochs: int, save_dir: str = None):
|
177 |
+
"""Main training loop"""
|
178 |
+
best_loss = float('inf')
|
179 |
+
|
180 |
+
for epoch in range(num_epochs):
|
181 |
+
losses = self.train_epoch(epoch)
|
182 |
+
|
183 |
+
# Save checkpoint if this is the best model
|
184 |
+
if save_dir and losses['total'] < best_loss:
|
185 |
+
best_loss = losses['total']
|
186 |
+
self.model.save_pretrained(os.path.join(save_dir, f'checkpoint-{epoch}'))
|
187 |
+
|
188 |
+
print(f"Epoch {epoch+1}/{num_epochs}")
|
189 |
+
for loss_name, loss_value in losses.items():
|
190 |
+
print(f"{loss_name}: {loss_value:.4f}")
|
191 |
+
|
192 |
+
def main():
|
193 |
+
"""Example usage of Quantumaurora"""
|
194 |
+
|
195 |
+
# Initialize configuration
|
196 |
+
config = QuantumauroraConfig(
|
197 |
+
vocab_size=50000,
|
198 |
+
d_model=768,
|
199 |
+
num_heads=12,
|
200 |
+
num_layers=12,
|
201 |
+
attention_type="sparse"
|
202 |
+
)
|
203 |
+
|
204 |
+
# Initialize model
|
205 |
+
model = Quantumaurora(config)
|
206 |
+
|
207 |
+
# Multi-GPU training if available
|
208 |
+
world_size = torch.cuda.device_count()
|
209 |
+
if world_size > 1:
|
210 |
+
mp.spawn(
|
211 |
+
train_distributed,
|
212 |
+
args=(world_size, model, dataset),
|
213 |
+
nprocs=world_size,
|
214 |
+
join=True
|
215 |
+
)
|
216 |
+
else:
|
217 |
+
# Single GPU training
|
218 |
+
trainer = QuantumauroraTrainer(
|
219 |
+
model=model,
|
220 |
+
train_dataloader=train_dataloader,
|
221 |
+
optimizer=torch.optim.Adam(model.parameters()),
|
222 |
+
use_mixed_precision=True,
|
223 |
+
distributed=False
|
224 |
+
)
|
225 |
+
|
226 |
+
trainer.train(
|
227 |
+
num_epochs=10,
|
228 |
+
save_dir='quantumaurora_checkpoints'
|
229 |
+
)
|
230 |
+
|
231 |
+
if __name__ == "__main__":
|
232 |
+
main()
|