Zai commited on
Commit
c4b84ea
1 Parent(s): d9526a7

added model

Browse files
notebooks/mini_training.ipynb CHANGED
@@ -2,12 +2,31 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
- "## What's the goal here"
10
  ]
 
 
 
 
 
 
 
11
  }
12
  ],
13
  "metadata": {
@@ -26,7 +45,7 @@
26
  "name": "python",
27
  "nbconvert_exporter": "python",
28
  "pygments_lexer": "ipython3",
29
- "version": "3.10.13"
30
  }
31
  },
32
  "nbformat": 4,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 6,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "ename": "ModuleNotFoundError",
10
+ "evalue": "No module named 'yume'",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
15
+ "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01myume\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Yume\n",
16
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'yume'"
17
+ ]
18
+ }
19
+ ],
20
  "source": [
21
+ "from yume import Yume"
22
  ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": []
30
  }
31
  ],
32
  "metadata": {
 
45
  "name": "python",
46
  "nbconvert_exporter": "python",
47
  "pygments_lexer": "ipython3",
48
+ "version": "3.11.7"
49
  }
50
  },
51
  "nbformat": 4,
training.py CHANGED
@@ -4,17 +4,15 @@ config = Config()
4
 
5
  dataset = Trainset()
6
 
7
- dataset._load_dataset()
8
-
9
- dataset._tokenize(tiktoken=True)
10
 
11
  yume = Yume(config)
12
 
13
- assert len(dataset.data) > 0
14
 
15
- yume.pretrain(dataset.data)
16
 
17
- yume.sample()
18
 
19
  # optional
20
  # yume.huggingface_login("your hf tokens")
 
4
 
5
  dataset = Trainset()
6
 
7
+ dataset.build_dataset()
 
 
8
 
9
  yume = Yume(config)
10
 
11
+ # assert len(dataset.data) > 0
12
 
13
+ # yume.pretrain(dataset)
14
 
15
+ # yume.sample()
16
 
17
  # optional
18
  # yume.huggingface_login("your hf tokens")
yume/config.py CHANGED
@@ -22,7 +22,7 @@ class Config:
22
  self.lr = lr
23
 
24
  # Small Yume model (around 100M parameters)
25
- small_yume_config = Config(
26
  num_epoch=10,
27
  block_size=512,
28
  vocab_size=30522,
@@ -35,7 +35,7 @@ small_yume_config = Config(
35
  )
36
 
37
  # Medium Yume model (around 500M parameters)
38
- medium_yume_config = Config(
39
  num_epoch=10,
40
  block_size=1024,
41
  vocab_size=30522,
@@ -48,7 +48,7 @@ medium_yume_config = Config(
48
  )
49
 
50
  # Large Yume model (around 1B parameters)
51
- large_yume_config = Config(
52
  num_epoch=10,
53
  block_size=2048,
54
  vocab_size=30522,
 
22
  self.lr = lr
23
 
24
  # Small Yume model (around 100M parameters)
25
+ yume_small = Config(
26
  num_epoch=10,
27
  block_size=512,
28
  vocab_size=30522,
 
35
  )
36
 
37
  # Medium Yume model (around 500M parameters)
38
+ yume_medium = Config(
39
  num_epoch=10,
40
  block_size=1024,
41
  vocab_size=30522,
 
48
  )
49
 
50
  # Large Yume model (around 1B parameters)
51
+ yume_large = Config(
52
  num_epoch=10,
53
  block_size=2048,
54
  vocab_size=30522,
yume/dataset.py CHANGED
@@ -23,6 +23,7 @@ class Trainset(Dataset):
23
  loaded_dataset = load_dataset(url)
24
  self.texts = loaded_dataset["animanga"]["texts"]
25
  dummy_logger("Successfully loaded the dataset")
 
26
 
27
  def _tokenize(self, tiktoken=True):
28
  if tiktoken:
@@ -34,4 +35,14 @@ class Trainset(Dataset):
34
  else:
35
  self.tokenizer = Tokenizer()
36
  self.tokenizer.load_pretrained()
37
- self.tokenizer.encode(self.texts)
 
 
 
 
 
 
 
 
 
 
 
23
  loaded_dataset = load_dataset(url)
24
  self.texts = loaded_dataset["animanga"]["texts"]
25
  dummy_logger("Successfully loaded the dataset")
26
+
27
 
28
  def _tokenize(self, tiktoken=True):
29
  if tiktoken:
 
35
  else:
36
  self.tokenizer = Tokenizer()
37
  self.tokenizer.load_pretrained()
38
+ self.tokenizer.encode(self.texts)
39
+
40
+ def _prep_bin(self):
41
+ pass
42
+
43
+ def get_batch(self):
44
+ pass
45
+
46
+ # from loading to installing in one function
47
+ def build_dataset(self):
48
+ pass
yume/models.py CHANGED
@@ -3,48 +3,229 @@ from torch import nn
3
  import torch.nn.functional as F
4
  from .config import Config
5
  from .utils import encode, decode
 
6
  from huggingface_hub import PyTorchModelHubMixin
7
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # TODO setup models
10
  class SelfAttention(nn.Module, PyTorchModelHubMixin):
11
  def __init__(self, config: Config) -> None:
12
  super().__init__()
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def forward(self, x):
15
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  class MLP(nn.Module, PyTorchModelHubMixin):
19
  def __init__(self, config: Config) -> None:
20
  super().__init__()
21
-
 
 
 
 
22
  def forward(self, x):
23
- pass
 
 
 
 
24
 
25
 
26
  class Block(nn.Module, PyTorchModelHubMixin):
27
  def __init__(self, config: Config) -> None:
28
  super().__init__()
 
 
 
 
 
29
 
30
  def forward(self, x):
31
- pass
32
-
 
33
 
34
  class GPT(nn.Module, PyTorchModelHubMixin):
35
  def __init__(self, config: Config):
36
  super().__init__()
37
-
38
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
39
-
40
- def forward(self, x):
41
- pass
42
-
43
- def _init_weights(self):
44
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def configure_optimizer(self):
47
  pass
48
-
49
- def generate(self):
50
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import torch.nn.functional as F
4
  from .config import Config
5
  from .utils import encode, decode
6
+ import math
7
  from huggingface_hub import PyTorchModelHubMixin
8
 
9
+ # took from karpthy's
10
+ class LayerNorm(nn.Module):
11
+ """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
12
+
13
+ def __init__(self, ndim, bias):
14
+ super().__init__()
15
+ self.weight = nn.Parameter(torch.ones(ndim))
16
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
17
+
18
+ def forward(self, input):
19
+ return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
20
+
21
 
22
  # TODO setup models
23
  class SelfAttention(nn.Module, PyTorchModelHubMixin):
24
  def __init__(self, config: Config) -> None:
25
  super().__init__()
26
+ self.attn = nn.Linear(config.n_embd,3*config.n_embd,bias=config.bias)
27
+ self.proj = nn.Linear(config.n_embd,config.n_embd,bias=config.bias)
28
+ self.attn_dropout = nn.Dropout(config.dropout)
29
+ self.resid_dropout = nn.Dropout(config.dropout)
30
+ self.config = config
31
+
32
+ self.flash = hasattr(torch.nn.functional,'scaled_dot_product_attention')
33
+ if not self.flash:
34
+ print("Using Slow Attention. Use PyTorch >= 2.0")
35
+
36
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
37
+ .view(1, 1, config.block_size, config.block_size))
38
+
39
 
40
  def forward(self, x):
41
+ B,T,C = x.size()
42
+ q,k,v = self.attn(x).split(self.config.n_embd,dim=2)
43
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
44
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
45
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
46
+
47
+ if self.flash:
48
+ # efficient attention using Flash Attention CUDA kernels
49
+ y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
50
+ else:
51
+ # manual implementation of attention
52
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
53
+ att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
54
+ att = F.softmax(att, dim=-1)
55
+ att = self.attn_dropout(att)
56
+ y = att @ v
57
+
58
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
59
+
60
+ # output projection
61
+ y = self.resid_dropout(self.c_proj(y))
62
+ return y
63
 
64
 
65
  class MLP(nn.Module, PyTorchModelHubMixin):
66
  def __init__(self, config: Config) -> None:
67
  super().__init__()
68
+ self.fully_connected = nn.Linear(config.n_embd,4*config.n_embd,bias=config.bias)
69
+ self.gelu = nn.GELU()
70
+ self.proj = nn.Linear(4*config.n_embd,config.n_embd,bias=config.bias)
71
+ self.dropout = nn.Dropout(config.dropout)
72
+
73
  def forward(self, x):
74
+ x = self.fully_connected(x)
75
+ x = self.gelu(x)
76
+ x = self.proj(x)
77
+ x = self.dropout(x)
78
+ return x
79
 
80
 
81
  class Block(nn.Module, PyTorchModelHubMixin):
82
  def __init__(self, config: Config) -> None:
83
  super().__init__()
84
+ self.ln_1 = LayerNorm(config.n_embd,bias=config.bias)
85
+ self.attn = SelfAttention(config)
86
+ self.ln_2 = LayerNorm(config.n_embd,bias=config.bias)
87
+ self.mlp = MLP(config)
88
+
89
 
90
  def forward(self, x):
91
+ x = x+ self.attn(self.ln_1(x))
92
+ x = x+ self.mlp(self.ln_2(x))
93
+ return x
94
 
95
  class GPT(nn.Module, PyTorchModelHubMixin):
96
  def __init__(self, config: Config):
97
  super().__init__()
98
+ assert config.vocab_size is not None
99
+ assert config.block_size is not None
100
+ self.config = config
101
+ self.device = config.device
102
+
103
+ self.transformer= nn.ModuleDict(dict(
104
+ wte = nn.Embedding(config.vocab_size,config.n_embd),
105
+ wpe = nn.Embedding(config.block_size,config.n_embd),
106
+ drop = nn.Dropout(config.dropout),
107
+ blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
108
+ ln_f = LayerNorm(config.n_embd,config.bias)
109
+ ))
110
+ self.lm_head = nn.Linear(config.n_embd,config.vocab_size,bias=False)
111
+
112
+
113
+ def get_num_params(self, non_embedding=True):
114
+ """
115
+ Return the number of parameters in the model.
116
+ For non-embedding count (default), the position embeddings get subtracted.
117
+ The token embeddings would too, except due to the parameter sharing these
118
+ params are actually used as weights in the final layer, so we include them.
119
+ """
120
+ n_params = sum(p.numel() for p in self.parameters())
121
+ if non_embedding:
122
+ n_params -= self.transformer.wpe.weight.numel()
123
+ return n_params
124
+
125
+ def forward(self, idx,targets=None):
126
+ b,t = x.size()
127
+ assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
128
+ pos = torch.arange(0, t, dtype=torch.long, device=self.device) # shape (t)
129
+
130
+ tok_emb = self.transformer.wte(idx)
131
+ pos_emb = self.transformer.wpe(idx)
132
+
133
+ x = self.transformer.drop(tok_emb+pos_emb)
134
+
135
+ for block in self.transformer.blocks:
136
+ x = block(x)
137
+ x = self.transformer.ln_f(x)
138
+
139
+ if targets is not None:
140
+ # if we are given some desired targets also calculate the loss
141
+ logits = self.lm_head(x)
142
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
143
+ else:
144
+ # inference-time mini-optimization: only forward the lm_head on the very last position
145
+ logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
146
+ loss = None
147
+
148
+ return logits, loss
149
+
150
+ def crop_block_size(self, block_size):
151
+ # model surgery to decrease the block size if necessary
152
+ # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
153
+ # but want to use a smaller block size for some smaller, simpler model
154
+ assert block_size <= self.config.block_size
155
+ self.config.block_size = block_size
156
+ self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
157
+ for block in self.transformer.h:
158
+ if hasattr(block.attn, 'bias'):
159
+ block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
160
+
161
+ def _init_weights(self, module):
162
+ if isinstance(module, nn.Linear):
163
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
164
+ if module.bias is not None:
165
+ torch.nn.init.zeros_(module.bias)
166
+ elif isinstance(module, nn.Embedding):
167
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
168
 
169
  def configure_optimizer(self):
170
  pass
171
+
172
+ @torch.no_grad()
173
+ def generate(self,idx,max_token,temperature=1.0,top_k=None):
174
+
175
+ for _ in range(max_token):
176
+ idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:,-self.config.block_size:]
177
+ logits,_ = self(idx_cond)
178
+
179
+ if top_k is not None:
180
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
181
+ logits[logits < v[:, [-1]]] = -float('Inf')
182
+ # apply softmax to convert logits to (normalized) probabilities
183
+ probs = F.softmax(logits, dim=-1)
184
+ # sample from the distribution
185
+ idx_next = torch.multinomial(probs, num_samples=1)
186
+ # append sampled index to the running sequence and continue
187
+ idx = torch.cat((idx, idx_next), dim=1)
188
+
189
+ return idx
190
+
191
+ def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
192
+ # start with all of the candidate parameters
193
+ param_dict = {pn: p for pn, p in self.named_parameters()}
194
+ # filter out those that do not require grad
195
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
196
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
197
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
198
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
199
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
200
+ optim_groups = [
201
+ {'params': decay_params, 'weight_decay': weight_decay},
202
+ {'params': nodecay_params, 'weight_decay': 0.0}
203
+ ]
204
+ num_decay_params = sum(p.numel() for p in decay_params)
205
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
206
+ print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
207
+ print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
208
+ # Create AdamW optimizer and use the fused version if it is available
209
+ fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
210
+ use_fused = fused_available and device_type == 'cuda'
211
+ extra_args = dict(fused=True) if use_fused else dict()
212
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
213
+ print(f"using fused AdamW: {use_fused}")
214
+
215
+ return optimizer
216
+
217
+ def estimate_mfu(self, fwdbwd_per_iter, dt):
218
+ """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
219
+ # first estimate the number of flops we do per iteration.
220
+ # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
221
+ N = self.get_num_params()
222
+ cfg = self.config
223
+ L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
224
+ flops_per_token = 6*N + 12*L*H*Q*T
225
+ flops_per_fwdbwd = flops_per_token * T
226
+ flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
227
+ # express our flops throughput as ratio of A100 bfloat16 peak flops
228
+ flops_achieved = flops_per_iter * (1.0/dt) # per second
229
+ flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
230
+ mfu = flops_achieved / flops_promised
231
+ return mfu
yume/yume.py CHANGED
@@ -3,9 +3,10 @@ from torch import nn
3
  import torch.nn.functional as F
4
  from huggingface_hub import login
5
 
6
- from .config import Config
7
  from .models import GPT
8
  from .utils import dummy_logger, training_logger
 
9
 
10
 
11
  class Yume:
@@ -13,7 +14,7 @@ class Yume:
13
  assert config is not None
14
  super().__init__()
15
  self.gpt = GPT
16
- self.model = GPT(config=config)
17
  self.config = config
18
 
19
  def generate(self):
@@ -22,11 +23,13 @@ class Yume:
22
  def sample(self):
23
  pass
24
 
25
- def pretrain(self, tokens):
26
  lr = self.config.lr
27
- num_epochs = self.config.num_epoch
28
-
29
- pass
 
 
30
 
31
  def fine_tune(self):
32
  pass
 
3
  import torch.nn.functional as F
4
  from huggingface_hub import login
5
 
6
+ from .config import Config,yume_small
7
  from .models import GPT
8
  from .utils import dummy_logger, training_logger
9
+ from .dataset import Trainset
10
 
11
 
12
  class Yume:
 
14
  assert config is not None
15
  super().__init__()
16
  self.gpt = GPT
17
+ self.model = GPT(config=yume_small)
18
  self.config = config
19
 
20
  def generate(self):
 
23
  def sample(self):
24
  pass
25
 
26
+ def pretrain(self, dataset:Trainset):
27
  lr = self.config.lr
28
+ dataset = Trainset()
29
+ for epoch in range(self.config.num_epoch):
30
+ # real trainset
31
+ pass
32
+
33
 
34
  def fine_tune(self):
35
  pass