Spaces:

zaibutcooler
/

yume

Sleeping

App Files Files Community

Zai commited on May 5, 2024

Commit

d2d5f50

1 Parent(s): c4b84ea

poertry inited

Browse files

Files changed (7) hide show

pyproject.toml +14 -0
setup.py +0 -21
version.py +0 -1
yume/config.py +2 -1
yume/dataset.py +4 -5
yume/models.py +100 -68
yume/yume.py +2 -3

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[tool.poetry]
+name = "yume"
+version = "0.1.0"
+description = "GPT from scratch trained with Japanese dataset"
+authors = ["Zai <130903099+zaibutcooler@users.noreply.github.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.11"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

setup.py DELETED Viewed

@@ -1,21 +0,0 @@
-from setuptools import setup, find_packages
-with open("requirements.txt") as f:
-    requirements = f.read().splitlines()
-setup(
-    name="yume",
-    version="0.1",
-    packages=find_packages(),
-    install_requires=requirements,
-    author="Zai",
-    author_email="zaiyellyintaung@gmail.com",
-    description="LLM trained with Animanga dataset",
-    long_description="Inspired by Andrej Karpathy trained with japanese animanga dataset",
-    url="https://github.com/zaibutcooler/yume",
-    # classifiers=[
-    #     'Programming Language :: Python :: 3',
-    #     'License :: OSI Approved :: MIT License',
-    #     'Operating System :: OS Independent',
-    # ],
-)

version.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "20231117"

yume/config.py CHANGED Viewed

@@ -21,6 +21,7 @@ class Config:
         self.bias = bias
         self.lr = lr
 # Small Yume model (around 100M parameters)
 yume_small = Config(
     num_epoch=10,
@@ -58,4 +59,4 @@ yume_large = Config(
     dropout=0.1,
     bias=True,
     lr=0.001,
-)

         self.bias = bias
         self.lr = lr
 # Small Yume model (around 100M parameters)
 yume_small = Config(
     num_epoch=10,
     dropout=0.1,
     bias=True,
     lr=0.001,
+)

yume/dataset.py CHANGED Viewed

@@ -23,7 +23,6 @@ class Trainset(Dataset):
         loaded_dataset = load_dataset(url)
         self.texts = loaded_dataset["animanga"]["texts"]
         dummy_logger("Successfully loaded the dataset")
     def _tokenize(self, tiktoken=True):
         if tiktoken:
@@ -36,13 +35,13 @@ class Trainset(Dataset):
             self.tokenizer = Tokenizer()
             self.tokenizer.load_pretrained()
         self.tokenizer.encode(self.texts)
     def _prep_bin(self):
         pass
     def get_batch(self):
         pass
     # from loading to installing in one function
     def build_dataset(self):
-        pass

         loaded_dataset = load_dataset(url)
         self.texts = loaded_dataset["animanga"]["texts"]
         dummy_logger("Successfully loaded the dataset")
     def _tokenize(self, tiktoken=True):
         if tiktoken:
             self.tokenizer = Tokenizer()
             self.tokenizer.load_pretrained()
         self.tokenizer.encode(self.texts)
     def _prep_bin(self):
         pass
     def get_batch(self):
         pass
     # from loading to installing in one function
     def build_dataset(self):
+        pass

yume/models.py CHANGED Viewed

@@ -6,9 +6,10 @@ from .utils import encode, decode
 import math
 from huggingface_hub import PyTorchModelHubMixin
 # took from karpthy's
 class LayerNorm(nn.Module):
-    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
     def __init__(self, ndim, bias):
         super().__init__()
@@ -23,39 +24,49 @@ class LayerNorm(nn.Module):
 class SelfAttention(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config) -> None:
         super().__init__()
-        self.attn = nn.Linear(config.n_embd,3*config.n_embd,bias=config.bias)
-        self.proj = nn.Linear(config.n_embd,config.n_embd,bias=config.bias)
         self.attn_dropout = nn.Dropout(config.dropout)
         self.resid_dropout = nn.Dropout(config.dropout)
         self.config = config
-        self.flash = hasattr(torch.nn.functional,'scaled_dot_product_attention')
         if not self.flash:
             print("Using Slow Attention. Use PyTorch >= 2.0")
-            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
-                                        .view(1, 1, config.block_size, config.block_size))
     def forward(self, x):
-        B,T,C = x.size()
-        q,k,v = self.attn(x).split(self.config.n_embd,dim=2)
         k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
-        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         if self.flash:
             # efficient attention using Flash Attention CUDA kernels
-            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
         else:
             # manual implementation of attention
             att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
-            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
             att = F.softmax(att, dim=-1)
             att = self.attn_dropout(att)
             y = att @ v
-        y = y.transpose(1, 2).contiguous().view(B, T, C)
         # output projection
         y = self.resid_dropout(self.c_proj(y))
@@ -65,11 +76,13 @@ class SelfAttention(nn.Module, PyTorchModelHubMixin):
 class MLP(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config) -> None:
         super().__init__()
-        self.fully_connected = nn.Linear(config.n_embd,4*config.n_embd,bias=config.bias)
         self.gelu = nn.GELU()
-        self.proj = nn.Linear(4*config.n_embd,config.n_embd,bias=config.bias)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
         x = self.fully_connected(x)
         x = self.gelu(x)
@@ -81,17 +94,17 @@ class MLP(nn.Module, PyTorchModelHubMixin):
 class Block(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config) -> None:
         super().__init__()
-        self.ln_1 = LayerNorm(config.n_embd,bias=config.bias)
         self.attn = SelfAttention(config)
-        self.ln_2 = LayerNorm(config.n_embd,bias=config.bias)
         self.mlp = MLP(config)
     def forward(self, x):
-        x = x+ self.attn(self.ln_1(x))
-        x = x+ self.mlp(self.ln_2(x))
         return x
 class GPT(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config):
         super().__init__()
@@ -99,17 +112,18 @@ class GPT(nn.Module, PyTorchModelHubMixin):
         assert config.block_size is not None
         self.config = config
         self.device = config.device
-        self.transformer= nn.ModuleDict(dict(
-            wte = nn.Embedding(config.vocab_size,config.n_embd),
-            wpe = nn.Embedding(config.block_size,config.n_embd),
-            drop = nn.Dropout(config.dropout),
-            blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
-            ln_f = LayerNorm(config.n_embd,config.bias)
-        ))
-        self.lm_head = nn.Linear(config.n_embd,config.vocab_size,bias=False)
     def get_num_params(self, non_embedding=True):
         """
         Return the number of parameters in the model.
@@ -121,28 +135,34 @@ class GPT(nn.Module, PyTorchModelHubMixin):
         if non_embedding:
             n_params -= self.transformer.wpe.weight.numel()
         return n_params
-    def forward(self, idx,targets=None):
-        b,t = x.size()
-        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
-        pos = torch.arange(0, t, dtype=torch.long, device=self.device) # shape (t)
         tok_emb = self.transformer.wte(idx)
         pos_emb = self.transformer.wpe(idx)
-        x = self.transformer.drop(tok_emb+pos_emb)
         for block in self.transformer.blocks:
             x = block(x)
         x = self.transformer.ln_f(x)
         if targets is not None:
             # if we are given some desired targets also calculate the loss
             logits = self.lm_head(x)
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
         else:
             # inference-time mini-optimization: only forward the lm_head on the very last position
-            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
             loss = None
         return logits, loss
@@ -153,10 +173,12 @@ class GPT(nn.Module, PyTorchModelHubMixin):
         # but want to use a smaller block size for some smaller, simpler model
         assert block_size <= self.config.block_size
         self.config.block_size = block_size
-        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
         for block in self.transformer.h:
-            if hasattr(block.attn, 'bias'):
-                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
@@ -168,17 +190,21 @@ class GPT(nn.Module, PyTorchModelHubMixin):
     def configure_optimizer(self):
         pass
     @torch.no_grad()
-    def generate(self,idx,max_token,temperature=1.0,top_k=None):
         for _ in range(max_token):
-            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:,-self.config.block_size:]
-            logits,_ = self(idx_cond)
             if top_k is not None:
                 v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = -float('Inf')
             # apply softmax to convert logits to (normalized) probabilities
             probs = F.softmax(logits, dim=-1)
             # sample from the distribution
@@ -198,34 +224,40 @@ class GPT(nn.Module, PyTorchModelHubMixin):
         decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
         nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
         optim_groups = [
-            {'params': decay_params, 'weight_decay': weight_decay},
-            {'params': nodecay_params, 'weight_decay': 0.0}
         ]
         num_decay_params = sum(p.numel() for p in decay_params)
         num_nodecay_params = sum(p.numel() for p in nodecay_params)
-        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
-        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
         # Create AdamW optimizer and use the fused version if it is available
-        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
-        use_fused = fused_available and device_type == 'cuda'
         extra_args = dict(fused=True) if use_fused else dict()
-        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
         print(f"using fused AdamW: {use_fused}")
         return optimizer
     def estimate_mfu(self, fwdbwd_per_iter, dt):
-        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
         # first estimate the number of flops we do per iteration.
         # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
         N = self.get_num_params()
         cfg = self.config
-        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
-        flops_per_token = 6*N + 12*L*H*Q*T
         flops_per_fwdbwd = flops_per_token * T
         flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
         # express our flops throughput as ratio of A100 bfloat16 peak flops
-        flops_achieved = flops_per_iter * (1.0/dt) # per second
-        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
         mfu = flops_achieved / flops_promised
-        return mfu

 import math
 from huggingface_hub import PyTorchModelHubMixin
 # took from karpthy's
 class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
     def __init__(self, ndim, bias):
         super().__init__()
 class SelfAttention(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config) -> None:
         super().__init__()
+        self.attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
         self.attn_dropout = nn.Dropout(config.dropout)
         self.resid_dropout = nn.Dropout(config.dropout)
         self.config = config
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
         if not self.flash:
             print("Using Slow Attention. Use PyTorch >= 2.0")
+            self.register_buffer(
+                "bias",
+                torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                    1, 1, config.block_size, config.block_size
+                ),
+            )
     def forward(self, x):
+        B, T, C = x.size()
+        q, k, v = self.attn(x).split(self.config.n_embd, dim=2)
         k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         if self.flash:
             # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=None,
+                dropout_p=self.dropout if self.training else 0,
+                is_causal=True,
+            )
         else:
             # manual implementation of attention
             att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
             att = F.softmax(att, dim=-1)
             att = self.attn_dropout(att)
             y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
         # output projection
         y = self.resid_dropout(self.c_proj(y))
 class MLP(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config) -> None:
         super().__init__()
+        self.fully_connected = nn.Linear(
+            config.n_embd, 4 * config.n_embd, bias=config.bias
+        )
         self.gelu = nn.GELU()
+        self.proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
         x = self.fully_connected(x)
         x = self.gelu(x)
 class Block(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config) -> None:
         super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
         self.attn = SelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
         self.mlp = MLP(config)
     def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
         return x
 class GPT(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config: Config):
         super().__init__()
         assert config.block_size is not None
         self.config = config
         self.device = config.device
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                blocks=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=LayerNorm(config.n_embd, config.bias),
+            )
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
     def get_num_params(self, non_embedding=True):
         """
         Return the number of parameters in the model.
         if non_embedding:
             n_params -= self.transformer.wpe.weight.numel()
         return n_params
+    def forward(self, idx, targets=None):
+        b, t = x.size()
+        assert (
+            t <= self.config.block_size
+        ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=self.device)  # shape (t)
         tok_emb = self.transformer.wte(idx)
         pos_emb = self.transformer.wpe(idx)
+        x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.blocks:
             x = block(x)
         x = self.transformer.ln_f(x)
         if targets is not None:
             # if we are given some desired targets also calculate the loss
             logits = self.lm_head(x)
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
+            )
         else:
             # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(
+                x[:, [-1], :]
+            )  # note: using list [-1] to preserve the time dim
             loss = None
         return logits, loss
         # but want to use a smaller block size for some smaller, simpler model
         assert block_size <= self.config.block_size
         self.config.block_size = block_size
+        self.transformer.wpe.weight = nn.Parameter(
+            self.transformer.wpe.weight[:block_size]
+        )
         for block in self.transformer.h:
+            if hasattr(block.attn, "bias"):
+                block.attn.bias = block.attn.bias[:, :, :block_size, :block_size]
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
     def configure_optimizer(self):
         pass
     @torch.no_grad()
+    def generate(self, idx, max_token, temperature=1.0, top_k=None):
         for _ in range(max_token):
+            idx_cond = (
+                idx
+                if idx.size(1) <= self.config.block_size
+                else idx[:, -self.config.block_size :]
+            )
+            logits, _ = self(idx_cond)
             if top_k is not None:
                 v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
             # apply softmax to convert logits to (normalized) probabilities
             probs = F.softmax(logits, dim=-1)
             # sample from the distribution
         decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
         nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
         optim_groups = [
+            {"params": decay_params, "weight_decay": weight_decay},
+            {"params": nodecay_params, "weight_decay": 0.0},
         ]
         num_decay_params = sum(p.numel() for p in decay_params)
         num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(
+            f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
+        )
+        print(
+            f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
+        )
         # Create AdamW optimizer and use the fused version if it is available
+        fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == "cuda"
         extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(
+            optim_groups, lr=learning_rate, betas=betas, **extra_args
+        )
         print(f"using fused AdamW: {use_fused}")
         return optimizer
     def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS"""
         # first estimate the number of flops we do per iteration.
         # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
         N = self.get_num_params()
         cfg = self.config
+        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd // cfg.n_head, cfg.block_size
+        flops_per_token = 6 * N + 12 * L * H * Q * T
         flops_per_fwdbwd = flops_per_token * T
         flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
         # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0 / dt)  # per second
+        flops_promised = 312e12  # A100 GPU bfloat16 peak flops is 312 TFLOPS
         mfu = flops_achieved / flops_promised
+        return mfu

yume/yume.py CHANGED Viewed

@@ -3,7 +3,7 @@ from torch import nn
 import torch.nn.functional as F
 from huggingface_hub import login
-from .config import Config,yume_small
 from .models import GPT
 from .utils import dummy_logger, training_logger
 from .dataset import Trainset
@@ -23,13 +23,12 @@ class Yume:
     def sample(self):
         pass
-    def pretrain(self, dataset:Trainset):
         lr = self.config.lr
         dataset = Trainset()
         for epoch in range(self.config.num_epoch):
             # real trainset
             pass
     def fine_tune(self):
         pass

 import torch.nn.functional as F
 from huggingface_hub import login
+from .config import Config, yume_small
 from .models import GPT
 from .utils import dummy_logger, training_logger
 from .dataset import Trainset
     def sample(self):
         pass
+    def pretrain(self, dataset: Trainset):
         lr = self.config.lr
         dataset = Trainset()
         for epoch in range(self.config.num_epoch):
             # real trainset
             pass
     def fine_tune(self):
         pass