Spaces:

RaviNaik
/

ERA-SESSION21

Sleeping

App Files Files Community

ravi.naik commited on Oct 28, 2023

Commit

213d16f

1 Parent(s): 73580b8

Added training, inference and gradio UI code

Browse files

Files changed (16) hide show

README.md +55 -2
app.py +74 -0
checkpoints/model.pth +3 -0
data/input.txt +0 -0
experiments/bigram.py +107 -0
experiments/bigram_v2.py +200 -0
experiments/exp.ipynb +468 -0
gpt.ipynb +211 -0
src/__pycache__/inference.cpython-310.pyc +0 -0
src/__pycache__/model.cpython-310.pyc +0 -0
src/__pycache__/training.cpython-310.pyc +0 -0
src/__pycache__/utils.cpython-310.pyc +0 -0
src/inference.py +9 -0
src/model.py +120 -0
src/training.py +53 -0
src/utils.py +32 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: ERA SESSION21
 emoji: 🌍
 colorFrom: indigo
 colorTo: blue
@@ -10,4 +10,57 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: "ERA SESSION21: GPT from scratch"
 emoji: 🌍
 colorFrom: indigo
 colorTo: blue
 license: mit
 ---
+### Results
+**Bigram Base model training and results**
+![image](https://github.com/RaviNaik/ERA-SESSION21/assets/23289802/4cc02d93-98fc-4114-a4c9-8a3c249eaad3)
+**GPT Model training results**
+![image](https://github.com/RaviNaik/ERA-SESSION21/assets/23289802/95dcde00-bf20-4853-ad20-fa67c1046f6b)
+#### Generation Output:
+```python
+model = torch.load("checkpoints/model.pth", map_location={"cpu": device})
+results = generate("hello", model, block_size, 1000, device)
+print(results)
+```
+```
+hellows thence grown from thee.
+Since thou hast raim, thou thast well were quarterned; and
+ever man tree can saw for words word from her at hour
+Whiles contrations or devoided from ere years;
+Yea, foul vice, indelice on the bird of the
+noble of Hermione.
+PARIS:
+Sir, adies, sir, hate no choping but to your good.
+HENRY BOLINGBROKE:
+Yes, to ask you might, foreweed.
+WARCK:
+'Tis he made moust true.
+RORSET:
+It is an hour fastal that cracknaf at the chase
+Upon; you are your hearing news a daughter.
+KING EDWARD IV:
+Tut, Lord Warwick, thou shouldst aft Rutlansps?
+Thou tust but back hild, he countemn'd my lady's seal,
+For access dead the treature moon! and the Englisting!
+Thy vage for yonder see thou be donen?
+O, count thou dost not Romeo, thou pratheeo sir,
+That sweet thou feigh with no past blood on
+Be see, here through on that find bears, if an
+pretterinctors three and aspect die meeds thou,
+Behing mine of thy denigning state lain business?
+SAMPSA:
+Sir, ha! but thou refused? thyself food, gr
+```
+### Gradio Interface
+![image](https://github.com/RaviNaik/ERA-SESSION21/assets/23289802/f339ec6b-17b3-4de6-bbef-14eb2b3fac84)

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gradio as gr
+import random
+import torch
+import pathlib
+from src.model import GPTModel
+from src.inference import generate as generate_text
+from src.utils import vocab_size
+batch_size = 64
+block_size = 256
+max_iters = 5000
+eval_interval = 500
+learning_rate = 3e-4
+device = "cuda:1" if torch.cuda.is_available() else "cpu"
+eval_iters = 200
+n_embeds = 384
+n_heads = 6
+n_layers = 6
+dropout = 0.2
+def load_model():
+    model = torch.load("checkpoints/model.pth", map_location={"cpu": device})
+    return model
+model = load_model()
+def generate(prompt, max_new_tokens):
+    prompt = prompt.strip()
+    out = generate_text(prompt, model, block_size, max_new_tokens, device)
+    return {gpt_output: out}
+with gr.Blocks() as app:
+    gr.Markdown("## ERA Session21 - GPT from scratch")
+    gr.Markdown(
+        """This is an implementation of GPT [Let's build GPT: from scratch, in code, spelled out.](https://www.youtube.com/watch?v=kCc8FmEb1nY&t=2s) by Andrej Karpathy.
+        Please find the source code and training details [here](https://github.com/RaviNaik/ERA-SESSION21).
+        Dataset used to train: [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt).
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            prompt_box = gr.Textbox(label="Initial Prompt", interactive=True)
+            max_new_tokens = gr.Slider(
+                minimum=10,
+                maximum=2500,
+                value=100,
+                step=10,
+                label="Select Number of Tokens to be Generated",
+                interactive=True,
+            )
+            submit_btn = gr.Button(value="Generate")
+        with gr.Column():
+            gpt_output = gr.TextArea(
+                label="Text Generated by GPT",
+                show_label=True,
+                max_lines=100,
+                interactive=False,
+            )
+        submit_btn.click(
+            generate,
+            inputs=[prompt_box, max_new_tokens],
+            outputs=[gpt_output],
+        )
+app.launch()

checkpoints/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8b930ee87e1eecc6a03bc49983a81fd11aaa95f4cd5e1d64091d6107827811b
+size 52698997

data/input.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/bigram.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+batch_size = 32
+block_size = 8
+max_iters = 3000
+eval_interval = 300
+learning_rate = 1e-2
+device = "cuda:1" if torch.cuda.is_available() else "cpu"
+eval_iters = 200
+torch.manual_seed(1123)
+with open("input.txt") as f:
+    text = f.read()
+chars = sorted(list(set(text)))
+vocab_size = len(chars)
+stoi = {ch: i for i, ch in enumerate(chars)}
+itos = {i: ch for i, ch in enumerate(chars)}
+encode = lambda s: [stoi[c] for c in s]
+decode = lambda l: "".join([itos[i] for i in l])
+data = torch.tensor(encode(text), dtype=torch.long)
+n = int(0.9 * len(data))
+train_data = data[:n]
+val_data = data[n:]
+def get_batch(split):
+    data = train_data if split == "train" else val_data
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
+    return x, y
+@torch.no_grad()
+def estimate_loss(model: nn.Module):
+    out = {}
+    model.eval()
+    for split in ["train", "val"]:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split)
+            X, Y = X.to(device), Y.to(device)
+            logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+class BigramLanguageModel(nn.Module):
+    def __init__(self, vocab_size):
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
+    def forward(self, idx, targets=None):
+        logits = self.token_embedding_table(idx)  # BTC
+        loss = None
+        if targets is not None:
+            B, T, C = logits.shape
+            logits = logits.view(B * T, C)
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
+        return logits, loss
+    def generate(self, idx, max_new_tokens):
+        for _ in range(max_new_tokens):
+            logits, loss = self(idx)  # BxTxC
+            logits = logits[:, -1, :]  # BxC
+            probs = F.softmax(logits, dim=-1)  # BxC
+            idx_next = torch.multinomial(probs, num_samples=1)  # Bx1
+            idx = torch.cat((idx, idx_next), dim=1)  # BxT+1
+        return idx
+model = BigramLanguageModel(vocab_size)
+model = model.to(device)
+optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
+for iter in range(max_iters):
+    if iter % eval_interval == 0:
+        losses = estimate_loss(model)
+        print(
+            f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
+        )
+    xb, yb = get_batch("train")
+    xb, yb = xb.to(device), yb.to(device)
+    logits, loss = model(xb, yb)
+    optimizer.zero_grad(set_to_none=True)
+    loss.backward()
+    optimizer.step()
+context = torch.zeros((1, 1), dtype=torch.long, device=device)
+results = decode(model.generate(context, max_new_tokens=100)[0].tolist())
+print(results)

experiments/bigram_v2.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+batch_size = 64
+block_size = 256
+max_iters = 5000
+eval_interval = 500
+learning_rate = 3e-4
+device = "cuda:1" if torch.cuda.is_available() else "cpu"
+eval_iters = 200
+n_embeds = 384
+n_heads = 6
+n_layers = 6
+dropout = 0.2
+torch.manual_seed(1123)
+with open("input.txt") as f:
+    text = f.read()
+chars = sorted(list(set(text)))
+vocab_size = len(chars)
+stoi = {ch: i for i, ch in enumerate(chars)}
+itos = {i: ch for i, ch in enumerate(chars)}
+def encode(s):
+    return [stoi[c] for c in s]
+def decode(l):
+    return "".join([itos[i] for i in l])
+data = torch.tensor(encode(text), dtype=torch.long)
+n = int(0.9 * len(data))
+train_data = data[:n]
+val_data = data[n:]
+def get_batch(split):
+    data = train_data if split == "train" else val_data
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
+    return x, y
+@torch.no_grad()
+def estimate_loss(model: nn.Module):
+    out = {}
+    model.eval()
+    for split in ["train", "val"]:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split)
+            X, Y = X.to(device), Y.to(device)
+            logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+class Head(nn.Module):
+    def __init__(self, n_embed, head_size) -> None:
+        super().__init__()
+        self.key = nn.Linear(n_embed, head_size, bias=False)
+        self.query = nn.Linear(n_embed, head_size, bias=False)
+        self.value = nn.Linear(n_embed, head_size, bias=False)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
+    def forward(self, x):
+        B, T, C = x.shape
+        k = self.key(x)
+        q = self.query(x)
+        wei = q @ k.transpose(-2, -1) * (C**-0.5)  # (B,T,16) @ (B,16,T) --> (B,T,T)
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        v = self.value(x)
+        out = wei @ v
+        return out
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_heads, n_embeds, head_size):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(n_embeds, head_size) for _ in range(n_heads)])
+        self.proj = nn.Linear(n_embeds, n_embeds)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = torch.cat([h(x) for h in self.heads], dim=-1)
+        x = self.proj(x)
+        x = self.dropout(x)
+        return x
+class FeedForward(nn.Module):
+    def __init__(self, n_embeds):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embeds, 4 * n_embeds),
+            nn.ReLU(),
+            nn.Linear(4 * n_embeds, n_embeds),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Block(nn.Module):
+    def __init__(self, n_embeds, n_heads):
+        super().__init__()
+        head_size = n_embeds // n_heads
+        self.sa_heads = MultiHeadAttention(n_heads, n_embeds, head_size)
+        self.ffwd = FeedForward(n_embeds)
+        self.ln1 = nn.LayerNorm(n_embeds)
+        self.ln2 = nn.LayerNorm(n_embeds)
+    def forward(self, x):
+        x = x + self.sa_heads(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+class BigramLanguageModel(nn.Module):
+    def __init__(self, vocab_size, n_embeds, block_size):
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embeds)
+        self.position_embedding_table = nn.Embedding(block_size, n_embeds)
+        self.blocks = nn.Sequential(
+            *[Block(n_embeds, n_heads) for _ in range(n_layers)]
+        )
+        self.lnf = nn.LayerNorm(n_embeds)
+        self.lm_head = nn.Linear(n_embeds, vocab_size)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        tok_embeds = self.token_embedding_table(idx)  # BxTxNemb
+        pos_embeds = self.position_embedding_table(
+            torch.arange(T, device=device)
+        )  # TXNemb
+        x = tok_embeds + pos_embeds  # BxTxNemb
+        x = self.blocks(x)
+        x = self.lnf(x)
+        logits = self.lm_head(x)  # BxTxVocabSize
+        loss = None
+        if targets is not None:
+            B, T, C = logits.shape
+            logits = logits.view(B * T, C)
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
+        return logits, loss
+    def generate(self, idx, max_new_tokens):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -block_size:]
+            logits, loss = self(idx_cond)  # BxTxC
+            logits = logits[:, -1, :]  # BxC
+            probs = F.softmax(logits, dim=-1)  # BxC
+            idx_next = torch.multinomial(probs, num_samples=1)  # Bx1
+            idx = torch.cat((idx, idx_next), dim=1)  # BxT+1
+        return idx
+model = BigramLanguageModel(vocab_size, n_embeds, block_size)
+model = model.to(device)
+optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
+for iter in range(max_iters):
+    if iter % eval_interval == 0:
+        losses = estimate_loss(model)
+        print(
+            f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
+        )
+    xb, yb = get_batch("train")
+    xb, yb = xb.to(device), yb.to(device)
+    logits, loss = model(xb, yb)
+    optimizer.zero_grad(set_to_none=True)
+    loss.backward()
+    optimizer.step()
+context = torch.zeros((1, 1), dtype=torch.long, device=device)
+results = decode(model.generate(context, max_new_tokens=100)[0].tolist())
+print(results)

experiments/exp.ipynb ADDED Viewed

	@@ -0,0 +1,468 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2023-10-27 16:11:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 1115394 (1.1M) [text/plain]\n",
+      "Saving to: ‘input.txt.1’\n",
+      "\n",
+      "input.txt.1         100%[===================>]   1.06M   734KB/s    in 1.5s    \n",
+      "\n",
+      "2023-10-27 16:11:36 (734 KB/s) - ‘input.txt.1’ saved [1115394/1115394]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"input.txt\") as f:\n",
+    "    text = f.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'First Citizen:\\nBefore we proceed any further, hear'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text[:50]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\n",
+      "65\n"
+     ]
+    }
+   ],
+   "source": [
+    "chars = sorted(list(set(text)))\n",
+    "vocab_size = len(chars)\n",
+    "\n",
+    "print(\"\".join(chars))\n",
+    "print(vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[46, 47, 1, 58, 46, 43, 56, 43]\n",
+      "hi there\n"
+     ]
+    }
+   ],
+   "source": [
+    "stoi = {ch: i for i, ch in enumerate(chars)}\n",
+    "itos = {i: ch for i, ch in enumerate(chars)}\n",
+    "\n",
+    "encode = lambda s: [stoi[c] for c in s]\n",
+    "decode = lambda l: \"\".join([itos[i] for i in l])\n",
+    "\n",
+    "print(encode(\"hi there\"))\n",
+    "\n",
+    "print(decode(encode(\"hi there\")))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1115394]) torch.int64\n",
+      "tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,\n",
+      "        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,\n",
+      "         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,\n",
+      "        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,\n",
+      "         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,\n",
+      "        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "data = torch.tensor(encode(text), dtype=torch.long)\n",
+    "print(data.shape, data.dtype)\n",
+    "print(data[:100])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = int(0.9 * len(data))\n",
+    "train_data = data[:n]\n",
+    "val_data = data[n:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Inputs:\n",
+      "torch.Size([4, 8])\n",
+      "tensor([[24, 43, 58,  5, 57,  1, 46, 43],\n",
+      "        [44, 53, 56,  1, 58, 46, 39, 58],\n",
+      "        [52, 58,  1, 58, 46, 39, 58,  1],\n",
+      "        [25, 17, 27, 10,  0, 21,  1, 54]])\n",
+      "-----------\n",
+      "Targets:\n",
+      "torch.Size([4, 8])\n",
+      "tensor([[43, 58,  5, 57,  1, 46, 43, 39],\n",
+      "        [53, 56,  1, 58, 46, 39, 58,  1],\n",
+      "        [58,  1, 58, 46, 39, 58,  1, 46],\n",
+      "        [17, 27, 10,  0, 21,  1, 54, 39]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch.manual_seed(1337)\n",
+    "batch_size = 4\n",
+    "block_size = 8\n",
+    "\n",
+    "\n",
+    "def get_batch(split):\n",
+    "    data = train_data if split == \"train\" else val_data\n",
+    "    ix = torch.randint(len(data) - block_size, (batch_size,))\n",
+    "    x = torch.stack([data[i : i + block_size] for i in ix])\n",
+    "    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])\n",
+    "    return x, y\n",
+    "\n",
+    "\n",
+    "xb, yb = get_batch(\"train\")\n",
+    "print(\"Inputs:\")\n",
+    "print(xb.shape)\n",
+    "print(xb)\n",
+    "\n",
+    "print(\"-----------\")\n",
+    "print(\"Targets:\")\n",
+    "print(yb.shape)\n",
+    "print(yb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch.nn as nn\n",
+    "from torch.nn import functional as F\n",
+    "\n",
+    "\n",
+    "class BigramLanguageModel(nn.Module):\n",
+    "    def __init__(self, vocab_size):\n",
+    "        super().__init__()\n",
+    "        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)\n",
+    "\n",
+    "    def forward(self, idx, targets):\n",
+    "        logits = self.token_embedding_table(idx)\n",
+    "\n",
+    "        return logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([4, 8, 65])\n"
+     ]
+    }
+   ],
+   "source": [
+    "m = BigramLanguageModel(vocab_size)\n",
+    "out = m(xb, yb)\n",
+    "print(out.shape)  # B,T,C -> 4X8X65"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([32, 65])\n",
+      "tensor(4.5262, grad_fn=<NllLossBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "class BigramLanguageModel(nn.Module):\n",
+    "    def __init__(self, vocab_size):\n",
+    "        super().__init__()\n",
+    "        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)\n",
+    "\n",
+    "    def forward(self, idx, targets=None):\n",
+    "        logits = self.token_embedding_table(idx)  # BTC\n",
+    "        loss = None\n",
+    "        if targets is not None:\n",
+    "            B, T, C = logits.shape\n",
+    "            logits = logits.view(B * T, C)\n",
+    "            targets = targets.view(B * T)\n",
+    "            loss = F.cross_entropy(logits, targets)\n",
+    "        return logits, loss\n",
+    "\n",
+    "    def generate(self, idx, max_new_tokens):\n",
+    "        for _ in range(max_new_tokens):\n",
+    "            logits, loss = self(idx)  # BxTxC\n",
+    "            logits = logits[:, -1, :]  # BxC\n",
+    "            probs = F.softmax(logits, dim=-1)  # BxC\n",
+    "            idx_next = torch.multinomial(probs, num_samples=1)  # Bx1\n",
+    "            idx = torch.cat((idx, idx_next), dim=1)  # BxT+1\n",
+    "\n",
+    "        return idx\n",
+    "\n",
+    "\n",
+    "m = BigramLanguageModel(vocab_size)\n",
+    "logits, loss = m(xb, yb)\n",
+    "print(logits.shape)  # B,T,C -> 4X8X65\n",
+    "print(loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "'JgC.JZWqUkpdtkSpmzjM-,RqzgaN?vC:hgjnAnBZDga-APqGUH!WdCbIb;$DefOYbEvcaKGMmnO'q$KdS-'ZH\n",
+      ".YSqr'X!Q! d;\n"
+     ]
+    }
+   ],
+   "source": [
+    "idx = torch.zeros((1, 1), dtype=torch.long)\n",
+    "\n",
+    "results = decode(m.generate(idx, max_new_tokens=100)[0].tolist())\n",
+    "\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.4206888675689697\n"
+     ]
+    }
+   ],
+   "source": [
+    "batch_size = 32\n",
+    "\n",
+    "for steps in range(10000):\n",
+    "    xb, yb = get_batch(\"train\")\n",
+    "\n",
+    "    logits, loss = m(xb, yb)\n",
+    "    optimizer.zero_grad(set_to_none=True)\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "\n",
+    "print(loss.item())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Hou'sy'ting'stis's w ys'stholealy woawhimedy it 'save,\n",
+      "Too:Had wh fo an, ZCENERUCHENar ee onds, th h\n"
+     ]
+    }
+   ],
+   "source": [
+    "idx = torch.zeros((1, 1), dtype=torch.long)\n",
+    "\n",
+    "results = decode(m.generate(idx, max_new_tokens=100)[0].tolist())\n",
+    "\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([4, 8, 16])"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "B, T, C = 4, 8, 32\n",
+    "\n",
+    "x = torch.randn(B, T, C)\n",
+    "\n",
+    "head_size = 16\n",
+    "key = nn.Linear(C, head_size, bias=False)\n",
+    "query = nn.Linear(C, head_size, bias=False)\n",
+    "value = nn.Linear(C, head_size, bias=False)\n",
+    "k = key(x)\n",
+    "q = query(x)\n",
+    "wei = q @ k.transpose(-2, -1) * (head_size**-0.5)  # (B,T,16) @ (B,16,T) --> (B,T,T)\n",
+    "\n",
+    "tril = torch.tril(torch.ones(T, T))\n",
+    "wei = wei.masked_fill(tril == 0, float(\"-inf\"))\n",
+    "wei = F.softmax(wei, dim=-1)\n",
+    "v = value(x)\n",
+    "out = wei @ v\n",
+    "\n",
+    "out.shape\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+       "        [0.3325, 0.6675, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+       "        [0.3578, 0.2873, 0.3550, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+       "        [0.2281, 0.1964, 0.2733, 0.3022, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+       "        [0.2851, 0.1588, 0.2068, 0.1436, 0.2057, 0.0000, 0.0000, 0.0000],\n",
+       "        [0.2429, 0.1547, 0.1550, 0.1475, 0.2049, 0.0951, 0.0000, 0.0000],\n",
+       "        [0.1573, 0.1838, 0.1123, 0.1680, 0.1528, 0.1194, 0.1063, 0.0000],\n",
+       "        [0.1139, 0.1704, 0.0766, 0.1134, 0.1600, 0.1466, 0.1228, 0.0963]],\n",
+       "       grad_fn=<SelectBackward0>)"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wei[0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

gpt.ipynb ADDED Viewed

	@@ -0,0 +1,211 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.model import GPTModel\n",
+    "from src.training import train\n",
+    "from src.inference import generate\n",
+    "from src.utils import vocab_size\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Decalre Hyperparams"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 64\n",
+    "block_size = 256\n",
+    "max_iters = 5000\n",
+    "eval_interval = 500\n",
+    "learning_rate = 3e-4\n",
+    "device = \"cuda:1\" if torch.cuda.is_available() else \"cpu\"\n",
+    "eval_iters = 200\n",
+    "n_embeds = 384\n",
+    "n_heads = 6\n",
+    "n_layers = 6\n",
+    "dropout = 0.2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize Model and Optimizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = GPTModel(vocab_size, n_embeds, block_size, n_heads, n_layers, dropout, device)\n",
+    "model = model.to(device)\n",
+    "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step 0: train loss 4.3249, val loss 4.3219\n",
+      "Step 500: train loss 2.0213, val loss 2.0953\n",
+      "Step 1000: train loss 1.6067, val loss 1.7813\n",
+      "Step 1500: train loss 1.4462, val loss 1.6380\n",
+      "Step 2000: train loss 1.3516, val loss 1.5810\n",
+      "Step 2500: train loss 1.2836, val loss 1.5376\n",
+      "Step 3000: train loss 1.2309, val loss 1.5148\n",
+      "Step 3500: train loss 1.1910, val loss 1.4904\n",
+      "Step 4000: train loss 1.1522, val loss 1.4822\n",
+      "Step 4500: train loss 1.1186, val loss 1.4838\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(\n",
+    "    model,\n",
+    "    optimizer,\n",
+    "    max_iters,\n",
+    "    eval_interval,\n",
+    "    eval_iters,\n",
+    "    block_size,\n",
+    "    batch_size,\n",
+    "    device,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the model and Generate text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "hellows thence grown from thee.\n",
+      "Since thou hast raim, thou thast well were quarterned; and\n",
+      "ever man tree can saw for words word from her at hour\n",
+      "Whiles contrations or devoided from ere years;\n",
+      "Yea, foul vice, indelice on the bird of the\n",
+      "noble of Hermione.\n",
+      "\n",
+      "PARIS:\n",
+      "Sir, adies, sir, hate no choping but to your good.\n",
+      "\n",
+      "HENRY BOLINGBROKE:\n",
+      "Yes, to ask you might, foreweed.\n",
+      "\n",
+      "WARCK:\n",
+      "'Tis he made moust true.\n",
+      "\n",
+      "RORSET:\n",
+      "It is an hour fastal that cracknaf at the chase\n",
+      "Upon; you are your hearing news a daughter.\n",
+      "\n",
+      "KING EDWARD IV:\n",
+      "Tut, Lord Warwick, thou shouldst aft Rutlansps?\n",
+      "Thou tust but back hild, he countemn'd my lady's seal,\n",
+      "For access dead the treature moon! and the Englisting!\n",
+      "Thy vage for yonder see thou be donen?\n",
+      "O, count thou dost not Romeo, thou pratheeo sir,\n",
+      "That sweet thou feigh with no past blood on\n",
+      "Be see, here through on that find bears, if an\n",
+      "pretterinctors three and aspect die meeds thou,\n",
+      "Behing mine of thy denigning state lain business?\n",
+      "\n",
+      "SAMPSA:\n",
+      "Sir, ha! but thou refused? thyself food, gr\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = torch.load(\"checkpoints/model.pth\", map_location={\"cpu\": device})\n",
+    "results = generate(\"hello\", model, block_size, 1000, device)\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (527 Bytes). View file

src/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (4.77 kB). View file

src/__pycache__/training.cpython-310.pyc ADDED Viewed

Binary file (1.27 kB). View file

src/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.75 kB). View file

src/inference.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from src.utils import encode, decode
+def generate(prompt, model, block_size, max_new_tokens, device):
+    X = torch.tensor(encode(prompt), dtype=torch.long, device=device)
+    X = X[:block_size].unsqueeze(0)
+    results = decode(model.generate(X, max_new_tokens=max_new_tokens)[0].tolist())
+    return results

src/model.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+class Head(nn.Module):
+    def __init__(self, n_embeds, head_size, block_size, dropout) -> None:
+        super().__init__()
+        self.key = nn.Linear(n_embeds, head_size, bias=False)
+        self.query = nn.Linear(n_embeds, head_size, bias=False)
+        self.value = nn.Linear(n_embeds, head_size, bias=False)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
+    def forward(self, x):
+        B, T, C = x.shape
+        k = self.key(x)
+        q = self.query(x)
+        wei = q @ k.transpose(-2, -1) * (C**-0.5)  # (B,T,16) @ (B,16,T) --> (B,T,T)
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        v = self.value(x)
+        out = wei @ v
+        return out
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_heads, n_embeds, head_size, block_size, dropout):
+        super().__init__()
+        self.heads = nn.ModuleList(
+            [Head(n_embeds, head_size, block_size, dropout) for _ in range(n_heads)]
+        )
+        self.proj = nn.Linear(n_embeds, n_embeds)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = torch.cat([h(x) for h in self.heads], dim=-1)
+        x = self.proj(x)
+        x = self.dropout(x)
+        return x
+class FeedForward(nn.Module):
+    def __init__(self, n_embeds, dropout):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embeds, 4 * n_embeds),
+            nn.ReLU(),
+            nn.Linear(4 * n_embeds, n_embeds),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Decoder(nn.Module):
+    def __init__(self, n_embeds, n_heads, block_size, dropout):
+        super().__init__()
+        head_size = n_embeds // n_heads
+        self.sa_heads = MultiHeadAttention(
+            n_heads, n_embeds, head_size, block_size, dropout
+        )
+        self.ffwd = FeedForward(n_embeds, dropout)
+        self.ln1 = nn.LayerNorm(n_embeds)
+        self.ln2 = nn.LayerNorm(n_embeds)
+    def forward(self, x):
+        x = x + self.sa_heads(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+class GPTModel(nn.Module):
+    def __init__(
+        self, vocab_size, n_embeds, block_size, n_heads, n_layers, dropout, device
+    ):
+        super().__init__()
+        self.device = device
+        self.block_size = block_size
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embeds)
+        self.position_embedding_table = nn.Embedding(block_size, n_embeds)
+        self.blocks = nn.Sequential(
+            *[Decoder(n_embeds, n_heads, block_size, dropout) for _ in range(n_layers)]
+        )
+        self.lnf = nn.LayerNorm(n_embeds)
+        self.lm_head = nn.Linear(n_embeds, vocab_size)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        tok_embeds = self.token_embedding_table(idx)  # BxTxNemb
+        pos_embeds = self.position_embedding_table(
+            torch.arange(T, device=self.device)
+        )  # TXNemb
+        x = tok_embeds + pos_embeds  # BxTxNemb
+        x = self.blocks(x)
+        x = self.lnf(x)
+        logits = self.lm_head(x)  # BxTxVocabSize
+        loss = None
+        if targets is not None:
+            B, T, C = logits.shape
+            logits = logits.view(B * T, C)
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
+        return logits, loss
+    def generate(self, idx, max_new_tokens):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -self.block_size :]
+            logits, loss = self(idx_cond)  # BxTxC
+            logits = logits[:, -1, :]  # BxC
+            probs = F.softmax(logits, dim=-1)  # BxC
+            idx_next = torch.multinomial(probs, num_samples=1)  # Bx1
+            idx = torch.cat((idx, idx_next), dim=1)  # BxT+1
+        return idx

src/training.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from torch import nn
+from src.utils import get_batch
+@torch.no_grad()
+def estimate_loss(model: nn.Module, eval_iters, block_size, batch_size, device):
+    out = {}
+    model.eval()
+    for split in ["train", "val"]:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split, block_size, batch_size)
+            X, Y = X.to(device), Y.to(device)
+            logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+def train(
+    model,
+    optimizer,
+    max_iters,
+    eval_interval,
+    eval_iters,
+    block_size,
+    batch_size,
+    device,
+):
+    val_loss = None
+    for iter in range(max_iters):
+        if iter % eval_interval == 0:
+            losses = estimate_loss(model, eval_iters, block_size, batch_size, device)
+            print(
+                f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
+            )
+            if val_loss is not None:
+                if losses["val"] < val_loss:
+                    torch.save(model, "checkpoints/model.pth")
+            else:
+                val_loss = losses["val"]
+        xb, yb = get_batch("train", block_size, batch_size)
+        xb, yb = xb.to(device), yb.to(device)
+        logits, loss = model(xb, yb)
+        optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        optimizer.step()

src/utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+with open("data/input.txt") as f:
+    text = f.read()
+chars = sorted(list(set(text)))
+vocab_size = len(chars)
+stoi = {ch: i for i, ch in enumerate(chars)}
+itos = {i: ch for i, ch in enumerate(chars)}
+def encode(s):
+    return [stoi[c] for c in s]
+def decode(l):
+    return "".join([itos[i] for i in l])
+data = torch.tensor(encode(text), dtype=torch.long)
+n = int(0.9 * len(data))
+train_data = data[:n]
+val_data = data[n:]
+def get_batch(split, block_size, batch_size):
+    data = train_data if split == "train" else val_data
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
+    return x, y