Spaces:

jkeisling
/

gpt

Sleeping

App Files Files Community

jkeisling commited on Mar 30, 2023

Commit

f747ce5

1 Parent(s): fb24f54

Fix training objective; lower model size

Browse files

Files changed (2) hide show

.gitignore +3 -0
gpt.ipynb +315 -203

.gitignore CHANGED Viewed

@@ -2,6 +2,9 @@
 checkpoints/
 datasets/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

 checkpoints/
 datasets/
+# Training Tensorboard runs
+runs/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

gpt.ipynb CHANGED Viewed

@@ -10,20 +10,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
-    "# We always start with a dataset to train on. Let's download the tiny shakespeare dataset\n",
     "if not os.path.isfile(\"./datasets/corpora/shakespeare.txt\"):\n",
-    "    !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > datasets/corpora/shakespeare.txt"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -31,6 +30,21 @@
     "    text = f.read()"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -41,16 +55,86 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<torch._C.Generator at 0x7f7b543cb430>"
       ]
      },
-     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -71,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,16 +169,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
    "source": [
     "# Tensorify data, put it in dataset\n",
     "data = torch.tensor(encode_text(text), dtype=torch.int32)\n",
     "\n",
-    "split_idx = int(0.9 * len(data))\n",
-    "train_data = data[:split_idx]\n",
-    "test_data = data[split_idx:]"
    ]
   },
   {
@@ -107,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -117,92 +212,15 @@
     "        self.context_size = context_size\n",
     "    \n",
     "    def __len__(self):\n",
-    "        return len(self.data_tensor)\n",
     "\n",
     "    def __getitem__(self, index):\n",
-    "        if index < self.context_size:\n",
-    "            x = F.pad(self.data_tensor[:index], (self.context_size - index, 0), value=0)\n",
-    "        else:\n",
-    "            x = self.data_tensor[index - self.context_size:index]\n",
     "        \n",
-    "        y = self.data_tensor[index]\n",
     "        return x, y"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "NOTE 2023-03-25: I think this is bugged, and that's the reason the training loss is so damn high. Testing:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Step 0:\n",
-      "[0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "---\n",
-      "[0, 0, 0, 0, 0, 0, 0, 70]\n",
-      "---\n",
-      "['F', 'i']\n",
-      "Step 1:\n",
-      "[0, 0, 0, 0, 0, 0, 70, 105]\n",
-      "---\n",
-      "[0, 0, 0, 0, 0, 70, 105, 114]\n",
-      "---\n",
-      "['r', 's']\n",
-      "Step 2:\n",
-      "[0, 0, 0, 0, 70, 105, 114, 115]\n",
-      "---\n",
-      "[0, 0, 0, 70, 105, 114, 115, 116]\n",
-      "---\n",
-      "['t', ' ']\n",
-      "Step 3:\n",
-      "[0, 0, 70, 105, 114, 115, 116, 32]\n",
-      "---\n",
-      "[0, 70, 105, 114, 115, 116, 32, 67]\n",
-      "---\n",
-      "['C', 'i']\n",
-      "Step 4:\n",
-      "[70, 105, 114, 115, 116, 32, 67, 105]\n",
-      "---\n",
-      "[105, 114, 115, 116, 32, 67, 105, 116]\n",
-      "---\n",
-      "['t', 'i']\n",
-      "Step 5:\n",
-      "[114, 115, 116, 32, 67, 105, 116, 105]\n",
-      "---\n",
-      "[115, 116, 32, 67, 105, 116, 105, 122]\n",
-      "---\n",
-      "['z', 'e']\n"
-     ]
-    }
-   ],
-   "source": [
-    "train_dataset = TextDataset(train_data, 8)\n",
-    "train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=False)\n",
-    "\n",
-    "step = 0\n",
-    "for x, y in train_dataloader:\n",
-    "    print(f\"Step {step}:\")\n",
-    "    for b in x.tolist():\n",
-    "        print(b)\n",
-    "        print(\"---\")\n",
-    "\n",
-    "    print(decode_text(y.tolist()))\n",
-    "    step += 1\n",
-    "    if step > 5:\n",
-    "        break\n",
-    "\n"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -213,7 +231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,15 +244,14 @@
     "        self.num_heads = num_heads\n",
     "        self.d_k = embed_dim // num_heads\n",
     "\n",
-    "        self.Q = nn.Linear(embed_dim, embed_dim, bias=bias)\n",
-    "        self.K = nn.Linear(embed_dim, embed_dim, bias=bias)\n",
-    "        self.V = nn.Linear(embed_dim, embed_dim, bias=bias)\n",
     "\n",
     "        self.dropout = nn.Dropout(dropout)\n",
     "        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)\n",
-    "        nn.init.kaiming_normal_(self.out_proj.weight, mode='fan_in', nonlinearity='linear')\n",
     "\n",
-    "    def forward(self, query, key, value, key_padding_mask=None):\n",
     "        batch_size = query.size(0)\n",
     "\n",
     "        # Apply linear layers\n",
@@ -251,7 +268,7 @@
     "        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) # [B, num_heads, C, C]\n",
     "\n",
     "        # Apply mask, if necessary\n",
-    "        if key_padding_mask is not None:\n",
     "            \"\"\"\n",
     "            MAY BE WORTH DEBUGGING\n",
     "\n",
@@ -263,7 +280,7 @@
     "                key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]\n",
     "            \"\"\"\n",
     "            # Apply the mask to attention scores\n",
-    "            scores = scores.masked_fill(key_padding_mask, float('-inf'))\n",
     "\n",
     "        # Scale by sqrt(k)\n",
     "        attn = F.softmax(scores, dim=-1)\n",
@@ -275,13 +292,13 @@
     "        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_dim)\n",
     "        # Project: give attention \"time to think\". Maybe this should be part of a different module but whatever\n",
     "        out = self.out_proj(out)\n",
-    "        return(out)\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -290,9 +307,9 @@
     "        super().__init__()\n",
     "        self.net = nn.Sequential(\n",
     "            nn.Linear(embed_dim, 4 * embed_dim),\n",
-    "            nn.ReLU(),\n",
-    "            nn.Dropout(dropout)\n",
     "            nn.Linear(4 * embed_dim, embed_dim),\n",
     "        )\n",
     "\n",
     "    def forward(self, x):\n",
@@ -301,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -311,6 +328,7 @@
     "        super(Block, self).__init__()  \n",
     "        self.register_buffer(\"mask\", mask)\n",
     "        self.head = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout)\n",
     "        self.ffwd = FeedForward(embed_dim=embed_dim, dropout=dropout)\n",
     "        self.ln1 = nn.LayerNorm(embed_dim)\n",
     "        self.ln2 = nn.LayerNorm(embed_dim)\n",
@@ -318,58 +336,57 @@
     "    def forward(self, x):\n",
     "        # Residual connections\n",
     "        x = self.ln1(x)\n",
-    "        x = x + self.head.forward(x, x, x, key_padding_mask=self.mask) \n",
     "        out = x + self.ffwd(self.ln2(x))\n",
     "        return out\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
     "class GPT(nn.Module):\n",
-    "    def __init__(self, embedding_dim, vocab_size, context_size, lr=1e-3):\n",
-    "        # Inherit PyTorch stuff\n",
     "        super(GPT, self).__init__()\n",
     "\n",
-    "        # Save variables for later\n",
     "        self.embedding_dim = embedding_dim\n",
     "        self.output_dim = vocab_size\n",
     "        self.context_size = context_size\n",
     "\n",
-    "        # Initialize layers. Sadly this breaks the whole \"self.layers: concept but whatever\n",
     "        self.tok_embed = nn.Embedding(vocab_size, embedding_dim)\n",
     "        self.pos_embed = nn.Embedding(context_size, embedding_dim)\n",
     "\n",
-    "        NUM_HEADS=6\n",
-    "        NUM_LAYERS=6\n",
-    "        \n",
     "        mask = torch.tril(torch.ones(self.context_size, self.context_size)).bool()\n",
     "        mask = ~mask\n",
-    "        self.register_buffer(mask)\n",
     "\n",
     "        self.blocks = nn.Sequential(\n",
-    "            *[Block(embed_dim=embedding_dim, num_heads=NUM_HEADS, mask=mask) for _ in range(NUM_LAYERS)],\n",
-    "            nn.Dropout(0.2)\n",
     "        )\n",
     "\n",
     "        # Final feed-forward layer from embeddings\n",
-    "        self.ffwd = nn.Linear(embedding_dim, out_features=vocab_size)\n",
     "\n",
     "    def forward(self, x):\n",
     "        tok_embed = self.tok_embed(x)\n",
-    "        tok_embed = tok_embed.view(-1, self.context_size, self.embedding_dim)\n",
-    "        pos_embed = self.pos_embed(torch.arange(0, self.context_size, device=\"cuda\")).unsqueeze(0)\n",
     "        x = tok_embed + pos_embed\n",
     "\n",
-    "        # The actual attention is all you need here!\n",
-    "        # B*C*C cutting out the future\n",
     "        x = self.blocks(x)\n",
     "\n",
-    "        preds = self.ffwd(x)\n",
-    "        return(preds)\n",
     "    \n",
     "    def infer(self, x):\n",
     "        with torch.no_grad():\n",
@@ -387,93 +404,114 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
     "def compute_loss(model, criterion, x, y):\n",
     "    logits = model(x)\n",
-    "    last_logits = logits[:, -1, :]\n",
-    "    log_probs = nn.LogSoftmax(dim=1)(last_logits)\n",
-    "    loss = criterion(log_probs, y.view(-1).long())\n",
     "    return loss"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
-    "EMBEDDING_NDIM = 384\n",
-    "VOCAB_SIZE = 128\n",
-    "BATCH_SIZE=64\n",
-    "# \"Context window\"\n",
-    "BLOCK_SIZE=256\n",
-    "LR=1e-3\n",
     "\n",
     "train_dataset = TextDataset(train_data, BLOCK_SIZE)\n",
-    "test_dataset = TextDataset(train_data, BLOCK_SIZE)\n",
     "\n",
     "# Janky training code\n",
     "model = GPT(\n",
     "    embedding_dim=EMBEDDING_NDIM, \n",
     "    vocab_size=VOCAB_SIZE,\n",
     "    context_size=BLOCK_SIZE,\n",
-    "    lr=LR\n",
     "    )\n",
     "\n",
     "model = model.to('cuda')\n",
     "optimizer = optim.AdamW(model.parameters(), lr=LR)\n",
-    "# TODO Fix this!\n",
-    "scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10000, gamma=0.2)\n",
-    "criterion = nn.NLLLoss()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Step 0; loss: 3.3686537742614746\n",
-      "Step 100; loss: 3.3535483678181968\n",
-      "Step 200; loss: 3.3484479188919067\n",
-      "Step 300; loss: 3.344235420227051\n",
-      "Step 400; loss: 3.338580369949341\n",
-      "Step 500; loss: 3.330465725490025\n",
-      "Step 600; loss: 3.333183079957962\n",
-      "Step 700; loss: 3.3319032986958823\n",
-      "Step 800; loss: 3.332624101638794\n",
-      "Step 900; loss: 3.3325188810175117\n",
-      "Step 1000; loss: 3.331260542074839\n",
-      "Step 1100; loss: 3.3311657355381894\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[1;32m/home/ritsuko/projects/ai/micrograd/gpt.ipynb Cell 20\u001b[0m in \u001b[0;36m2\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=24'>25</a>\u001b[0m \u001b[39m# Backward pass\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=25'>26</a>\u001b[0m optimizer\u001b[39m.\u001b[39mzero_grad()\n\u001b[0;32m---> <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=26'>27</a>\u001b[0m loss\u001b[39m.\u001b[39;49mbackward()\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=27'>28</a>\u001b[0m optimizer\u001b[39m.\u001b[39mstep()\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=28'>29</a>\u001b[0m scheduler\u001b[39m.\u001b[39mstep()\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/_tensor.py:396\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    387\u001b[0m \u001b[39mif\u001b[39;00m has_torch_function_unary(\u001b[39mself\u001b[39m):\n\u001b[1;32m    388\u001b[0m     \u001b[39mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    389\u001b[0m         Tensor\u001b[39m.\u001b[39mbackward,\n\u001b[1;32m    390\u001b[0m         (\u001b[39mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    394\u001b[0m         create_graph\u001b[39m=\u001b[39mcreate_graph,\n\u001b[1;32m    395\u001b[0m         inputs\u001b[39m=\u001b[39minputs)\n\u001b[0;32m--> 396\u001b[0m torch\u001b[39m.\u001b[39;49mautograd\u001b[39m.\u001b[39;49mbackward(\u001b[39mself\u001b[39;49m, gradient, retain_graph, create_graph, inputs\u001b[39m=\u001b[39;49minputs)\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/autograd/__init__.py:173\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    168\u001b[0m     retain_graph \u001b[39m=\u001b[39m create_graph\n\u001b[1;32m    170\u001b[0m \u001b[39m# The reason we repeat same the comment below is that\u001b[39;00m\n\u001b[1;32m    171\u001b[0m \u001b[39m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m    172\u001b[0m \u001b[39m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 173\u001b[0m Variable\u001b[39m.\u001b[39;49m_execution_engine\u001b[39m.\u001b[39;49mrun_backward(  \u001b[39m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m    174\u001b[0m     tensors, grad_tensors_, retain_graph, create_graph, inputs,\n\u001b[1;32m    175\u001b[0m     allow_unreachable\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, accumulate_grad\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
    ],
    "source": [
-    "from torch.utils.\n",
     "EPOCHS = 1\n",
     "STEPS = 5000\n",
     "VAL_INTERVAL = 100\n",
     "\n",
-    "losses = []\n",
     "model.train()\n",
     "\n",
     "train_dataloader = DataLoader(\n",
@@ -485,7 +523,10 @@
     "\n",
     "test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=4, shuffle=True)\n",
     "\n",
     "step = 0\n",
     "for epoch in range(EPOCHS):\n",
     "    for data, target in train_dataloader:\n",
     "        data = data.to('cuda')\n",
@@ -497,11 +538,16 @@
     "        optimizer.zero_grad()\n",
     "        loss.backward()\n",
     "        optimizer.step()\n",
-    "        scheduler.step()\n",
     "\n",
-    "        losses.append(loss.cpu().detach().numpy())\n",
     "\n",
     "        if step % VAL_INTERVAL == 0:\n",
     "            with torch.no_grad():\n",
     "                model.eval()\n",
     "                for x, y in test_dataloader:\n",
@@ -514,18 +560,23 @@
     "                    if total_samples > 10:\n",
     "                        break\n",
     "\n",
-    "                average_loss = total_loss / total_samples\n",
-    "                print(f\"Step {step}; loss: {average_loss}\")\n",
-    "                model.train()\n",
     "\n",
     "        step += 1\n",
     "        if step >= STEPS:\n",
-    "            break\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -534,7 +585,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -568,16 +619,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "2399"
       ]
      },
-     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -589,14 +640,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3.4188449382781982\n"
      ]
     }
    ],
@@ -605,16 +656,17 @@
     "total_loss = 0.0\n",
     "total_samples = 0\n",
     "\n",
-    "test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=4)\n",
     "with torch.no_grad():\n",
-    "    for x, y in test_dataloader:\n",
     "        x = x.to(\"cuda\")\n",
     "        y = y.to(\"cuda\")\n",
     "\n",
     "        batch_loss = compute_loss(model, criterion, x, y)\n",
     "        total_loss += batch_loss.item() * x.size(0)\n",
     "        total_samples += x.size(0)\n",
-    "        if total_samples > 100:\n",
     "            break\n",
     "\n",
     "    average_loss = total_loss / total_samples\n",
@@ -623,44 +675,98 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Finally, we generate:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      ",n  aon mr\n",
-      "nr\n",
-      "egtel  s.mangtVk h\n",
-      " -hinSfii ol ihIraddeioi akpshaC.n trU d aamooaa eoeEhl:daoUabo'm-fddE auh hpyHs wv'erstiInnmwt hnAuNu ufl\n",
-      "I: rl.T   l!eool'lIhl:aynet nna:i yaneehtea hdel\n",
-      "  hse l;imi\n",
-      "  hgy f iuto eoh gBum.umhemvt\n",
-      "a hFo lNsute oaaenh;byeon"
      ]
     }
    ],
    "source": [
     "g_cuda = torch.Generator(device='cuda')\n",
     "\n",
-    "contexts = torch.tensor(encode_text(\"God\"), dtype=torch.int32).to('cuda')\n",
-    "GEN_LENGTH=256\n",
     "\n",
     "model.eval()\n",
     "for i in range(GEN_LENGTH):\n",
@@ -668,13 +774,19 @@
     "    # What happens if GEN_LENGTH > CONTEXT? don't worry about it\n",
     "    #x = F.pad(contexts[:, -BLOCK_SIZE:], (0, BLOCK_SIZE - contexts.size(0)), \"constant\", 0)\n",
     "    x = contexts[-BLOCK_SIZE:]\n",
-    "    x = F.pad(x, (0, BLOCK_SIZE - x.size(0)), \"constant\", 0).unsqueeze(0) # B*T\n",
     "    preds = model.infer(x)\n",
     "    preds = preds.squeeze(0)\n",
     "    probs = torch.softmax(preds, dim=-1)\n",
     "\n",
     "    # TODO: Broken because of bug with the trailing 0s. FIX THIS\n",
-    "    next_char = torch.multinomial(torch.exp(preds[(-1 if i >= BLOCK_SIZE else i), :]), num_samples=1, generator=g_cuda)\n",
     "    #context = torch.cat(context, next_char)\n",
     "    contexts = torch.cat((contexts, next_char), dim=0)\n",
     "    print(decode_text(next_char.cpu().numpy())[-1], end=\"\")\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "if not os.path.isfile(\"./datasets/corpora/shakespeare.txt\"):\n",
+    "    !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O datasets/corpora/shakespeare.txt"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
     "    text = f.read()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Putting hyperparameters at the top because I learned this the hard way\n",
+    "# 64 * NUM_HEADS\n",
+    "EMBEDDING_NDIM=256\n",
+    "VOCAB_SIZE=128\n",
+    "BATCH_SIZE=64\n",
+    "# \"Context window\"\n",
+    "BLOCK_SIZE=256"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
   },
   {
    "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: torch in ./venv/lib/python3.10/site-packages (2.0.0)\n",
+      "Requirement already satisfied: pandas in ./venv/lib/python3.10/site-packages (1.5.3)\n",
+      "Requirement already satisfied: numpy in ./venv/lib/python3.10/site-packages (1.24.1)\n",
+      "Requirement already satisfied: tensorboard in ./venv/lib/python3.10/site-packages (2.12.0)\n",
+      "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in ./venv/lib/python3.10/site-packages (from torch) (2.14.3)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in ./venv/lib/python3.10/site-packages (from torch) (8.5.0.96)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in ./venv/lib/python3.10/site-packages (from torch) (11.4.0.1)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in ./venv/lib/python3.10/site-packages (from torch) (11.7.99)\n",
+      "Requirement already satisfied: networkx in ./venv/lib/python3.10/site-packages (from torch) (3.0)\n",
+      "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in ./venv/lib/python3.10/site-packages (from torch) (10.2.10.91)\n",
+      "Requirement already satisfied: filelock in ./venv/lib/python3.10/site-packages (from torch) (3.10.4)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in ./venv/lib/python3.10/site-packages (from torch) (11.7.91)\n",
+      "Requirement already satisfied: typing-extensions in ./venv/lib/python3.10/site-packages (from torch) (4.5.0)\n",
+      "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in ./venv/lib/python3.10/site-packages (from torch) (11.10.3.66)\n",
+      "Requirement already satisfied: sympy in ./venv/lib/python3.10/site-packages (from torch) (1.11.1)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in ./venv/lib/python3.10/site-packages (from torch) (11.7.99)\n",
+      "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in ./venv/lib/python3.10/site-packages (from torch) (10.9.0.58)\n",
+      "Requirement already satisfied: jinja2 in ./venv/lib/python3.10/site-packages (from torch) (3.1.2)\n",
+      "Requirement already satisfied: triton==2.0.0 in ./venv/lib/python3.10/site-packages (from torch) (2.0.0)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in ./venv/lib/python3.10/site-packages (from torch) (11.7.101)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in ./venv/lib/python3.10/site-packages (from torch) (11.7.4.91)\n",
+      "Requirement already satisfied: wheel in ./venv/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (0.40.0)\n",
+      "Requirement already satisfied: setuptools in ./venv/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (65.5.0)\n",
+      "Requirement already satisfied: lit in ./venv/lib/python3.10/site-packages (from triton==2.0.0->torch) (16.0.0)\n",
+      "Requirement already satisfied: cmake in ./venv/lib/python3.10/site-packages (from triton==2.0.0->torch) (3.26.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.1 in ./venv/lib/python3.10/site-packages (from pandas) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.10/site-packages (from pandas) (2023.2)\n",
+      "Requirement already satisfied: requests<3,>=2.21.0 in ./venv/lib/python3.10/site-packages (from tensorboard) (2.28.2)\n",
+      "Requirement already satisfied: werkzeug>=1.0.1 in ./venv/lib/python3.10/site-packages (from tensorboard) (2.2.3)\n",
+      "Requirement already satisfied: google-auth<3,>=1.6.3 in ./venv/lib/python3.10/site-packages (from tensorboard) (2.16.3)\n",
+      "Requirement already satisfied: protobuf>=3.19.6 in ./venv/lib/python3.10/site-packages (from tensorboard) (4.22.1)\n",
+      "Requirement already satisfied: markdown>=2.6.8 in ./venv/lib/python3.10/site-packages (from tensorboard) (3.4.3)\n",
+      "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in ./venv/lib/python3.10/site-packages (from tensorboard) (0.4.6)\n",
+      "Requirement already satisfied: grpcio>=1.48.2 in ./venv/lib/python3.10/site-packages (from tensorboard) (1.51.3)\n",
+      "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in ./venv/lib/python3.10/site-packages (from tensorboard) (0.7.0)\n",
+      "Requirement already satisfied: absl-py>=0.4 in ./venv/lib/python3.10/site-packages (from tensorboard) (1.4.0)\n",
+      "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in ./venv/lib/python3.10/site-packages (from tensorboard) (1.8.1)\n",
+      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in ./venv/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard) (5.3.0)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in ./venv/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard) (0.2.8)\n",
+      "Requirement already satisfied: six>=1.9.0 in ./venv/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard) (1.16.0)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in ./venv/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard) (4.9)\n",
+      "Requirement already satisfied: requests-oauthlib>=0.7.0 in ./venv/lib/python3.10/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard) (1.3.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard) (3.4)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./venv/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard) (1.26.15)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard) (3.1.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard) (2022.12.7)\n",
+      "Requirement already satisfied: MarkupSafe>=2.1.1 in ./venv/lib/python3.10/site-packages (from werkzeug>=1.0.1->tensorboard) (2.1.2)\n",
+      "Requirement already satisfied: mpmath>=0.19 in ./venv/lib/python3.10/site-packages (from sympy->torch) (1.3.0)\n",
+      "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in ./venv/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard) (0.4.8)\n",
+      "Requirement already satisfied: oauthlib>=3.0.0 in ./venv/lib/python3.10/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard) (3.2.2)\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install torch pandas numpy tensorboard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "<torch._C.Generator at 0x7fef50768610>"
       ]
      },
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 45,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1115394 chars of data\n"
+     ]
+    }
+   ],
    "source": [
     "# Tensorify data, put it in dataset\n",
     "data = torch.tensor(encode_text(text), dtype=torch.int32)\n",
     "\n",
+    "test_split_idx = int(0.8 * len(data))\n",
+    "val_split_idx = int(0.9 * len(data))\n",
+    "train_data = data[:test_split_idx]\n",
+    "test_data = data[test_split_idx:val_split_idx]\n",
+    "val_data = data[val_split_idx:]\n",
+    "print(f\"{len(data)} chars of data\")"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
     "        self.context_size = context_size\n",
     "    \n",
     "    def __len__(self):\n",
+    "        return len(self.data_tensor) - self.context_size\n",
     "\n",
     "    def __getitem__(self, index):\n",
+    "        x = self.data_tensor[index:index + self.context_size]\n",
+    "        y = self.data_tensor[index + 1:index + self.context_size + 1]\n",
     "        \n",
     "        return x, y"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
   },
   {
    "cell_type": "code",
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
     "        self.num_heads = num_heads\n",
     "        self.d_k = embed_dim // num_heads\n",
     "\n",
+    "        self.Q = nn.Linear(embed_dim, embed_dim, bias=False)\n",
+    "        self.K = nn.Linear(embed_dim, embed_dim, bias=False)\n",
+    "        self.V = nn.Linear(embed_dim, embed_dim, bias=False)\n",
     "\n",
     "        self.dropout = nn.Dropout(dropout)\n",
     "        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)\n",
     "\n",
+    "    def forward(self, query, key, value, attn_mask=None):\n",
     "        batch_size = query.size(0)\n",
     "\n",
     "        # Apply linear layers\n",
     "        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) # [B, num_heads, C, C]\n",
     "\n",
     "        # Apply mask, if necessary\n",
+    "        if attn_mask is not None:\n",
     "            \"\"\"\n",
     "            MAY BE WORTH DEBUGGING\n",
     "\n",
     "                key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]\n",
     "            \"\"\"\n",
     "            # Apply the mask to attention scores\n",
+    "            scores = scores.masked_fill(attn_mask, float('-inf'))\n",
     "\n",
     "        # Scale by sqrt(k)\n",
     "        attn = F.softmax(scores, dim=-1)\n",
     "        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_dim)\n",
     "        # Project: give attention \"time to think\". Maybe this should be part of a different module but whatever\n",
     "        out = self.out_proj(out)\n",
+    "        return((out, None))\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
     "        super().__init__()\n",
     "        self.net = nn.Sequential(\n",
     "            nn.Linear(embed_dim, 4 * embed_dim),\n",
+    "            nn.GELU(),\n",
     "            nn.Linear(4 * embed_dim, embed_dim),\n",
+    "            nn.Dropout(dropout),\n",
     "        )\n",
     "\n",
     "    def forward(self, x):\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 60,
    "metadata": {},
    "outputs": [],
    "source": [
     "        super(Block, self).__init__()  \n",
     "        self.register_buffer(\"mask\", mask)\n",
     "        self.head = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout)\n",
+    "        #self.head = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, batch_first=True)\n",
     "        self.ffwd = FeedForward(embed_dim=embed_dim, dropout=dropout)\n",
     "        self.ln1 = nn.LayerNorm(embed_dim)\n",
     "        self.ln2 = nn.LayerNorm(embed_dim)\n",
     "    def forward(self, x):\n",
     "        # Residual connections\n",
     "        x = self.ln1(x)\n",
+    "        attn_output, _ = self.head(x, x, x, attn_mask=self.mask) \n",
+    "        x = x + attn_output\n",
     "        out = x + self.ffwd(self.ln2(x))\n",
     "        return out\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
     "class GPT(nn.Module):\n",
+    "    def __init__(self, embedding_dim, vocab_size, context_size):\n",
     "        super(GPT, self).__init__()\n",
     "\n",
     "        self.embedding_dim = embedding_dim\n",
     "        self.output_dim = vocab_size\n",
     "        self.context_size = context_size\n",
     "\n",
+    "        NUM_HEADS=4\n",
+    "        NUM_LAYERS=4\n",
+    "        \n",
+    "        # Initialize layers\n",
     "        self.tok_embed = nn.Embedding(vocab_size, embedding_dim)\n",
     "        self.pos_embed = nn.Embedding(context_size, embedding_dim)\n",
     "\n",
     "        mask = torch.tril(torch.ones(self.context_size, self.context_size)).bool()\n",
     "        mask = ~mask\n",
+    "        self.register_buffer(\"mask\", mask)\n",
     "\n",
     "        self.blocks = nn.Sequential(\n",
+    "            *[Block(embed_dim=embedding_dim, num_heads=NUM_HEADS, mask=mask, dropout=0.2) for _ in range(NUM_LAYERS)]\n",
     "        )\n",
     "\n",
+    "        self.ln_f = nn.LayerNorm(self.embedding_dim)\n",
     "        # Final feed-forward layer from embeddings\n",
+    "        self.ffwd = nn.Linear(embedding_dim, out_features=vocab_size, bias=False)\n",
     "\n",
     "    def forward(self, x):\n",
     "        tok_embed = self.tok_embed(x)\n",
+    "        pos_embed = self.pos_embed(\n",
+    "            torch.arange(0, self.context_size, device=\"cuda\")\n",
+    "        )\n",
     "        x = tok_embed + pos_embed\n",
     "\n",
     "        x = self.blocks(x)\n",
+    "        x = self.ln_f(x)\n",
     "\n",
+    "        logits = self.ffwd(x)\n",
+    "        return(logits)\n",
     "    \n",
     "    def infer(self, x):\n",
     "        with torch.no_grad():\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 51,
    "metadata": {},
    "outputs": [],
    "source": [
     "def compute_loss(model, criterion, x, y):\n",
     "    logits = model(x)\n",
+    "    B,C,V = logits.shape\n",
+    "    logits = logits.view(B*C, V)\n",
+    "    y = y.view(B*C)\n",
+    "    loss = F.cross_entropy(logits, y.long())\n",
     "    return loss"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
+    "LR=3e-4\n",
     "\n",
     "train_dataset = TextDataset(train_data, BLOCK_SIZE)\n",
+    "test_dataset = TextDataset(test_data, BLOCK_SIZE)\n",
     "\n",
     "# Janky training code\n",
     "model = GPT(\n",
     "    embedding_dim=EMBEDDING_NDIM, \n",
     "    vocab_size=VOCAB_SIZE,\n",
     "    context_size=BLOCK_SIZE,\n",
     "    )\n",
     "\n",
     "model = model.to('cuda')\n",
     "optimizer = optim.AdamW(model.parameters(), lr=LR)\n",
+    "#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)\n",
+    "criterion = F.cross_entropy\n",
+    "\n",
+    "global_step = 0"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 68,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Step 0; loss: 4.62758731842041\n",
+      "Step 100; loss: 2.5372843742370605\n",
+      "Step 200; loss: 2.486722946166992\n",
+      "Step 300; loss: 2.3916263580322266\n",
+      "Step 400; loss: 2.269087314605713\n",
+      "Step 500; loss: 2.1484358310699463\n",
+      "Step 600; loss: 2.057586193084717\n",
+      "Step 700; loss: 1.9845455884933472\n",
+      "Step 800; loss: 1.910020351409912\n",
+      "Step 900; loss: 1.8550803661346436\n",
+      "Step 1000; loss: 1.8193731307983398\n",
+      "Step 1100; loss: 1.767741322517395\n",
+      "Step 1200; loss: 1.7612113952636719\n",
+      "Step 1300; loss: 1.7009034156799316\n",
+      "Step 1400; loss: 1.6827564239501953\n",
+      "Step 1500; loss: 1.6604313850402832\n",
+      "Step 1600; loss: 1.633068323135376\n",
+      "Step 1700; loss: 1.6335963010787964\n",
+      "Step 1800; loss: 1.6095472574234009\n",
+      "Step 1900; loss: 1.6086715459823608\n",
+      "Step 2000; loss: 1.5876469612121582\n",
+      "Step 2100; loss: 1.5713247060775757\n",
+      "Step 2200; loss: 1.5546257495880127\n",
+      "Step 2300; loss: 1.5589814186096191\n",
+      "Step 2400; loss: 1.5507397651672363\n",
+      "Step 2500; loss: 1.5470337867736816\n",
+      "Step 2600; loss: 1.547551155090332\n",
+      "Step 2700; loss: 1.5338884592056274\n",
+      "Step 2800; loss: 1.5179914236068726\n",
+      "Step 2900; loss: 1.5240544080734253\n",
+      "Step 3000; loss: 1.5162924528121948\n",
+      "Step 3100; loss: 1.5197933912277222\n",
+      "Step 3200; loss: 1.5107413530349731\n",
+      "Step 3300; loss: 1.5017006397247314\n",
+      "Step 3400; loss: 1.4874128103256226\n",
+      "Step 3500; loss: 1.4917751550674438\n",
+      "Step 3600; loss: 1.5251762866973877\n",
+      "Step 3700; loss: 1.4957225322723389\n",
+      "Step 3800; loss: 1.507473111152649\n",
+      "Step 3900; loss: 1.4815101623535156\n",
+      "Step 4000; loss: 1.4824676513671875\n",
+      "Step 4100; loss: 1.4799575805664062\n",
+      "Step 4200; loss: 1.4820805788040161\n",
+      "Step 4300; loss: 1.4852553606033325\n",
+      "Step 4400; loss: 1.469815731048584\n",
+      "Step 4500; loss: 1.4853312969207764\n",
+      "Step 4600; loss: 1.4830256700515747\n",
+      "Step 4700; loss: 1.468559741973877\n",
+      "Step 4800; loss: 1.4680243730545044\n",
+      "Step 4900; loss: 1.464580774307251\n"
      ]
     }
    ],
    "source": [
+    "from torch.utils.tensorboard import SummaryWriter\n",
+    "\n",
     "EPOCHS = 1\n",
     "STEPS = 5000\n",
     "VAL_INTERVAL = 100\n",
     "\n",
     "model.train()\n",
     "\n",
     "train_dataloader = DataLoader(\n",
     "\n",
     "test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=4, shuffle=True)\n",
     "\n",
+    "writer = SummaryWriter()\n",
+    "\n",
     "step = 0\n",
+    "\n",
     "for epoch in range(EPOCHS):\n",
     "    for data, target in train_dataloader:\n",
     "        data = data.to('cuda')\n",
     "        optimizer.zero_grad()\n",
     "        loss.backward()\n",
     "        optimizer.step()\n",
+    "        #scheduler.step()\n",
     "\n",
+    "        writer.add_scalar(\"Loss/train\", loss.cpu().detach().numpy(), global_step)\n",
+    "        global_step += 1\n",
     "\n",
+    "        # TODO!!! WTF???\n",
     "        if step % VAL_INTERVAL == 0:\n",
+    "            total_loss = 0\n",
+    "            total_samples = 0\n",
+    "\n",
     "            with torch.no_grad():\n",
     "                model.eval()\n",
     "                for x, y in test_dataloader:\n",
     "                    if total_samples > 10:\n",
     "                        break\n",
     "\n",
+    "            model.train()\n",
+    "            average_loss = total_loss / total_samples\n",
+    "\n",
+    "            print(f\"Step {step}; loss: {average_loss}\")\n",
+    "            writer.add_scalar(\"Loss/val\", average_loss, global_step)\n",
+    "\n",
     "\n",
     "        step += 1\n",
     "        if step >= STEPS:\n",
+    "            break\n",
+    "\n",
+    "writer.close()\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 69,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "841"
       ]
      },
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 57,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "1.7774584962397206\n"
      ]
     }
    ],
     "total_loss = 0.0\n",
     "total_samples = 0\n",
     "\n",
+    "val_dataset = TextDataset(val_data, BLOCK_SIZE)\n",
+    "val_dataloader = DataLoader(val_dataset, batch_size=512, num_workers=4)\n",
     "with torch.no_grad():\n",
+    "    for x, y in val_dataloader:\n",
     "        x = x.to(\"cuda\")\n",
     "        y = y.to(\"cuda\")\n",
     "\n",
     "        batch_loss = compute_loss(model, criterion, x, y)\n",
     "        total_loss += batch_loss.item() * x.size(0)\n",
     "        total_samples += x.size(0)\n",
+    "        if total_samples > 100000:\n",
     "            break\n",
     "\n",
     "    average_loss = total_loss / total_samples\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 71,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3286528"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "num_params"
+   ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Finally, we generate. NOTE: seeds shorter than 256 chars have nonsense until you reach the context window. I think it's because Karpathy jammed the whole Shakespeare into one file with no act/scene breaks and both he and I didn't split it, so there's only one padding that the model sees, ever. TODO: fix this in the data loading step"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 58,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Tutus, to Marcius, noble Marcius\n",
+      "Made to my voices! doing and hangs upon them!\n",
+      "Take it to down our foes and hates with stain,\n",
+      "Which thus follows slay with on I meland,\n",
+      "What I am after her to her fearful haunt it?\n",
+      "\n",
+      "PAULINA:\n",
+      "But you are well to hold the king.\n",
+      "\n",
+      "ISABELLA:\n",
+      "And I will not go royalty to thy hand.\n",
+      "\n",
+      "LUCIO:\n",
+      "Since I do not well in such goodly talk of.\n",
+      "I think I have a stay of it!\n",
+      "\n",
+      "HENRY BOLINGBROKE:\n",
+      "Who say I hate been a day's mind;\n",
+      "Till we here and so very little and way,\n",
+      "And wash the city has nest seen the feast.\n",
+      "\n",
+      "DUCHESS OF YORK:\n",
+      "No, by the matter.\n",
+      "\n",
+      "ISABELLA:\n",
+      "Flitter than desire never yet looks so.\n",
+      "\n",
+      "HENRY BOLINGBROKE:\n",
+      "I am not possible perceived\n",
+      "And both place, where I may not rafes,\n",
+      "And like me one air. What you'll your love day?\n",
+      "\n",
+      "KING RICHARD II:\n",
+      "Then be thou--\n",
+      "\n",
+      "GLOUCESTER:\n",
+      "No, Lord Hastings:\n",
+      "Else queen, though my trowbers grands me to-morrow\n",
+      "Here to Bolingbroke's match;\n",
+      "When the your life and spur at homely speak.\n",
+      "\n",
+      "BUCKINGHAM:\n",
+      "My father was I follow: if you be your your kingdom,\n",
+      "My approbations an"
      ]
     }
    ],
    "source": [
     "g_cuda = torch.Generator(device='cuda')\n",
     "\n",
+    "seed = \"\"\"\n",
+    "Plot histograms of the gradient values during training. If you notice a significant number of gradients are near zero (vanishing gradients) or very large values (exploding gradients), it could be a problem. TensorBoard is a useful tool for visualizing these histograms.\n",
+    "\"\"\"\n",
+    "\n",
+    "contexts = torch.tensor(encode_text(seed), dtype=torch.int32).to('cuda')\n",
+    "GEN_LENGTH=1024\n",
     "\n",
     "model.eval()\n",
     "for i in range(GEN_LENGTH):\n",
     "    # What happens if GEN_LENGTH > CONTEXT? don't worry about it\n",
     "    #x = F.pad(contexts[:, -BLOCK_SIZE:], (0, BLOCK_SIZE - contexts.size(0)), \"constant\", 0)\n",
     "    x = contexts[-BLOCK_SIZE:]\n",
+    "    if x.size(0) < BLOCK_SIZE:\n",
+    "        x = F.pad(x, (0, BLOCK_SIZE - x.size(0)), \"constant\", 0).unsqueeze(0) # B*T\n",
+    "    else:\n",
+    "        x = x.unsqueeze(0)\n",
+    "\n",
     "    preds = model.infer(x)\n",
     "    preds = preds.squeeze(0)\n",
     "    probs = torch.softmax(preds, dim=-1)\n",
     "\n",
     "    # TODO: Broken because of bug with the trailing 0s. FIX THIS\n",
+    "    # next_char = torch.multinomial(torch.exp(preds[(-1 if i >= BLOCK_SIZE else i), :]), num_samples=1, generator=g_cuda)\n",
+    "    next_char = torch.multinomial(torch.exp(preds[-1, :]), num_samples=1, generator=g_cuda)\n",
+    "\n",
     "    #context = torch.cat(context, next_char)\n",
     "    contexts = torch.cat((contexts, next_char), dim=0)\n",
     "    print(decode_text(next_char.cpu().numpy())[-1], end=\"\")\n",