Maykeye
/

MambaBitPet-TinyStories

Model card Files Files and versions Community

Maykeye commited on Apr 28, 2024

Commit

be19c03

•

1 Parent(s): 476ca24

Initial commit: code w/o weights

Browse files

Files changed (3) hide show

README.md +13 -0
mambabit.py +141 -0
trainer.ipynb +237 -0

README.md CHANGED Viewed

@@ -1,3 +1,16 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+Mamba Bit!
+Mamba with vocab size 2 bites again! This time we bite at tiny stories.
+I didn't bother preprocess them at all, during a training model took random char offset, converted it to bit string and fed to mamba. This time I didn't forget about residual connections nor about norm. As the result model was trained in BF16.
+Training code included.
+Example to run a model from CLI:
+$ python mambabit.py "Run, kitten, run"
+Run, kitten, running and jumping. She saw a big tree and thought it would be fun to share the tree. So, she went to the tree and started to climb the tree. She saw a big tree and thought it would be fun to share the tree. So, she went to the tree and saw a big red ball.

mambabit.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import sys
+import torch
+import torch.nn as nn
+from mamba_ssm.modules.mamba_simple import Mamba
+from mamba_ssm.utils.generation import InferenceParams
+from torch import Tensor
+from tqdm.auto import tqdm
+dim_model = 512
+n_vocab = 2
+n_layers = 4
+@torch.no_grad()
+def string_to_bits(text: str, msb=True, _cache={}) -> Tensor:
+    all_values = torch.arange(0, 256)
+    if msb not in _cache:
+        if msb:
+            bits = [((all_values & (1 << i)) != 0).int()
+                    for i in range(7, -1, -1)]
+        else:
+            bits = [((all_values & (1 << i)) != 0).int() for i in range(8)]
+        bits_tensor = torch.stack(bits).mT
+        _cache[msb] = bits_tensor
+    else:
+        bits_tensor = _cache[msb]
+    binary = text.encode()
+    raw = torch.frombuffer(binary, dtype=torch.uint8).int()
+    return bits_tensor[raw].long().ravel()
+@torch.no_grad()
+def bits_to_string(bits: Tensor, msb=True):
+    if bits.dim() == 2:
+        return [bits_to_string(t) for t in bits]
+    assert bits.dim() == 1
+    assert len(bits) % 8 == 0
+    if msb:
+        factors = torch.tensor([2**i for i in range(7, -1, -1)])
+    else:
+        factors = torch.tensor([2**i for i in range(8)])
+    factors = factors.to(device=bits.device)
+    as_bytes = bits.view(-1, 8)
+    as_bytes = (as_bytes*factors).sum(-1)
+    return ''.join([chr(x) for x in as_bytes])  # type: ignore
+class Encoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = nn.Embedding(n_vocab, dim_model)
+    def forward(self, x):
+        return self.emb(x)
+class Decoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim_model)
+        self.decoder = nn.Linear(dim_model, n_vocab, False)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.decoder(x)
+        return x
+class MambaLayer(nn.Module):
+    def __init__(self, layer_idx=None):
+        super().__init__()
+        self.in_norm = nn.LayerNorm(dim_model)
+        self.mamba = Mamba(dim_model, layer_idx=layer_idx)
+    def forward(self, x, inference_params=None):
+        residual = x
+        x = self.in_norm(x)
+        x = self.mamba(x, inference_params=inference_params)
+        x = residual + x
+        return x
+class MambaBit(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.enc = Encoder()
+        self.layers = nn.ModuleList([MambaLayer(layer_idx=idx) for idx in range(n_layers)])
+        self.dec = Decoder()
+    def forward(self, x, inference_params=None):
+        x = self.enc(x)
+        for layer in self.layers:
+            x = x + layer(x, inference_params=inference_params)
+        x = self.dec(x)
+        return x
+# test using O(N^2) cacheless stateless algorithm.
+@torch.no_grad()
+def test_n2(m: MambaBit, prompt: str, chars=10):
+    x = string_to_bits(prompt).cuda()[None]
+    process = chars * 8
+    for i in tqdm(range(process)):
+        y = m(x)
+        new = y[:, -1:].argmax(-1)
+        x = torch.cat((x, new), 1)
+    return bits_to_string(x)
+# test using O(N) by reusing state
+@torch.no_grad()
+def test_n(m: MambaBit, prompt: str, chars=10):
+    x = string_to_bits(prompt).cuda()[None]
+    process = chars * 8
+    inference_parms = InferenceParams(
+        max_seqlen=x.numel() + process,
+        max_batch_size=1)
+    y = m(x, inference_params=inference_parms)
+    new = y[:, -1:].argmax(-1)
+    for i in tqdm(range(process)):
+        x = torch.cat((x, new), 1)
+        inference_parms.seqlen_offset = x.numel() + i
+        y = m(new, inference_params=inference_parms)
+        new = y[:, -1:].argmax(-1)
+    return bits_to_string(x)
+def run():
+    mamba_bit = MambaBit().bfloat16().cuda()
+    mamba_bit.load_state_dict(torch.load("mamba_bit.tiny.bin"))
+    prompt = "Once upon a time" if len(sys.argv) != 2 else sys.argv[1]
+    s = test_n(mamba_bit, prompt, chars=256)[0]
+    print(s)
+def model_numel(m: nn.Module):
+    return sum(p.numel() for p in m.parameters())
+if __name__ == "__main__":
+    run()

trainer.ipynb ADDED Viewed

	@@ -0,0 +1,237 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from torch import Tensor\n",
+    "import random\n",
+    "from tqdm.auto import tqdm\n",
+    "from mamba_ssm.modules.mamba_simple import Mamba\n",
+    "from pathlib import Path\n",
+    "from mambabit import string_to_bits, bits_to_string\n",
+    "def model_numel(m: nn.Module):\n",
+    "    return sum(p.numel() for p in m.parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_txt = Path(\"~/Downloads/TinyStories/TinyStoriesV2-GPT4-train.txt\").expanduser().read_text()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2226845268"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(train_txt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def random_batches(raw_text: str, n_batch: int, bs: int):\n",
+    "    assert bs % 8 == 0, \"have mercy\"\n",
+    "    bs_bytes = bs // 8\n",
+    "    max_allowed_pos = len(raw_text) - bs_bytes\n",
+    "\n",
+    "    texts = []\n",
+    "    for i in range(n_batch):\n",
+    "        pos = random.randint(0, max_allowed_pos)\n",
+    "        texts.append(raw_text[pos:pos+bs_bytes])\n",
+    "    \n",
+    "    tensors = [string_to_bits(text) for text in texts]\n",
+    "    # in case we met unicode, there will be non-uniform lengths. Trim'em\n",
+    "    common_len = min(t.shape[0] for t in tensors)\n",
+    "    tensors = [t[:common_len] for t in tensors]\n",
+    "    batch = torch.stack(tensors)\n",
+    "    return batch.to(\"cuda\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mambabit import MambaBit, n_vocab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mamba_bit = MambaBit().cuda().bfloat16()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if False:\n",
+    "    mamba_bit.load_state_dict(torch.load(\"mamba_bit.tiny.bin\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(m: nn.Module, \n",
+    "        n_epoch: int = 100,         \n",
+    "        n_batch: int = 4, \n",
+    "        bs: int = 256):\n",
+    "    opt = torch.optim.AdamW(m.parameters(), lr=0.0005, fused=True)\n",
+    "\n",
+    "    for e in (bar := tqdm(range(n_epoch))):        \n",
+    "        b = random_batches(train_txt, n_batch, bs)\n",
+    "\n",
+    "        y_pred = m(b)\n",
+    "        y_pred = y_pred[:, :-1].reshape(-1, n_vocab)\n",
+    "        y_true = b[:, 1:].ravel()\n",
+    "\n",
+    "        loss = F.cross_entropy(y_pred,y_true)\n",
+    "        loss.backward()\n",
+    "        opt.step()\n",
+    "        opt.zero_grad()\n",
+    "       \n",
+    "        l = loss.item()\n",
+    "        bar.set_description(f\"L:{l:.10f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/10000 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "L:0.0805664062: 100%|██████████| 10000/10000 [6:15:25<00:00,  2.25s/it] \n"
+     ]
+    }
+   ],
+   "source": [
+    "if True:\n",
+    "    train(mamba_bit, 10000, 10, 8*2560 )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(mamba_bit.state_dict(), \"mamba_bit.tiny.bin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/1024 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1024/1024 [00:01<00:00, 760.83it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Once upon a time, there lived a kitten named Lily. Lily loved to play with her friends, and they all liked to play together.\\nOne day, Lily and Ben were playing in the']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# TEST\n",
+    "@torch.no_grad()\n",
+    "def test(prompt: str, chars=10):\n",
+    "    x0 = string_to_bits(prompt).cuda()[None]\n",
+    "    x = x0.clone()\n",
+    "    process = chars * 8\n",
+    "    for _ in tqdm(range(process)):\n",
+    "        y = mamba_bit(x)\n",
+    "        new = y[:, -1:].argmax(-1)\n",
+    "        x = torch.cat((x, new), 1)\n",
+    "    return bits_to_string(x)\n",
+    "\n",
+    "    \n",
+    "print(test(\"Once upon a time, there lived a kitten\", chars=128))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sd",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}