{ "cells": [ { "cell_type": "markdown", "id": "29d14fe0", "metadata": { "cellId": "hwmcjwsucnwczi4u66ftg", "id": "e13eff4e-c134-4dac-9523-07b297164250" }, "source": [ "# Example of Quantizating 7.1 billion Bloom with 8-bit weights\n", "\n", "Heavily inspired by [Hivemind's work](https://nbviewer.org/urls/huggingface.co/hivemind/gpt-j-6B-8bit/raw/main/convert-gpt-j.ipynb) and [joaoalvarenga's work](https://huggingface.co/joaoalvarenga/bloom-8bit)" ] }, { "cell_type": "code", "execution_count": 1, "id": "39f137ae", "metadata": { "cellId": "wg56t50s3la38havqevkme", "colab": { "base_uri": "https://localhost:8080/" }, "id": "699e94eb-3ce1-4788-999b-fb6d593ba7e9", "outputId": "764a6719-66d0-4ef7-df2d-4cfda0914f65" }, "outputs": [], "source": [ "#%pip install transformers==4.20.1\n", "#%pip install bitsandbytes\n", "#%pip install datasets\n", "#%pip install accelerate" ] }, { "cell_type": "markdown", "id": "53e4dd05", "metadata": { "cellId": "aklenvay105v0md7yy679m", "id": "0afea72c-691d-4719-a84a-663f1891af6e" }, "source": [ "### Load and convert original Bloom structure to 8-bit\n", "\n", "You can load an already compressed 8-bit version of Bloom from [OpenDungeon/bloom-7b1-8bit](https://huggingface.co/OpenDungeon/bloom-7b1-8bit/tree/main) with small monkey patching. But this notebook focuses on compression of Bloom, not usage." ] }, { "cell_type": "code", "execution_count": 2, "id": "e1ca3df9", "metadata": { "cellId": "ktgxcupgtcf8hhh2k1r2ij", "colab": { "base_uri": "https://localhost:8080/" }, "id": "xcdQSnYIk12Z", "outputId": "8d0fff65-4d34-41bd-f750-278a35ac9533" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/dm/.local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n", " warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "===================================BUG REPORT===================================\n", "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n", "================================================================================\n", "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching /usr/local/cuda/lib64...\n", "WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!\n", "CUDA SETUP: Loading binary /home/dm/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('8bitexperiments/f746d450-b748-4d1f-b3e3-9e4fd3f72d6e')}\n", " warn(\n", "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n", " warn(\n", "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/cuda/lib64')}\n", " warn(\n", "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:48: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.\n", " warn(\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import transformers\n", "from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise\n", "\n", "model_name = \"bigscience/bloom-7b1\"\n", "gpt = transformers.BloomForCausalLM.from_pretrained(model_name, cache_dir=\"mycache\")\n", "tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, cache_dir=\"mycache\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "b37255b0", "metadata": { "cellId": "wmew4wc0e3pztbva18lggg", "id": "YjLHVyIOkdCH" }, "outputs": [], "source": [ "def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):\n", " assert chunk_size % 4096 == 0\n", " code = None\n", " chunks = []\n", " absmaxes = []\n", " flat_tensor = matrix.view(-1)\n", " for i in range((matrix.numel() - 1) // chunk_size + 1):\n", " input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()\n", " quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)\n", " chunks.append(quantized_chunk)\n", " absmaxes.append(absmax_chunk)\n", " \n", " matrix_i8 = torch.cat(chunks).reshape_as(matrix)\n", " absmax = torch.cat(absmaxes)\n", " return matrix_i8, (absmax, code)\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "5c03f13b", "metadata": { "cellId": "zwcfu5ypstmsusllfldemc", "id": "StJJ6oickpZs" }, "outputs": [], "source": [ "from typing import Tuple\n", "from torch.cuda.amp import custom_fwd, custom_bwd\n", "\n", "\n", "class DequantizeAndLinear(torch.autograd.Function):\n", " @staticmethod\n", " @custom_fwd\n", " def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,\n", " absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):\n", "\n", " weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n", " ctx.save_for_backward(input, weights_quantized, absmax, code)\n", " ctx._has_bias = bias is not None\n", " return F.linear(input, weights_deq, bias)\n", "\n", " @staticmethod\n", " @custom_bwd\n", " def backward(ctx, grad_output: torch.Tensor):\n", " assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]\n", " input, weights_quantized, absmax, code = ctx.saved_tensors\n", " # grad_output: [*batch, out_features]\n", " weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n", " grad_input = grad_output @ weights_deq\n", " grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None\n", " return grad_input, None, None, None, grad_bias\n", "\n", "\n", "class BNBLinearWithAdapter(nn.Module):\n", " def __init__(self, weight, absmax, code, bias=None, adapter_dim=0):\n", " assert isinstance(bias, nn.Parameter) or bias is None\n", " super().__init__()\n", " self.out_features, self.in_features = weight.shape\n", " self.register_buffer(\"weight\", weight.requires_grad_(False))\n", " self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n", " self.register_buffer(\"code\", code.requires_grad_(False))\n", " self.bias = bias\n", "\n", " if adapter_dim > 0:\n", " self.adapter = nn.Sequential(\n", " nn.Linear(self.in_features, adapter_dim, bias=False),\n", " nn.Linear(adapter_dim, self.out_features, bias=False),\n", " )\n", "\n", " nn.init.zeros_(self.adapter[1].weight)\n", " else:\n", " self.adapter = None\n", "\n", " def forward(self, input):\n", " out = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)\n", "\n", " if self.adapter:\n", " return self.adapter(input) + out\n", "\n", " return out\n", "\n", "\n", " @classmethod\n", " def from_linear(cls, linear: nn.Linear, **kwargs) -> \"FrozenBNBLinear\":\n", " weights_int8, state = quantize_blockise_lowmemory(linear.weight)\n", " return cls(weights_int8, *state, linear.bias, **kwargs)\n", "\n", " def __repr__(self):\n", " return f\"{self.__class__.__name__}({self.in_features}, {self.out_features})\"\n", "\n", "\n", "class BNBEmbeddingWithAdapter(nn.Module):\n", " def __init__(self, weight, absmax, code, adapter_dim=0):\n", " super().__init__()\n", " self.num_embeddings, self.embedding_dim = weight.shape\n", " self.register_buffer(\"weight\", weight.requires_grad_(False))\n", " self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n", " self.register_buffer(\"code\", code.requires_grad_(False))\n", "\n", " if adapter_dim > 0:\n", " self.adapter = nn.Sequential(\n", " nn.Embedding(self.num_embeddings, adapter_dim),\n", " nn.Linear(adapter_dim, self.embedding_dim, bias=False),\n", " )\n", "\n", " nn.init.zeros_(self.adapter[1].weight)\n", " else:\n", " self.adapter = None\n", "\n", " def forward(self, input, **kwargs):\n", " with torch.no_grad():\n", " # note: both quantuized weights and input indices are *not* differentiable\n", " weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)\n", " out = F.embedding(input, weight_deq, **kwargs)\n", " if self.adapter:\n", " return out + self.adapter(input, **kwargs)\n", "\n", " return out\n", "\n", " @classmethod\n", " def from_embedding(cls, embedding: nn.Embedding, **kwargs) -> \"FrozenBNBEmbedding\":\n", " weights_int8, state = quantize_blockise_lowmemory(embedding.weight)\n", " return cls(weights_int8, *state, **kwargs)\n", "\n", " def __repr__(self):\n", " return f\"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "92a58957", "metadata": { "cellId": "due8kcyko4fv3vxzrbin3", "id": "6LafYNhlktnt" }, "outputs": [], "source": [ "def bnbfy_(model, adapter_dim: int = 0): \n", " for module in list(model.transformer.h.modules()):\n", " for name, child in module.named_children():\n", " if isinstance(child, nn.Linear):\n", " print(name, child)\n", " setattr(module, name, BNBLinearWithAdapter.from_linear(child, adapter_dim=adapter_dim))\n", "\n", " elif isinstance(child, nn.Embedding):\n", " print(name, child)\n", " setattr(module, name, BNBEmbeddingWithAdapter.from_embedding(child, adapter_dim=adapter_dim))" ] }, { "cell_type": "code", "execution_count": 6, "id": "f2d513c6-cd72-411d-9a25-9a21e5c2b87c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "model size: 26966.156MB\n" ] } ], "source": [ "#!g1.1\n", "param_size = 0\n", "for param in gpt.parameters():\n", " param_size += param.nelement() * param.element_size()\n", "buffer_size = 0\n", "for buffer in gpt.buffers():\n", " buffer_size += buffer.nelement() * buffer.element_size()\n", "\n", "size_all_mb = (param_size + buffer_size) / 1024**2\n", "print('model size: {:.3f}MB'.format(size_all_mb))" ] }, { "cell_type": "code", "execution_count": 7, "id": "ab52cd5c", "metadata": { "cellId": "5269rte0cil8omgnvcif" }, "outputs": [ { "data": { "text/plain": [ "BloomForCausalLM(\n", " (transformer): BloomModel(\n", " (word_embeddings): Embedding(250880, 4096)\n", " (word_embeddings_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (h): ModuleList(\n", " (0): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (1): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (2): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (3): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (4): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (5): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (6): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (7): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (8): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (9): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (10): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (11): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (12): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (13): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (14): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (15): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (16): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (17): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (18): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (19): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (20): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (21): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (22): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (23): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (24): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (25): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (26): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (27): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (28): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " (29): BloomBlock(\n", " (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (self_attention): BloomAttention(\n", " (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n", " (dense): Linear(in_features=4096, out_features=4096, bias=True)\n", " (attention_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (mlp): BloomMLP(\n", " (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n", " (gelu_impl): BloomGelu()\n", " (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (lm_head): Linear(in_features=4096, out_features=250880, bias=False)\n", ")" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "gpt" ] }, { "cell_type": "code", "execution_count": 8, "id": "9280b510", "metadata": { "cellId": "a7nstbdt9vo9qikpzvo48c", "id": "jV3pGEGalDwz" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n", "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n", "dense Linear(in_features=4096, out_features=4096, bias=True)\n", "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n", "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n" ] } ], "source": [ "#!g1.1\n", "bnbfy_(gpt, adapter_dim=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "e35305f2", "metadata": { "cellId": "q5jafg9w9x0hg355icd4vo" }, "outputs": [], "source": [ "#!g1.1\n", "param_size = 0\n", "for param in gpt.parameters():\n", " param_size += param.nelement() * param.element_size()\n", "buffer_size = 0\n", "for buffer in gpt.buffers():\n", " buffer_size += buffer.nelement() * buffer.element_size()\n", "\n", "size_all_mb = (param_size + buffer_size) / 1024**2\n", "print('model size: {:.3f}MB'.format(size_all_mb))\n", "gpt.save_pretrained('bloom-7b1-8bit')" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "notebookId": "8f3ce20e-06a1-44f2-9373-2b6424b859a3", "notebookPath": "bloom8bit.ipynb" }, "nbformat": 4, "nbformat_minor": 5 }