Added extra files

Browse files

Files changed (15) hide show

.gitattributes +1 -0
README.md +69 -0
bnb-4.pt +3 -0
bnb-8.pt +3 -0
bnb-nf4.pt +3 -0
config.json +31 -0
ggml_models/gpt2-hf.pt +3 -0
gpt2.ggml +3 -0
main1.py +114 -0
main2.py +97 -0
pytorch_model.bin +3 -0
q1-full-quant.pt +3 -0
q3.ipynb +309 -0
quant.py +68 -0
tokenizer.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+gpt2.ggml filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+## Part 1
+Normal model
+Memory usage of model alone = 510.342192
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 838.783488
+100%|█████████████████████████████████████▊| 489/491 [00:25<00:00, 18.97it/s]
+Loss = 26.38488006591797
+Time taken: 25.795103549957275
+Full model quant
+Memory usage of model alone = 294.250369
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 1465.776128
+100%|█████████████████████████████████████▊| 489/491 [00:21<00:00, 22.39it/s]
+Loss = 26.954803466796875
+Time taken: 21.855380058288574
+Full model without lm_head
+Memory usage of model alone = 255.602736
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 1269.30176
+100%|█████████████████████████████████████▊| 489/491 [00:21<00:00, 22.68it/s]
+Loss = 26.41402816772461
+Time taken: 21.578929662704468
+Only LM head
+Memory usage of model alone = 548.989825
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 1036.319744
+100%|█████████████████████████████████████▊| 489/491 [00:20<00:00, 23.39it/s]
+Loss = 26.924053192138672
+Time taken: 20.919220209121704
+Last 4 attention layers
+Memory usage of model alone = 425.42904
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 983.949824
+100%|█████████████████████████████████████▊| 489/491 [00:20<00:00, 23.40it/s]
+Loss = 26.39584732055664
+Time taken: 20.912957668304443
+Only q,k,v
+Memory usage of model alone = 425.425968
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 989.827584
+100%|█████████████████████████████████████▊| 489/491 [00:21<00:00, 23.11it/s]
+Loss = 26.396583557128906
+Time taken: 21.17274236679077
+## Part 2:
+4 bit model
+Memory usage of model alone = 134.060568
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 308.803072
+100%|█████████████████████████████████████▊| 489/491 [00:16<00:00, 29.78it/s]
+Loss = 31.296875
+Time taken: 16.42749333381653
+`low_cpu_mem_usage` was None, now set to True since model is quantized.
+8 bit model
+Memory usage of model alone = 176.527896
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 494.142976
+100%|█████████████████████████████████████▊| 489/491 [00:29<00:00, 16.70it/s]
+Loss = 26.5625
+Time taken: 29.27569341659546
+`low_cpu_mem_usage` was None, now set to True since model is quantized.
+4 bit nf4 model
+Memory usage of model alone = 134.060568
+  0%|                                                | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 494.85824
+100%|█████████████████████████████████████▊| 489/491 [00:15<00:00, 30.64it/s]
+Loss = 28.375
+Time taken: 15.961309671401978

bnb-4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9145e9bbbad9f4b20ab918ff7d06d45ed6faad24d4f523f1263474557e68bc36
+size 126879130

bnb-8.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2012bc265e11a8fce96cbeded25165467f489ab30ccf3995089ce0f85e088e69
+size 164347802

bnb-nf4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d79894f1fe69972458d10591ba4acf90dc70ddd35baed120c043955e498e95a
+size 126894826

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "resid_pdrop": 0.1,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "vocab_size": 50257
+}

ggml_models/gpt2-hf.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:051959f78a781d69aa46be8730e01960adb7b346817788a2ef7f7cffabd2263b
+size 548141490

gpt2.ggml ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:295fbc748b5ccb10d2ede7936729cff2f5b1243df48f05a274c22c8e503399ba
+size 177668768

main1.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import re
+import transformers
+import torch
+from tqdm import tqdm
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+import warnings
+warnings.filterwarnings("ignore")
+device = "cuda"
+model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+from datasets import load_dataset
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
+# print(len(test))
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+import time
+import gc
+def run_experiment(model):
+    print(f'Memory usage of model alone = {model.get_memory_footprint()/10**6}')
+    max_length = model.config.n_positions
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+    nlls = []
+    start_time = time.time()
+    prev_end_loc = 0
+    for begin_loc in tqdm(range(0, seq_len, stride)):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+        with torch.no_grad():
+            outputs = model(input_ids, labels=target_ids)
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            neg_log_likelihood = outputs.loss
+        if begin_loc == 0:
+            print(f'Memory usage at forward pass = {torch.cuda.memory_allocated(0)/10**6}')
+        nlls.append(neg_log_likelihood)
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+    ppl = torch.exp(torch.stack(nlls).mean())
+    print(f'Loss = {ppl.item()}')
+    print(f'Time taken: {- start_time + time.time()}')
+from quant import perform_quantization
+model_type = 0
+if model_type == 0:
+## Normal
+    print('Normal model')
+    run_experiment(model)
+    print()
+## Full model quant (including lm_head)
+if model_type == 0:
+    print('Full model quant')
+    perform_quantization(model)
+    torch.save(model, 'q1-full-quant.pt')
+    # print(model)
+    run_experiment(model)
+    print()
+# Without lm_head
+if model_type == 0:
+    print('Full model without lm_head')
+    model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
+    perform_quantization(model, regex=r"transformer\.h\.\d+\.[a-zA-Z]+")
+    # print(model)
+    run_experiment(model)
+    print()
+# Only lm_head
+if model_type == 0:
+    print('Only LM head')
+    model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
+    perform_quantization(model, regex=r"[\w.]*lm_head[\w.]*")
+    # print(gc.collect())
+    # print(model)
+    run_experiment(model)
+    print()
+# Last 4 layers
+if model_type == 0:
+    print('Last 4 attention layers')
+    model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
+    perform_quantization(model, regex=r"transformer\.h\.(8|9|10|11)\.[a-zA-Z]+")
+    # print(gc.collect())
+    # print(model)
+    run_experiment(model)
+    print()
+# Only q,k,v
+if model_type == 0:
+    print('Only q,k,v')
+    model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
+    perform_quantization(model, regex=r"[\w.]*attn[\w.]*")
+    # print(model)
+    run_experiment(model)
+    print()

main2.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import re
+import transformers
+import torch
+from tqdm import tqdm
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+import warnings
+warnings.filterwarnings("ignore")
+device = "cuda"
+tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+from datasets import load_dataset
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
+# print(len(test))
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+import time
+import gc
+def run_experiment(model):
+    print(f'Memory usage of model alone = {model.get_memory_footprint()/10**6}')
+    max_length = model.config.n_positions
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+    nlls = []
+    start_time = time.time()
+    prev_end_loc = 0
+    for begin_loc in tqdm(range(0, seq_len, stride)):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+        with torch.no_grad():
+            outputs = model(input_ids, labels=target_ids)
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            neg_log_likelihood = outputs.loss
+        if begin_loc == 0:
+            print(f'Memory usage at forward pass = {torch.cuda.memory_allocated(0)/10**6}')
+        nlls.append(neg_log_likelihood)
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+    ppl = torch.exp(torch.stack(nlls).mean())
+    print(f'Loss = {ppl.item()}')
+    print(f'Time taken: {- start_time + time.time()}')
+from transformers import BitsAndBytesConfig
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+)
+model =AutoModelForCausalLM.from_pretrained("gpt2", quantization_config=bnb_config )
+## 4bit
+print('4 bit model')
+run_experiment(model)
+torch.save(model, 'bnb-4.pth')
+print()
+## 8bit
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+)
+model =AutoModelForCausalLM.from_pretrained("gpt2", quantization_config=bnb_config )
+print('8 bit model')
+run_experiment(model)
+torch.save(model, 'bnb-8.pth')
+print()
+## nf4 bit
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+bnb_4bit_quant_type="nf4",
+)
+model =AutoModelForCausalLM.from_pretrained("gpt2", quantization_config=bnb_config )
+print('4 bit nf4 model')
+run_experiment(model)
+torch.save(model, 'bnb-nf4.pth')
+print()

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f8cf8154c29d4014a82a86ebc8ad4eeef1525e78262257dabccd2b6d3065cf
+size 548143050

q1-full-quant.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee9a5b08bb9e8875f406079201e4d77ba4f9db2361ba216b7a2e955af6a7055f
+size 294906758

q3.ipynb ADDED Viewed

	@@ -0,0 +1,309 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/kyrylo/Sem-7/Anlp/Grokking/Minimal/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from huggingface_hub import hf_hub_download, upload_folder\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'tokenizer.json'"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "model_name = \"gpt2\"  # Replace with the Hugging Face model name you want to convert\n",
+    "local_dir = \"./\"  #  to store the GGML model\n",
+    "\n",
+    "model_path = hf_hub_download(repo_id='openai-community/gpt2', filename=\"pytorch_model.bin\", local_dir=local_dir) \n",
+    "model_path = hf_hub_download(repo_id='openai-community/gpt2', filename=\"config.json\", local_dir=local_dir) \n",
+    "model_path = hf_hub_download(repo_id='openai-community/gpt2', filename=\"tokenizer.json\", local_dir=local_dir) \n",
+    "model_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_96016/408161957.py:1: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  model =torch.load(f'./pytorch_model.bin')\n"
+     ]
+    }
+   ],
+   "source": [
+    "model =torch.load(f'./pytorch_model.bin')\n",
+    "torch.save(model, './pytorch_model.bin',  _use_new_zipfile_serialization=True)\n",
+    "# ! rm pytorch_model.bin"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:hf-to-gguf:Loading model: \n",
+      "INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only\n",
+      "INFO:hf-to-gguf:Exporting model...\n",
+      "INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'\n",
+      "INFO:hf-to-gguf:token_embd.weight,         torch.float32 --> Q8_0, shape = {768, 50257}\n",
+      "INFO:hf-to-gguf:output.weight,             torch.float32 --> Q8_0, shape = {768, 50257}\n",
+      "INFO:hf-to-gguf:position_embd.weight,      torch.float32 --> F32, shape = {768, 1024}\n",
+      "INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.0.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.0.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.0.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.0.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.0.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.0.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.0.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.0.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.1.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.1.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.1.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.1.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.1.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.1.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.1.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.1.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.1.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.1.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.1.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.1.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.2.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.2.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.2.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.2.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.2.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.2.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.2.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.2.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.2.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.2.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.2.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.2.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.3.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.3.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.3.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.3.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.3.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.3.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.3.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.3.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.3.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.3.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.3.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.3.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.4.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.4.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.4.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.4.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.4.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.4.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.4.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.4.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.4.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.4.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.4.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.4.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.5.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.5.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.5.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.5.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.5.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.5.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.5.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.5.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.5.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.5.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.5.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.5.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.6.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.6.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.6.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.6.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.6.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.6.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.6.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.6.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.6.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.6.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.6.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.6.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.7.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.7.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.7.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.7.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.7.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.7.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.7.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.7.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.7.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.7.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.7.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.7.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.8.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.8.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.8.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.8.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.8.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.8.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.8.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.8.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.8.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.8.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.8.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.8.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.9.attn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.9.attn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.9.attn_qkv.weight,     torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.9.attn_qkv.bias,       torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.9.attn_output.weight,  torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.9.attn_output.bias,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.9.ffn_norm.weight,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.9.ffn_norm.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.9.ffn_up.weight,       torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.9.ffn_up.bias,         torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.9.ffn_down.weight,     torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.9.ffn_down.bias,       torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.10.attn_norm.weight,   torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.10.attn_norm.bias,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.10.attn_qkv.weight,    torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.10.attn_qkv.bias,      torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.10.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.10.attn_output.bias,   torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.10.ffn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.10.ffn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.10.ffn_up.weight,      torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.10.ffn_up.bias,        torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.10.ffn_down.weight,    torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.10.ffn_down.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.11.attn_norm.weight,   torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.11.attn_norm.bias,     torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.11.attn_qkv.weight,    torch.float32 --> Q8_0, shape = {768, 2304}\n",
+      "INFO:hf-to-gguf:blk.11.attn_qkv.bias,      torch.float32 --> F32, shape = {2304}\n",
+      "INFO:hf-to-gguf:blk.11.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
+      "INFO:hf-to-gguf:blk.11.attn_output.bias,   torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.11.ffn_norm.weight,    torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.11.ffn_norm.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:blk.11.ffn_up.weight,      torch.float32 --> Q8_0, shape = {768, 3072}\n",
+      "INFO:hf-to-gguf:blk.11.ffn_up.bias,        torch.float32 --> F32, shape = {3072}\n",
+      "INFO:hf-to-gguf:blk.11.ffn_down.weight,    torch.float32 --> Q8_0, shape = {3072, 768}\n",
+      "INFO:hf-to-gguf:blk.11.ffn_down.bias,      torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:output_norm.weight,        torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:output_norm.bias,          torch.float32 --> F32, shape = {768}\n",
+      "INFO:hf-to-gguf:Set meta model\n",
+      "INFO:hf-to-gguf:Set model parameters\n",
+      "INFO:hf-to-gguf:Set model tokenizer\n",
+      "DEBUG:hf-to-gguf:chktok: [198, 220, 628, 220, 628, 198, 220, 197, 220, 197, 197, 220, 197, 198, 220, 220, 198, 220, 220, 220, 198, 220, 220, 220, 220, 198, 220, 220, 220, 220, 220, 198, 8582, 248, 222, 357, 11265, 8, 30325, 114, 447, 235, 8582, 234, 104, 37929, 357, 48101, 795, 13210, 271, 1673, 36686, 515, 8, 14519, 227, 12520, 99, 247, 8582, 99, 247, 513, 4747, 23460, 513, 20370, 23460, 2091, 23460, 20370, 23460, 24840, 23460, 2091, 20370, 513, 13, 18, 513, 492, 18, 513, 986, 18, 28053, 252, 222, 157, 252, 114, 157, 252, 241, 157, 253, 233, 157, 252, 237, 157, 253, 224, 157, 252, 244, 157, 252, 115, 157, 252, 253, 157, 253, 223, 157, 252, 253, 157, 252, 95, 157, 252, 114, 157, 252, 227, 47249, 223, 5633, 22755, 239, 46349, 111, 28839, 101, 18040, 32432, 98, 43291, 1485, 1415, 24309, 25465, 171, 121, 252, 40103, 1421, 18604, 12466, 121, 16843, 141, 231, 15166, 12466, 121, 16142, 12466, 239, 141, 232, 30143, 140, 111, 16142, 21169, 21727, 31583, 18849, 705, 39115, 6, 33153, 15506, 63, 15931, 15931, 16317, 13896, 3228, 9805, 3548, 314, 1053, 587, 705, 44040, 339, 338, 612, 11, 705, 2200, 345, 1654, 30, 705, 44, 407, 1654, 314, 1183, 787, 340, 11, 705, 35, 345, 588, 617, 8887, 30, 775, 6, 26979, 257, 6, 75, 43]\n",
+      "DEBUG:hf-to-gguf:chkhsh: 3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454\n",
+      "DEBUG:hf-to-gguf:tokenizer.ggml.pre: 'gpt-2'\n",
+      "DEBUG:hf-to-gguf:chkhsh: 3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454\n",
+      "INFO:gguf.vocab:Adding 50000 merge(s).\n",
+      "INFO:gguf.vocab:Setting special token type bos to 50256\n",
+      "INFO:gguf.vocab:Setting special token type eos to 50256\n",
+      "INFO:hf-to-gguf:Set model quantization version\n",
+      "INFO:gguf.gguf_writer:Writing the following files:\n",
+      "INFO:gguf.gguf_writer:gpt2.ggml: n_tensors = 149, total_size = 175.9M\n",
+      "Writing: 100%|█████████████████████████████| 176M/176M [00:01<00:00, 129Mbyte/s]\n",
+      "INFO:hf-to-gguf:Model successfully exported to gpt2.ggml\n"
+     ]
+    }
+   ],
+   "source": [
+    "!../llama.cpp/convert_hf_to_gguf.py --outfile {local_dir}/{model_name}.ggml --outtype q8_0 --verbose ./"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import create_repo, upload_folder\n",
+    "\n",
+    "\n",
+    "repo_id = \"kyrylokumar/gpt2-quantzed-gguf\" \n",
+    "create_repo(repo_id=repo_id, exist_ok=True) # exist_ok=True avoids errors if the repo already exists\n",
+    "\n",
+    "# Upload the folder\n",
+    "local_dir = \"./\" # Path to the directory you want to upload\n",
+    "upload_folder(\n",
+    "    repo_id=repo_id,\n",
+    "    folder_path=local_dir,\n",
+    "    commit_message=\"Added extra files\",  # Optional commit message\n",
+    "    ignore_patterns=\".git*\",  # Optional: ignore .git files and other patterns\n",
+    ")\n",
+    "\n",
+    "print(f\"Directory '{local_dir}' pushed to: {repo_id}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Minimal",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

quant.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import re
+import transformers
+class ReplacedLinearLayer(nn.Module):
+    def __init__(self, input_dim, output_dim, if_conv=True):
+        super().__init__()
+        self.register_buffer('weights', torch.zeros([output_dim, input_dim], dtype=torch.int8))
+        self.register_buffer('scale_matrix',  torch.zeros(output_dim, dtype=torch.int8))
+        # self.register_buffer("bias", torch.zeros((1, output_dim), dtype = torch.float32))
+        self.bias = None
+        self.if_conv = if_conv
+    def forward(self, x):
+        fp32_weights = self.weights.to(x.dtype)
+        # print(fp32_weights.shape, self.scales.shape, )
+        try:
+            x = F.linear(x, fp32_weights )* self.scales
+            if self.bias is not None:
+                x += self.bias
+        except Exception as e:
+            print(e)
+            print(fp32_weights.shape, self.scales.shape, )
+            exit()
+        return x
+    def do_quantization(self, W, ):
+        if self.if_conv:
+           W32 = W.clone().squeeze().T
+        else:
+            W32 = W.clone()
+        scales = (torch.max(W32.abs(), dim=-1)[0]/127).to(torch.float32)
+        self.scales = scales
+        self.weights = torch.round(W32 / scales[:, None]).to(torch.int8)
+def perform_quantization(module,  regex='.*'):
+    pattern = re.compile(regex)
+    for name, node in module.named_modules():
+        for name2, child in node.named_children():
+            if ( isinstance(child, nn.Linear) or  isinstance(child, transformers.pytorch_utils.Conv1D) ) and pattern.match(f'{name}.{name2}'):
+                # print(name, name2, node, child)
+                fp32_weight, fp32_bias = child.weight, child.bias
+                quant_module = ReplacedLinearLayer(child.weight.shape[1], child.weight.shape[0], if_conv=isinstance(child, transformers.pytorch_utils.Conv1D))
+                setattr(node, name2, quant_module)
+                # print(getattr(node, name2).custom_weights)
+                # return
+                getattr(node, name2).do_quantization(fp32_weight)
+                if fp32_bias is not None:
+                    getattr(node, name2).bias = fp32_bias
+                # print(getattr(node, name2).weights)
+                # return

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff