kyrylokumar commited on
Commit
35e23cc
·
verified ·
1 Parent(s): 13542a6

Added extra files

Browse files
Files changed (15) hide show
  1. .gitattributes +1 -0
  2. README.md +69 -0
  3. bnb-4.pt +3 -0
  4. bnb-8.pt +3 -0
  5. bnb-nf4.pt +3 -0
  6. config.json +31 -0
  7. ggml_models/gpt2-hf.pt +3 -0
  8. gpt2.ggml +3 -0
  9. main1.py +114 -0
  10. main2.py +97 -0
  11. pytorch_model.bin +3 -0
  12. q1-full-quant.pt +3 -0
  13. q3.ipynb +309 -0
  14. quant.py +68 -0
  15. tokenizer.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ gpt2.ggml filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Part 1
2
+
3
+ Normal model
4
+ Memory usage of model alone = 510.342192
5
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 838.783488
6
+ 100%|█████████████████████████████████████▊| 489/491 [00:25<00:00, 18.97it/s]
7
+ Loss = 26.38488006591797
8
+ Time taken: 25.795103549957275
9
+
10
+ Full model quant
11
+ Memory usage of model alone = 294.250369
12
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 1465.776128
13
+ 100%|█████████████████████████████████████▊| 489/491 [00:21<00:00, 22.39it/s]
14
+ Loss = 26.954803466796875
15
+ Time taken: 21.855380058288574
16
+
17
+ Full model without lm_head
18
+ Memory usage of model alone = 255.602736
19
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 1269.30176
20
+ 100%|█████████████████████████████████████▊| 489/491 [00:21<00:00, 22.68it/s]
21
+ Loss = 26.41402816772461
22
+ Time taken: 21.578929662704468
23
+
24
+ Only LM head
25
+ Memory usage of model alone = 548.989825
26
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 1036.319744
27
+ 100%|█████████████████████████████████████▊| 489/491 [00:20<00:00, 23.39it/s]
28
+ Loss = 26.924053192138672
29
+ Time taken: 20.919220209121704
30
+
31
+ Last 4 attention layers
32
+ Memory usage of model alone = 425.42904
33
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 983.949824
34
+ 100%|█████████████████████████████████████▊| 489/491 [00:20<00:00, 23.40it/s]
35
+ Loss = 26.39584732055664
36
+ Time taken: 20.912957668304443
37
+
38
+ Only q,k,v
39
+ Memory usage of model alone = 425.425968
40
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 989.827584
41
+ 100%|█████████████████████████████████████▊| 489/491 [00:21<00:00, 23.11it/s]
42
+ Loss = 26.396583557128906
43
+ Time taken: 21.17274236679077
44
+
45
+
46
+ ## Part 2:
47
+ 4 bit model
48
+ Memory usage of model alone = 134.060568
49
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 308.803072
50
+ 100%|█████████████████████████████████████▊| 489/491 [00:16<00:00, 29.78it/s]
51
+ Loss = 31.296875
52
+ Time taken: 16.42749333381653
53
+
54
+ `low_cpu_mem_usage` was None, now set to True since model is quantized.
55
+ 8 bit model
56
+ Memory usage of model alone = 176.527896
57
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 494.142976
58
+ 100%|█████████████████████████████████████▊| 489/491 [00:29<00:00, 16.70it/s]
59
+ Loss = 26.5625
60
+ Time taken: 29.27569341659546
61
+
62
+ `low_cpu_mem_usage` was None, now set to True since model is quantized.
63
+ 4 bit nf4 model
64
+ Memory usage of model alone = 134.060568
65
+ 0%| | 0/491 [00:00<?, ?it/s]Memory usage at forward pass = 494.85824
66
+ 100%|█████████████████████████████████████▊| 489/491 [00:15<00:00, 30.64it/s]
67
+ Loss = 28.375
68
+ Time taken: 15.961309671401978
69
+
bnb-4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9145e9bbbad9f4b20ab918ff7d06d45ed6faad24d4f523f1263474557e68bc36
3
+ size 126879130
bnb-8.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2012bc265e11a8fce96cbeded25165467f489ab30ccf3995089ce0f85e088e69
3
+ size 164347802
bnb-nf4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d79894f1fe69972458d10591ba4acf90dc70ddd35baed120c043955e498e95a
3
+ size 126894826
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "resid_pdrop": 0.1,
19
+ "summary_activation": null,
20
+ "summary_first_dropout": 0.1,
21
+ "summary_proj_to_labels": true,
22
+ "summary_type": "cls_index",
23
+ "summary_use_proj": true,
24
+ "task_specific_params": {
25
+ "text-generation": {
26
+ "do_sample": true,
27
+ "max_length": 50
28
+ }
29
+ },
30
+ "vocab_size": 50257
31
+ }
ggml_models/gpt2-hf.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:051959f78a781d69aa46be8730e01960adb7b346817788a2ef7f7cffabd2263b
3
+ size 548141490
gpt2.ggml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295fbc748b5ccb10d2ede7936729cff2f5b1243df48f05a274c22c8e503399ba
3
+ size 177668768
main1.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import re
7
+ import transformers
8
+ import torch
9
+ from tqdm import tqdm
10
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+ device = "cuda"
14
+ model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
15
+ tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
16
+
17
+ from datasets import load_dataset
18
+
19
+ test = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
20
+ # print(len(test))
21
+ encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
22
+ import time
23
+ import gc
24
+ def run_experiment(model):
25
+ print(f'Memory usage of model alone = {model.get_memory_footprint()/10**6}')
26
+ max_length = model.config.n_positions
27
+ stride = 512
28
+ seq_len = encodings.input_ids.size(1)
29
+
30
+ nlls = []
31
+ start_time = time.time()
32
+ prev_end_loc = 0
33
+ for begin_loc in tqdm(range(0, seq_len, stride)):
34
+ end_loc = min(begin_loc + max_length, seq_len)
35
+ trg_len = end_loc - prev_end_loc # may be different from stride on last loop
36
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
37
+ target_ids = input_ids.clone()
38
+ target_ids[:, :-trg_len] = -100
39
+
40
+ with torch.no_grad():
41
+ outputs = model(input_ids, labels=target_ids)
42
+
43
+ # loss is calculated using CrossEntropyLoss which averages over valid labels
44
+ neg_log_likelihood = outputs.loss
45
+
46
+ if begin_loc == 0:
47
+ print(f'Memory usage at forward pass = {torch.cuda.memory_allocated(0)/10**6}')
48
+ nlls.append(neg_log_likelihood)
49
+
50
+ prev_end_loc = end_loc
51
+ if end_loc == seq_len:
52
+ break
53
+
54
+ ppl = torch.exp(torch.stack(nlls).mean())
55
+ print(f'Loss = {ppl.item()}')
56
+ print(f'Time taken: {- start_time + time.time()}')
57
+ from quant import perform_quantization
58
+
59
+ model_type = 0
60
+
61
+
62
+ if model_type == 0:
63
+ ## Normal
64
+ print('Normal model')
65
+ run_experiment(model)
66
+ print()
67
+
68
+
69
+ ## Full model quant (including lm_head)
70
+ if model_type == 0:
71
+ print('Full model quant')
72
+ perform_quantization(model)
73
+ torch.save(model, 'q1-full-quant.pt')
74
+ # print(model)
75
+ run_experiment(model)
76
+ print()
77
+
78
+ # Without lm_head
79
+ if model_type == 0:
80
+ print('Full model without lm_head')
81
+ model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
82
+ perform_quantization(model, regex=r"transformer\.h\.\d+\.[a-zA-Z]+")
83
+ # print(model)
84
+ run_experiment(model)
85
+ print()
86
+
87
+ # Only lm_head
88
+ if model_type == 0:
89
+ print('Only LM head')
90
+ model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
91
+ perform_quantization(model, regex=r"[\w.]*lm_head[\w.]*")
92
+ # print(gc.collect())
93
+ # print(model)
94
+ run_experiment(model)
95
+ print()
96
+
97
+ # Last 4 layers
98
+ if model_type == 0:
99
+ print('Last 4 attention layers')
100
+ model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
101
+ perform_quantization(model, regex=r"transformer\.h\.(8|9|10|11)\.[a-zA-Z]+")
102
+ # print(gc.collect())
103
+ # print(model)
104
+ run_experiment(model)
105
+ print()
106
+
107
+ # Only q,k,v
108
+ if model_type == 0:
109
+ print('Only q,k,v')
110
+ model =AutoModelForCausalLM.from_pretrained("gpt2", ).to(device)
111
+ perform_quantization(model, regex=r"[\w.]*attn[\w.]*")
112
+ # print(model)
113
+ run_experiment(model)
114
+ print()
main2.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import re
7
+ import transformers
8
+ import torch
9
+ from tqdm import tqdm
10
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+ device = "cuda"
14
+
15
+ tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
16
+
17
+ from datasets import load_dataset
18
+
19
+ test = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
20
+ # print(len(test))
21
+ encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
22
+ import time
23
+ import gc
24
+ def run_experiment(model):
25
+ print(f'Memory usage of model alone = {model.get_memory_footprint()/10**6}')
26
+ max_length = model.config.n_positions
27
+ stride = 512
28
+ seq_len = encodings.input_ids.size(1)
29
+
30
+ nlls = []
31
+ start_time = time.time()
32
+ prev_end_loc = 0
33
+ for begin_loc in tqdm(range(0, seq_len, stride)):
34
+ end_loc = min(begin_loc + max_length, seq_len)
35
+ trg_len = end_loc - prev_end_loc # may be different from stride on last loop
36
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
37
+ target_ids = input_ids.clone()
38
+ target_ids[:, :-trg_len] = -100
39
+
40
+ with torch.no_grad():
41
+ outputs = model(input_ids, labels=target_ids)
42
+
43
+ # loss is calculated using CrossEntropyLoss which averages over valid labels
44
+ neg_log_likelihood = outputs.loss
45
+
46
+ if begin_loc == 0:
47
+ print(f'Memory usage at forward pass = {torch.cuda.memory_allocated(0)/10**6}')
48
+ nlls.append(neg_log_likelihood)
49
+
50
+ prev_end_loc = end_loc
51
+ if end_loc == seq_len:
52
+ break
53
+
54
+ ppl = torch.exp(torch.stack(nlls).mean())
55
+ print(f'Loss = {ppl.item()}')
56
+ print(f'Time taken: {- start_time + time.time()}')
57
+
58
+
59
+ from transformers import BitsAndBytesConfig
60
+
61
+ bnb_config = BitsAndBytesConfig(
62
+ load_in_4bit=True,
63
+ )
64
+ model =AutoModelForCausalLM.from_pretrained("gpt2", quantization_config=bnb_config )
65
+
66
+ ## 4bit
67
+ print('4 bit model')
68
+ run_experiment(model)
69
+
70
+ torch.save(model, 'bnb-4.pth')
71
+ print()
72
+
73
+ ## 8bit
74
+ bnb_config = BitsAndBytesConfig(
75
+ load_in_8bit=True,
76
+ )
77
+ model =AutoModelForCausalLM.from_pretrained("gpt2", quantization_config=bnb_config )
78
+ print('8 bit model')
79
+ run_experiment(model)
80
+ torch.save(model, 'bnb-8.pth')
81
+ print()
82
+
83
+
84
+ ## nf4 bit
85
+ bnb_config = BitsAndBytesConfig(
86
+ load_in_4bit=True,
87
+ bnb_4bit_quant_type="nf4",
88
+ )
89
+ model =AutoModelForCausalLM.from_pretrained("gpt2", quantization_config=bnb_config )
90
+ print('4 bit nf4 model')
91
+ run_experiment(model)
92
+ torch.save(model, 'bnb-nf4.pth')
93
+ print()
94
+
95
+
96
+
97
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7f8cf8154c29d4014a82a86ebc8ad4eeef1525e78262257dabccd2b6d3065cf
3
+ size 548143050
q1-full-quant.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee9a5b08bb9e8875f406079201e4d77ba4f9db2361ba216b7a2e955af6a7055f
3
+ size 294906758
q3.ipynb ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/home/kyrylo/Sem-7/Anlp/Grokking/Minimal/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import torch\n",
19
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
20
+ "from huggingface_hub import hf_hub_download, upload_folder\n",
21
+ "from pathlib import Path"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 32,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "text/plain": [
32
+ "'tokenizer.json'"
33
+ ]
34
+ },
35
+ "execution_count": 32,
36
+ "metadata": {},
37
+ "output_type": "execute_result"
38
+ }
39
+ ],
40
+ "source": [
41
+ "\n",
42
+ "\n",
43
+ "model_name = \"gpt2\" # Replace with the Hugging Face model name you want to convert\n",
44
+ "local_dir = \"./\" # to store the GGML model\n",
45
+ "\n",
46
+ "model_path = hf_hub_download(repo_id='openai-community/gpt2', filename=\"pytorch_model.bin\", local_dir=local_dir) \n",
47
+ "model_path = hf_hub_download(repo_id='openai-community/gpt2', filename=\"config.json\", local_dir=local_dir) \n",
48
+ "model_path = hf_hub_download(repo_id='openai-community/gpt2', filename=\"tokenizer.json\", local_dir=local_dir) \n",
49
+ "model_path"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 33,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "name": "stderr",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "/tmp/ipykernel_96016/408161957.py:1: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
62
+ " model =torch.load(f'./pytorch_model.bin')\n"
63
+ ]
64
+ }
65
+ ],
66
+ "source": [
67
+ "model =torch.load(f'./pytorch_model.bin')\n",
68
+ "torch.save(model, './pytorch_model.bin', _use_new_zipfile_serialization=True)\n",
69
+ "# ! rm pytorch_model.bin"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 34,
75
+ "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "name": "stdout",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "INFO:hf-to-gguf:Loading model: \n",
82
+ "INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only\n",
83
+ "INFO:hf-to-gguf:Exporting model...\n",
84
+ "INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'\n",
85
+ "INFO:hf-to-gguf:token_embd.weight, torch.float32 --> Q8_0, shape = {768, 50257}\n",
86
+ "INFO:hf-to-gguf:output.weight, torch.float32 --> Q8_0, shape = {768, 50257}\n",
87
+ "INFO:hf-to-gguf:position_embd.weight, torch.float32 --> F32, shape = {768, 1024}\n",
88
+ "INFO:hf-to-gguf:blk.0.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
89
+ "INFO:hf-to-gguf:blk.0.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
90
+ "INFO:hf-to-gguf:blk.0.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
91
+ "INFO:hf-to-gguf:blk.0.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
92
+ "INFO:hf-to-gguf:blk.0.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
93
+ "INFO:hf-to-gguf:blk.0.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
94
+ "INFO:hf-to-gguf:blk.0.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
95
+ "INFO:hf-to-gguf:blk.0.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
96
+ "INFO:hf-to-gguf:blk.0.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
97
+ "INFO:hf-to-gguf:blk.0.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
98
+ "INFO:hf-to-gguf:blk.0.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
99
+ "INFO:hf-to-gguf:blk.0.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
100
+ "INFO:hf-to-gguf:blk.1.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
101
+ "INFO:hf-to-gguf:blk.1.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
102
+ "INFO:hf-to-gguf:blk.1.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
103
+ "INFO:hf-to-gguf:blk.1.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
104
+ "INFO:hf-to-gguf:blk.1.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
105
+ "INFO:hf-to-gguf:blk.1.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
106
+ "INFO:hf-to-gguf:blk.1.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
107
+ "INFO:hf-to-gguf:blk.1.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
108
+ "INFO:hf-to-gguf:blk.1.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
109
+ "INFO:hf-to-gguf:blk.1.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
110
+ "INFO:hf-to-gguf:blk.1.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
111
+ "INFO:hf-to-gguf:blk.1.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
112
+ "INFO:hf-to-gguf:blk.2.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
113
+ "INFO:hf-to-gguf:blk.2.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
114
+ "INFO:hf-to-gguf:blk.2.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
115
+ "INFO:hf-to-gguf:blk.2.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
116
+ "INFO:hf-to-gguf:blk.2.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
117
+ "INFO:hf-to-gguf:blk.2.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
118
+ "INFO:hf-to-gguf:blk.2.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
119
+ "INFO:hf-to-gguf:blk.2.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
120
+ "INFO:hf-to-gguf:blk.2.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
121
+ "INFO:hf-to-gguf:blk.2.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
122
+ "INFO:hf-to-gguf:blk.2.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
123
+ "INFO:hf-to-gguf:blk.2.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
124
+ "INFO:hf-to-gguf:blk.3.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
125
+ "INFO:hf-to-gguf:blk.3.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
126
+ "INFO:hf-to-gguf:blk.3.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
127
+ "INFO:hf-to-gguf:blk.3.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
128
+ "INFO:hf-to-gguf:blk.3.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
129
+ "INFO:hf-to-gguf:blk.3.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
130
+ "INFO:hf-to-gguf:blk.3.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
131
+ "INFO:hf-to-gguf:blk.3.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
132
+ "INFO:hf-to-gguf:blk.3.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
133
+ "INFO:hf-to-gguf:blk.3.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
134
+ "INFO:hf-to-gguf:blk.3.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
135
+ "INFO:hf-to-gguf:blk.3.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
136
+ "INFO:hf-to-gguf:blk.4.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
137
+ "INFO:hf-to-gguf:blk.4.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
138
+ "INFO:hf-to-gguf:blk.4.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
139
+ "INFO:hf-to-gguf:blk.4.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
140
+ "INFO:hf-to-gguf:blk.4.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
141
+ "INFO:hf-to-gguf:blk.4.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
142
+ "INFO:hf-to-gguf:blk.4.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
143
+ "INFO:hf-to-gguf:blk.4.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
144
+ "INFO:hf-to-gguf:blk.4.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
145
+ "INFO:hf-to-gguf:blk.4.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
146
+ "INFO:hf-to-gguf:blk.4.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
147
+ "INFO:hf-to-gguf:blk.4.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
148
+ "INFO:hf-to-gguf:blk.5.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
149
+ "INFO:hf-to-gguf:blk.5.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
150
+ "INFO:hf-to-gguf:blk.5.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
151
+ "INFO:hf-to-gguf:blk.5.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
152
+ "INFO:hf-to-gguf:blk.5.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
153
+ "INFO:hf-to-gguf:blk.5.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
154
+ "INFO:hf-to-gguf:blk.5.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
155
+ "INFO:hf-to-gguf:blk.5.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
156
+ "INFO:hf-to-gguf:blk.5.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
157
+ "INFO:hf-to-gguf:blk.5.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
158
+ "INFO:hf-to-gguf:blk.5.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
159
+ "INFO:hf-to-gguf:blk.5.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
160
+ "INFO:hf-to-gguf:blk.6.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
161
+ "INFO:hf-to-gguf:blk.6.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
162
+ "INFO:hf-to-gguf:blk.6.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
163
+ "INFO:hf-to-gguf:blk.6.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
164
+ "INFO:hf-to-gguf:blk.6.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
165
+ "INFO:hf-to-gguf:blk.6.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
166
+ "INFO:hf-to-gguf:blk.6.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
167
+ "INFO:hf-to-gguf:blk.6.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
168
+ "INFO:hf-to-gguf:blk.6.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
169
+ "INFO:hf-to-gguf:blk.6.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
170
+ "INFO:hf-to-gguf:blk.6.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
171
+ "INFO:hf-to-gguf:blk.6.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
172
+ "INFO:hf-to-gguf:blk.7.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
173
+ "INFO:hf-to-gguf:blk.7.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
174
+ "INFO:hf-to-gguf:blk.7.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
175
+ "INFO:hf-to-gguf:blk.7.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
176
+ "INFO:hf-to-gguf:blk.7.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
177
+ "INFO:hf-to-gguf:blk.7.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
178
+ "INFO:hf-to-gguf:blk.7.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
179
+ "INFO:hf-to-gguf:blk.7.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
180
+ "INFO:hf-to-gguf:blk.7.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
181
+ "INFO:hf-to-gguf:blk.7.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
182
+ "INFO:hf-to-gguf:blk.7.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
183
+ "INFO:hf-to-gguf:blk.7.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
184
+ "INFO:hf-to-gguf:blk.8.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
185
+ "INFO:hf-to-gguf:blk.8.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
186
+ "INFO:hf-to-gguf:blk.8.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
187
+ "INFO:hf-to-gguf:blk.8.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
188
+ "INFO:hf-to-gguf:blk.8.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
189
+ "INFO:hf-to-gguf:blk.8.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
190
+ "INFO:hf-to-gguf:blk.8.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
191
+ "INFO:hf-to-gguf:blk.8.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
192
+ "INFO:hf-to-gguf:blk.8.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
193
+ "INFO:hf-to-gguf:blk.8.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
194
+ "INFO:hf-to-gguf:blk.8.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
195
+ "INFO:hf-to-gguf:blk.8.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
196
+ "INFO:hf-to-gguf:blk.9.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
197
+ "INFO:hf-to-gguf:blk.9.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
198
+ "INFO:hf-to-gguf:blk.9.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
199
+ "INFO:hf-to-gguf:blk.9.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
200
+ "INFO:hf-to-gguf:blk.9.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
201
+ "INFO:hf-to-gguf:blk.9.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
202
+ "INFO:hf-to-gguf:blk.9.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
203
+ "INFO:hf-to-gguf:blk.9.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
204
+ "INFO:hf-to-gguf:blk.9.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
205
+ "INFO:hf-to-gguf:blk.9.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
206
+ "INFO:hf-to-gguf:blk.9.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
207
+ "INFO:hf-to-gguf:blk.9.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
208
+ "INFO:hf-to-gguf:blk.10.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
209
+ "INFO:hf-to-gguf:blk.10.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
210
+ "INFO:hf-to-gguf:blk.10.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
211
+ "INFO:hf-to-gguf:blk.10.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
212
+ "INFO:hf-to-gguf:blk.10.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
213
+ "INFO:hf-to-gguf:blk.10.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
214
+ "INFO:hf-to-gguf:blk.10.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
215
+ "INFO:hf-to-gguf:blk.10.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
216
+ "INFO:hf-to-gguf:blk.10.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
217
+ "INFO:hf-to-gguf:blk.10.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
218
+ "INFO:hf-to-gguf:blk.10.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
219
+ "INFO:hf-to-gguf:blk.10.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
220
+ "INFO:hf-to-gguf:blk.11.attn_norm.weight, torch.float32 --> F32, shape = {768}\n",
221
+ "INFO:hf-to-gguf:blk.11.attn_norm.bias, torch.float32 --> F32, shape = {768}\n",
222
+ "INFO:hf-to-gguf:blk.11.attn_qkv.weight, torch.float32 --> Q8_0, shape = {768, 2304}\n",
223
+ "INFO:hf-to-gguf:blk.11.attn_qkv.bias, torch.float32 --> F32, shape = {2304}\n",
224
+ "INFO:hf-to-gguf:blk.11.attn_output.weight, torch.float32 --> Q8_0, shape = {768, 768}\n",
225
+ "INFO:hf-to-gguf:blk.11.attn_output.bias, torch.float32 --> F32, shape = {768}\n",
226
+ "INFO:hf-to-gguf:blk.11.ffn_norm.weight, torch.float32 --> F32, shape = {768}\n",
227
+ "INFO:hf-to-gguf:blk.11.ffn_norm.bias, torch.float32 --> F32, shape = {768}\n",
228
+ "INFO:hf-to-gguf:blk.11.ffn_up.weight, torch.float32 --> Q8_0, shape = {768, 3072}\n",
229
+ "INFO:hf-to-gguf:blk.11.ffn_up.bias, torch.float32 --> F32, shape = {3072}\n",
230
+ "INFO:hf-to-gguf:blk.11.ffn_down.weight, torch.float32 --> Q8_0, shape = {3072, 768}\n",
231
+ "INFO:hf-to-gguf:blk.11.ffn_down.bias, torch.float32 --> F32, shape = {768}\n",
232
+ "INFO:hf-to-gguf:output_norm.weight, torch.float32 --> F32, shape = {768}\n",
233
+ "INFO:hf-to-gguf:output_norm.bias, torch.float32 --> F32, shape = {768}\n",
234
+ "INFO:hf-to-gguf:Set meta model\n",
235
+ "INFO:hf-to-gguf:Set model parameters\n",
236
+ "INFO:hf-to-gguf:Set model tokenizer\n",
237
+ "DEBUG:hf-to-gguf:chktok: [198, 220, 628, 220, 628, 198, 220, 197, 220, 197, 197, 220, 197, 198, 220, 220, 198, 220, 220, 220, 198, 220, 220, 220, 220, 198, 220, 220, 220, 220, 220, 198, 8582, 248, 222, 357, 11265, 8, 30325, 114, 447, 235, 8582, 234, 104, 37929, 357, 48101, 795, 13210, 271, 1673, 36686, 515, 8, 14519, 227, 12520, 99, 247, 8582, 99, 247, 513, 4747, 23460, 513, 20370, 23460, 2091, 23460, 20370, 23460, 24840, 23460, 2091, 20370, 513, 13, 18, 513, 492, 18, 513, 986, 18, 28053, 252, 222, 157, 252, 114, 157, 252, 241, 157, 253, 233, 157, 252, 237, 157, 253, 224, 157, 252, 244, 157, 252, 115, 157, 252, 253, 157, 253, 223, 157, 252, 253, 157, 252, 95, 157, 252, 114, 157, 252, 227, 47249, 223, 5633, 22755, 239, 46349, 111, 28839, 101, 18040, 32432, 98, 43291, 1485, 1415, 24309, 25465, 171, 121, 252, 40103, 1421, 18604, 12466, 121, 16843, 141, 231, 15166, 12466, 121, 16142, 12466, 239, 141, 232, 30143, 140, 111, 16142, 21169, 21727, 31583, 18849, 705, 39115, 6, 33153, 15506, 63, 15931, 15931, 16317, 13896, 3228, 9805, 3548, 314, 1053, 587, 705, 44040, 339, 338, 612, 11, 705, 2200, 345, 1654, 30, 705, 44, 407, 1654, 314, 1183, 787, 340, 11, 705, 35, 345, 588, 617, 8887, 30, 775, 6, 26979, 257, 6, 75, 43]\n",
238
+ "DEBUG:hf-to-gguf:chkhsh: 3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454\n",
239
+ "DEBUG:hf-to-gguf:tokenizer.ggml.pre: 'gpt-2'\n",
240
+ "DEBUG:hf-to-gguf:chkhsh: 3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454\n",
241
+ "INFO:gguf.vocab:Adding 50000 merge(s).\n",
242
+ "INFO:gguf.vocab:Setting special token type bos to 50256\n",
243
+ "INFO:gguf.vocab:Setting special token type eos to 50256\n",
244
+ "INFO:hf-to-gguf:Set model quantization version\n",
245
+ "INFO:gguf.gguf_writer:Writing the following files:\n",
246
+ "INFO:gguf.gguf_writer:gpt2.ggml: n_tensors = 149, total_size = 175.9M\n",
247
+ "Writing: 100%|█████████████████████████████| 176M/176M [00:01<00:00, 129Mbyte/s]\n",
248
+ "INFO:hf-to-gguf:Model successfully exported to gpt2.ggml\n"
249
+ ]
250
+ }
251
+ ],
252
+ "source": [
253
+ "!../llama.cpp/convert_hf_to_gguf.py --outfile {local_dir}/{model_name}.ggml --outtype q8_0 --verbose ./"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": null,
259
+ "metadata": {},
260
+ "outputs": [],
261
+ "source": [
262
+ "from huggingface_hub import create_repo, upload_folder\n",
263
+ "\n",
264
+ "\n",
265
+ "repo_id = \"kyrylokumar/gpt2-quantzed-gguf\" \n",
266
+ "create_repo(repo_id=repo_id, exist_ok=True) # exist_ok=True avoids errors if the repo already exists\n",
267
+ "\n",
268
+ "# Upload the folder\n",
269
+ "local_dir = \"./\" # Path to the directory you want to upload\n",
270
+ "upload_folder(\n",
271
+ " repo_id=repo_id,\n",
272
+ " folder_path=local_dir,\n",
273
+ " commit_message=\"Added extra files\", # Optional commit message\n",
274
+ " ignore_patterns=\".git*\", # Optional: ignore .git files and other patterns\n",
275
+ ")\n",
276
+ "\n",
277
+ "print(f\"Directory '{local_dir}' pushed to: {repo_id}\")"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": null,
283
+ "metadata": {},
284
+ "outputs": [],
285
+ "source": []
286
+ }
287
+ ],
288
+ "metadata": {
289
+ "kernelspec": {
290
+ "display_name": "Minimal",
291
+ "language": "python",
292
+ "name": "python3"
293
+ },
294
+ "language_info": {
295
+ "codemirror_mode": {
296
+ "name": "ipython",
297
+ "version": 3
298
+ },
299
+ "file_extension": ".py",
300
+ "mimetype": "text/x-python",
301
+ "name": "python",
302
+ "nbconvert_exporter": "python",
303
+ "pygments_lexer": "ipython3",
304
+ "version": "3.8.10"
305
+ }
306
+ },
307
+ "nbformat": 4,
308
+ "nbformat_minor": 2
309
+ }
quant.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import re
7
+ import transformers
8
+
9
+ class ReplacedLinearLayer(nn.Module):
10
+ def __init__(self, input_dim, output_dim, if_conv=True):
11
+ super().__init__()
12
+
13
+ self.register_buffer('weights', torch.zeros([output_dim, input_dim], dtype=torch.int8))
14
+ self.register_buffer('scale_matrix', torch.zeros(output_dim, dtype=torch.int8))
15
+
16
+ # self.register_buffer("bias", torch.zeros((1, output_dim), dtype = torch.float32))
17
+ self.bias = None
18
+ self.if_conv = if_conv
19
+
20
+ def forward(self, x):
21
+ fp32_weights = self.weights.to(x.dtype)
22
+ # print(fp32_weights.shape, self.scales.shape, )
23
+ try:
24
+ x = F.linear(x, fp32_weights )* self.scales
25
+ if self.bias is not None:
26
+ x += self.bias
27
+ except Exception as e:
28
+ print(e)
29
+ print(fp32_weights.shape, self.scales.shape, )
30
+
31
+ exit()
32
+ return x
33
+
34
+ def do_quantization(self, W, ):
35
+ if self.if_conv:
36
+ W32 = W.clone().squeeze().T
37
+ else:
38
+ W32 = W.clone()
39
+
40
+ scales = (torch.max(W32.abs(), dim=-1)[0]/127).to(torch.float32)
41
+ self.scales = scales
42
+ self.weights = torch.round(W32 / scales[:, None]).to(torch.int8)
43
+
44
+
45
+ def perform_quantization(module, regex='.*'):
46
+ pattern = re.compile(regex)
47
+ for name, node in module.named_modules():
48
+ for name2, child in node.named_children():
49
+ if ( isinstance(child, nn.Linear) or isinstance(child, transformers.pytorch_utils.Conv1D) ) and pattern.match(f'{name}.{name2}'):
50
+ # print(name, name2, node, child)
51
+ fp32_weight, fp32_bias = child.weight, child.bias
52
+
53
+ quant_module = ReplacedLinearLayer(child.weight.shape[1], child.weight.shape[0], if_conv=isinstance(child, transformers.pytorch_utils.Conv1D))
54
+ setattr(node, name2, quant_module)
55
+ # print(getattr(node, name2).custom_weights)
56
+ # return
57
+ getattr(node, name2).do_quantization(fp32_weight)
58
+ if fp32_bias is not None:
59
+ getattr(node, name2).bias = fp32_bias
60
+ # print(getattr(node, name2).weights)
61
+
62
+ # return
63
+
64
+
65
+
66
+
67
+
68
+
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff