robertgshaw2 commited on
Commit
05f7150
1 Parent(s): 800abad

added files

Browse files
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TheBloke/Llama-2-7B-Chat-GPTQ",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 11008,
14
+ "max_length": 4096,
15
+ "max_position_embeddings": 4096,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 32,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "quantization_config": {
23
+ "group_size": 128,
24
+ "quant_method": "marlin"
25
+ },
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_scaling": null,
28
+ "rope_theta": 10000.0,
29
+ "tie_word_embeddings": false,
30
+ "torch_dtype": "float16",
31
+ "transformers_version": "4.36.2",
32
+ "use_cache": true,
33
+ "vocab_size": 32000
34
+ }
convert.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, argparse, copy
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from auto_gptq.nn_modules.qlinear.qlinear_exllama import QuantLinear
4
+ from marlin import Layer as MarlinLayer
5
+ import gc
6
+
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("--model-id", type=str)
9
+ parser.add_argument("--save-path", type=str)
10
+ parser.add_argument("--do-generation", action="store_true")
11
+
12
+ def _validate_compatibility(model):
13
+ if not hasattr(model.config, "quantization_config"):
14
+ raise ValueError("Must be a quantized model to convert to Marlin Format")
15
+ quantization_config = model.config.quantization_config
16
+ if quantization_config.quant_method != "gptq":
17
+ raise ValueError(f"Only GPTQ models can be converted to Marlin format. You passed a model with quant_method={quantization_config.quant_method}")
18
+ if quantization_config.bits != 4:
19
+ raise ValueError(f"Only 4 bit quantized models can be converted to Marlin format. You passed a model with bits={quantization_config.bits}")
20
+ if quantization_config.group_size != 128:
21
+ raise ValueError(f"Only group size 128 models can be converted to Marlin format. You passed a model with group_size={quantization_config.group_size}")
22
+ if not quantization_config.sym:
23
+ raise ValueError(f"Only models with symmetric quantization can be converted to Marlin Format. You passed a model with sym={quantization_config.sym}")
24
+ if quantization_config.desc_act:
25
+ raise ValueError(f"Models with act order quantization cannot be converted to Marlin Format. You passed a model with desc_act={quantization_config.desc_act}")
26
+
27
+ @torch.no_grad()
28
+ def unpack_4bit_to_32bit_signed(qweight, qzeros):
29
+ # Unpack 4-bit values and interpret them as signed integers
30
+ unpacked_weights = torch.zeros((qweight.shape[0]*8, qweight.shape[1]), dtype=torch.int8, device=qweight.device, requires_grad=False)
31
+ unpacked_zeros = torch.zeros((qzeros.shape[0], qzeros.shape[1]*8), dtype=torch.int8, device=qzeros.device, requires_grad=False)
32
+
33
+ for row in range(unpacked_weights.shape[0]):
34
+ i = row % 8
35
+ unpacked_weights[row, :] = (qweight[row // 8, :] >> (4 * i)) & 0xF
36
+
37
+ for col in range(unpacked_zeros.shape[1]):
38
+ i = col % 8
39
+ unpacked_zeros[:, col] = (qzeros[:, col // 8] >> (4 * i)) & 0xF
40
+
41
+ return unpacked_weights, unpacked_zeros + 1
42
+
43
+ @torch.no_grad()
44
+ def dequantize_weight(layer):
45
+ qweight, qzeros, scales = layer.qweight, layer.qzeros, layer.scales
46
+ unpacked_qweight, unpacked_qzeros = unpack_4bit_to_32bit_signed(qweight, qzeros)
47
+ group_size = unpacked_qweight.shape[0] // scales.shape[0]
48
+ scales = scales.repeat_interleave(group_size, dim=0)
49
+ unpacked_qzeros = unpacked_qzeros.repeat_interleave(group_size, dim=0)
50
+ unpacked_qweight = (unpacked_qweight - unpacked_qzeros) * scales
51
+
52
+ return unpacked_qweight.T
53
+
54
+ @torch.no_grad()
55
+ def convert_model(model, verbose=True):
56
+ for name, module in model.named_modules():
57
+ if not isinstance(module, QuantLinear):
58
+ continue
59
+
60
+ if verbose:
61
+ print(f"--- Converting Module: {name}")
62
+ parent_name = ".".join(name.split(".")[:-1])
63
+ layer_name = name[len(parent_name) + 1:]
64
+
65
+ # Dequantize the weight.
66
+ dequantized_weight = dequantize_weight(module).to(torch.float16)
67
+ linear_module = torch.nn.Linear(
68
+ in_features=dequantized_weight.shape[1],
69
+ out_features=dequantized_weight.shape[0],
70
+ bias=False,
71
+ dtype=torch.float16,
72
+ device="cuda")
73
+ linear_module.weight.data.copy_(dequantized_weight)
74
+
75
+ # Create new linear method and copy to model.
76
+ new_module = MarlinLayer(
77
+ infeatures=linear_module.in_features,
78
+ outfeatures=linear_module.out_features,
79
+ groupsize=model.config.quantization_config.group_size)
80
+ new_module.pack(linear_module, scales=copy.deepcopy(module.scales.data.t()))
81
+
82
+ # Save to parent.
83
+ parent_module = model.get_submodule(parent_name)
84
+ setattr(parent_module, layer_name, new_module)
85
+
86
+ # Free cuda memory.
87
+ del dequantized_weight, module
88
+ torch.cuda.empty_cache()
89
+ gc.collect()
90
+
91
+ return model
92
+
93
+ @torch.no_grad()
94
+ def dequantize_model(model, verbose=True):
95
+ for name, module in model.named_modules():
96
+ if not isinstance(module, QuantLinear):
97
+ continue
98
+
99
+ if verbose:
100
+ print(f"--- Dequantizing Module: {name}")
101
+ parent_name = ".".join(name.split(".")[:-1])
102
+ layer_name = name[len(parent_name) + 1:]
103
+
104
+ # Dequantize the weight.
105
+ dequantized_weight = dequantize_weight(module)
106
+ dequantized_weight_cpu = dequantized_weight.to("cpu")
107
+
108
+ # Create new linear method and copy to model.
109
+ new_module = torch.nn.Linear(
110
+ in_features=dequantized_weight_cpu.shape[1],
111
+ out_features=dequantized_weight_cpu.shape[0],
112
+ bias=False,
113
+ dtype=torch.float16)
114
+ new_module.weight.data.copy_(dequantized_weight_cpu)
115
+ new_module.scales = torch.nn.Parameter(copy.deepcopy(module.scales.data))
116
+
117
+ # Save to parent.
118
+ parent_module = model.get_submodule(parent_name)
119
+ setattr(parent_module, layer_name, new_module)
120
+
121
+ # Free cuda memory.
122
+ del dequantized_weight, dequantized_weight_cpu, module
123
+ torch.cuda.empty_cache()
124
+
125
+ return model
126
+
127
+ if __name__ == "__main__":
128
+ args = parser.parse_args()
129
+ model_id = args.model_id
130
+ save_path = args.save_path
131
+ do_generation = args.do_generation
132
+
133
+ print("Loading gptq model...")
134
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
135
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
136
+
137
+ # Validate that this model is compatible with Marlin.
138
+ print("Validating compatibility...")
139
+ _validate_compatibility(model)
140
+
141
+ # Dequantize the Model.
142
+ print("Converting model...")
143
+ model = convert_model(model).to("cpu")
144
+
145
+ # Save after updating quantization config.
146
+ print("Saving marlin model...")
147
+ model.config.quantization_config = {
148
+ "group_size": model.config.quantization_config.group_size,
149
+ "quant_method": "marlin"
150
+ }
151
+ model.save_pretrained(save_path)
152
+ tokenizer.save_pretrained(save_path)
153
+
154
+ if do_generation:
155
+ print("Generating sample text...")
156
+ model.to("cuda")
157
+ prompt = "My favorite song is"
158
+ inputs = tokenizer(prompt, return_tensors="pt")
159
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
160
+ outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
161
+ print(tokenizer.batch_decode(outputs)[0])
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.36.2"
7
+ }
load.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from huggingface_hub import snapshot_download
4
+ from safetensors.torch import safe_open
5
+ from typing import Optional, Tuple, List, Iterator
6
+ import os, filelock, json, glob
7
+ from accelerate import init_empty_weights
8
+ from transformers import AutoModelForCausalLM, AutoConfig
9
+ import marlin
10
+
11
+ # Adapted from https://github.com/vllm-project/vllm/blob/14cc317ba48229d93ee2417822d96ccb8db56abe/vllm/model_executor/weight_utils.py#L191
12
+
13
+ def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
14
+ lock_dir = cache_dir if cache_dir is not None else "/tmp"
15
+ lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
16
+ lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
17
+ return lock
18
+
19
+ def prepare_hf_model_weights(
20
+ model_name_or_path: str,
21
+ cache_dir: Optional[str] = None,
22
+ load_format: str = "auto",
23
+ fall_back_to_pt: bool = True,
24
+ revision: Optional[str] = None,
25
+ ) -> Tuple[str, List[str], bool]:
26
+ # Download model weights from huggingface.
27
+ is_local = os.path.isdir(model_name_or_path)
28
+ use_safetensors = False
29
+ # Some quantized models use .pt files for storing the weights.
30
+ if load_format == "auto":
31
+ allow_patterns = ["*.safetensors", "*.bin"]
32
+ elif load_format == "safetensors":
33
+ use_safetensors = True
34
+ allow_patterns = ["*.safetensors"]
35
+ elif load_format == "pt":
36
+ allow_patterns = ["*.pt"]
37
+ elif load_format == "npcache":
38
+ allow_patterns = ["*.bin"]
39
+ else:
40
+ raise ValueError(f"Unknown load_format: {load_format}")
41
+
42
+ if fall_back_to_pt:
43
+ allow_patterns += ["*.pt"]
44
+
45
+ if not is_local:
46
+ # Use file lock to prevent multiple processes from
47
+ # downloading the same model weights at the same time.
48
+ with get_lock(model_name_or_path, cache_dir):
49
+ hf_folder = snapshot_download(model_name_or_path,
50
+ allow_patterns=allow_patterns,
51
+ cache_dir=cache_dir,
52
+ revision=revision)
53
+ else:
54
+ hf_folder = model_name_or_path
55
+ hf_weights_files: List[str] = []
56
+ for pattern in allow_patterns:
57
+ hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
58
+ if len(hf_weights_files) > 0:
59
+ if pattern == "*.safetensors":
60
+ use_safetensors = True
61
+ break
62
+ if not use_safetensors:
63
+ # Exclude files that are not needed for inference.
64
+ # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
65
+ blacklist = [
66
+ "training_args.bin",
67
+ "optimizer.bin",
68
+ "optimizer.pt",
69
+ "scheduler.pt",
70
+ "scaler.pt",
71
+ ]
72
+ hf_weights_files = [
73
+ f for f in hf_weights_files
74
+ if not any(f.endswith(x) for x in blacklist)
75
+ ]
76
+
77
+ if len(hf_weights_files) == 0:
78
+ raise RuntimeError(
79
+ f"Cannot find any model weights with `{model_name_or_path}`")
80
+
81
+ return hf_folder, hf_weights_files, use_safetensors
82
+
83
+ def hf_model_weights_iterator(
84
+ model_name_or_path: str,
85
+ cache_dir: Optional[str] = None,
86
+ load_format: str = "auto",
87
+ revision: Optional[str] = None,
88
+ fall_back_to_pt: Optional[bool] = True,
89
+ ) -> Iterator[Tuple[str, torch.Tensor]]:
90
+ hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(
91
+ model_name_or_path,
92
+ cache_dir=cache_dir,
93
+ load_format=load_format,
94
+ fall_back_to_pt=fall_back_to_pt,
95
+ revision=revision)
96
+
97
+ if load_format == "npcache":
98
+ # Currently np_cache only support *.bin checkpoints
99
+ assert use_safetensors is False
100
+
101
+ # Convert the model weights from torch tensors to numpy arrays for
102
+ # faster loading.
103
+ np_folder = os.path.join(hf_folder, "np")
104
+ os.makedirs(np_folder, exist_ok=True)
105
+ weight_names_file = os.path.join(np_folder, "weight_names.json")
106
+ # Use file lock to prevent multiple processes from
107
+ # dumping the same model weights to numpy at the same time.
108
+ with get_lock(model_name_or_path, cache_dir):
109
+ if not os.path.exists(weight_names_file):
110
+ weight_names = []
111
+ for bin_file in hf_weights_files:
112
+ state = torch.load(bin_file, map_location="cpu")
113
+ for name, param in state.items():
114
+ param_path = os.path.join(np_folder, name)
115
+ with open(param_path, "wb") as f:
116
+ np.save(f, param.cpu().detach().numpy())
117
+ weight_names.append(name)
118
+ with open(weight_names_file, "w") as f:
119
+ json.dump(weight_names, f)
120
+
121
+ with open(weight_names_file, "r") as f:
122
+ weight_names = json.load(f)
123
+
124
+ for name in weight_names:
125
+ param_path = os.path.join(np_folder, name)
126
+ with open(param_path, "rb") as f:
127
+ param = np.load(f)
128
+ yield name, torch.from_numpy(param)
129
+ elif use_safetensors:
130
+ for st_file in hf_weights_files:
131
+ with safe_open(st_file, framework="pt") as f:
132
+ for name in f.keys(): # noqa: SIM118
133
+ param = f.get_tensor(name)
134
+ yield name, param
135
+ else:
136
+ for bin_file in hf_weights_files:
137
+ state = torch.load(bin_file, map_location="cpu")
138
+ for name, param in state.items():
139
+ yield name, param
140
+ del state
141
+ torch.cuda.empty_cache()
142
+
143
+ @torch.no_grad()
144
+ def load_model(model_path):
145
+ with init_empty_weights():
146
+ config = AutoConfig.from_pretrained(model_path)
147
+
148
+ if not hasattr(config, "quantization_config"):
149
+ raise ValueError("Must be a Marlin quantized model, but your config has no quantization config.")
150
+ if "quant_method" not in config.quantization_config:
151
+ raise ValueError("Must be a Marlin quantized model, but your quantization config has no quant_method.")
152
+ if config.quantization_config["quant_method"] != "marlin":
153
+ raise ValueError(f"Must be a Marlin model, but you passed a model with quant_method = {config.quant_method}")
154
+
155
+ model = AutoModelForCausalLM.from_config(config)
156
+ marlin.replace_linear(
157
+ model.model,
158
+ groupsize=config.quantization_config["group_size"]
159
+ )
160
+
161
+ module_dict = dict(model.named_modules())
162
+ for name, loaded_weight in hf_model_weights_iterator(model_path):
163
+ module_name = ".".join(name.split(".")[:-1])
164
+ param_name = name[len(module_name) + 1:]
165
+ module = module_dict[module_name]
166
+
167
+ if not hasattr(module, param_name):
168
+ raise ValueError("Key mismatch.")
169
+
170
+ setattr(module, param_name, torch.nn.Parameter(loaded_weight, requires_grad=False))
171
+
172
+ return model
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f790910a5b04ad26d46a403ca2f97b439515efbb890cdeab9c50b750b2c89d82
3
+ size 3864067088
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "bos_token": "<s>",
31
+ "clean_up_tokenization_spaces": false,
32
+ "eos_token": "</s>",
33
+ "model_max_length": 1000000000000000019884624838656,
34
+ "pad_token": null,
35
+ "sp_model_kwargs": {},
36
+ "tokenizer_class": "LlamaTokenizer",
37
+ "unk_token": "<unk>",
38
+ "use_default_system_prompt": false
39
+ }