skyxiaobaibai
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,66 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
---
|
4 |
+
|
5 |
+
from transformers import AutoTokenizer, TextGenerationPipeline
|
6 |
+
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logging.basicConfig(
|
10 |
+
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
|
11 |
+
)
|
12 |
+
|
13 |
+
pretrained_model_dir = "Qwen/Qwen1.5-7B-Chat"
|
14 |
+
quantized_model_dir = "/gptq_model-4bit-128g"
|
15 |
+
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
|
17 |
+
examples = [
|
18 |
+
tokenizer(
|
19 |
+
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
|
20 |
+
)
|
21 |
+
]
|
22 |
+
|
23 |
+
quantize_config = BaseQuantizeConfig(
|
24 |
+
bits=4, # quantize model to 4-bit
|
25 |
+
group_size=128, # it is recommended to set the value to 128
|
26 |
+
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
|
27 |
+
)
|
28 |
+
|
29 |
+
# load un-quantized model, by default, the model will always be loaded into CPU memory
|
30 |
+
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
|
31 |
+
|
32 |
+
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
|
33 |
+
model.quantize(examples)
|
34 |
+
|
35 |
+
# save quantized model
|
36 |
+
model.save_quantized(quantized_model_dir)
|
37 |
+
|
38 |
+
# save quantized model using safetensors
|
39 |
+
model.save_quantized(quantized_model_dir, use_safetensors=True)
|
40 |
+
|
41 |
+
# push quantized model to Hugging Face Hub.
|
42 |
+
# to use use_auth_token=True, Login first via huggingface-cli login.
|
43 |
+
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
|
44 |
+
# (uncomment the following three lines to enable this feature)
|
45 |
+
# repo_id = f"YourUserName/{quantized_model_dir}"
|
46 |
+
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
|
47 |
+
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
|
48 |
+
|
49 |
+
# alternatively you can save and push at the same time
|
50 |
+
# (uncomment the following three lines to enable this feature)
|
51 |
+
# repo_id = f"YourUserName/{quantized_model_dir}"
|
52 |
+
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
|
53 |
+
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
|
54 |
+
|
55 |
+
# load quantized model to the first GPU
|
56 |
+
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
|
57 |
+
|
58 |
+
# download quantized model from Hugging Face Hub and load to the first GPU
|
59 |
+
# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
|
60 |
+
|
61 |
+
# inference with model.generate
|
62 |
+
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
|
63 |
+
|
64 |
+
# or you can also use pipeline
|
65 |
+
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
66 |
+
print(pipeline("auto-gptq is")[0]["generated_text"])
|