skyxiaobaibai commited on
Commit
9592e99
·
verified ·
1 Parent(s): c6845a0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +66 -3
README.md CHANGED
@@ -1,3 +1,66 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ from transformers import AutoTokenizer, TextGenerationPipeline
6
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
7
+ import logging
8
+
9
+ logging.basicConfig(
10
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
11
+ )
12
+
13
+ pretrained_model_dir = "Qwen/Qwen1.5-7B-Chat"
14
+ quantized_model_dir = "/gptq_model-4bit-128g"
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
17
+ examples = [
18
+ tokenizer(
19
+ "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
20
+ )
21
+ ]
22
+
23
+ quantize_config = BaseQuantizeConfig(
24
+ bits=4, # quantize model to 4-bit
25
+ group_size=128, # it is recommended to set the value to 128
26
+ desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
27
+ )
28
+
29
+ # load un-quantized model, by default, the model will always be loaded into CPU memory
30
+ model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
31
+
32
+ # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
33
+ model.quantize(examples)
34
+
35
+ # save quantized model
36
+ model.save_quantized(quantized_model_dir)
37
+
38
+ # save quantized model using safetensors
39
+ model.save_quantized(quantized_model_dir, use_safetensors=True)
40
+
41
+ # push quantized model to Hugging Face Hub.
42
+ # to use use_auth_token=True, Login first via huggingface-cli login.
43
+ # or pass explcit token with: use_auth_token="hf_xxxxxxx"
44
+ # (uncomment the following three lines to enable this feature)
45
+ # repo_id = f"YourUserName/{quantized_model_dir}"
46
+ # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
47
+ # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
48
+
49
+ # alternatively you can save and push at the same time
50
+ # (uncomment the following three lines to enable this feature)
51
+ # repo_id = f"YourUserName/{quantized_model_dir}"
52
+ # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
53
+ # model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
54
+
55
+ # load quantized model to the first GPU
56
+ model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
57
+
58
+ # download quantized model from Hugging Face Hub and load to the first GPU
59
+ # model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
60
+
61
+ # inference with model.generate
62
+ print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
63
+
64
+ # or you can also use pipeline
65
+ pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
66
+ print(pipeline("auto-gptq is")[0]["generated_text"])