huzaifa1117
commited on
Commit
•
41d3097
1
Parent(s):
3605ddf
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Inference calling Way:
|
2 |
+
from transformers import (
|
3 |
+
AutoModelForCausalLM,
|
4 |
+
AutoTokenizer,
|
5 |
+
HqqConfig
|
6 |
+
)
|
7 |
+
from peft import PeftModel
|
8 |
+
import torch
|
9 |
+
from awq import AutoAWQForCausalLM
|
10 |
+
import torch
|
11 |
+
|
12 |
+
device = torch.device("cuda")
|
13 |
+
|
14 |
+
model_id = "huzaifa1117/tinyllama_AWQ_4bit"
|
15 |
+
# All linear layers will use the same quantization config
|
16 |
+
quant_config = HqqConfig(nbits=1, group_size=64, quant_zero=False, quant_scale=False, axis=1)
|
17 |
+
|
18 |
+
#Load the tokenizer to save it along with the model
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
20 |
+
|
21 |
+
|
22 |
+
model = AutoAWQForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, use_cache=False, device_map='cuda')
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
24 |
+
# model = AutoModelForCausalLM.from_pretrained(model_id)
|
25 |
+
|
26 |
+
model.to(device)
|