huzaifa1117 commited on
Commit
41d3097
1 Parent(s): 3605ddf

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +26 -0
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Inference calling Way:
2
+ from transformers import (
3
+ AutoModelForCausalLM,
4
+ AutoTokenizer,
5
+ HqqConfig
6
+ )
7
+ from peft import PeftModel
8
+ import torch
9
+ from awq import AutoAWQForCausalLM
10
+ import torch
11
+
12
+ device = torch.device("cuda")
13
+
14
+ model_id = "huzaifa1117/tinyllama_AWQ_4bit"
15
+ # All linear layers will use the same quantization config
16
+ quant_config = HqqConfig(nbits=1, group_size=64, quant_zero=False, quant_scale=False, axis=1)
17
+
18
+ #Load the tokenizer to save it along with the model
19
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
20
+
21
+
22
+ model = AutoAWQForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, use_cache=False, device_map='cuda')
23
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
24
+ # model = AutoModelForCausalLM.from_pretrained(model_id)
25
+
26
+ model.to(device)