mgoin commited on
Commit
37b6c13
1 Parent(s): 5883b46

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +29 -0
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://github.com/vllm-project/llm-compressor/pull/185
2
+
3
+ ```python
4
+ from transformers import AutoTokenizer
5
+
6
+ from llmcompressor.modifiers.quantization import QuantizationModifier
7
+ from llmcompressor.transformers import oneshot
8
+ from llmcompressor.transformers.sparsification import create_sparse_auto_model_class
9
+
10
+ MODEL_ID = "llava-hf/llava-1.5-7b-hf"
11
+ model_class = create_sparse_auto_model_class("LlavaForConditionalGeneration")
12
+ model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
13
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
14
+
15
+ recipe = QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
16
+ oneshot(model=model, recipe=recipe)
17
+
18
+ # Confirm generations of the quantized model look sane.
19
+ print("========== SAMPLE GENERATION ==============")
20
+ input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
21
+ output = model.generate(input_ids, max_new_tokens=20)
22
+ print(tokenizer.decode(output[0]))
23
+ print("==========================================")
24
+
25
+ # Save to disk in compressed-tensors format.
26
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
27
+ model.save_pretrained(SAVE_DIR)
28
+ tokenizer.save_pretrained(SAVE_DIR)
29
+ ```