neuralmagic
/

granite-3.1-8b-instruct-quantized.w8a8

@@ -1,6 +1,7 @@
 ---
 tags:
-- INT8
 - vllm
 license: apache-2.0
 license_link: https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md
@@ -66,45 +67,81 @@ vLLM also supports OpenAI-compatible serving. See the [documentation](https://do
 This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
 ```python
 import argparse
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
-import os
-def main():
-    parser = argparse.ArgumentParser(description='Quantize a transformer model to INT8')
-    parser.add_argument('--model_id', type=str, required=True,
-                        help='The model ID from HuggingFace (e.g., "meta-llama/Meta-Llama-3-8B-Instruct")')
-    parser.add_argument('--save_path', type=str, default='.',
-                        help='Custom path to save the quantized model. If not provided, will use model_name-quantized.w8a8')
-    args = parser.parse_args()
-    # Load model
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model_id, device_map="auto", torch_dtype="auto", trust_remote_code=True,
     )
-    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
-    # Configure the quantization algorithm and scheme
-    recipe = QuantizationModifier(
-        targets="Linear", scheme="INT8_DYNAMIC", ignore=["lm_head"]
-    )
-    # Apply quantization
-    oneshot(model=model, recipe=recipe)
-    save_path = os.path.join(args.save_path, args.model_id.split("/")[1] + "-quantized.w8a8")
-    os.makedirs(save_path, exist_ok=True)
-    # Save to disk in compressed-tensors format
-    model.save_pretrained(save_path)
-    tokenizer.save_pretrained(save_path)
-    print(f"Model and tokenizer saved to: {save_path}")
-if __name__ == "__main__":
-    main()
 ```
 ## Evaluation

 ---
 tags:
+- w8a8
+- int8
 - vllm
 license: apache-2.0
 license_link: https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md
 This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
+```bash
+python quantize.py --model_path ibm-granite/granite-3.1-8b-instruct --quant_path "output_dir/granite-3.1-8b-instruct-quantized.w8a8" --calib_size 3072 --dampening_frac 0.1 --observer mse
+```
 ```python
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply
 import argparse
+from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str)
+parser.add_argument('--quant_path', type=str)
+parser.add_argument('--calib_size', type=int, default=256)
+parser.add_argument('--dampening_frac', type=float, default=0.1)
+parser.add_argument('--observer', type=str, default="minmax")
+args = parser.parse_args()
+model = SparseAutoModelForCausalLM.from_pretrained(
+    args.model_path,
+    device_map="auto",
+    torch_dtype="auto",
+    use_cache=False,
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+NUM_CALIBRATION_SAMPLES = args.calib_size
+DATASET_ID = "garage-bAInd/Open-Platypus"
+DATASET_SPLIT = "train"
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+def preprocess(example):
+    concat_txt = example["instruction"] + "\n" + example["output"]
+    return {"text": concat_txt}
+ds = ds.map(preprocess)
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        truncation=False,
+        add_special_tokens=True,
     )
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+recipe = [
+    GPTQModifier(
+        targets=["Linear"],
+        ignore=["lm_head"],
+        scheme="W8A8",
+        dampening_frac=args.dampening_frac,
+        observer=args.observer,
+    )
+]
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    num_calibration_samples=args.calib_size,
+    max_seq_length=8192,
+)
+# Save to disk compressed.
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
 ```
 ## Evaluation