neuralmagic
/

granite-3.1-8b-instruct-quantized.w8a8

@@ -76,6 +76,7 @@ python quantize.py --model_path ibm-granite/granite-3.1-8b-instruct --quant_path
 from datasets import load_dataset
 from transformers import AutoTokenizer
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply
 import argparse
 from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
@@ -98,9 +99,8 @@ model = SparseAutoModelForCausalLM.from_pretrained(
 )
 tokenizer = AutoTokenizer.from_pretrained(args.model_path)
 NUM_CALIBRATION_SAMPLES = args.calib_size
-DATASET_ID = "garage-bAInd/Open-Platypus"
 DATASET_SPLIT = "train"
 ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
 ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
@@ -122,7 +122,15 @@ def tokenize(sample):
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 recipe = [
     GPTQModifier(
         targets=["Linear"],
         ignore=["lm_head"],
@@ -136,7 +144,7 @@ oneshot(
     dataset=ds,
     recipe=recipe,
     num_calibration_samples=args.calib_size,
-    max_seq_length=8192,
 )
 # Save to disk compressed.
@@ -189,16 +197,17 @@ evalplus.evaluate \
 | Metric                                  | ibm-granite/granite-3.1-8b-instruct             | neuralmagic-ent/granite-3.1-8b-instruct-quantized.w8a8 |
 |-----------------------------------------|:---------------------------------:|:-------------------------------------------:|
-| ARC-Challenge (Acc-Norm, 25-shot)       |                             |                                       |
-| GSM8K (Strict-Match, 5-shot)            |                             |                                        |
-| HellaSwag (Acc-Norm, 10-shot)           |                             |                                        |
-| MMLU (Acc, 5-shot)                      |                             |                                        |
-| TruthfulQA (MC2, 0-shot)                |                             |                                        |
-| Winogrande (Acc, 5-shot)                |                             |                                        |
-| **Average Score**                       | ****                        | ****                                   |
-| **Recovery**                            | **100.00**                       | ****                                   |
 #### HumanEval pass@1 scores

 from datasets import load_dataset
 from transformers import AutoTokenizer
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply
 import argparse
 from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
 )
 tokenizer = AutoTokenizer.from_pretrained(args.model_path)
 NUM_CALIBRATION_SAMPLES = args.calib_size
+DATASET_ID = "neuralmagic/LLM_compression_calibration"
 DATASET_SPLIT = "train"
 ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
 ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
 ds = ds.map(tokenize, remove_columns=ds.column_names)
+ignore=["lm_head"]
+mappings=[
+    [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
+    [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"],
+    [["re:.*down_proj"], "re:.*up_proj"]
+]
 recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8, ignore=ignore, mappings=mappings),
     GPTQModifier(
         targets=["Linear"],
         ignore=["lm_head"],
     dataset=ds,
     recipe=recipe,
     num_calibration_samples=args.calib_size,
+    max_seq_length=8196,
 )
 # Save to disk compressed.
 | Metric                                  | ibm-granite/granite-3.1-8b-instruct             | neuralmagic-ent/granite-3.1-8b-instruct-quantized.w8a8 |
 |-----------------------------------------|:---------------------------------:|:-------------------------------------------:|
+| ARC-Challenge (Acc-Norm, 25-shot)       | 66.81                            | 66.81                                       |
+| GSM8K (Strict-Match, 5-shot)            | 64.52                            | 64.37                                       |
+| HellaSwag (Acc-Norm, 10-shot)           | 84.18                            | 83.91                                       |
+| MMLU (Acc, 5-shot)                      | 65.52                            | 65.00                                       |
+| TruthfulQA (MC2, 0-shot)                | 60.57                            | 60.29                                       |
+| Winogrande (Acc, 5-shot)                | 80.19                            | 79.87                                       |
+| **Average Score**                       | **70.30**                        | **70.04**                                   |
+| **Recovery**                            | **100.00**                       | **99.64**                                   |
 #### HumanEval pass@1 scores
+| Metric                                  | ibm-granite/granite-3.1-8b-instruct             | neuralmagic-ent/granite-3.1-8b-instruct-quantized.w8a8 |
+|-----------------------------------------|:---------------------------------:|:-------------------------------------------:|
+| HumanEval Pass@1                        | 71.00                            | 72.00                                       |