nm-research commited on
Commit
667fb88
·
verified ·
1 Parent(s): d4bd631

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +71 -34
README.md CHANGED
@@ -1,6 +1,7 @@
1
  ---
2
  tags:
3
- - INT8
 
4
  - vllm
5
  license: apache-2.0
6
  license_link: https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md
@@ -66,45 +67,81 @@ vLLM also supports OpenAI-compatible serving. See the [documentation](https://do
66
  This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
67
 
68
 
 
 
 
 
 
69
  ```python
 
 
 
 
70
  import argparse
71
- from transformers import AutoModelForCausalLM, AutoTokenizer
72
- from llmcompressor.modifiers.quantization import QuantizationModifier
73
- from llmcompressor.transformers import oneshot
74
- import os
75
-
76
- def main():
77
- parser = argparse.ArgumentParser(description='Quantize a transformer model to INT8')
78
- parser.add_argument('--model_id', type=str, required=True,
79
- help='The model ID from HuggingFace (e.g., "meta-llama/Meta-Llama-3-8B-Instruct")')
80
- parser.add_argument('--save_path', type=str, default='.',
81
- help='Custom path to save the quantized model. If not provided, will use model_name-quantized.w8a8')
82
- args = parser.parse_args()
83
-
84
- # Load model
85
- model = AutoModelForCausalLM.from_pretrained(
86
- args.model_id, device_map="auto", torch_dtype="auto", trust_remote_code=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  )
88
- tokenizer = AutoTokenizer.from_pretrained(args.model_id)
89
 
90
- # Configure the quantization algorithm and scheme
91
- recipe = QuantizationModifier(
92
- targets="Linear", scheme="INT8_DYNAMIC", ignore=["lm_head"]
93
- )
94
 
95
- # Apply quantization
96
- oneshot(model=model, recipe=recipe)
97
 
98
- save_path = os.path.join(args.save_path, args.model_id.split("/")[1] + "-quantized.w8a8")
99
- os.makedirs(save_path, exist_ok=True)
100
-
101
- # Save to disk in compressed-tensors format
102
- model.save_pretrained(save_path)
103
- tokenizer.save_pretrained(save_path)
104
- print(f"Model and tokenizer saved to: {save_path}")
105
-
106
- if __name__ == "__main__":
107
- main()
 
 
 
 
 
 
 
 
 
 
108
  ```
109
 
110
  ## Evaluation
 
1
  ---
2
  tags:
3
+ - w8a8
4
+ - int8
5
  - vllm
6
  license: apache-2.0
7
  license_link: https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md
 
67
  This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
68
 
69
 
70
+ ```bash
71
+ python quantize.py --model_path ibm-granite/granite-3.1-8b-instruct --quant_path "output_dir/granite-3.1-8b-instruct-quantized.w8a8" --calib_size 3072 --dampening_frac 0.1 --observer mse
72
+ ```
73
+
74
+
75
  ```python
76
+ from datasets import load_dataset
77
+ from transformers import AutoTokenizer
78
+ from llmcompressor.modifiers.quantization import GPTQModifier
79
+ from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply
80
  import argparse
81
+ from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
82
+
83
+
84
+ parser = argparse.ArgumentParser()
85
+ parser.add_argument('--model_path', type=str)
86
+ parser.add_argument('--quant_path', type=str)
87
+ parser.add_argument('--calib_size', type=int, default=256)
88
+ parser.add_argument('--dampening_frac', type=float, default=0.1)
89
+ parser.add_argument('--observer', type=str, default="minmax")
90
+ args = parser.parse_args()
91
+
92
+ model = SparseAutoModelForCausalLM.from_pretrained(
93
+ args.model_path,
94
+ device_map="auto",
95
+ torch_dtype="auto",
96
+ use_cache=False,
97
+ trust_remote_code=True,
98
+ )
99
+ tokenizer = AutoTokenizer.from_pretrained(args.model_path)
100
+
101
+
102
+ NUM_CALIBRATION_SAMPLES = args.calib_size
103
+ DATASET_ID = "garage-bAInd/Open-Platypus"
104
+ DATASET_SPLIT = "train"
105
+ ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
106
+ ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
107
+
108
+ def preprocess(example):
109
+ concat_txt = example["instruction"] + "\n" + example["output"]
110
+ return {"text": concat_txt}
111
+
112
+ ds = ds.map(preprocess)
113
+
114
+ def tokenize(sample):
115
+ return tokenizer(
116
+ sample["text"],
117
+ padding=False,
118
+ truncation=False,
119
+ add_special_tokens=True,
120
  )
 
121
 
 
 
 
 
122
 
123
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
 
124
 
125
+ recipe = [
126
+ GPTQModifier(
127
+ targets=["Linear"],
128
+ ignore=["lm_head"],
129
+ scheme="W8A8",
130
+ dampening_frac=args.dampening_frac,
131
+ observer=args.observer,
132
+ )
133
+ ]
134
+ oneshot(
135
+ model=model,
136
+ dataset=ds,
137
+ recipe=recipe,
138
+ num_calibration_samples=args.calib_size,
139
+ max_seq_length=8192,
140
+ )
141
+
142
+ # Save to disk compressed.
143
+ model.save_pretrained(SAVE_DIR, save_compressed=True)
144
+ tokenizer.save_pretrained(SAVE_DIR)
145
  ```
146
 
147
  ## Evaluation