aapot
/

bge-m3-onnx

Feature Extraction

text-embeddings-inference

Inference Endpoints

Model card Files Files and versions Community

Add quantization

#2

by talavivi - opened Mar 26

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

Files changed (2) hide show

export_onnx.py +19 -2
requirements.txt +1 -1

export_onnx.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import copy
 import argparse
 from optimum.exporters.onnx import onnx_export_from_model
 from collections import OrderedDict
 from typing import Dict
 from optimum.exporters.onnx.model_configs import XLMRobertaOnnxConfig
@@ -27,7 +31,7 @@ class BGEM3OnnxConfig(XLMRobertaOnnxConfig):
         )
-def main(output: str, opset: int, device: str, optimize: str, atol: str):
     model = BGEM3InferenceModel()
     bgem3_onnx_config = BGEM3OnnxConfig(model.config)
     onnx_export_from_model(
@@ -40,6 +44,12 @@ def main(output: str, opset: int, device: str, optimize: str, atol: str):
         atol=atol,
         device=device,
     )
 if __name__ == "__main__":
@@ -80,6 +90,13 @@ if __name__ == "__main__":
         default=None,
         help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
     )
     args = parser.parse_args()
-    main(args.output, args.opset, args.device, args.optimize, args.atol)

 import copy
 import argparse
 from optimum.exporters.onnx import onnx_export_from_model
+# quantization import
+from optimum.onnxruntime import ORTQuantizer
+from optimum.onnxruntime.configuration import AutoQuantizationConfig
 from collections import OrderedDict
 from typing import Dict
 from optimum.exporters.onnx.model_configs import XLMRobertaOnnxConfig
         )
+def main(output: str, opset: int, device: str, optimize: str, atol: str, quantize: bool= False):
     model = BGEM3InferenceModel()
     bgem3_onnx_config = BGEM3OnnxConfig(model.config)
     onnx_export_from_model(
         atol=atol,
         device=device,
     )
+    if quantize:
+        quantizer = ORTQuantizer.from_pretrained(output, file_name="model.onnx")
+        qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
+        quantizer.quantize(save_dir=output, quantization_config=qconfig)
 if __name__ == "__main__":
         default=None,
         help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
     )
+    parser.add_argument(
+        "--quantize",
+        type=bool,
+        default=False,
+        help="If specified, the model will be quantized using ONNX Runtime quantization",
+    )
     args = parser.parse_args()
+    main(args.output, args.opset, args.device, args.optimize, args.atol, args.quantize)

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 accelerate==0.27.2
 huggingface-hub==0.20.3
 onnx==1.15.0
-onnxruntime==1.17.0
 optimum==1.17.0
 torch==2.2.0
 transformers==4.37.2

 accelerate==0.27.2
 huggingface-hub==0.20.3
 onnx==1.15.0
+onnxruntime==1.16.0
 optimum==1.17.0
 torch==2.2.0
 transformers==4.37.2