https://github.com/vllm-project/llm-compressor/pull/185 ```python from transformers import AutoTokenizer from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.transformers import oneshot from llmcompressor.transformers.sparsification import create_sparse_auto_model_class MODEL_ID = "llava-hf/llava-1.5-7b-hf" model_class = create_sparse_auto_model_class("LlavaForConditionalGeneration") model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) recipe = QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) oneshot(model=model, recipe=recipe) # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") # Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" model.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR) ```