Llama 3 70B Instruct – OmniQuant

Based on Llama 3 70B Instruct.

Quantized with OmniQuant.

Evaluation

PPL (↓)

wiki
FP 5,33
Quantized 5,90

Accuracy on English Benchmarks, % (↑)

piqa arc_easy arc_challenge boolq hellaswag winogrande mmlu_humanities mmlu_social_sciences mmlu_stem mmlu_other
FP 81,5 86,2 61,9 87,4 63,7 75,8 78,7 84,4 71,1 80,2
Quantized 80,7 85,8 61,4 87,0 62,7 73,0 75,5 81,0 68,6 77,9

Accuracy on Russian Benchmarks, % (↑)

danetqa terra rwsd muserc rucos lidirus parus rcb russe rucola
FP 88,9 88,6 75,5 81,8 82,4 70,7 77,0 35,0 63,1 34,7
Quantized 86,6 81,8 71,6 75,6 69,5 60,3 64,0 26,8 63,1 32,5

Summary

Avg acc diff on Eng, % (↑) Avg acc diff on Rus, % (↑) Occupied disk space, % (↓)
FP 0 0 100
Quantized -1,7 -6,6 28,2

Examples

Imports and Model Loading

Expand
import gc

import auto_gptq.nn_modules.qlinear.qlinear_cuda as qlinear_cuda
import auto_gptq.nn_modules.qlinear.qlinear_triton as qlinear_triton
import torch

from accelerate import (
    init_empty_weights,
    infer_auto_device_map,
    load_checkpoint_in_model,
)
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)


def get_named_linears(model):
    return {
        name: module for name, module in model.named_modules()
        if isinstance(module, torch.nn.Linear)
    }


def set_module(model, name, module):
    parent = model
    levels = name.split('.')

    for i in range(len(levels) - 1):
        cur_name = levels[i]

        if cur_name.isdigit():
            parent = parent[int(cur_name)]
        else:
            parent = getattr(parent, cur_name)

    setattr(parent, levels[-1], module)


def load_model(model_path):
    # Based on: https://github.com/OpenGVLab/OmniQuant/blob/main/runing_quantized_mixtral_7bx8.ipynb

    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

    if not hasattr(config, 'quantization_config'):
        raise AttributeError(
            f'No quantization info found in model config "{model_path}"'
            f' (`quantization_config` section is missing).'
        )

    wbits = config.quantization_config['bits']
    group_size = config.quantization_config['group_size']

    # We are going to init an ordinary model and then manually replace all Linears with QuantLinears
    del config.quantization_config

    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(config=config, torch_dtype=torch.float16, trust_remote_code=True)

    layers = model.model.layers

    for i in tqdm(range(len(layers))):
        layer = layers[i]
        named_linears = get_named_linears(layer)

        for name, module in named_linears.items():
            params = (
                wbits, group_size,
                module.in_features, module.out_features,
                module.bias is not None
            )

            if wbits in [2, 4]:
                q_linear = qlinear_triton.QuantLinear(*params)
            elif wbits == 3:
                q_linear = qlinear_cuda.QuantLinear(*params)
            else:
                raise NotImplementedError("Only 2, 3 and 4 bits are supported.")

            q_linear.to(next(layer.parameters()).device)
            set_module(layer, name, q_linear)

    torch.cuda.empty_cache()
    gc.collect()

    model.tie_weights()
    device_map = infer_auto_device_map(model)

    print("Loading pre-computed quantized weights...")

    load_checkpoint_in_model(
        model, checkpoint=model_path,
        device_map=device_map, offload_state_dict=True,
    )

    print("Model loaded successfully!")

    return model

Inference

model_path = "compressa-ai/Llama-3-70B-Instruct-OmniQuant"

model = load_model(model_path).cuda()
tokenizer = AutoTokenizer.from_pretrained(
    model_path, use_fast=False, trust_remote_code=True
)

# Llama 3 "specifics"
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/4
terminators = [
    tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

system_message = "You are a friendly chatbot who responds as if you are the Sandy Cheeks squirrel from the SpongeBob SquarePants cartoon."
user_message = "Do squirrels communicate with birds?"
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_message},
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}

outputs = model.generate(
    **inputs, max_new_tokens=512,
    do_sample=True, temperature=0.7, top_p=0.95,
    eos_token_id=terminators,
)

response = tokenizer.decode(outputs[0])
continuation = response.removeprefix(prompt).removesuffix(tokenizer.eos_token)

print(f'Prompt:\n{prompt}')
print(f'Continuation:\n{continuation}\n')

Inference Using Pipeline

pipe = pipeline(
    "text-generation",
    model=model, tokenizer=tokenizer,
    eos_token_id=terminators,
    max_new_tokens=512, do_sample=True,
    temperature=0.7, top_p=0.95,
    device=0,
)

prompt = pipe.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt)

response = outputs[0]["generated_text"]
continuation = response.removeprefix(prompt)

print(f'Prompt:\n{prompt}')
print(f'Continuation:\n{continuation}\n')
Downloads last month
18
Safetensors
Model size
11.3B params
Tensor type
FP16
Β·
I32
Β·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Model tree for compressa-ai/Llama-3-70B-Instruct-OmniQuant

Quantized
(3)
this model