"Open

In [None]:
# @title # 🤖 AutoQuantize

# @markdown 🔮 Created by [@zainulabideen](https://huggingface.co/abideen).

# @markdown Please add HF token to the secrets tab in Google Colab before.

# @markdown Quantization formats supported: `GGUF`, `AWQ`, `EXL2`, `GPTQ`

# @markdown ---


# @markdown ### 🤗 Hugging Face Hub

MODEL_ID = "abideen/Heimer-dpo-TinyLlama-1.1B" # @param {type:"string"}
MODEL_NAME = MODEL_ID.split('/')[-1]

# Download model
!git lfs install
!git clone https://huggingface.co/{MODEL_ID}

username = "abideen" # @param {type:"string"}
token = "" # @param {type:"string"}
!pip install -q huggingface_hub
from huggingface_hub import create_repo, HfApi
from google.colab import userdata, runtime

In [None]:
# @title # 🛸 GGUF
# @markdown ### ✨ Quantization parameters

QUANTIZATION_FORMAT = "q4_k_m" # @param {type:"string"}
QUANTIZATION_METHODS = QUANTIZATION_FORMAT.replace(" ", "").split(",")
# Install llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

# Convert to fp16
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
 qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
 !./llama.cpp/quantize {fp16} {qtype} {method}

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
 repo_id = f"{username}/{MODEL_NAME}-GGUF",
 repo_type="model",
 exist_ok=True,
 token=hf_token
)

# Upload gguf files
api.upload_folder(
 folder_path=MODEL_NAME,
 repo_id=f"{username}/{MODEL_NAME}-GGUF",
 allow_patterns=["*.gguf","$.md"],
 token=hf_token
)

In [None]:
# @title # 🏛️ AWQ
# @markdown ### ✨ Quantization parameters

Q_GROUP_SIZE = 128 # @param {type:"integer"}
ZERO_POINT = True # @param {text:"boolean"}
W_BIT = 4 # @param {type:"integer"}
VERSION = "GEMM" # @param {type:"string"}
SAFETENSORS = True # @param {text:"boolean"}

# Install AutoAWQ
!git clone https://github.com/casper-hansen/AutoAWQ
%cd AutoAWQ
!pip install -e .
!pip install git+https://github.com/huggingface/transformers
!pip install zstandard

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer


quant_path = MODEL_NAME + "-awq"
quant_config = { "zero_point": ZERO_POINT, "q_group_size": Q_GROUP_SIZE, "w_bit": W_BIT, "version": VERSION }

# Load model
PATH = "/content/" + MODEL_NAME
model = AutoAWQForCausalLM.from_pretrained(PATH, safetensors=SAFETENSORS)
tokenizer = AutoTokenizer.from_pretrained(PATH, trust_remote_code=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
 repo_id = f"{username}/{MODEL_NAME}-AWQ",
 repo_type="model",
 exist_ok=True,
 token=hf_token
)

# Upload awq files
api.upload_folder(
 folder_path=quant_path,
 repo_id=f"{username}/{MODEL_NAME}-AWQ",
 token=hf_token
)

In [None]:
# @title # 🔬 EXL2
# @markdown ### ✨ Quantization parameters

BPW = 5.0 # @param {type:"number"}

# Install ExLLamaV2
!git clone https://github.com/turboderp/exllamav2
!pip install -e exllamav2

!mv {MODEL_NAME} base_model
!rm base_mode/*.bin

# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

# Quantize model
!mkdir quant
!python exllamav2/convert.py \
 -i base_model \
 -o quant \
 -c wikitext-test.parquet \
 -b {BPW}

# Copy files
!rm -rf quant/out_tensor
!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./quant/

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
 repo_id = f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
 repo_type="model",
 exist_ok=True,
 token=hf_token
)

# Upload exl2 files
api.upload_folder(
 folder_path=quant,
 repo_id=f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
 token=hf_token
)

In [None]:
# @title # 📝 GPTQ
# @markdown ### ✨ Quantization parameters

BITS = 4 # @param {type:"integer"}
GROUP_SIZE = 128 # @param {type:"integer"}
DAMP_PERCENT = 0.01 # @param {type:"number"}

!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers
import random
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
from transformers import AutoTokenizer
out_dir = MODEL_ID + "-GPTQ"

# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
 bits=BITS,
 group_size=GROUP_SIZE,
 damp_percent=DAMP_PERCENT,
 desc_act=False,
)
PATH = "/content/" + MODEL_NAME
model = AutoGPTQForCausalLM.from_pretrained(PATH, quantize_config)
tokenizer = AutoTokenizer.from_pretrained(PATH)

# Load data and tokenize examples
n_samples = 1024
data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')

# Format tokenized examples
examples_ids = []
for _ in range(n_samples):
 i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)
 j = i + tokenizer.model_max_length
 input_ids = tokenized_data.input_ids[:, i:j]
 attention_mask = torch.ones_like(input_ids)
 examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})

# Quantize with GPTQ
model.quantize(
 examples_ids,
 batch_size=1,
 use_triton=True,
)

# Save model and tokenizer
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
 repo_id = f"{username}/{MODEL_NAME}-GPTQ",
 repo_type="model",
 exist_ok=True,
 token=hf_token
)

# Upload gptq files
api.upload_folder(
 folder_path=out_dir,
 repo_id=f"{username}/{MODEL_NAME}-GPTQ",
 token=hf_token
)
