{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Ku0ezvyD42ng" }, "source": [ "#Quantizing huggingface models to exl2\n", "This version of my exl2 quantize colab creates a single quantizaion to download privatly.\\\n", "To calculate an estimate for VRAM size use: [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator)\\\n", "Not all models and architectures are compatible with exl2.\\\n", "Will upload to private hf repo in future." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "G7zSk2LWHtPU" }, "outputs": [], "source": [ "#@title Download and install environment\n", "!git clone https://github.com/turboderp/exllamav2\n", "%cd exllamav2\n", "print(\"Installing pip dependencies\")\n", "!pip install -q -r requirements.txt\n", "!pip install -q huggingface_hub requests tqdm\n", "!pip install . -q\n", "!wget https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/download-model.py\n", "modeldw = \"none\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "8Hl3fQmRLybp" }, "outputs": [], "source": [ "#@title Login to HF (Required only for gated models)\n", "#@markdown From my Colab/Kaggle login script on [Anthonyg5005/hf-scripts](https://huggingface.co/Anthonyg5005/hf-scripts/blob/main/HF%20Login%20Snippet%20Kaggle.py)\n", "#import required functions\n", "import os\n", "from huggingface_hub import login, get_token, whoami\n", "\n", "#get token\n", "if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None: #check if user in kaggle\n", " from kaggle_secrets import UserSecretsClient\n", " from kaggle_web_client import BackendError\n", " try:\n", " login(UserSecretsClient().get_secret(\"HF_TOKEN\")) #login if token secret found\n", " except BackendError:\n", " print('''\n", " When using Kaggle, make sure to use the secret key HF_TOKEN.\n", " This will prevent the need to login every time you run the script.\n", " Set your secrets with the secrets add-on on the top of the screen.\n", " ''')\n", "if get_token() is not None:\n", " #if the token is found then log in:\n", " login(get_token())\n", "else:\n", " #if the token is not found then prompt user to provide it:\n", " login(input(\"API token not detected. Enter your HuggingFace (WRITE) token: \"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "NI1LUMD7H-Zx" }, "outputs": [], "source": [ "#@title ##Choose HF model to download\n", "#@markdown Weights must be stored in safetensors\n", "if modeldw != \"none\":\n", " !rm {model}-{BPW}bpw.zip\n", " !rm -r {model}-exl2-{BPW}bpw\n", "User = \"meta-llama\" # @param {type:\"string\"}\n", "Repo = \"Llama-2-7b-chat-hf\" # @param {type:\"string\"}\n", "modeldw = f\"{User}/{Repo}\"\n", "model = f\"{User}_{Repo}\"\n", "!python download-model.py {modeldw}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "8anbEbGyNmBI" }, "outputs": [], "source": [ "#@title Quantize the model\n", "#@markdown ###Quantization time will last based on model size\n", "#@markdown Target bits per weight:\n", "BPW = \"4.125\" # @param {type:\"string\"}\n", "!mkdir {model}-exl2-{BPW}bpw-WD\n", "!mkdir {model}-exl2-{BPW}bpw\n", "!cp models/{model}/config.json {model}-exl2-{BPW}bpw-WD\n", "#@markdown Calibrate with custom dataset, not recommended: (not finished on this version)\n", "Calibrate = False # @param {type:\"boolean\"}\n", "Calibrate = False\n", "#@markdown Calibration dataset, check above (must be parquet file):\n", "dataset = \"wikitext\" # @param {type:\"string\"}\n", "if Calibrate == True:\n", " quant = f\"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -c {dataset} -b {BPW}\"\n", "else:\n", " quant = f\"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -b {BPW}\"\n", "!python {quant}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "XORLS2uPrbma" }, "outputs": [], "source": [ "#@title Zip and download the model\n", "!rm -r {model}-exl2-{BPW}bpw-WD\n", "!rm -r models/{model}\n", "print(\"Zipping. May take a few minutes\")\n", "!zip -r {model}-{BPW}bpw.zip {model}-exl2-{BPW}bpw\n", "from google.colab import files\n", "files.download(f\"{model}-{BPW}bpw.zip\")\n", "print(\"Colab download speeds very slow so download will take a while\")" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }