File size: 5,784 Bytes
519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 519cef8 aa3ba77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "Ku0ezvyD42ng"
},
"source": [
"#Quantizing huggingface models to exl2\n",
"This version of my exl2 quantize colab creates a single quantizaion to download privatly.\\\n",
"To calculate an estimate for VRAM size use: [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator)\\\n",
"Not all models and architectures are compatible with exl2.\\\n",
"Will upload to private hf repo in future."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "G7zSk2LWHtPU"
},
"outputs": [],
"source": [
"#@title Download and install environment\n",
"!git clone https://github.com/turboderp/exllamav2\n",
"%cd exllamav2\n",
"print(\"Installing pip dependencies\")\n",
"!pip install -q -r requirements.txt\n",
"!pip install -q huggingface_hub requests tqdm\n",
"!pip install . -q\n",
"!wget https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/download-model.py\n",
"modeldw = \"none\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "8Hl3fQmRLybp"
},
"outputs": [],
"source": [
"#@title Login to HF (Required only for gated models)\n",
"#@markdown From my Colab/Kaggle login script on [Anthonyg5005/hf-scripts](https://huggingface.co/Anthonyg5005/hf-scripts/blob/main/HF%20Login%20Snippet%20Kaggle.py)\n",
"#import required functions\n",
"import os\n",
"from huggingface_hub import login, get_token, whoami\n",
"\n",
"#get token\n",
"if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None: #check if user in kaggle\n",
" from kaggle_secrets import UserSecretsClient\n",
" from kaggle_web_client import BackendError\n",
" try:\n",
" login(UserSecretsClient().get_secret(\"HF_TOKEN\")) #login if token secret found\n",
" except BackendError:\n",
" print('''\n",
" When using Kaggle, make sure to use the secret key HF_TOKEN.\n",
" This will prevent the need to login every time you run the script.\n",
" Set your secrets with the secrets add-on on the top of the screen.\n",
" ''')\n",
"if get_token() is not None:\n",
" #if the token is found then log in:\n",
" login(get_token())\n",
"else:\n",
" #if the token is not found then prompt user to provide it:\n",
" login(input(\"API token not detected. Enter your HuggingFace (WRITE) token: \"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "NI1LUMD7H-Zx"
},
"outputs": [],
"source": [
"#@title ##Choose HF model to download\n",
"#@markdown Weights must be stored in safetensors\n",
"if modeldw != \"none\":\n",
" !rm {model}-{BPW}bpw.zip\n",
" !rm -r {model}-exl2-{BPW}bpw\n",
"User = \"meta-llama\" # @param {type:\"string\"}\n",
"Repo = \"Llama-2-7b-chat-hf\" # @param {type:\"string\"}\n",
"modeldw = f\"{User}/{Repo}\"\n",
"model = f\"{User}_{Repo}\"\n",
"!python download-model.py {modeldw}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "8anbEbGyNmBI"
},
"outputs": [],
"source": [
"#@title Quantize the model\n",
"#@markdown ###Quantization time will last based on model size\n",
"#@markdown Target bits per weight:\n",
"BPW = \"4.125\" # @param {type:\"string\"}\n",
"!mkdir {model}-exl2-{BPW}bpw-WD\n",
"!mkdir {model}-exl2-{BPW}bpw\n",
"!cp models/{model}/config.json {model}-exl2-{BPW}bpw-WD\n",
"#@markdown Calibrate with dataset, may improve model output: (NOT WORKING YET)\n",
"Calibrate = False # @param {type:\"boolean\"}\n",
"#@markdown Calibration dataset, check above (must be parquet file):\n",
"dataset = \"wikitext\" # @param {type:\"string\"}\n",
"if Calibrate == True:\n",
" quant = f\"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -c {dataset} -b {BPW}\"\n",
"else:\n",
" quant = f\"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -b {BPW}\"\n",
"!python {quant}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "XORLS2uPrbma"
},
"outputs": [],
"source": [
"#@title Zip and download the model\n",
"!rm -r {model}-exl2-{BPW}bpw-WD\n",
"!rm -r models/{model}\n",
"print(\"Zipping. May take a few minutes\")\n",
"!zip -r {model}-{BPW}bpw.zip {model}-exl2-{BPW}bpw\n",
"from google.colab import files\n",
"files.download(f\"{model}-{BPW}bpw.zip\")\n",
"print(\"Colab download speeds very slow so download will take a while\")"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
|