File size: 5,784 Bytes
519cef8
 
 
 
aa3ba77
 
 
519cef8
 
 
 
 
 
aa3ba77
519cef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3ba77
519cef8
 
 
 
 
 
aa3ba77
 
 
 
 
 
519cef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3ba77
519cef8
 
 
aa3ba77
 
 
 
 
 
519cef8
 
 
 
 
 
 
 
 
 
 
aa3ba77
519cef8
 
 
aa3ba77
 
 
 
 
 
519cef8
 
aa3ba77
519cef8
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3ba77
519cef8
 
 
aa3ba77
 
 
 
 
 
519cef8
 
 
 
 
 
 
 
 
aa3ba77
519cef8
aa3ba77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ku0ezvyD42ng"
      },
      "source": [
        "#Quantizing huggingface models to exl2\n",
        "This version of my exl2 quantize colab creates a single quantizaion to download privatly.\\\n",
        "To calculate an estimate for VRAM size use: [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator)\\\n",
        "Not all models and architectures are compatible with exl2.\\\n",
        "Will upload to private hf repo in future."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "G7zSk2LWHtPU"
      },
      "outputs": [],
      "source": [
        "#@title Download and install environment\n",
        "!git clone https://github.com/turboderp/exllamav2\n",
        "%cd exllamav2\n",
        "print(\"Installing pip dependencies\")\n",
        "!pip install -q -r requirements.txt\n",
        "!pip install -q huggingface_hub requests tqdm\n",
        "!pip install . -q\n",
        "!wget https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/download-model.py\n",
        "modeldw = \"none\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "8Hl3fQmRLybp"
      },
      "outputs": [],
      "source": [
        "#@title Login to HF (Required only for gated models)\n",
        "#@markdown From my Colab/Kaggle login script on [Anthonyg5005/hf-scripts](https://huggingface.co/Anthonyg5005/hf-scripts/blob/main/HF%20Login%20Snippet%20Kaggle.py)\n",
        "#import required functions\n",
        "import os\n",
        "from huggingface_hub import login, get_token, whoami\n",
        "\n",
        "#get token\n",
        "if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None: #check if user in kaggle\n",
        "    from kaggle_secrets import UserSecretsClient\n",
        "    from kaggle_web_client import BackendError\n",
        "    try:\n",
        "        login(UserSecretsClient().get_secret(\"HF_TOKEN\")) #login if token secret found\n",
        "    except BackendError:\n",
        "        print('''\n",
        "                      When using Kaggle, make sure to use the secret key HF_TOKEN.\n",
        "                   This will prevent the need to login every time you run the script.\n",
        "                   Set your secrets with the secrets add-on on the top of the screen.\n",
        "             ''')\n",
        "if get_token() is not None:\n",
        "    #if the token is found then log in:\n",
        "    login(get_token())\n",
        "else:\n",
        "    #if the token is not found then prompt user to provide it:\n",
        "    login(input(\"API token not detected. Enter your HuggingFace (WRITE) token: \"))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "NI1LUMD7H-Zx"
      },
      "outputs": [],
      "source": [
        "#@title ##Choose HF model to download\n",
        "#@markdown Weights must be stored in safetensors\n",
        "if modeldw != \"none\":\n",
        "    !rm {model}-{BPW}bpw.zip\n",
        "    !rm -r {model}-exl2-{BPW}bpw\n",
        "User = \"meta-llama\" # @param {type:\"string\"}\n",
        "Repo = \"Llama-2-7b-chat-hf\" # @param {type:\"string\"}\n",
        "modeldw = f\"{User}/{Repo}\"\n",
        "model = f\"{User}_{Repo}\"\n",
        "!python download-model.py {modeldw}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "8anbEbGyNmBI"
      },
      "outputs": [],
      "source": [
        "#@title Quantize the model\n",
        "#@markdown ###Quantization time will last based on model size\n",
        "#@markdown Target bits per weight:\n",
        "BPW = \"4.125\" # @param {type:\"string\"}\n",
        "!mkdir {model}-exl2-{BPW}bpw-WD\n",
        "!mkdir {model}-exl2-{BPW}bpw\n",
        "!cp models/{model}/config.json {model}-exl2-{BPW}bpw-WD\n",
        "#@markdown Calibrate with dataset, may improve model output: (NOT WORKING YET)\n",
        "Calibrate = False # @param {type:\"boolean\"}\n",
        "#@markdown Calibration dataset, check above (must be parquet file):\n",
        "dataset = \"wikitext\" # @param {type:\"string\"}\n",
        "if Calibrate == True:\n",
        "    quant = f\"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -c {dataset} -b {BPW}\"\n",
        "else:\n",
        "    quant = f\"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -b {BPW}\"\n",
        "!python {quant}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "XORLS2uPrbma"
      },
      "outputs": [],
      "source": [
        "#@title Zip and download the model\n",
        "!rm -r {model}-exl2-{BPW}bpw-WD\n",
        "!rm -r models/{model}\n",
        "print(\"Zipping. May take a few minutes\")\n",
        "!zip -r {model}-{BPW}bpw.zip {model}-exl2-{BPW}bpw\n",
        "from google.colab import files\n",
        "files.download(f\"{model}-{BPW}bpw.zip\")\n",
        "print(\"Colab download speeds very slow so download will take a while\")"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}