{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "7PhL3HkpFeU7" }, "outputs": [], "source": [ "#@title Setup environment\n", "#@markdown Takes about 15 minutes to finish\n", "# download stuff\n", "!git clone https://github.com/turboderp/exllamav2\n", "!wget https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/convert-to-safetensors.py\n", "!wget https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/download-model.py\n", "!pip install -r exllamav2/requirements.txt\n", "!pip install huggingface-hub transformers accelerate --upgrade\n", "!pip install ./exllamav2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "CXbUzOmNHyff" }, "outputs": [], "source": [ "#@title Login to Huggingface - Required\n", "#import required functions\n", "import os\n", "import sys\n", "from huggingface_hub import login, get_token, whoami\n", "\n", "#get token\n", "if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None: #check if user in kaggle\n", " from kaggle_secrets import UserSecretsClient # type: ignore\n", " from kaggle_web_client import BackendError # type: ignore\n", " try:\n", " login(UserSecretsClient().get_secret(\"HF_TOKEN\")) #login if token secret found\n", " except BackendError:\n", " print('''\n", " When using Kaggle, make sure to use the secret key HF_TOKEN with a 'WRITE' token.\n", " This will prevent the need to login every time you run the script.\n", " Set your secrets with the secrets add-on on the top of the screen.\n", " ''')\n", "if get_token() is not None:\n", " #if the token is found then log in:\n", " login(get_token())\n", "else:\n", " #if the token is not found then prompt user to provide it:\n", " login(input(\"API token not detected. Enter your HuggingFace (WRITE) token: \"))\n", "\n", "#if the token is read only then prompt user to provide a write token (Only required if user needs a WRITE token, remove if READ is enough):\n", "while True:\n", " if whoami().get('auth', {}).get('accessToken', {}).get('role', None) != 'write':\n", " if os.environ.get('HF_TOKEN', None) is not None: #if environ finds HF_TOKEN as read-only then display following text and exit:\n", " print('''\n", " You have the environment variable HF_TOKEN set.\n", " You cannot log in.\n", " Either set the environment variable to a 'WRITE' token or remove it.\n", " ''')\n", " sys.exit(\"Exiting...\")\n", " if os.environ.get('COLAB_BACKEND_VERSION', None) is not None:\n", " print('''\n", " Your Colab secret key is read-only\n", " Please switch your key to 'write' or disable notebook access on the left.\n", " ''')\n", " sys.exit(\"Stuck in a loop, exiting...\")\n", " elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None:\n", " print('''\n", " Your Kaggle secret key is read-only\n", " Please switch your key to 'write' or unattach from notebook in add-ons at the top.\n", " Having a read-only key attched will require login every time.\n", " ''')\n", " print(\"You do not have write access to this repository. Please use a valid token with (WRITE) access.\")\n", " login(input(\"Enter your HuggingFace (WRITE) token: \"))\n", " continue\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "dxKEA7obHLoO" }, "outputs": [], "source": [ "#@title Start quant\n", "#@markdown ### Using subprocess to execute scripts doesn't output on Colab. If something seems frozen, please wait. Any detected errors will automatically stop Colab\n", "#import required modules\n", "from huggingface_hub import repo_exists, upload_folder, create_repo, upload_file, create_branch\n", "import os\n", "import sys\n", "import subprocess\n", "import glob\n", "\n", "#define os differences\n", "oname = os.name\n", "if oname == 'nt':\n", " osmv = 'move'\n", " osrmd = 'rmdir /s /q'\n", " oscp = 'copy'\n", " pyt = 'venv\\\\scripts\\\\python.exe'\n", " slsh = '\\\\'\n", "elif oname == 'posix':\n", " osmv = 'mv'\n", " osrmd = 'rm -r'\n", " oscp = 'cp'\n", " pyt = 'python'\n", " slsh = '/'\n", "else:\n", " sys.exit('This script is not compatible with your machine.')\n", "\n", "#get original model repo url\n", "#@markdown Enter unquantized model repository (User/Repo):\n", "repo_url = \"mistralai/Mistral-7B-Instruct-v0.2\" # @param {type:\"string\"}\n", "\n", "#look for repo\n", "if repo_exists(repo_url) == False:\n", " print(f\"Model repo doesn't exist at https://huggingface.co/{repo_url}\")\n", " sys.exit(\"Exiting...\")\n", "model = repo_url.replace(\"/\", \"_\")\n", "modelname = repo_url.split(\"/\")[1]\n", "print(\"\\n\\n\")\n", "\n", "#ask for number of quants\n", "#@markdown Enter the number of quants you want to create:\n", "quant_amount = \"5\" # @param {type:\"string\"}\n", "qmount = int(quant_amount)\n", "qmount += 1\n", "\n", "#save bpw values\n", "#@markdown You will be asked the BPW values after running this section.\n", "print(f\"Type the BPW for the following {qmount - 1} quants. Recommend staying over 2.4 BPW. Use the vram calculator to find the best BPW values: https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator\")\n", "qnum = {}\n", "for i in range(1, qmount):\n", " qnum[f\"bpw{i}\"] = float(input(f\"Enter BPW for quant {i} (2.00-8.00): \")) #convert input to float for proper sorting\n", "print(\"\\n\\n\")\n", "\n", "#collect all values in a list for sorting\n", "bpwvalue = list(qnum.values())\n", "\n", "#sort the list from smallest to largest\n", "bpwvalue.sort()\n", "\n", "if not os.path.exists(f\"models{slsh}{model}{slsh}converted-st\"): #check if model was converted to safetensors, skip download if it was\n", " print(\"Starting download...\")\n", " result = subprocess.run(f\"{pyt} download-model.py {repo_url}\", shell=True) #download model from hf (Credit to oobabooga for this script)\n", " if result.returncode != 0:\n", " print(\"Download failed.\")\n", " sys.exit(\"Exiting...\")\n", " print(\"Download finished\\n\\n\")\n", "\n", "if not glob.glob(f\"models/{model}/*.safetensors\"): #check if safetensors model exists, if not try converting\n", " print(\"Converting weights to safetensors, please wait...\")\n", " result = subprocess.run(f\"{pyt} convert-to-safetensors.py models{slsh}{model} --output models{slsh}{model}-st --max-shard-size 1GB --bf16\", shell=True) #convert to safetensors (Credit to oobabooga for this script as well)\n", " if result.returncode != 0:\n", " print(\"Converting failed. Please look for a safetensors/bin model.\")\n", " sys.exit(\"Exiting...\")\n", " subprocess.run(f\"{osrmd} models{slsh}{model}\", shell=True)\n", " subprocess.run(f\"{osmv} models{slsh}{model}-st models{slsh}{model}\", shell=True)\n", " open(f\"models{slsh}{model}{slsh}converted-st\", 'w').close()\n", " print(\"Finished converting\")\n", " print(\"\\n\\n\")\n", "\n", "#create new repo if one doesn't already exist\n", "if repo_exists(f\"{whoami().get('name', None)}/{modelname}-exl2\") == False:\n", " print(\"Creating model repository...\")\n", " create_repo(f\"{whoami().get('name', None)}/{modelname}-exl2\", private=True)\n", " print(f\"Created repo at https://huggingface.co/{whoami().get('name', None)}/{modelname}-exl2\") #notify user of repo creation\n", "\n", " #create the markdown file\n", " print(\"Writing model card...\")\n", " with open('./README.md', 'w') as file:\n", " file.write(f\"# Exl2 quants for [{modelname}](https://huggingface.co/{repo_url})\\n\\n\")\n", " file.write(\"## Automatically quantized using the auto quant from [hf-scripts](https://huggingface.co/anthonyg5005/hf-scripts)\\n\\n\")\n", " file.write(f\"Would recommend {whoami().get('name', None)} to change up this README to include more info.\\n\\n\")\n", " file.write(\"### BPW:\\n\\n\")\n", " for bpw in bpwvalue:\n", " file.write(f\"[{bpw}](https://huggingface.co/{whoami().get('name', None)}/{modelname}-exl2/tree/{bpw}bpw)\\n\\n\")\n", " print(\"Created README.md\")\n", "\n", " upload_file(path_or_fileobj=\"README.md\", path_in_repo=\"README.md\", repo_id=f\"{whoami().get('name', None)}/{modelname}-exl2\", commit_message=\"Add temp README\") #upload md file\n", " print(\"Uploaded README.md to main\")\n", "else:\n", " print(f\"WARNING: repo already exists at https://huggingface.co/{whoami().get('name', None)}/{modelname}-exl2\")\n", "\n", "#start converting\n", "for bpw in bpwvalue:\n", " if os.path.exists(f\"{model}-measure{slsh}measurement.json\"): # Check if measurement.json exists\n", " cmdir = False\n", " mskip = f\" -m {model}-measure{slsh}measurement.json\" #skip measurement if it exists\n", " else:\n", " cmdir = True\n", " mskip = \"\"\n", " print(f\"Starting quantization for BPW {bpw}. Please wait, may take hours\")\n", " os.makedirs(f\"{model}-exl2-{bpw}bpw-WD\", exist_ok=True) #create working directory\n", " os.makedirs(f\"{model}-exl2-{bpw}bpw\", exist_ok=True) #create compile full directory\n", " subprocess.run(f\"{oscp} models{slsh}{model}{slsh}config.json {model}-exl2-{bpw}bpw-WD\", shell=True) #copy config to working directory\n", " #more settings exist in the convert.py script, to veiw them go to docs/convert.md or https://github.com/turboderp/exllamav2/blob/master/doc/convert.md\n", " result = subprocess.run(f\"{pyt} exllamav2/convert.py -i models/{model} -o {model}-exl2-{bpw}bpw-WD -cf {model}-exl2-{bpw}bpw -b {bpw}{mskip} -ss 2048\", shell=True) #run quantization and exit if failed (Credit to turbo for his dedication to exl2)\n", " if result.returncode != 0:\n", " print(\"Quantization failed.\")\n", " sys.exit(\"Exiting...\")\n", " print(f\"Down quantizing BPW {bpw}. Starting upload\")\n", " if cmdir == True:\n", " os.makedirs(f\"{model}-measure\", exist_ok=True) #create measurement directory\n", " subprocess.run(f\"{oscp} {model}-exl2-{bpw}bpw-WD{slsh}measurement.json {model}-measure\", shell=True) #copy measurement to measure directory\n", " open(f\"{model}-measure/Delete folder when no more quants are needed from this model\", 'w').close()\n", " try:\n", " create_branch(f\"{whoami().get('name', None)}/{modelname}-exl2\", branch=f\"{bpw}bpw\") #create branch\n", " except:\n", " print(f\"Branch {bpw} already exists, trying upload...\")\n", " upload_folder(folder_path=f\"{model}-exl2-{bpw}bpw\", repo_id=f\"{whoami().get('name', None)}/{modelname}-exl2\", commit_message=f\"Add quant for BPW {bpw}\", revision=f\"{bpw}bpw\") #upload quantized model\n", " subprocess.run(f\"{osrmd} {model}-exl2-{bpw}bpw-WD\", shell=True) #remove working directory\n", " subprocess.run(f\"{osrmd} {model}-exl2-{bpw}bpw\", shell=True) #remove compile directory\n", "\n", "upload_file(path_or_fileobj=f\"{model}-measure{slsh}measurement.json\", path_in_repo=\"measurement.json\", repo_id=f\"{whoami().get('name', None)}/{modelname}-exl2\", commit_message=\"Add measurement.json\") #upload measurement.json to main\n", "\n", "print(f'''Quants available at https://huggingface.co/{whoami().get('name', None)}/{modelname}-exl2\n", " \\nRepo is private, go to https://huggingface.co/{whoami().get('name', None)}/{modelname}-exl2/settings to make public if you'd like.''')\n" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }