Spaces:

yangzzay
/

HydroxApp_t2t

Runtime error

App Files Files Community

yangzzay commited on Nov 22, 2023

Commit

02a7b98

1 Parent(s): d618f04

Upload folder using huggingface_hub

Browse files

Files changed (42) hide show

.env +18 -0
.github/workflows/branch.yml +60 -0
.github/workflows/release.yml +30 -0
.gitignore +10 -0
CONTRIBUTING.md +90 -0
LICENSE +21 -0
README.md +379 -8
app.py +418 -0
benchmark.py +145 -0
code_completion.py +216 -0
colab/Llama_2_7b_Chat_GPTQ.ipynb +0 -0
colab/ggmlv3_q4_0.ipynb +109 -0
colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb +514 -0
docs/issues.md +0 -0
docs/news.md +38 -0
docs/performance.md +32 -0
docs/pypi.md +187 -0
env_examples/.env.13b_example +13 -0
env_examples/.env.7b_8bit_example +13 -0
env_examples/.env.7b_ggmlv3_q4_0_example +18 -0
env_examples/.env.7b_gptq_example +18 -0
llama2_wrapper/__init__.py +1 -0
llama2_wrapper/__pycache__/__init__.cpython-310.pyc +0 -0
llama2_wrapper/__pycache__/model.cpython-310.pyc +0 -0
llama2_wrapper/__pycache__/types.cpython-310.pyc +0 -0
llama2_wrapper/download/__init__.py +0 -0
llama2_wrapper/download/__main__.py +59 -0
llama2_wrapper/model.py +787 -0
llama2_wrapper/server/__init__.py +0 -0
llama2_wrapper/server/__main__.py +46 -0
llama2_wrapper/server/app.py +526 -0
llama2_wrapper/types.py +115 -0
nohup.out +0 -0
poetry.lock +0 -0
prompts/__pycache__/utils.cpython-310.pyc +0 -0
prompts/prompts_en.csv +0 -0
prompts/utils.py +48 -0
pyproject.toml +47 -0
requirements.txt +21 -0
static/screenshot.png +0 -0
tests/__init__.py +0 -0
tests/test_get_prompt.py +59 -0

.env ADDED Viewed

	@@ -0,0 +1,18 @@

+MODEL_PATH = "FlagAlpha/Llama2-Chinese-7b-Chat"
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example ggml path:
+# MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "transformers"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = ""

.github/workflows/branch.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: Push
+on: [push]
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run image
+        uses: abatilo/actions-poetry@v2.1.4
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Install dependencies
+        run: poetry install
+      - name: Run tests
+        run: poetry run pytest
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v3
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+      # - name: Upload coverage to Codecov
+      #   uses: codecov/codecov-action@v2
+  code-quality:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Python Poetry Action
+        uses: abatilo/actions-poetry@v2.1.6
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Install dependencies
+        run: poetry install
+      - name: Run black
+        run: poetry run black . --check
+      # - name: Run isort
+      #   run: poetry run isort . --check-only --profile black
+      # - name: Run flake8
+      #   run: poetry run flake8 .
+      # - name: Run bandit
+      #   run: poetry run bandit .
+      # - name: Run saftey
+      #   run: poetry run safety check

.github/workflows/release.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: Release
+on:
+  release:
+    types:
+      - created
+jobs:
+  publish:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run image
+        uses: abatilo/actions-poetry@v2.1.4
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Publish
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          poetry config pypi-token.pypi $PYPI_TOKEN
+          poetry publish --build

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+models
+dist
+.DS_Store
+.vscode
+__pycache__
+gradio_cached_examples
+.pytest_cache

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# Contributing to [llama2-webui](https://github.com/liltom-eth/llama2-webui)
+We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
+- Reporting a bug
+- Proposing new features
+- Discussing the current state of the code
+- Update README.md
+- Submitting a PR
+## Using GitHub's [issues](https://github.com/liltom-eth/llama2-webui/issues)
+We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/liltom-eth/llama2-webui/issues). It's that easy!
+Thanks for **[jlb1504](https://github.com/jlb1504)** for reporting the [first issue](https://github.com/liltom-eth/llama2-webui/issues/1)!
+**Great Bug Reports** tend to have:
+- A quick summary and/or background
+- Steps to reproduce
+  - Be specific!
+  - Give a sample code if you can.
+- What you expected would happen
+- What actually happens
+- Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
+Proposing new features are also welcome.
+## Pull Request
+All pull requests are welcome. For example, you update the `README.md` to help users to better understand the usage.
+### Clone the repository
+1. Create a user account on GitHub if you do not already have one.
+2. Fork the project [repository](https://github.com/liltom-eth/llama2-webui): click on the *Fork* button near the top of the page. This creates a copy of the code under your account on GitHub.
+3. Clone this copy to your local disk:
+   ```
+   git clone git@github.com:liltom-eth/llama2-webui.git
+   cd llama2-webui
+   ```
+### Implement your changes
+1. Create a branch to hold your changes:
+   ```
+   git checkout -b my-feature
+   ```
+   and start making changes. Never work on the main branch!
+2. Start your work on this branch.
+3. When you’re done editing, do:
+   ```
+   git add <MODIFIED FILES>
+   git commit
+   ```
+   to record your changes in [git](https://git-scm.com/).
+### Submit your contribution
+1. If everything works fine, push your local branch to the remote server with:
+   ```
+   git push -u origin my-feature
+   ```
+2. Go to the web page of your fork and click "Create pull request" to send your changes for review.
+   ```{todo}
+      Find more detailed information in [creating a PR]. You might also want to open
+      the PR as a draft first and mark it as ready for review after the feedbacks
+      from the continuous integration (CI) system or any required fixes.
+   ```
+## License
+By contributing, you agree that your contributions will be licensed under its MIT License.
+## Questions?
+Email us at [liltom.eth@gmail.com](mailto:liltom.eth@gmail.com)

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Tom
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,383 @@
 ---
-title: HydroxApp T2t
-emoji: 🚀
-colorFrom: purple
-colorTo: pink
-sdk: gradio
-sdk_version: 4.5.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: HydroxApp_t2t
 app_file: app.py
+sdk: gradio
+sdk_version: 3.37.0
 ---
+# llama2-webui
+Running Llama 2 with gradio web UI on GPU or CPU from anywhere (Linux/Windows/Mac).
+- Supporting all Llama 2 models (7B, 13B, 70B, GPTQ, GGML, GGUF, [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)) with 8-bit, 4-bit mode.
+- Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
+![screenshot](./static/screenshot.png)
+![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
+## Features
+- Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML),  [Llama-2-GGUF](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF),  [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
+- Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on free Colab T4 GPU](./colab/Llama_2_7b_Chat_GPTQ.ipynb)
+- Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
+- [News](./docs/news.md), [Benchmark](./docs/performance.md), [Issue Solutions](./docs/issues.md)
+## Contents
+- [Install](#install)
+- [Usage](#usage)
+  - [Start Chat UI](#start-chat-ui)
+  - [Start Code Llama UI](#start-code-llama-ui)
+  - [Use llama2-wrapper for Your App](#use-llama2-wrapper-for-your-app)
+  - [Start OpenAI Compatible API](#start-openai-compatible-api)
+- [Benchmark](#benchmark)
+- [Download Llama-2 Models](#download-llama-2-models)
+  - [Model List](#model-list)
+  - [Download Script](#download-script)
+- [Tips](#tips)
+  - [Env Examples](#env-examples)
+  - [Run on Nvidia GPU](#run-on-nvidia-gpu)
+    - [Run bitsandbytes 8 bit](#run-bitsandbytes-8-bit)
+    - [Run GPTQ 4 bit](#run-gptq-4-bit)
+  - [Run on CPU](#run-on-cpu)
+    - [Mac Metal Acceleration](#mac-metal-acceleration)
+    - [AMD/Nvidia GPU Acceleration](#amdnvidia-gpu-acceleration)
+- [License](#license)
+- [Contributing](#contributing)
+## Install
+### Method 1: From [PyPI](https://pypi.org/project/llama2-wrapper/)
+```
+pip install llama2-wrapper
+```
+The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
+If you would like to use old `ggml` models, install `llama2-wrapper<=0.1.13` or manually install `llama-cpp-python==0.1.77`.
+### Method 2: From Source:
+```
+git clone https://github.com/liltom-eth/llama2-webui.git
+cd llama2-webui
+pip install -r requirements.txt
+```
+### Install Issues:
+`bitsandbytes >= 0.39` may not work on older NVIDIA GPUs. In that case, to use `LOAD_IN_8BIT`, you may have to downgrade like this:
+-  `pip install bitsandbytes==0.38.1`
+`bitsandbytes` also need a special install for Windows:
+```
+pip uninstall bitsandbytes
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl
+```
+## Usage
+### Start Chat UI
+Run chatbot simply with web UI:
+```bash
+python app.py
+```
+`app.py` will load the default config `.env` which uses `llama.cpp` as the backend to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model for inference. The model `llama-2-7b-chat.ggmlv3.q4_0.bin` will be automatically downloaded.
+```bash
+Running on backend llama.cpp.
+Use default model path: ./models/llama-2-7b-chat.Q4_0.gguf
+Start downloading model to: ./models/llama-2-7b-chat.Q4_0.gguf
+```
+You can also customize your `MODEL_PATH`, `BACKEND_TYPE,` and model configs in `.env` file to run different llama2 models on different backends (llama.cpp, transformers, gptq).
+### Start Code Llama UI
+We provide a code completion / filling UI for Code Llama.
+Base model **Code Llama** and extend model **Code Llama — Python** are not fine-tuned to follow instructions. They should be prompted so that the expected answer is the natural continuation of the prompt. That means these two models focus on code filling and code completion.
+Here is an example run CodeLlama code completion on llama.cpp backend:
+```
+python code_completion.py --model_path ./models/codellama-7b.Q4_0.gguf
+```
+![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
+`codellama-7b.Q4_0.gguf` can be downloaded from [TheBloke/CodeLlama-7B-GGUF](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/blob/main/codellama-7b.Q4_0.gguf).
+**Code Llama — Instruct** trained with “natural language instruction” inputs paired with anticipated outputs. This strategic methodology enhances the model’s capacity to grasp human expectations in prompts. That means instruct models can be used in a chatbot-like app.
+Example run CodeLlama chat on gptq backend:
+```
+python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True
+```
+![code_llama_chat](https://i.imgur.com/lQLfemB.gif)
+`CodeLlama-7B-Instruct-GPTQ` can be downloaded from [TheBloke/CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)
+### Use llama2-wrapper for Your App
+🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
+Use  `llama2-wrapper`  as your local llama2 backend to answer questions and more, [colab example](./colab/ggmlv3_q4_0.ipynb):
+```python
+# pip install llama2-wrapper
+from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+llama2_wrapper = LLAMA2_WRAPPER()
+# Default running on backend llama.cpp.
+# Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
+prompt = "Do you know Pytorch"
+answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
+```
+Run gptq llama2 model on Nvidia GPU, [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
+# Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
+```
+Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(
+	model_path = "./models/Llama-2-7b-chat-hf",
+  backend_type = "transformers",
+  load_in_8bit = True
+)
+```
+Check [API Document](https://pypi.org/project/llama2-wrapper/) for more usages.
+### Start OpenAI Compatible API
+`llama2-wrapper` offers a web server that acts as a drop-in replacement for the OpenAI API. This allows you to use Llama2 models with any OpenAI compatible clients, libraries or services, etc.
+Start Fast API:
+```
+python -m llama2_wrapper.server
+```
+it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
+Start Fast API for `gptq` backend:
+```
+python -m llama2_wrapper.server --backend_type gptq
+```
+Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
+#### Basic settings
+| Flag             | Description                                                  |
+| ---------------- | ------------------------------------------------------------ |
+| `-h`, `--help`   | Show this help message.                                      |
+| `--model_path`   | The path to the model to use for generating completions.     |
+| `--backend_type` | Backend for llama2, options: llama.cpp, gptq, transformers   |
+| `--max_tokens`   | Maximum context size.                                        |
+| `--load_in_8bit` | Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models). |
+| `--verbose`      | Whether to print verbose output to stderr.                   |
+| `--host`         | API address                                                  |
+| `--port`         | API port                                                     |
+## Benchmark
+Run benchmark script to compute performance on your device, `benchmark.py` will load the same `.env` as `app.py`.:
+```bash
+python benchmark.py
+```
+You can also select the `iter`, `backend_type` and `model_path` the benchmark will be run (overwrite .env args) :
+```bash
+python benchmark.py --iter NB_OF_ITERATIONS --backend_type gptq
+```
+ By default, the number of iterations is 5, but if you want a faster result or a more accurate one
+ you can set it to whatever value you want, but please only report results with at least 5 iterations.
+This [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb) also show you how to benchmark gptq model on free Google Colab T4 GPU.
+Some benchmark performance:
+| Model                       | Precision | Device             | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------- | --------- | ------------------ | -------------- | ------------------ | ------------- |
+| Llama-2-7b-chat-hf          | 8 bit     | NVIDIA RTX 2080 Ti | 7.7 GB VRAM    | 3.76               | 641.36        |
+| Llama-2-7b-Chat-GPTQ        | 4 bit     | NVIDIA RTX 2080 Ti | 5.8 GB VRAM    | 18.85              | 192.91        |
+| Llama-2-7b-Chat-GPTQ        | 4 bit     | Google Colab T4    | 5.8 GB VRAM    | 18.19              | 37.44         |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M1 Pro CPU   | 5.4 GB RAM     | 17.90              | 0.18          |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU       | 5.4 GB RAM     | 13.70              | 0.13          |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 Metal     | 5.4 GB RAM     | 12.60              | 0.10          |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700      | 4.5 GB RAM     | 7.88               | 31.90         |
+Check/contribute the performance of your device in the full [performance doc](./docs/performance.md).
+## Download Llama-2 Models
+Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
+Llama-2-7b-Chat-GPTQ is the GPTQ model files for [Meta's Llama 2 7b Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). GPTQ 4-bit Llama-2 model require less GPU VRAM to run it.
+### Model List
+| Model Name                          | set MODEL_PATH in .env                   | Download URL                                                 |
+| ----------------------------------- | ---------------------------------------- | ------------------------------------------------------------ |
+| meta-llama/Llama-2-7b-chat-hf       | /path-to/Llama-2-7b-chat-hf              | [Link](https://huggingface.co/llamaste/Llama-2-7b-chat-hf)   |
+| meta-llama/Llama-2-13b-chat-hf      | /path-to/Llama-2-13b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)  |
+| meta-llama/Llama-2-70b-chat-hf      | /path-to/Llama-2-70b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-70b-chat-hf)  |
+| meta-llama/Llama-2-7b-hf            | /path-to/Llama-2-7b-hf                   | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf)      |
+| meta-llama/Llama-2-13b-hf           | /path-to/Llama-2-13b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-13b-hf)     |
+| meta-llama/Llama-2-70b-hf           | /path-to/Llama-2-70b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-70b-hf)     |
+| TheBloke/Llama-2-7b-Chat-GPTQ       | /path-to/Llama-2-7b-Chat-GPTQ            | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ) |
+| TheBloke/Llama-2-7b-Chat-GGUF       | /path-to/llama-2-7b-chat.Q4_0.gguf       | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf) |
+| TheBloke/Llama-2-7B-Chat-GGML       | /path-to/llama-2-7b-chat.ggmlv3.q4_0.bin | [Link](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) |
+| TheBloke/CodeLlama-7B-Instruct-GPTQ | TheBloke/CodeLlama-7B-Instruct-GPTQ      | [Link](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) |
+| ...                                 | ...                                      | ...                                                          |
+Running 4-bit model `Llama-2-7b-Chat-GPTQ` needs GPU with 6GB VRAM.
+Running 4-bit model `llama-2-7b-chat.ggmlv3.q4_0.bin` needs CPU with 6GB RAM. There is also a list of other 2, 3, 4, 5, 6, 8-bit GGML models that can be used from [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML).
+### Download Script
+These models can be downloaded through:
+```bash
+python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Python-GPTQ
+python -m llama2_wrapper.download --repo_id TheBloke/Llama-2-7b-Chat-GGUF --filename llama-2-7b-chat.Q4_0.gguf --save_dir ./models
+```
+Or use CMD like:
+```bash
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone git@hf.co:meta-llama/Llama-2-7b-chat-hf
+```
+To download Llama 2 models, you need to request access from [https://ai.meta.com/llama/](https://ai.meta.com/llama/) and also enable access on repos like [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main). Requests will be processed in hours.
+For GPTQ models like [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), you can directly download without requesting access.
+For GGML models like [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), you can directly download without requesting access.
+## Tips
+### Env Examples
+There are some examples in `./env_examples/` folder.
+| Model Setup                                            | Example .env                |
+| ------------------------------------------------------ | --------------------------- |
+| Llama-2-7b-chat-hf 8-bit (transformers backend)        | .env.7b_8bit_example        |
+| Llama-2-7b-Chat-GPTQ 4-bit (gptq transformers backend) | .env.7b_gptq_example        |
+| Llama-2-7B-Chat-GGML 4bit (llama.cpp backend)          | .env.7b_ggmlv3_q4_0_example |
+| Llama-2-13b-chat-hf (transformers backend)             | .env.13b_example            |
+| ...                                                    | ...                         |
+### Run on Nvidia GPU
+The running requires around 14GB of GPU VRAM for Llama-2-7b and 28GB of GPU VRAM for Llama-2-13b.
+If you are running on multiple GPUs, the model will be loaded automatically on GPUs and split the VRAM usage. That allows you to run Llama-2-7b (requires 14GB of GPU VRAM) on a setup like 2 GPUs (11GB VRAM each).
+#### Run bitsandbytes 8 bit
+If you do not have enough memory,  you can set up your `LOAD_IN_8BIT` as `True` in `.env`. This can reduce memory usage by around half with slightly degraded model quality. It is compatible with the CPU, GPU, and Metal backend.
+Llama-2-7b with 8-bit compression can run on a single GPU with 8 GB of VRAM, like an Nvidia RTX 2080Ti, RTX 4080, T4, V100 (16GB).
+#### Run GPTQ 4 bit
+If you want to run 4 bit  Llama-2 model like `Llama-2-7b-Chat-GPTQ`,  you can set up your `BACKEND_TYPE` as `gptq` in `.env` like example `.env.7b_gptq_example`.
+Make sure you have downloaded the 4-bit model from `Llama-2-7b-Chat-GPTQ` and set the `MODEL_PATH` and arguments in `.env` file.
+`Llama-2-7b-Chat-GPTQ` can run on a single GPU with 6 GB of VRAM.
+If you encounter issue like `NameError: name 'autogptq_cuda_256' is not defined`, please refer to [here](https://huggingface.co/TheBloke/open-llama-13b-open-instruct-GPTQ/discussions/1)
+> pip install https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl
+### Run on CPU
+Run Llama-2 model on CPU requires [llama.cpp](https://github.com/ggerganov/llama.cpp) dependency and [llama.cpp Python Bindings](https://github.com/abetlen/llama-cpp-python), which are already installed.
+Download GGML models like `llama-2-7b-chat.ggmlv3.q4_0.bin` following [Download Llama-2 Models](#download-llama-2-models) section. `llama-2-7b-chat.ggmlv3.q4_0.bin` model requires at least 6 GB RAM to run on CPU.
+Set up configs like `.env.7b_ggmlv3_q4_0_example` from `env_examples` as `.env`.
+Run web UI `python app.py` .
+#### Mac Metal Acceleration
+For Mac users, you can also set up Mac Metal for acceleration, try install this dependencies:
+```bash
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+```
+or check details:
+- [MacOS Install with Metal GPU](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md)
+#### AMD/Nvidia GPU Acceleration
+If you would like to use AMD/Nvidia GPU for acceleration, check this:
+- [Installation with OpenBLAS / cuBLAS / CLBlast / Metal](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)
+## License
+MIT - see [MIT License](LICENSE)
+This project enables users to adapt it freely for proprietary purposes without any restrictions.
+## Contributing
+Kindly read our [Contributing Guide](CONTRIBUTING.md) to learn and understand our development process.
+### All Contributors
+<a href="https://github.com/liltom-eth/llama2-webui/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=liltom-eth/llama2-webui" />
+</a>
+### Review
+<a href='https://github.com/repo-reviews/repo-reviews.github.io/blob/main/create.md' target="_blank"><img alt='Github' src='https://img.shields.io/badge/review-100000?style=flat&logo=Github&logoColor=white&labelColor=888888&color=555555'/></a>
+### Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=liltom-eth/llama2-webui&type=Date)](https://star-history.com/#liltom-eth/llama2-webui&Date)
+## Credits
+- https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+- https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat
+- https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ
+- [https://github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
+- [https://github.com/TimDettmers/bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- [https://github.com/PanQiWei/AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
+- [https://github.com/abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)

app.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import os
+import argparse
+from typing import Iterator
+import gradio as gr
+from dotenv import load_dotenv
+from distutils.util import strtobool
+from llama2_wrapper import LLAMA2_WRAPPER
+import logging
+from prompts.utils import PromtsContainer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="", help="model path")
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    parser.add_argument(
+        "--share",
+        type=bool,
+        default=False,
+        help="Whether to share public for gradio.",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
+    MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
+    DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
+    MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
+    MODEL_PATH = os.getenv("MODEL_PATH")
+    assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
+    BACKEND_TYPE = os.getenv("BACKEND_TYPE")
+    assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
+    LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
+    if args.model_path != "":
+        MODEL_PATH = args.model_path
+    if args.backend_type != "":
+        BACKEND_TYPE = args.backend_type
+    if args.load_in_8bit:
+        LOAD_IN_8BIT = True
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=MODEL_PATH,
+        backend_type=BACKEND_TYPE,
+        max_tokens=MAX_INPUT_TOKEN_LENGTH,
+        load_in_8bit=LOAD_IN_8BIT,
+        # verbose=True,
+    )
+    DESCRIPTION = """
+    # llama2-webui
+    """
+    DESCRIPTION2 = """
+    - Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
+    - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+    """
+    def clear_and_save_textbox(message: str) -> tuple[str, str]:
+        return "", message
+    def save_textbox_for_prompt(message: str) -> str:
+        logging.info("start save_textbox_from_prompt")
+        message = convert_summary_to_prompt(message)
+        return message
+    def display_input(
+        message: str, history: list[tuple[str, str]]
+    ) -> list[tuple[str, str]]:
+        history.append((message, ""))
+        return history
+    def delete_prev_fn(
+        history: list[tuple[str, str]]
+    ) -> tuple[list[tuple[str, str]], str]:
+        try:
+            message, _ = history.pop()
+        except IndexError:
+            message = ""
+        return history, message or ""
+    def generate(
+        message: str,
+        history_with_input: list[tuple[str, str]],
+        system_prompt: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+    ) -> Iterator[list[tuple[str, str]]]:
+        if max_new_tokens > MAX_MAX_NEW_TOKENS:
+            raise ValueError
+        try:
+            history = history_with_input[:-1]
+            generator = llama2_wrapper.run(
+                message,
+                history,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            )
+            try:
+                first_response = next(generator)
+                yield history + [(message, first_response)]
+            except StopIteration:
+                yield history + [(message, "")]
+            for response in generator:
+                yield history + [(message, response)]
+        except Exception as e:
+            logging.exception(e)
+    def check_input_token_length(
+        message: str, chat_history: list[tuple[str, str]], system_prompt: str
+    ) -> None:
+        input_token_length = llama2_wrapper.get_input_token_length(
+            message, chat_history, system_prompt
+        )
+        if input_token_length > MAX_INPUT_TOKEN_LENGTH:
+            raise gr.Error(
+                f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
+            )
+    prompts_container = PromtsContainer()
+    prompts = prompts_container.get_prompts_tab_dict()
+    default_prompts_checkbox = False
+    default_advanced_checkbox = False
+    def convert_summary_to_prompt(summary):
+        return prompts_container.get_prompt_by_summary(summary)
+    def two_columns_list(tab_data, chatbot):
+        result = []
+        for i in range(int(len(tab_data) / 2) + 1):
+            row = gr.Row()
+            with row:
+                for j in range(2):
+                    index = 2 * i + j
+                    if index >= len(tab_data):
+                        break
+                    item = tab_data[index]
+                    with gr.Group():
+                        gr.HTML(
+                            f'<p style="color: black; font-weight: bold;">{item["act"]}</p>'
+                        )
+                        prompt_text = gr.Button(
+                            label="",
+                            value=f"{item['summary']}",
+                            size="sm",
+                            elem_classes="text-left-aligned",
+                        )
+                        prompt_text.click(
+                            fn=save_textbox_for_prompt,
+                            inputs=prompt_text,
+                            outputs=saved_input,
+                            api_name=False,
+                            queue=True,
+                        ).then(
+                            fn=display_input,
+                            inputs=[saved_input, chatbot],
+                            outputs=chatbot,
+                            api_name=False,
+                            queue=True,
+                        ).then(
+                            fn=check_input_token_length,
+                            inputs=[saved_input, chatbot, system_prompt],
+                            api_name=False,
+                            queue=False,
+                        ).success(
+                            fn=generate,
+                            inputs=[
+                                saved_input,
+                                chatbot,
+                                system_prompt,
+                                max_new_tokens,
+                                temperature,
+                                top_p,
+                                top_k,
+                            ],
+                            outputs=chatbot,
+                            api_name=False,
+                        )
+                result.append(row)
+        return result
+    CSS = """
+        .contain { display: flex; flex-direction: column;}
+        #component-0 #component-1 #component-2 #component-4 #component-5 { height:71vh !important; }
+        #component-0 #component-1 #component-24 > div:nth-child(2) { height:80vh !important; overflow-y:auto }
+        .text-left-aligned {text-align: left !important; font-size: 16px;}
+    """
+    with gr.Blocks(css=CSS) as demo:
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=2):
+                gr.Markdown(DESCRIPTION)
+                with gr.Group():
+                    chatbot = gr.Chatbot(label="Chatbot")
+                    with gr.Row():
+                        textbox = gr.Textbox(
+                            container=False,
+                            show_label=False,
+                            placeholder="Type a message...",
+                            scale=10,
+                        )
+                        submit_button = gr.Button(
+                            "Submit", variant="primary", scale=1, min_width=0
+                        )
+                with gr.Row():
+                    retry_button = gr.Button("🔄  Retry", variant="secondary")
+                    undo_button = gr.Button("↩️ Undo", variant="secondary")
+                    clear_button = gr.Button("🗑️  Clear", variant="secondary")
+                saved_input = gr.State()
+                with gr.Row():
+                    advanced_checkbox = gr.Checkbox(
+                        label="Advanced",
+                        value=default_prompts_checkbox,
+                        container=False,
+                        elem_classes="min_check",
+                    )
+                    prompts_checkbox = gr.Checkbox(
+                        label="Prompts",
+                        value=default_prompts_checkbox,
+                        container=False,
+                        elem_classes="min_check",
+                    )
+                with gr.Column(visible=default_advanced_checkbox) as advanced_column:
+                    system_prompt = gr.Textbox(
+                        label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6
+                    )
+                    max_new_tokens = gr.Slider(
+                        label="Max new tokens",
+                        minimum=1,
+                        maximum=MAX_MAX_NEW_TOKENS,
+                        step=1,
+                        value=DEFAULT_MAX_NEW_TOKENS,
+                    )
+                    temperature = gr.Slider(
+                        label="Temperature",
+                        minimum=0.1,
+                        maximum=4.0,
+                        step=0.1,
+                        value=1.0,
+                    )
+                    top_p = gr.Slider(
+                        label="Top-p (nucleus sampling)",
+                        minimum=0.05,
+                        maximum=1.0,
+                        step=0.05,
+                        value=0.95,
+                    )
+                    top_k = gr.Slider(
+                        label="Top-k",
+                        minimum=1,
+                        maximum=1000,
+                        step=1,
+                        value=50,
+                    )
+            with gr.Column(scale=1, visible=default_prompts_checkbox) as prompt_column:
+                gr.HTML(
+                    '<p style="color: green; font-weight: bold;font-size: 16px;">\N{four leaf clover} prompts</p>'
+                )
+                for k, v in prompts.items():
+                    with gr.Tab(k, scroll_to_output=True):
+                        lst = two_columns_list(v, chatbot)
+            prompts_checkbox.change(
+                lambda x: gr.update(visible=x),
+                prompts_checkbox,
+                prompt_column,
+                queue=False,
+            )
+            advanced_checkbox.change(
+                lambda x: gr.update(visible=x),
+                advanced_checkbox,
+                advanced_column,
+                queue=False,
+            )
+        textbox.submit(
+            fn=clear_and_save_textbox,
+            inputs=textbox,
+            outputs=[textbox, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=check_input_token_length,
+            inputs=[saved_input, chatbot, system_prompt],
+            api_name=False,
+            queue=False,
+        ).success(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            ],
+            outputs=chatbot,
+            api_name=False,
+        )
+        button_event_preprocess = (
+            submit_button.click(
+                fn=clear_and_save_textbox,
+                inputs=textbox,
+                outputs=[textbox, saved_input],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                fn=display_input,
+                inputs=[saved_input, chatbot],
+                outputs=chatbot,
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                fn=check_input_token_length,
+                inputs=[saved_input, chatbot, system_prompt],
+                api_name=False,
+                queue=False,
+            )
+            .success(
+                fn=generate,
+                inputs=[
+                    saved_input,
+                    chatbot,
+                    system_prompt,
+                    max_new_tokens,
+                    temperature,
+                    top_p,
+                    top_k,
+                ],
+                outputs=chatbot,
+                api_name=False,
+            )
+        )
+        retry_button.click(
+            fn=delete_prev_fn,
+            inputs=chatbot,
+            outputs=[chatbot, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            ],
+            outputs=chatbot,
+            api_name=False,
+        )
+        undo_button.click(
+            fn=delete_prev_fn,
+            inputs=chatbot,
+            outputs=[chatbot, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=lambda x: x,
+            inputs=[saved_input],
+            outputs=textbox,
+            api_name=False,
+            queue=False,
+        )
+        clear_button.click(
+            fn=lambda: ([], ""),
+            outputs=[chatbot, saved_input],
+            queue=False,
+            api_name=False,
+        )
+    demo.queue(max_size=20).launch(share=args.share)
+if __name__ == "__main__":
+    main()

benchmark.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import time
+import argparse
+from dotenv import load_dotenv
+from distutils.util import strtobool
+from memory_profiler import memory_usage
+from tqdm import tqdm
+from llama2_wrapper import LLAMA2_WRAPPER
+def run_iteration(
+    llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
+):
+    def generation():
+        generator = llama2_wrapper.run(
+            prompt_example,
+            [],
+            DEFAULT_SYSTEM_PROMPT,
+            DEFAULT_MAX_NEW_TOKENS,
+            1,
+            0.95,
+            50,
+        )
+        model_response = None
+        try:
+            first_model_response = next(generator)
+        except StopIteration:
+            pass
+        for model_response in generator:
+            pass
+        return llama2_wrapper.get_token_length(model_response), model_response
+    tic = time.perf_counter()
+    mem_usage, (output_token_length, model_response) = memory_usage(
+        (generation,), max_usage=True, retval=True
+    )
+    toc = time.perf_counter()
+    generation_time = toc - tic
+    tokens_per_second = output_token_length / generation_time
+    return generation_time, tokens_per_second, mem_usage, model_response
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--iter", type=int, default=5, help="Number of iterations")
+    parser.add_argument("--model_path", type=str, default="", help="model path")
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
+    MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
+    DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
+    MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
+    MODEL_PATH = os.getenv("MODEL_PATH")
+    assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
+    BACKEND_TYPE = os.getenv("BACKEND_TYPE")
+    assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
+    LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
+    if args.model_path != "":
+        MODEL_PATH = args.model_path
+    if args.backend_type != "":
+        BACKEND_TYPE = args.backend_type
+    if args.load_in_8bit:
+        LOAD_IN_8BIT = True
+    # Initialization
+    init_tic = time.perf_counter()
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=MODEL_PATH,
+        backend_type=BACKEND_TYPE,
+        max_tokens=MAX_INPUT_TOKEN_LENGTH,
+        load_in_8bit=LOAD_IN_8BIT,
+        # verbose=True,
+    )
+    init_toc = time.perf_counter()
+    initialization_time = init_toc - init_tic
+    total_time = 0
+    total_tokens_per_second = 0
+    total_memory_gen = 0
+    prompt_example = (
+        "Can you explain briefly to me what is the Python programming language?"
+    )
+    # Cold run
+    print("Performing cold run...")
+    run_iteration(
+        llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
+    )
+    # Timed runs
+    print(f"Performing {args.iter} timed runs...")
+    for i in tqdm(range(args.iter)):
+        try:
+            gen_time, tokens_per_sec, mem_gen, model_response = run_iteration(
+                llama2_wrapper,
+                prompt_example,
+                DEFAULT_SYSTEM_PROMPT,
+                DEFAULT_MAX_NEW_TOKENS,
+            )
+            total_time += gen_time
+            total_tokens_per_second += tokens_per_sec
+            total_memory_gen += mem_gen
+        except:
+            break
+    avg_time = total_time / (i + 1)
+    avg_tokens_per_second = total_tokens_per_second / (i + 1)
+    avg_memory_gen = total_memory_gen / (i + 1)
+    print(f"Last model response: {model_response}")
+    print(f"Initialization time: {initialization_time:0.4f} seconds.")
+    print(
+        f"Average generation time over {(i + 1)} iterations: {avg_time:0.4f} seconds."
+    )
+    print(
+        f"Average speed over {(i + 1)} iterations: {avg_tokens_per_second:0.4f} tokens/sec."
+    )
+    print(f"Average memory usage during generation: {avg_memory_gen:.2f} MiB")
+if __name__ == "__main__":
+    main()

code_completion.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import argparse
+import gradio as gr
+from llama2_wrapper import LLAMA2_WRAPPER
+FIM_PREFIX = "<PRE> "
+FIM_MIDDLE = " <MID>"
+FIM_SUFFIX = " <SUF>"
+FIM_INDICATOR = "<FILL_ME>"
+EOS_STRING = "</s>"
+EOT_STRING = "<EOT>"
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./models/codellama-7b-instruct.ggmlv3.Q4_0.bin",
+        help="model path",
+    )
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="llama.cpp",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--max_tokens",
+        type=int,
+        default=4000,
+        help="Maximum context size.",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    parser.add_argument(
+        "--share",
+        type=bool,
+        default=False,
+        help="Whether to share public for gradio.",
+    )
+    args = parser.parse_args()
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=args.model_path,
+        backend_type=args.backend_type,
+        max_tokens=args.max_tokens,
+        load_in_8bit=args.load_in_8bit,
+    )
+    def generate(
+        prompt,
+        temperature=0.9,
+        max_new_tokens=256,
+        top_p=0.95,
+        repetition_penalty=1.0,
+    ):
+        temperature = float(temperature)
+        if temperature < 1e-2:
+            temperature = 1e-2
+        top_p = float(top_p)
+        fim_mode = False
+        generate_kwargs = dict(
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            stream=True,
+        )
+        if FIM_INDICATOR in prompt:
+            fim_mode = True
+            try:
+                prefix, suffix = prompt.split(FIM_INDICATOR)
+            except:
+                raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!")
+            prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
+        stream = llama2_wrapper.__call__(prompt, **generate_kwargs)
+        if fim_mode:
+            output = prefix
+        else:
+            output = prompt
+        # for response in stream:
+        #     output += response
+        #     yield output
+        # return output
+        previous_token = ""
+        for response in stream:
+            if any([end_token in response for end_token in [EOS_STRING, EOT_STRING]]):
+                if fim_mode:
+                    output += suffix
+                    yield output
+                    return output
+                    print("output", output)
+                else:
+                    return output
+            else:
+                output += response
+            previous_token = response
+            yield output
+        return output
+    examples = [
+        'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\nprint(remove_non_ascii(\'afkdj$$(\'))',
+        "X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1)\n\n# Train a logistic regression model, predict the labels on the test set and compute the accuracy score",
+        "// Returns every other value in the array as a new array.\nfunction everyOther(arr) {",
+        "Poor English: She no went to the market. Corrected English:",
+        "def alternating(list1, list2):\n   results = []\n   for i in range(min(len(list1), len(list2))):\n       results.append(list1[i])\n       results.append(list2[i])\n   if len(list1) > len(list2):\n       <FILL_ME>\n   else:\n       results.extend(list2[i+1:])\n   return results",
+    ]
+    def process_example(args):
+        for x in generate(args):
+            pass
+        return x
+    description = """
+    <div style="text-align: center;">
+        <h1>Code Llama Playground</h1>
+    </div>
+    <div style="text-align: center;">
+        <p>This is a demo to complete code with Code Llama. For instruction purposes, please use llama2-webui app.py with CodeLlama-Instruct models. </p>
+    </div>
+    """
+    with gr.Blocks() as demo:
+        with gr.Column():
+            gr.Markdown(description)
+            with gr.Row():
+                with gr.Column():
+                    instruction = gr.Textbox(
+                        placeholder="Enter your code here",
+                        lines=5,
+                        label="Input",
+                        elem_id="q-input",
+                    )
+                    submit = gr.Button("Generate", variant="primary")
+                    output = gr.Code(elem_id="q-output", lines=30, label="Output")
+                    with gr.Row():
+                        with gr.Column():
+                            with gr.Accordion("Advanced settings", open=False):
+                                with gr.Row():
+                                    column_1, column_2 = gr.Column(), gr.Column()
+                                    with column_1:
+                                        temperature = gr.Slider(
+                                            label="Temperature",
+                                            value=0.1,
+                                            minimum=0.0,
+                                            maximum=1.0,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Higher values produce more diverse outputs",
+                                        )
+                                        max_new_tokens = gr.Slider(
+                                            label="Max new tokens",
+                                            value=256,
+                                            minimum=0,
+                                            maximum=8192,
+                                            step=64,
+                                            interactive=True,
+                                            info="The maximum numbers of new tokens",
+                                        )
+                                    with column_2:
+                                        top_p = gr.Slider(
+                                            label="Top-p (nucleus sampling)",
+                                            value=0.90,
+                                            minimum=0.0,
+                                            maximum=1,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Higher values sample more low-probability tokens",
+                                        )
+                                        repetition_penalty = gr.Slider(
+                                            label="Repetition penalty",
+                                            value=1.05,
+                                            minimum=1.0,
+                                            maximum=2.0,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Penalize repeated tokens",
+                                        )
+                    gr.Examples(
+                        examples=examples,
+                        inputs=[instruction],
+                        cache_examples=False,
+                        fn=process_example,
+                        outputs=[output],
+                    )
+        submit.click(
+            generate,
+            inputs=[
+                instruction,
+                temperature,
+                max_new_tokens,
+                top_p,
+                repetition_penalty,
+            ],
+            outputs=[output],
+        )
+    demo.queue(concurrency_count=16).launch(share=args.share)
+if __name__ == "__main__":
+    main()

colab/Llama_2_7b_Chat_GPTQ.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

colab/ggmlv3_q4_0.ipynb ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "toc_visible": true,
+      "authorship_tag": "ABX9TyM9WbudQYrVFksXUrt4Opt3",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7O5JSosg5-rx"
+      },
+      "outputs": [],
+      "source": [
+        "%cd /content\n",
+        "!pip install llama2-wrapper\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama2_wrapper import LLAMA2_WRAPPER, get_prompt\n",
+        "\n",
+        "llama2_wrapper = LLAMA2_WRAPPER()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8rgb1ckl72wC",
+        "outputId": "d9ca2e20-26a5-490b-86f2-1a182e533b20"
+      },
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Running on backend llama.cpp.\n",
+            "Use default model path: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n",
+            "Start downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "prompt = get_prompt(\"Hi do you know Pytorch?\")\n",
+        "print(llama2_wrapper(prompt))"
+      ],
+      "metadata": {
+        "id": "Qz2xAqozTIf6",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "1380fa52-3d4a-4ac5-ed02-7faefe7ec2f6"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  Yes, I'm familiar with PyTorch! PyTorch is an open-source deep learning framework that is widely used for building and training neural networks. It was originally developed by Facebook and is now maintained by the PyTorch Foundation.\n",
+            "\n",
+            "Here are some key features and capabilities of PyTorch:\n",
+            "\n",
+            "1. **Tensor Computation**: PyTorch provides a powerful tensor computation engine that allows for complex mathematical operations on large datasets.\n",
+            "2. **Autograd**: PyTorch's autograd system automatically computes gradients, which can save a lot of time and effort during training.\n",
+            "3. **Dynamic Compute**: PyTorch's dynamic compute system allows for more efficient computation by only computing the necessary computations at runtime.\n",
+            "4. **Memory-efficient**: PyTorch is designed to be memory-efficient, which is important for training large models that require a lot of memory.\n",
+            "5. **Accelerators**: PyTorch supports a wide range of accelerators, including GPUs, TPUs, and FPGAs, which can significantly speed up training times.\n",
+            "6. **Modules**: PyTorch provides a wide range of pre-built modules for common tasks, such as convolutional layers, recurrent neural networks, and more.\n",
+            "7. **Extensive Community**: PyTorch has a large and active community of developers and users, which can be helpful for getting support and staying up-to-date with the latest developments.\n",
+            "8. **Easy Integration**: PyTorch can be easily integrated with other popular deep learning frameworks, such as TensorFlow and Keras.\n",
+            "9. **Pythonic**: PyTorch is written in Python, which is a popular and easy-to-learn programming language.\n",
+            "10. **Flexible**: PyTorch allows for a wide range of customization options, which can be useful for building and training unique models.\n",
+            "\n",
+            "Overall, PyTorch is a powerful and flexible deep learning framework that can be used for a wide range of applications, including computer vision, natural language processing, and more.\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb ADDED Viewed

	@@ -0,0 +1,514 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4",
+      "authorship_tag": "ABX9TyOZhPcZe61RhDjhEFQv0vrl",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7O5JSosg5-rx"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U llama2-wrapper==0.1.12"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd /content\n",
+        "!git clone https://github.com/liltom-eth/llama2-webui\n",
+        "\n",
+        "%cd /content/llama2-webui\n",
+        "!python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Instruct-GPTQ\n",
+        "\n",
+        "%cd /content/llama2-webui\n",
+        "!python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Y6A7bJdkmzY8",
+        "outputId": "0d702a7d-68ab-4747-f012-246d4dee3718"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content\n",
+            "fatal: destination path 'llama2-webui' already exists and is not an empty directory.\n",
+            "/content/llama2-webui\n",
+            "Start downloading model TheBloke/CodeLlama-7B-Instruct-GPTQ to: ./models/CodeLlama-7B-Instruct-GPTQ\n",
+            "Fetching 15 files:   0% 0/15 [00:00<?, ?it/s]\n",
+            "Downloading (…)d0d05/.gitattributes: 100% 1.52k/1.52k [00:00<00:00, 7.94MB/s]\n",
+            "Fetching 15 files:   7% 1/15 [00:01<00:16,  1.15s/it]\n",
+            "Downloading (…)478d0d05/LICENSE.txt: 100% 7.02k/7.02k [00:00<00:00, 31.6MB/s]\n",
+            "\n",
+            "Downloading (…)478d0d05/config.json: 100% 1.25k/1.25k [00:00<00:00, 7.95MB/s]\n",
+            "\n",
+            "Downloading (…)nfiguration_llama.py: 100% 8.56k/8.56k [00:00<00:00, 41.7MB/s]\n",
+            "\n",
+            "Downloading (…)81b84478d0d05/Notice: 100% 112/112 [00:00<00:00, 750kB/s]\n",
+            "\n",
+            "Downloading (…)neration_config.json: 100% 132/132 [00:00<00:00, 836kB/s]\n",
+            "\n",
+            "Downloading (…)8d0d05/USE_POLICY.md: 100% 105/105 [00:00<00:00, 686kB/s]\n",
+            "\n",
+            "Downloading (…)84478d0d05/README.md: 100% 22.0k/22.0k [00:00<00:00, 59.5MB/s]\n",
+            "\n",
+            "Downloading (…)05/modeling_llama.py: 100% 45.9k/45.9k [00:00<00:00, 27.5MB/s]\n",
+            "\n",
+            "Downloading (…)quantize_config.json: 100% 187/187 [00:00<00:00, 1.34MB/s]\n",
+            "\n",
+            "Downloading (…)cial_tokens_map.json: 100% 411/411 [00:00<00:00, 2.82MB/s]\n",
+            "\n",
+            "Downloading (…)d0d05/tokenizer.json:   0% 0.00/1.84M [00:00<?, ?B/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)okenizer_config.json: 100% 824/824 [00:00<00:00, 5.75MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading model.safetensors:   0% 0.00/3.90G [00:00<?, ?B/s]\u001b[A\u001b[A\n",
+            "\n",
+            "\n",
+            "Downloading tokenizer.model: 100% 500k/500k [00:00<00:00, 16.3MB/s]\n",
+            "\n",
+            "Downloading (…)d0d05/tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 5.47MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading model.safetensors:   0% 10.5M/3.90G [00:00<01:08, 56.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 21.0M/3.90G [00:00<00:57, 67.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 31.5M/3.90G [00:00<00:51, 75.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 52.4M/3.90G [00:00<00:40, 94.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   2% 73.4M/3.90G [00:00<00:33, 113MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   2% 94.4M/3.90G [00:00<00:28, 133MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   3% 115M/3.90G [00:00<00:25, 148MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   3% 136M/3.90G [00:01<00:24, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   4% 157M/3.90G [00:01<00:22, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   5% 178M/3.90G [00:01<00:22, 168MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   5% 199M/3.90G [00:01<00:21, 169MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   6% 220M/3.90G [00:01<00:21, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   6% 241M/3.90G [00:01<00:21, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   7% 262M/3.90G [00:01<00:20, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   7% 283M/3.90G [00:02<01:08, 52.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   8% 315M/3.90G [00:02<00:47, 75.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   9% 346M/3.90G [00:03<00:36, 97.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   9% 367M/3.90G [00:03<00:31, 111MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  10% 388M/3.90G [00:03<00:28, 122MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  10% 409M/3.90G [00:03<00:26, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  11% 430M/3.90G [00:03<00:24, 141MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  12% 461M/3.90G [00:03<00:21, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  12% 482M/3.90G [00:03<00:20, 165MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  13% 503M/3.90G [00:04<00:20, 166MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  13% 524M/3.90G [00:04<00:19, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  14% 556M/3.90G [00:04<00:18, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  15% 577M/3.90G [00:04<00:18, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  15% 598M/3.90G [00:04<00:18, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  16% 619M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  16% 640M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  17% 661M/3.90G [00:04<00:18, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  17% 682M/3.90G [00:04<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  18% 703M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  19% 724M/3.90G [00:05<00:17, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  19% 744M/3.90G [00:05<00:18, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  20% 765M/3.90G [00:05<00:18, 173MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  20% 786M/3.90G [00:05<00:17, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  21% 807M/3.90G [00:05<00:17, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  21% 828M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  22% 849M/3.90G [00:05<00:16, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  22% 870M/3.90G [00:07<01:37, 30.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  23% 891M/3.90G [00:08<01:13, 40.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  24% 923M/3.90G [00:08<00:50, 59.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  24% 944M/3.90G [00:08<00:42, 70.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  25% 975M/3.90G [00:08<00:30, 94.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  26% 996M/3.90G [00:08<00:27, 107MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  26% 1.02G/3.90G [00:08<00:23, 121MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  27% 1.04G/3.90G [00:08<00:21, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  27% 1.06G/3.90G [00:08<00:20, 141MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  28% 1.08G/3.90G [00:09<00:18, 151MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  28% 1.10G/3.90G [00:09<00:17, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  29% 1.12G/3.90G [00:09<00:16, 166MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  29% 1.14G/3.90G [00:09<00:16, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  30% 1.16G/3.90G [00:09<00:15, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  30% 1.18G/3.90G [00:09<00:15, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  31% 1.21G/3.90G [00:09<00:15, 179MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  31% 1.23G/3.90G [00:09<00:14, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  32% 1.25G/3.90G [00:09<00:14, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  33% 1.27G/3.90G [00:10<00:23, 113MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  33% 1.29G/3.90G [00:10<00:20, 128MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  34% 1.31G/3.90G [00:10<00:18, 139MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  34% 1.33G/3.90G [00:10<00:17, 150MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  35% 1.35G/3.90G [00:10<00:16, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  35% 1.37G/3.90G [00:12<01:24, 29.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  36% 1.41G/3.90G [00:12<00:55, 45.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  37% 1.44G/3.90G [00:13<00:39, 63.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  37% 1.46G/3.90G [00:13<00:33, 72.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  38% 1.48G/3.90G [00:13<00:29, 82.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  38% 1.50G/3.90G [00:13<00:24, 98.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  39% 1.53G/3.90G [00:13<00:19, 124MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  40% 1.55G/3.90G [00:13<00:17, 132MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  40% 1.57G/3.90G [00:13<00:16, 143MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  41% 1.59G/3.90G [00:14<00:15, 153MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  41% 1.61G/3.90G [00:14<00:14, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  42% 1.64G/3.90G [00:14<00:13, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  43% 1.66G/3.90G [00:14<00:13, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  43% 1.68G/3.90G [00:14<00:12, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  44% 1.70G/3.90G [00:14<00:12, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  44% 1.72G/3.90G [00:14<00:12, 173MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  45% 1.74G/3.90G [00:14<00:12, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  45% 1.76G/3.90G [00:14<00:11, 179MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  46% 1.78G/3.90G [00:15<00:12, 172MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  46% 1.80G/3.90G [00:15<00:12, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  47% 1.82G/3.90G [00:15<00:11, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  47% 1.85G/3.90G [00:16<00:28, 71.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  48% 1.87G/3.90G [00:16<00:23, 87.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  49% 1.90G/3.90G [00:16<00:16, 118MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  49% 1.92G/3.90G [00:16<00:14, 132MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  50% 1.94G/3.90G [00:16<00:13, 143MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  50% 1.96G/3.90G [00:16<00:12, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  51% 1.98G/3.90G [00:16<00:13, 142MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  51% 2.00G/3.90G [00:16<00:13, 144MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  52% 2.02G/3.90G [00:17<00:12, 144MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  52% 2.04G/3.90G [00:17<00:12, 148MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  53% 2.07G/3.90G [00:17<00:12, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  54% 2.09G/3.90G [00:17<00:22, 81.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  54% 2.12G/3.90G [00:18<00:16, 107MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  55% 2.14G/3.90G [00:18<00:14, 119MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  55% 2.16G/3.90G [00:18<00:14, 123MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  56% 2.18G/3.90G [00:18<00:13, 131MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  57% 2.21G/3.90G [00:18<00:10, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  57% 2.23G/3.90G [00:18<00:10, 162MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  58% 2.25G/3.90G [00:18<00:10, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  59% 2.29G/3.90G [00:18<00:09, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  59% 2.31G/3.90G [00:19<00:08, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  60% 2.33G/3.90G [00:19<00:08, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  60% 2.35G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  61% 2.37G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  61% 2.39G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  62% 2.41G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  62% 2.43G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  63% 2.45G/3.90G [00:19<00:08, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  64% 2.47G/3.90G [00:20<00:11, 124MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  64% 2.51G/3.90G [00:20<00:09, 149MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  65% 2.53G/3.90G [00:22<00:40, 34.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  66% 2.56G/3.90G [00:22<00:26, 50.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  66% 2.58G/3.90G [00:22<00:21, 60.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  67% 2.60G/3.90G [00:22<00:18, 69.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  67% 2.62G/3.90G [00:22<00:15, 84.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  68% 2.64G/3.90G [00:22<00:12, 99.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  68% 2.66G/3.90G [00:23<00:12, 96.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  69% 2.68G/3.90G [00:23<00:12, 95.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  69% 2.71G/3.90G [00:23<00:14, 84.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.73G/3.90G [00:23<00:14, 82.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.74G/3.90G [00:24<00:14, 80.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.75G/3.90G [00:24<00:15, 75.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.76G/3.90G [00:24<00:15, 75.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.77G/3.90G [00:24<00:15, 72.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.78G/3.90G [00:24<00:14, 74.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.79G/3.90G [00:24<00:14, 74.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.80G/3.90G [00:25<00:15, 69.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.81G/3.90G [00:25<00:15, 71.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.82G/3.90G [00:25<00:13, 77.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.84G/3.90G [00:25<00:12, 84.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.85G/3.90G [00:25<00:12, 83.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.86G/3.90G [00:25<00:12, 81.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  74% 2.88G/3.90G [00:25<00:10, 97.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  75% 2.90G/3.90G [00:26<00:08, 118MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  75% 2.93G/3.90G [00:26<00:07, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  76% 2.95G/3.90G [00:26<00:06, 149MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  76% 2.97G/3.90G [00:26<00:05, 159MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  77% 2.99G/3.90G [00:27<00:23, 37.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  77% 3.02G/3.90G [00:27<00:15, 57.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  78% 3.04G/3.90G [00:28<00:12, 67.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  79% 3.06G/3.90G [00:28<00:10, 78.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  79% 3.08G/3.90G [00:28<00:08, 92.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  80% 3.10G/3.90G [00:28<00:07, 109MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  80% 3.14G/3.90G [00:28<00:05, 138MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  81% 3.16G/3.90G [00:28<00:05, 146MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  82% 3.18G/3.90G [00:28<00:04, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  82% 3.20G/3.90G [00:29<00:04, 161MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  83% 3.22G/3.90G [00:29<00:03, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  83% 3.24G/3.90G [00:29<00:04, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  84% 3.26G/3.90G [00:29<00:04, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  84% 3.28G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  85% 3.30G/3.90G [00:29<00:03, 162MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  85% 3.32G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  86% 3.34G/3.90G [00:29<00:03, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  87% 3.38G/3.90G [00:30<00:02, 191MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  87% 3.40G/3.90G [00:30<00:02, 188MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  88% 3.42G/3.90G [00:30<00:02, 187MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  88% 3.44G/3.90G [00:30<00:02, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  89% 3.46G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  89% 3.48G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  90% 3.50G/3.90G [00:30<00:02, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  90% 3.52G/3.90G [00:30<00:02, 185MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  91% 3.54G/3.90G [00:30<00:01, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  91% 3.57G/3.90G [00:31<00:05, 55.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  92% 3.59G/3.90G [00:32<00:08, 38.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  93% 3.61G/3.90G [00:32<00:05, 50.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  93% 3.63G/3.90G [00:33<00:04, 65.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  94% 3.65G/3.90G [00:33<00:03, 80.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  94% 3.67G/3.90G [00:33<00:02, 97.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  95% 3.69G/3.90G [00:33<00:01, 113MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  95% 3.71G/3.90G [00:33<00:01, 128MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  96% 3.73G/3.90G [00:33<00:01, 139MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  96% 3.75G/3.90G [00:33<00:00, 153MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  97% 3.77G/3.90G [00:33<00:00, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  97% 3.80G/3.90G [00:34<00:00, 165MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  98% 3.82G/3.90G [00:34<00:00, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  98% 3.84G/3.90G [00:34<00:00, 169MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  99% 3.86G/3.90G [00:34<00:00, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors: 100% 3.90G/3.90G [00:34<00:00, 113MB/s]\n",
+            "Fetching 15 files: 100% 15/15 [00:36<00:00,  2.41s/it]\n",
+            "/content/llama2-webui\n",
+            "Running on GPU with backend torch transformers.\n",
+            "2023-08-26 07:14:25.222792: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.\n",
+            "Caching examples at: '/content/llama2-webui/gradio_cached_examples/19'\n",
+            "Caching example 1/5\n",
+            "Caching example 2/5\n",
+            "Caching example 3/5\n",
+            "Caching example 4/5\n",
+            "Caching example 5/5\n",
+            "Caching complete\n",
+            "\n",
+            "Running on local URL:  http://127.0.0.1:7860\n",
+            "Running on public URL: https://71c3606942c440e7dd.gradio.live\n",
+            "\n",
+            "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n",
+            "Keyboard interruption in main thread... closing server.\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2130, in block_thread\n",
+            "    time.sleep(0.1)\n",
+            "KeyboardInterrupt\n",
+            "\n",
+            "During handling of the above exception, another exception occurred:\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/content/llama2-webui/app.py\", line 322, in <module>\n",
+            "    main()\n",
+            "  File \"/content/llama2-webui/app.py\", line 318, in main\n",
+            "    demo.queue(max_size=20).launch(share=args.share)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2046, in launch\n",
+            "    self.block_thread()\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2132, in block_thread\n",
+            "    print(\"Keyboard interruption in main thread... closing server.\")\n",
+            "KeyboardInterrupt\n",
+            "Killing tunnel 127.0.0.1:7860 <> https://71c3606942c440e7dd.gradio.live\n",
+            "terminate called without an active exception\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

docs/issues.md ADDED Viewed

File without changes

docs/news.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# News
+- [2023/09] The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
+- [2023/08] 🔥 For developers, we offer a web server that acts as a drop-in replacement for the OpenAI API.
+  - Usage:
+    ```
+    python3 -m llama2_wrapper.server
+    ```
+- [2023/08] 🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
+  - Install: `pip install llama2-wrapper`
+  - Usage:
+    ```python
+    from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path="./models/Llama-2-7B-Chat-GGML/llama-2-7b-chat.ggmlv3.q4_0.bin",
+        backend_type="llama.cpp", #options: llama.cpp, transformers, gptq
+    )
+    prompt = "Do you know Pytorch"
+    llama2_promt = get_prompt(prompt)
+    answer = llama2_wrapper(llama2_promt, temperature=0.9)
+    ```
+- [2023/08] 🔥 We added `benchmark.py` for users to benchmark llama2 models on their local devices.
+  - Check/contribute the performance of your device in the full [performance doc](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md).
+- [2023/07] We released **[llama2-webui](https://github.com/liltom-eth/llama2-webui)**, a gradio web UI to run Llama 2 on GPU or CPU from anywhere (Linux/Windows/Mac).
+  - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
+  - Supporting model backends:  [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)

docs/performance.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# Benchmark Performance
+## Performance on Nvidia GPU
+| Model                             | Precision | Device | GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
+| Llama-2-7b-chat-hf | 16 bit |  |  |              |              |
+| Llama-2-7b-chat-hf          |   8bit   | NVIDIA RTX 2080 Ti    | 7.7 GB VRAM | 3.76 | 641.36 |
+| Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA RTX 2080 Ti    | 5.8 GB VRAM | 18.85 | 192.91 |
+| Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA GTX 1660 Super | 4.8 GB VRAM | 8.5   | 262.74        |
+| Llama-2-7b-Chat-GPTQ | 4 bit | Google Colab T4 | 5.8 GB VRAM | 18.19 | 37.44 |
+| Llama-2-13b-chat-hf               |   16 bit   |  |                  |                  |                  |
+|  |  | |  | | |
+## Performance on CPU / OpenBLAS / cuBLAS / CLBlast / Metal
+| Model                             | Precision | Device | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700 | 4.5 GB RAM     | 7.88               | 31.90         |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 CPU | 4.5 GB RAM | 11.10 | 0.10 |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 Metal | 4.5 GB RAM | 12.10 | 0.12 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-8700 | 5.4 GB RAM     | 6.27            | 173.15 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-9700 | 4.8 GB RAM   | 4.2                 | 87.9        |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M1 Pro CPU | 5.4 GB RAM | 17.90 | 0.18 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU | 5.4 GB RAM | 13.70 | 0.13 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 Metal | 5.4 GB RAM | 12.60 | 0.10 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | AMD Ryzen 9 5900HS | 4.1 GB RAM | 6.01 | 0.15 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Intel vServer 4 threads, eth services | 8 GB RAM | 1.31 | 0.5|
+| llama-2-7b-chat.ggmlv3.q8_0 | 8 bit | Intel i7-8700 | 8.6 GB RAM | 2.63 | 336.57 |
+| llama-2-7b-chat.ggmlv3.q8_0 | 8 bit     | Intel i7-9700 | 7.6 GB RAM   | 2.05              | 302.9    |
+|  |  |  |  |  |  |

docs/pypi.md ADDED Viewed

	@@ -0,0 +1,187 @@

+# llama2-wrapper
+- Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
+## Features
+- Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)...
+- Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on Colab T4 GPU](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb)
+- Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
+- [News](https://github.com/liltom-eth/llama2-webui/blob/main/docs/news.md), [Benchmark](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md), [Issue Solutions](https://github.com/liltom-eth/llama2-webui/blob/main/docs/issues.md)
+[llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  is the backend and part of [llama2-webui](https://github.com/liltom-eth/llama2-webui), which can run any Llama 2 locally with gradio UI on GPU or CPU from anywhere (Linux/Windows/Mac).
+## Install
+```bash
+pip install llama2-wrapper
+```
+## Start OpenAI Compatible  API
+```
+python -m llama2_wrapper.server
+```
+it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
+Start Fast API for `gptq` backend:
+```
+python -m llama2_wrapper.server --backend_type gptq
+```
+Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
+## API Usage
+###  `__call__`
+`__call__()` is the function to generate text from a prompt.
+For example, run ggml llama2 model on CPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+llama2_wrapper = LLAMA2_WRAPPER()
+# Default running on backend llama.cpp.
+# Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
+prompt = "Do you know Pytorch"
+# llama2_wrapper() will run __call__()
+answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
+```
+Run gptq llama2 model on Nvidia GPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
+# Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
+```
+Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(
+	model_path = "./models/Llama-2-7b-chat-hf",
+  backend_type = "transformers",
+  load_in_8bit = True
+)
+```
+### completion
+  `completion()`  is the function to generate text from a prompt for OpenAI compatible API `/v1/completions`.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+prompt = get_prompt("Hi do you know Pytorch?")
+print(llm.completion(prompt))
+```
+### chat_completion
+  `chat_completion()`  is the function to generate text from a dialog (chat history) for OpenAI compatible API `/v1/chat/completions`.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+dialog = [
+    {
+        "role":"system",
+        "content":"You are a helpful, respectful and honest assistant. "
+    },{
+        "role":"user",
+        "content":"Hi do you know Pytorch?",
+    },
+]
+print(llm.chat_completion(dialog))
+```
+### generate
+`generate()` is the function to create a generator of response from a prompt.
+This is useful when you want to stream the output like typing in the chatbot.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+prompt = get_prompt("Hi do you know Pytorch?")
+for response in llama2_wrapper.generate(prompt):
+	print(response)
+```
+The response will be like:
+```
+Yes,
+Yes, I'm
+Yes, I'm familiar
+Yes, I'm familiar with
+Yes, I'm familiar with PyTorch!
+...
+```
+### run
+`run()` is similar to `generate()`, but `run()`can also accept `chat_history`and `system_prompt` from the users.
+It will process the input message to llama2 prompt template with `chat_history` and `system_prompt` for a chatbot-like app.
+### get_prompt
+`get_prompt()` will process the input message to llama2 prompt with `chat_history` and `system_prompt`for chatbot.
+By default, `chat_history` and `system_prompt` are empty and `get_prompt()` will add llama2 prompt template to your message:
+```python
+prompt = get_prompt("Hi do you know Pytorch?")
+```
+prompt will be:
+```
+[INST] <<SYS>>
+<</SYS>>
+Hi do you know Pytorch? [/INST]
+```
+If use `get_prompt("Hi do you know Pytorch?", system_prompt="You are a helpful...")`:
+```
+[INST] <<SYS>>
+You are a helpful, respectful and honest assistant.
+<</SYS>>
+Hi do you know Pytorch? [/INST]
+```
+### get_prompt_for_dialog
+`get_prompt_for_dialog()` will process dialog (chat history) to llama2 prompt for OpenAI compatible API `/v1/chat/completions`.
+```python
+dialog = [
+    {
+        "role":"system",
+        "content":"You are a helpful, respectful and honest assistant. "
+    },{
+        "role":"user",
+        "content":"Hi do you know Pytorch?",
+    },
+]
+prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
+# [INST] <<SYS>>
+# You are a helpful, respectful and honest assistant.
+# <</SYS>>
+#
+# Hi do you know Pytorch? [/INST]
+```

env_examples/.env.13b_example ADDED Viewed

	@@ -0,0 +1,13 @@

+MODEL_PATH = "./models/Llama-2-13b-chat-hf"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "transformers"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = True
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_8bit_example ADDED Viewed

	@@ -0,0 +1,13 @@

+MODEL_PATH = "./models/Llama-2-7b-chat-hf"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "transformers"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = True
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_ggmlv3_q4_0_example ADDED Viewed

	@@ -0,0 +1,18 @@

+MODEL_PATH = ""
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example ggml path:
+# MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "llama.cpp"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_gptq_example ADDED Viewed

	@@ -0,0 +1,18 @@

+MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example gptq path:
+# MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "gptq"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

llama2_wrapper/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LLAMA2_WRAPPER, get_prompt, get_prompt_for_dialog

llama2_wrapper/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (246 Bytes). View file

llama2_wrapper/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (18.6 kB). View file

llama2_wrapper/__pycache__/types.cpython-310.pyc ADDED Viewed

Binary file (4.05 kB). View file

llama2_wrapper/download/__init__.py ADDED Viewed

File without changes

llama2_wrapper/download/__main__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import argparse
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="",
+        required=True,
+        help="Repo ID like 'TheBloke/Llama-2-7B-Chat-GGML' ",
+    )
+    parser.add_argument(
+        "--filename",
+        type=str,
+        default=None,
+        help="Filename like llama-2-7b-chat.ggmlv3.q4_0.bin",
+    )
+    parser.add_argument(
+        "--save_dir", type=str, default="./models", help="Directory to save models"
+    )
+    args = parser.parse_args()
+    repo_id = args.repo_id
+    save_dir = args.save_dir
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    if args.filename:
+        filename = args.filename
+        from huggingface_hub import hf_hub_download
+        print(f"Start downloading model {repo_id} {filename} to: {save_dir}")
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_dir=save_dir,
+        )
+    else:
+        repo_name = repo_id.split("/")[1]
+        save_path = os.path.join(save_dir, repo_name)
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        print(f"Start downloading model {repo_id} to: {save_path}")
+        from huggingface_hub import snapshot_download
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=save_path,
+        )
+if __name__ == "__main__":
+    main()

llama2_wrapper/model.py ADDED Viewed

	@@ -0,0 +1,787 @@

+import os
+import time
+import uuid
+from enum import Enum
+from threading import Thread
+from typing import Any, Iterator, Union, List
+from llama2_wrapper.types import (
+    Completion,
+    CompletionChunk,
+    ChatCompletion,
+    ChatCompletionChunk,
+    # ChatCompletionMessage,
+    Message,
+    B_INST,
+    E_INST,
+    B_SYS,
+    E_SYS,
+)
+class LLAMA2_WRAPPER:
+    def __init__(
+        self,
+        model_path: str = "",
+        backend_type: str = "llama.cpp",
+        max_tokens: int = 4000,
+        load_in_8bit: bool = True,
+        verbose: bool = False,
+    ):
+        """Load a llama2 model from `model_path`.
+        Args:
+            model_path: Path to the model.
+            backend_type: Backend for llama2, options: llama.cpp, gptq, transformers
+            max_tokens: Maximum context size.
+            load_in_8bit: Use bitsandbytes to run model in 8 bit mode (only for transformers models).
+            verbose: Print verbose output to stderr.
+        Raises:
+            ValueError: If the model path does not exist.
+        Returns:
+            A LLAMA2_WRAPPER instance.
+        """
+        self.model_path = model_path
+        self.backend_type = BackendType.get_type(backend_type)
+        self.max_tokens = max_tokens
+        self.load_in_8bit = load_in_8bit
+        self.model = None
+        self.tokenizer = None
+        self.verbose = verbose
+        if self.backend_type is BackendType.LLAMA_CPP:
+            print("Running on backend llama.cpp.")
+        else:
+            import torch
+            if torch.cuda.is_available():
+                print("Running on GPU with backend torch transformers.")
+            else:
+                print("GPU CUDA not found.")
+        self.default_llamacpp_path = "./models/llama-2-7b-chat.Q4_0.gguf"
+        self.default_gptq_path = "./models/Llama-2-7b-Chat-GPTQ"
+        # Download default ggml/gptq model
+        if self.model_path == "":
+            print("Model path is empty.")
+            if self.backend_type is BackendType.LLAMA_CPP:
+                print("Use default llama.cpp model path: " + self.default_llamacpp_path)
+                if not os.path.exists(self.default_llamacpp_path):
+                    print("Start downloading model to: " + self.default_llamacpp_path)
+                    from huggingface_hub import hf_hub_download
+                    hf_hub_download(
+                        repo_id="TheBloke/Llama-2-7b-Chat-GGUF",
+                        filename="llama-2-7b-chat.Q4_0.gguf",
+                        local_dir="./models/",
+                    )
+                else:
+                    print("Model exists in ./models/llama-2-7b-chat.Q4_0.gguf.")
+                self.model_path = self.default_llamacpp_path
+            elif self.backend_type is BackendType.GPTQ:
+                print("Use default gptq model path: " + self.default_gptq_path)
+                if not os.path.exists(self.default_gptq_path):
+                    print("Start downloading model to: " + self.default_gptq_path)
+                    from huggingface_hub import snapshot_download
+                    snapshot_download(
+                        "TheBloke/Llama-2-7b-Chat-GPTQ",
+                        local_dir=self.default_gptq_path,
+                    )
+                else:
+                    print("Model exists in " + self.default_gptq_path)
+                self.model_path = self.default_gptq_path
+        self.init_tokenizer()
+        self.init_model()
+    def init_model(self):
+        if self.model is None:
+            self.model = LLAMA2_WRAPPER.create_llama2_model(
+                self.model_path,
+                self.backend_type,
+                self.max_tokens,
+                self.load_in_8bit,
+                self.verbose,
+            )
+        if self.backend_type is not BackendType.LLAMA_CPP:
+            self.model.eval()
+    def init_tokenizer(self):
+        if self.backend_type is not BackendType.LLAMA_CPP:
+            if self.tokenizer is None:
+                self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.model_path)
+    @classmethod
+    def create_llama2_model(
+        cls, model_path, backend_type, max_tokens, load_in_8bit, verbose
+    ):
+        if backend_type is BackendType.LLAMA_CPP:
+            from llama_cpp import Llama
+            model = Llama(
+                model_path=model_path,
+                n_ctx=max_tokens,
+                n_batch=max_tokens,
+                verbose=verbose,
+            )
+        elif backend_type is BackendType.GPTQ:
+            from auto_gptq import AutoGPTQForCausalLM
+            model = AutoGPTQForCausalLM.from_quantized(
+                model_path,
+                use_safetensors=True,
+                trust_remote_code=True,
+                device="cuda:0",
+                use_triton=False,
+                quantize_config=None,
+            )
+        elif backend_type is BackendType.TRANSFORMERS:
+            import torch
+            from transformers import AutoModelForCausalLM
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",
+                torch_dtype=torch.float16,
+                load_in_8bit=load_in_8bit,
+            )
+        else:
+            print(backend_type + "not implemented.")
+        return model
+    @classmethod
+    def create_llama2_tokenizer(cls, model_path):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        return tokenizer
+    def get_token_length(
+        self,
+        prompt: str,
+    ) -> int:
+        if self.backend_type is BackendType.LLAMA_CPP:
+            input_ids = self.model.tokenize(bytes(prompt, "utf-8"))
+            return len(input_ids)
+        else:
+            input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"]
+            return input_ids.shape[-1]
+    def get_input_token_length(
+        self,
+        message: str,
+        chat_history: list[tuple[str, str]] = [],
+        system_prompt: str = "",
+    ) -> int:
+        prompt = get_prompt(message, chat_history, system_prompt)
+        return self.get_token_length(prompt)
+    def generate(
+        self,
+        prompt: str,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Iterator[str]:
+        """Create a generator of response from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> for response in llama2_wrapper.generate(prompt):
+            ...     print(response)
+        Args:
+            prompt: The prompt to generate text from.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Yields:
+            The generated text.
+        """
+        if self.backend_type is BackendType.LLAMA_CPP:
+            result = self.model(
+                prompt=prompt,
+                stream=True,
+                max_tokens=max_new_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            outputs = []
+            for part in result:
+                text = part["choices"][0]["text"]
+                outputs.append(text)
+                yield "".join(outputs)
+        else:
+            from transformers import TextIteratorStreamer
+            inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
+            streamer = TextIteratorStreamer(
+                self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+            )
+            generate_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+            t.start()
+            outputs = []
+            for text in streamer:
+                outputs.append(text)
+                yield "".join(outputs)
+    def run(
+        self,
+        message: str,
+        chat_history: list[tuple[str, str]] = [],
+        system_prompt: str = "",
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+    ) -> Iterator[str]:
+        """Create a generator of response from a chat message.
+        Process message to llama2 prompt with chat history
+        and system_prompt for chatbot.
+        Args:
+            message: The origianl chat message to generate text from.
+            chat_history: Chat history list from chatbot.
+            system_prompt: System prompt for chatbot.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Yields:
+            The generated text.
+        """
+        prompt = get_prompt(message, chat_history, system_prompt)
+        return self.generate(
+            prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty
+        )
+    def __call__(
+        self,
+        prompt: str,
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[str, Iterator[str]]:
+        """Generate text from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> print(llama2_wrapper(prompt))
+        Args:
+            prompt: The prompt to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Generated text.
+        """
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.__call__(
+                prompt,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                def chunk_generator(chunks):
+                    for part in chunks:
+                        chunk = part["choices"][0]["text"]
+                        yield chunk
+                chunks: Iterator[str] = chunk_generator(completion_or_chunks)
+                return chunks
+            return completion_or_chunks["choices"][0]["text"]
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                return streamer
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                # skip prompt, skip special tokens
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                return output
+    def completion(
+        self,
+        prompt: str,
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[Completion, Iterator[CompletionChunk]]:
+        """For OpenAI compatible API /v1/completions
+        Generate text from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> print(llm.completion(prompt))
+        Args:
+            prompt: The prompt to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        model_name: str = (
+            self.backend_type + " default model"
+            if self.model_path == ""
+            else self.model_path
+        )
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.__call__(
+                prompt,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                chunks: Iterator[CompletionChunk] = completion_or_chunks
+                return chunks
+            return completion_or_chunks
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                def chunk_generator(chunks):
+                    for part in chunks:
+                        yield {
+                            "id": completion_id,
+                            "object": "text_completion",
+                            "created": created,
+                            "model": model_name,
+                            "choices": [
+                                {
+                                    "text": part,
+                                    "index": 0,
+                                    "logprobs": None,
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+                chunks: Iterator[CompletionChunk] = chunk_generator(streamer)
+                return chunks
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                total_tokens_len = len(output_ids[0])
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                completion: Completion = {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "text": output,
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens_len,
+                        "completion_tokens": total_tokens_len - prompt_tokens_len,
+                        "total_tokens": total_tokens_len,
+                    },
+                }
+                return completion
+    def chat_completion(
+        self,
+        messages: List[Message],
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        """For OpenAI compatible API /v1/chat/completions
+        Generate text from a dialog (chat history).
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> dialog = [
+                    {
+                        "role":"system",
+                        "content":"You are a helpful, respectful and honest assistant. "
+                    },{
+                        "role":"user",
+                        "content":"Hi do you know Pytorch?",
+                    },
+                ]
+            >>> print(llm.chat_completion(dialog))
+        Args:
+            dialog: The dialog (chat history) to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        model_name: str = (
+            self.backend_type + " default model"
+            if self.model_path == ""
+            else self.model_path
+        )
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.create_chat_completion(
+                messages,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                chunks: Iterator[ChatCompletionChunk] = completion_or_chunks
+                return chunks
+            return completion_or_chunks
+        else:
+            prompt = get_prompt_for_dialog(messages)
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                def chunk_generator(chunks):
+                    yield {
+                        "id": "chat" + completion_id,
+                        "model": model_name,
+                        "created": created,
+                        "object": "chat.completion.chunk",
+                        "choices": [
+                            {
+                                "index": 0,
+                                "delta": {
+                                    "role": "assistant",
+                                },
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                    for part in enumerate(chunks):
+                        yield {
+                            "id": "chat" + completion_id,
+                            "model": model_name,
+                            "created": created,
+                            "object": "chat.completion.chunk",
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": part,
+                                    },
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+                chunks: Iterator[ChatCompletionChunk] = chunk_generator(streamer)
+                return chunks
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                total_tokens_len = len(output_ids[0])
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                chatcompletion: ChatCompletion = {
+                    "id": "chat" + completion_id,
+                    "object": "chat.completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": output,
+                            },
+                            "finish_reason": None,
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens_len,
+                        "completion_tokens": total_tokens_len - prompt_tokens_len,
+                        "total_tokens": total_tokens_len,
+                    },
+                }
+                return chatcompletion
+def get_prompt_for_dialog(dialog: List[Message]) -> str:
+    """Process dialog (chat history) to llama2 prompt for
+    OpenAI compatible API /v1/chat/completions.
+    Examples:
+        >>> dialog = [
+                {
+                    "role":"system",
+                    "content":"You are a helpful, respectful and honest assistant. "
+                },{
+                    "role":"user",
+                    "content":"Hi do you know Pytorch?",
+                },
+            ]
+        >>> prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
+    Args:
+        dialog: The dialog (chat history) to generate text from.
+    Yields:
+        prompt string.
+    """
+    # add "<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" in first dialog
+    if dialog[0]["role"] == "system":
+        dialog = [
+            {
+                "role": dialog[1]["role"],
+                "content": B_SYS + dialog[0]["content"] + E_SYS + dialog[1]["content"],
+            }
+        ] + dialog[2:]
+    # check roles
+    assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
+        [msg["role"] == "assistant" for msg in dialog[1::2]]
+    ), (
+        "model only supports 'system', 'user' and 'assistant' roles, "
+        "starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
+    )
+    # add chat history
+    texts = []
+    for prompt, answer in zip(
+        dialog[::2],
+        dialog[1::2],
+    ):
+        texts.append(
+            f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} "
+        )
+    # check last message if role is user, then add it to prompt text
+    assert (
+        dialog[-1]["role"] == "user"
+    ), f"Last message must be from user, got {dialog[-1]['role']}"
+    texts.append(f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}")
+    return "".join(texts)
+def get_prompt(
+    message: str, chat_history: list[tuple[str, str]] = [], system_prompt: str = ""
+) -> str:
+    """Process message to llama2 prompt with chat history
+    and system_prompt for chatbot.
+    Examples:
+        >>> prompt = get_prompt("Hi do you know Pytorch?")
+    Args:
+        message: The origianl chat message to generate text from.
+        chat_history: Chat history list from chatbot.
+        system_prompt: System prompt for chatbot.
+    Yields:
+        prompt string.
+    """
+    texts = [f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
+    for user_input, response in chat_history:
+        texts.append(f"{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ")
+    texts.append(f"{message.strip()} [/INST]")
+    return "".join(texts)
+class BackendType(Enum):
+    UNKNOWN = 0
+    TRANSFORMERS = 1
+    GPTQ = 2
+    LLAMA_CPP = 3
+    @classmethod
+    def get_type(cls, backend_name: str):
+        backend_type = None
+        backend_name_lower = backend_name.lower()
+        if "transformers" in backend_name_lower:
+            backend_type = BackendType.TRANSFORMERS
+        elif "gptq" in backend_name_lower:
+            backend_type = BackendType.GPTQ
+        elif "cpp" in backend_name_lower:
+            backend_type = BackendType.LLAMA_CPP
+        else:
+            raise Exception("Unknown backend: " + backend_name)
+            # backend_type = BackendType.UNKNOWN
+        return backend_type

llama2_wrapper/server/__init__.py ADDED Viewed

File without changes

llama2_wrapper/server/__main__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Example FastAPI server for llama2_wrapper.
+To run this example:
+```
+python3 -m llama2_wrapper.server
+```
+or
+```
+uvicorn llama2_wrapper.server.app:app --reload
+```
+Then visit http://localhost:8000/docs to see the interactive API docs.
+"""
+import os
+import argparse
+import uvicorn
+from llama2_wrapper.server.app import create_app, Settings
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    for name, field in Settings.model_fields.items():
+        description = field.description
+        if field.default is not None and description is not None:
+            description += f" (default: {field.default})"
+        parser.add_argument(
+            f"--{name}",
+            dest=name,
+            type=field.annotation if field.annotation is not None else str,
+            help=description,
+        )
+    args = parser.parse_args()
+    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
+    app = create_app(settings=settings)
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", settings.host),
+        port=int(os.getenv("PORT", settings.port)),
+    )

llama2_wrapper/server/app.py ADDED Viewed

	@@ -0,0 +1,526 @@

+import json
+import multiprocessing
+from re import compile, Match, Pattern
+from threading import Lock
+from functools import partial
+from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict
+from typing_extensions import TypedDict, Literal
+import anyio
+from anyio.streams.memory import MemoryObjectSendStream
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
+from fastapi import Depends, FastAPI, APIRouter, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.routing import APIRoute
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings
+from sse_starlette.sse import EventSourceResponse
+from llama2_wrapper.model import LLAMA2_WRAPPER
+from llama2_wrapper.types import (
+    Completion,
+    CompletionChunk,
+    ChatCompletion,
+    ChatCompletionChunk,
+)
+class Settings(BaseSettings):
+    model_path: str = Field(
+        default="",
+        description="The path to the model to use for generating completions.",
+    )
+    backend_type: str = Field(
+        default="llama.cpp",
+        description="Backend for llama2, options: llama.cpp, gptq, transformers",
+    )
+    max_tokens: int = Field(default=4000, ge=1, description="Maximum context size.")
+    load_in_8bit: bool = Field(
+        default=False,
+        description="`Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models).",
+    )
+    verbose: bool = Field(
+        default=False,
+        description="Whether to print verbose output to stderr.",
+    )
+    host: str = Field(default="localhost", description="API address")
+    port: int = Field(default=8000, description="API port")
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
+class ErrorResponse(TypedDict):
+    """OpenAI style error response"""
+    message: str
+    type: str
+    param: Optional[str]
+    code: Optional[str]
+class ErrorResponseFormatters:
+    """Collection of formatters for error responses.
+    Args:
+        request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
+            Request body
+        match (Match[str]): Match object from regex pattern
+    Returns:
+        Tuple[int, ErrorResponse]: Status code and error response
+    """
+    @staticmethod
+    def context_length_exceeded(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for context length exceeded error"""
+        context_window = int(match.group(2))
+        prompt_tokens = int(match.group(1))
+        completion_tokens = request.max_new_tokens
+        if hasattr(request, "messages"):
+            # Chat completion
+            message = (
+                "This model's maximum context length is {} tokens. "
+                "However, you requested {} tokens "
+                "({} in the messages, {} in the completion). "
+                "Please reduce the length of the messages or completion."
+            )
+        else:
+            # Text completion
+            message = (
+                "This model's maximum context length is {} tokens, "
+                "however you requested {} tokens "
+                "({} in your prompt; {} for the completion). "
+                "Please reduce your prompt; or completion length."
+            )
+        return 400, ErrorResponse(
+            message=message.format(
+                context_window,
+                completion_tokens + prompt_tokens,
+                prompt_tokens,
+                completion_tokens,
+            ),
+            type="invalid_request_error",
+            param="messages",
+            code="context_length_exceeded",
+        )
+    @staticmethod
+    def model_not_found(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for model_not_found error"""
+        model_path = str(match.group(1))
+        message = f"The model `{model_path}` does not exist"
+        return 400, ErrorResponse(
+            message=message,
+            type="invalid_request_error",
+            param=None,
+            code="model_not_found",
+        )
+class RouteErrorHandler(APIRoute):
+    """Custom APIRoute that handles application errors and exceptions"""
+    # key: regex pattern for original error message from llama_cpp
+    # value: formatter function
+    pattern_and_formatters: Dict[
+        "Pattern",
+        Callable[
+            [
+                Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+                "Match[str]",
+            ],
+            Tuple[int, ErrorResponse],
+        ],
+    ] = {
+        compile(
+            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
+        ): ErrorResponseFormatters.context_length_exceeded,
+        compile(
+            r"Model path does not exist: (.+)"
+        ): ErrorResponseFormatters.model_not_found,
+    }
+    def error_message_wrapper(
+        self,
+        error: Exception,
+        body: Optional[
+            Union[
+                "CreateChatCompletionRequest",
+                "CreateCompletionRequest",
+            ]
+        ] = None,
+    ) -> Tuple[int, ErrorResponse]:
+        """Wraps error message in OpenAI style error response"""
+        if body is not None and isinstance(
+            body,
+            (
+                CreateCompletionRequest,
+                CreateChatCompletionRequest,
+            ),
+        ):
+            # When text completion or chat completion
+            for pattern, callback in self.pattern_and_formatters.items():
+                match = pattern.search(str(error))
+                if match is not None:
+                    return callback(body, match)
+        # Wrap other errors as internal server error
+        return 500, ErrorResponse(
+            message=str(error),
+            type="internal_server_error",
+            param=None,
+            code=None,
+        )
+    def get_route_handler(
+        self,
+    ) -> Callable[[Request], Coroutine[None, None, Response]]:
+        """Defines custom route handler that catches exceptions and formats
+        in OpenAI style error response"""
+        original_route_handler = super().get_route_handler()
+        async def custom_route_handler(request: Request) -> Response:
+            try:
+                return await original_route_handler(request)
+            except Exception as exc:
+                json_body = await request.json()
+                try:
+                    if "messages" in json_body:
+                        # Chat completion
+                        body: Optional[
+                            Union[
+                                CreateChatCompletionRequest,
+                                CreateCompletionRequest,
+                            ]
+                        ] = CreateChatCompletionRequest(**json_body)
+                    elif "prompt" in json_body:
+                        # Text completion
+                        body = CreateCompletionRequest(**json_body)
+                    # else:
+                    #     # Embedding
+                    #     body = CreateEmbeddingRequest(**json_body)
+                except Exception:
+                    # Invalid request body
+                    body = None
+                # Get proper error message from the exception
+                (
+                    status_code,
+                    error_message,
+                ) = self.error_message_wrapper(error=exc, body=body)
+                return JSONResponse(
+                    {"error": error_message},
+                    status_code=status_code,
+                )
+        return custom_route_handler
+router = APIRouter(route_class=RouteErrorHandler)
+settings: Optional[Settings] = None
+llama2: Optional[LLAMA2_WRAPPER] = None
+def create_app(settings: Optional[Settings] = None):
+    if settings is None:
+        settings = Settings()
+    app = FastAPI(
+        title="llama2-wrapper Fast API",
+        version="0.0.1",
+    )
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    app.include_router(router)
+    global llama2
+    llama2 = LLAMA2_WRAPPER(
+        model_path=settings.model_path,
+        backend_type=settings.backend_type,
+        max_tokens=settings.max_tokens,
+        load_in_8bit=settings.load_in_8bit,
+        verbose=settings.load_in_8bit,
+    )
+    def set_settings(_settings: Settings):
+        global settings
+        settings = _settings
+    set_settings(settings)
+    return app
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+def get_llama():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield llama2
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+def get_settings():
+    yield settings
+async def get_event_publisher(
+    request: Request,
+    inner_send_chan: MemoryObjectSendStream,
+    iterator: Iterator,
+):
+    async with inner_send_chan:
+        try:
+            async for chunk in iterate_in_threadpool(iterator):
+                await inner_send_chan.send(dict(data=json.dumps(chunk)))
+                if await request.is_disconnected():
+                    raise anyio.get_cancelled_exc_class()()
+                if settings.interrupt_requests and llama_outer_lock.locked():
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                    raise anyio.get_cancelled_exc_class()()
+            await inner_send_chan.send(dict(data="[DONE]"))
+        except anyio.get_cancelled_exc_class() as e:
+            print("disconnected")
+            with anyio.move_on_after(1, shield=True):
+                print(f"Disconnected from client (via refresh/close) {request.client}")
+                raise e
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
+)
+max_new_tokens_field = Field(
+    default=1000, ge=1, description="The maximum number of tokens to generate."
+)
+temperature_field = Field(
+    default=0.9,
+    ge=0.0,
+    le=2.0,
+    description="The temperature to use for sampling.",
+)
+top_p_field = Field(
+    default=1.0,
+    ge=0.0,
+    le=1.0,
+    description="The top-p value to use for sampling.",
+)
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="The top-k value to use for sampling.",
+)
+repetition_penalty_field = Field(
+    default=1.0,
+    ge=0.0,
+    description="The penalty to apply to repeated tokens.",
+)
+# stop_field = Field(
+#     default=None,
+#     description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
+# )
+class CreateCompletionRequest(BaseModel):
+    prompt: Union[str, List[str]] = Field(
+        default="", description="The prompt to generate text from."
+    )
+    stream: bool = stream_field
+    max_new_tokens: int = max_new_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    top_k: int = top_k_field
+    repetition_penalty: float = repetition_penalty_field
+    # stop: Optional[Union[str, List[str]]] = stop_field
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    # "stop": ["\n", "###"],
+                }
+            ]
+        }
+    }
+@router.post(
+    "/v1/completions",
+)
+async def create_completion(
+    request: Request,
+    body: CreateCompletionRequest,
+    llama2: LLAMA2_WRAPPER = Depends(get_llama),
+) -> Completion:
+    if isinstance(body.prompt, list):
+        assert len(body.prompt) <= 1
+        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+    kwargs = body.model_dump()
+    iterator_or_completion: Union[
+        Completion, Iterator[CompletionChunk]
+    ] = await run_in_threadpool(llama2.completion, **kwargs)
+    if isinstance(iterator_or_completion, Iterator):
+        first_response = await run_in_threadpool(next, iterator_or_completion)
+        # If no exception was raised from first_response, we can assume that
+        # the iterator is valid and we can use it to stream the response.
+        def iterator() -> Iterator[CompletionChunk]:
+            yield first_response
+            yield from iterator_or_completion
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator(),
+            ),
+        )
+    else:
+        return iterator_or_completion
+class ChatCompletionRequestMessage(BaseModel):
+    role: Literal["system", "user", "assistant"] = Field(
+        default="user", description="The role of the message."
+    )
+    content: str = Field(default="", description="The content of the message.")
+class CreateChatCompletionRequest(BaseModel):
+    messages: List[ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
+    )
+    stream: bool = stream_field
+    max_new_tokens: int = max_new_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    top_k: int = top_k_field
+    repetition_penalty: float = repetition_penalty_field
+    # stop: Optional[List[str]] = stop_field
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
+        }
+    }
+@router.post(
+    "/v1/chat/completions",
+)
+async def create_chat_completion(
+    request: Request,
+    body: CreateChatCompletionRequest,
+    llama2: LLAMA2_WRAPPER = Depends(get_llama),
+    settings: Settings = Depends(get_settings),
+) -> ChatCompletion:
+    kwargs = body.model_dump()
+    iterator_or_completion: Union[
+        ChatCompletion, Iterator[ChatCompletionChunk]
+    ] = await run_in_threadpool(llama2.chat_completion, **kwargs)
+    if isinstance(iterator_or_completion, Iterator):
+        first_response = await run_in_threadpool(next, iterator_or_completion)
+        # If no exception was raised from first_response, we can assume that
+        # the iterator is valid and we can use it to stream the response.
+        def iterator() -> Iterator[ChatCompletionChunk]:
+            yield first_response
+            yield from iterator_or_completion
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator(),
+            ),
+        )
+    else:
+        return iterator_or_completion
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]
+@router.get("/v1/models")
+async def get_models(
+    settings: Settings = Depends(get_settings),
+) -> ModelList:
+    assert llama2 is not None
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": settings.backend_type + " default model"
+                if settings.model_path == ""
+                else settings.model_path,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+        ],
+    }

llama2_wrapper/types.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import Any, List, Optional, Dict, Union
+from typing_extensions import TypedDict, NotRequired, Literal
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+# Role = Literal["system", "user", "assistant"]
+# class Message(TypedDict):
+#     role: Role
+#     content: str
+class ChatCompletionMessage(TypedDict):
+    role: Literal["assistant", "user", "system"]
+    content: str
+    user: NotRequired[str]
+# transformers: Message; llama.cpp: ChatCompletionMessage
+Message = ChatCompletionMessage
+Dialog = List[Message]
+class EmbeddingUsage(TypedDict):
+    prompt_tokens: int
+    total_tokens: int
+class EmbeddingData(TypedDict):
+    index: int
+    object: str
+    embedding: List[float]
+class Embedding(TypedDict):
+    object: Literal["list"]
+    model: str
+    data: List[EmbeddingData]
+    usage: EmbeddingUsage
+class CompletionLogprobs(TypedDict):
+    text_offset: List[int]
+    token_logprobs: List[Optional[float]]
+    tokens: List[str]
+    top_logprobs: List[Optional[Dict[str, float]]]
+class CompletionChoice(TypedDict):
+    text: str
+    index: int
+    logprobs: Optional[CompletionLogprobs]
+    finish_reason: Optional[str]
+class CompletionUsage(TypedDict):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+class CompletionChunk(TypedDict):
+    id: str
+    object: Literal["text_completion"]
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+class Completion(TypedDict):
+    id: str
+    object: Literal["text_completion"]
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+    usage: CompletionUsage
+class ChatCompletionChoice(TypedDict):
+    index: int
+    message: ChatCompletionMessage
+    finish_reason: Optional[str]
+class ChatCompletion(TypedDict):
+    id: str
+    object: Literal["chat.completion"]
+    created: int
+    model: str
+    choices: List[ChatCompletionChoice]
+    usage: CompletionUsage
+class ChatCompletionChunkDeltaEmpty(TypedDict):
+    pass
+class ChatCompletionChunkDelta(TypedDict):
+    role: NotRequired[Literal["assistant"]]
+    content: NotRequired[str]
+class ChatCompletionChunkChoice(TypedDict):
+    index: int
+    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
+    finish_reason: Optional[str]
+class ChatCompletionChunk(TypedDict):
+    id: str
+    model: str
+    object: Literal["chat.completion.chunk"]
+    created: int
+    choices: List[ChatCompletionChunkChoice]

nohup.out ADDED Viewed

The diff for this file is too large to render. See raw diff

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

prompts/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

prompts/prompts_en.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

prompts/utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import csv
+import os
+from hashlib import md5
+def read_csv_to_dict_list(file_path):
+    with open(file_path, mode="r", encoding="utf-8") as file:
+        reader = csv.DictReader(file)
+        list_of_dicts = [row for row in reader]
+        return list_of_dicts
+def split_list_with_key(lst, dict_key):
+    result = {}
+    for row in lst:
+        if row.get(dict_key) not in result:
+            result[row.get(dict_key)] = []
+        result[row.get(dict_key)].append(row)
+    return result
+def read_csv_to_type_dict(file_path, type_key):
+    lst = read_csv_to_dict_list(file_path=file_path)
+    return split_list_with_key(lst=lst, dict_key=type_key)
+def md5_str(str):
+    return md5(str.encode("utf8")).hexdigest()
+current_dir = os.path.dirname(__file__)
+class PromtsContainer(object):
+    def __init__(self) -> None:
+        prompts_path = os.path.join(current_dir, "prompts_en.csv")
+        self.data = read_csv_to_type_dict(prompts_path, "type")
+        self.summary_dict = {
+            md5_str(row.get("summary")): row.get("prompt")
+            for chunk in self.data.values()
+            for row in chunk
+        }
+    def get_prompts_tab_dict(self):
+        return self.data
+    def get_prompt_by_summary(self, summary):
+        return self.summary_dict.get(md5_str(summary), summary)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,47 @@

+[tool.poetry]
+name = "llama2-wrapper"
+version = "0.1.14"
+description = "Use llama2-wrapper as your local llama2 backend for Generative Agents / Apps"
+authors = ["liltom-eth <liltom.eth@gmail.com>"]
+license = "MIT"
+homepage = "https://github.com/liltom-eth/llama2-webui"
+repository = "https://github.com/liltom-eth/llama2-webui"
+readme = "./docs/pypi.md"
+packages = [{include = "llama2_wrapper"}]
+[tool.poetry.dependencies]
+python = ">=3.10,<3.13"
+accelerate = "^0.21.0"
+auto-gptq = "0.3.0"
+gradio = "3.37.0"
+protobuf = "3.20.3"
+scipy = "1.11.1"
+sentencepiece = "0.1.99"
+torch = "2.0.1"
+transformers = "4.31.0"
+tqdm = "4.65.0"
+python-dotenv = "1.0.0"
+llama-cpp-python = "0.2.11"
+bitsandbytes = [
+    {platform = 'linux', version = "0.40.2"},
+    {platform = 'darwin', version = "0.40.2"},
+]
+memory-profiler = "0.61.0"
+huggingface-hub = "0.16.4"
+fastapi = "0.100.0"
+uvicorn = "0.23.1"
+sse-starlette = "1.6.5"
+pydantic = "2.2.1"
+pydantic-settings = "2.0.3"
+pytest = "7.4.0"
+black = "23.7.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+[virtualenvs]
+create = true
+in-project = true

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+accelerate==0.21.0
+auto-gptq==0.3.0
+bitsandbytes==0.40.2
+gradio==3.37.0
+protobuf==3.20.3
+scipy==1.11.1
+sentencepiece==0.1.99
+torch==2.0.1
+transformers==4.31.0
+tqdm==4.65.0
+python-dotenv==1.0.0
+llama-cpp-python==0.2.11
+memory-profiler==0.61.0
+huggingface-hub==0.16.4
+fastapi==0.100.0
+uvicorn==0.23.1
+sse-starlette==1.6.5
+pydantic==2.2.1
+pydantic-settings==2.0.3
+pytest==7.4.0
+black==23.7.0

static/screenshot.png ADDED Viewed

tests/__init__.py ADDED Viewed

File without changes

tests/test_get_prompt.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import pytest
+from llama2_wrapper.model import get_prompt_for_dialog
+class TestClassGetPromptForDialog:
+    from llama2_wrapper.types import Message
+    dialog = []
+    message1 = Message(
+        role="system",
+        content="You are a helpful, respectful and honest assistant. ",
+    )
+    message2 = Message(
+        role="user",
+        content="Hi do you know Pytorch?",
+    )
+    dialog.append(message1)
+    dialog.append(message2)
+    dialog2 = []
+    dialog2.append(message1)
+    dialog2.append(message2)
+    message3 = Message(
+        role="assistant",
+        content="Yes I know Pytorch. ",
+    )
+    message4 = Message(
+        role="user",
+        content="Can you write a CNN in Pytorch?",
+    )
+    dialog2.append(message3)
+    dialog2.append(message4)
+    dialog3 = []
+    dialog3.append(message3)
+    dialog3.append(message4)
+    dialog3.append(message3)
+    dialog3.append(message4)
+    message5 = Message(
+        role="assistant",
+        content="Yes I can write a CNN in Pytorch.",
+    )
+    dialog3.append(message5)
+    def test_dialog1(self):
+        prompt = get_prompt_for_dialog(self.dialog)
+        # print(prompt)
+        result = """[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. \n<</SYS>>\n\nHi do you know Pytorch? [/INST]"""
+        assert prompt == result
+    def test_dialog2(self):
+        prompt = get_prompt_for_dialog(self.dialog2)
+        # print(prompt)
+        result = """[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. \n<</SYS>>\n\nHi do you know Pytorch? [/INST] Yes I know Pytorch. [INST] Can you write a CNN in Pytorch? [/INST]"""
+        assert prompt == result
+    def test_dialog3(self):
+        with pytest.raises(AssertionError):
+            prompt = get_prompt_for_dialog(self.dialog3)