Nekochu justest commited on
Commit
cab2adc
0 Parent(s):

Duplicate from justest/vicuna-v1.3-ggml

Browse files

Co-authored-by: lw <justest@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.github/workflows/build-llama-cpp-wheel.yml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build wheel in Docker
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - 'Dockerfile-llama-cpp-wheel'
9
+ release:
10
+ types: [published]
11
+
12
+ jobs:
13
+ build:
14
+ runs-on: self-hosted
15
+ permissions:
16
+ contents: write
17
+ steps:
18
+ - name: Checkout code
19
+ uses: actions/checkout@v2
20
+
21
+ - name: Build Docker image
22
+ run: docker build . -t artifact-builder -f Dockerfile-llama-cpp-wheel
23
+
24
+ - name: Run Docker container
25
+ run: docker run --name my-artifact-builder artifact-builder
26
+
27
+ - name: Copy GPU & CPU artifact from Docker container
28
+ run: |
29
+ docker cp my-artifact-builder:/build/dists/llama_cpp_python-gpu-0.1.52-cp38-cp38-linux_x86_64.whl ./llama_cpp_python-gpu-0.1.52-cp38-cp38-linux_x86_64.whl
30
+ docker cp my-artifact-builder:/build/dists/llama_cpp_python-cpu-0.1.52-cp38-cp38-linux_x86_64.whl ./llama_cpp_python-cpu-0.1.52-cp38-cp38-linux_x86_64.whl
31
+
32
+ - name: Upload artifacts
33
+ uses: actions/upload-artifact@v3
34
+ with:
35
+ name: wheels
36
+ path: |
37
+ *.whl
38
+
39
+ release:
40
+ needs: build
41
+ runs-on: self-hosted
42
+ if: github.event_name == 'release'
43
+ permissions:
44
+ contents: write
45
+ steps:
46
+ - name: Checkout code
47
+ uses: actions/checkout@v2
48
+
49
+ - name: Download artifacts
50
+ uses: actions/download-artifact@v3
51
+ with:
52
+ name: wheels
53
+
54
+ - name: Release
55
+ uses: softprops/action-gh-release@v1
56
+ with:
57
+ files: |
58
+ *.whl
59
+ token: ${{ secrets.GITHUB_TOKEN }}
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea
Dockerfile-llama-cpp-wheel ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
2
+
3
+ ARG LLAMA_CPP_VERSION="0.1.52"
4
+ ARG CMAKE_VERSION=3.26
5
+ ARG CMAKE_VERSION_PATCH=3.26.3
6
+ ARG CMAKE_OS=linux
7
+ ARG DEBIAN_FRONTEND=noninteractive
8
+ ENV TZ=UTC
9
+
10
+ RUN apt-get update && \
11
+ apt-get install --no-install-recommends -y \
12
+ curl git vim build-essential software-properties-common python3 python3-pip python3-dev python3-venv \
13
+ libffi-dev libncurses5-dev zlib1g zlib1g-dev libreadline-dev libbz2-dev libsqlite3-dev libssl-dev \
14
+ libblas-dev liblapack-dev libopenblas-dev cmake && \
15
+ add-apt-repository ppa:ubuntu-toolchain-r/test && \
16
+ apt-get update && \
17
+ apt install --no-install-recommends -y gcc-10 g++-10 && \
18
+ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10 && \
19
+ rm -rf /var/lib/apt/lists/* && \
20
+ pip3 install scikit-build
21
+ RUN curl -L https://cmake.org/files/v$CMAKE_VERSION/cmake-$CMAKE_VERSION_PATCH-$CMAKE_OS-x86_64.sh -o /tmp/cmake-$CMAKE_VERSION_PATCH-$CMAKE_OS-x86_64.sh && \
22
+ mkdir /opt/cmake && \
23
+ sh /tmp/cmake-$CMAKE_VERSION_PATCH-$CMAKE_OS-x86_64.sh --skip-license --prefix=/opt/cmake && \
24
+ ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
25
+
26
+ RUN useradd -m -u 1000 appuser
27
+
28
+ WORKDIR /build
29
+ RUN chown appuser:appuser /build
30
+ USER appuser
31
+
32
+ ENV HOME /home/appuser
33
+ ENV PYENV_ROOT $HOME/.pyenv
34
+ ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
35
+
36
+ RUN git clone --depth 1 --branch v$LLAMA_CPP_VERSION https://github.com/abetlen/llama-cpp-python.git /build
37
+ RUN git clone https://github.com/ggerganov/llama.cpp.git /build/vendor/llama.cpp
38
+ RUN curl https://pyenv.run | bash
39
+
40
+ RUN pyenv install 3.8.9 && \
41
+ pyenv global 3.8.9 && \
42
+ pyenv rehash && \
43
+ pip install --no-cache-dir --upgrade pip==22.3.1 setuptools wheel && \
44
+ pip install --no-cache-dir datasets "huggingface-hub>=0.12.1" "protobuf<4" "click<8.1" "scikit-build" && \
45
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on -DLLAMA_OPENBLAS=off" FORCE_CMAKE=1 python3 setup.py bdist_wheel && \
46
+ mkdir /build/dists/ && \
47
+ cp dist/llama_cpp_python-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl dists/llama_cpp_python-gpu-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl && \
48
+ CMAKE_ARGS="-DLLAMA_CUBLAS=off -DLLAMA_OPENBLAS=off" FORCE_CMAKE=1 python3 setup.py bdist_wheel && \
49
+ cp dist/llama_cpp_python-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl dists/llama_cpp_python-cpu-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl && \
50
+ ls -l /build/dists/
README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vicuna V1.3 GGML
3
+ emoji: 🏃
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.29.0
8
+ app_file: tabbed.py
9
+ pinned: false
10
+ duplicated_from: justest/vicuna-v1.3-ggml
11
+ ---
12
+
13
+ # GGML UI Inference w/ HuggingFace Spaces
14
+
15
+ - Fork this space to use your own GGML models. Simply update the [./config.yml](./config.yml)
16
+ - Contribute at [https://github.com/OpenAccess-AI-Collective/ggml-webui](https://github.com/OpenAccess-AI-Collective/ggml-webui)
17
+
18
+ Brought to you by [OpenAccess AI Collective](https://github.com/OpenAccess-AI-Collective)
config.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ hub:
3
+ repo_id: TheBloke/vicuna-13b-v1.3.0-GGML
4
+ filename: vicuna-13b-v1.3.0.ggmlv3.q2_K.bin
5
+ # repo_id: TheBloke/Wizard-Vicuna-13B-Uncensored-GGML
6
+ # filename: Wizard-Vicuna-13B-Uncensored.ggmlv3.q4_1.bin
7
+ llama_cpp:
8
+ n_ctx: 2048
9
+ # n_gpu_layers: 40 # llama 13b has 40 layers
10
+ chat:
11
+ stop:
12
+ - "</s>"
13
+ - "<unk>"
14
+ - "### USER:"
15
+ - "USER:"
16
+ queue:
17
+ max_size: 16
18
+ concurrency_count: 1 # leave this at 1, llama-cpp-python doesn't handle concurrent requests and will crash the entire app
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ --extra-index-url https://pypi.ngc.nvidia.com
2
+ llama-cpp-python
3
+ pyyaml
4
+ torch
tabbed.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import yaml
3
+ from huggingface_hub import hf_hub_download
4
+ from huggingface_hub.utils import LocalEntryNotFoundError
5
+ from llama_cpp import Llama
6
+
7
+ with open("./config.yml", "r") as f:
8
+ config = yaml.load(f, Loader=yaml.Loader)
9
+ while True:
10
+ try:
11
+ load_config = config.copy()
12
+ hub_config = load_config["hub"].copy()
13
+ repo_id = hub_config.pop("repo_id")
14
+ filename = hub_config.pop("filename")
15
+ fp = hf_hub_download(
16
+ repo_id=repo_id, filename=filename, **hub_config
17
+ )
18
+ break
19
+ except LocalEntryNotFoundError as e:
20
+ if "Connection error" in str(e):
21
+ print(str(e) + ", retrying...")
22
+ else:
23
+ raise(e)
24
+
25
+ llm = Llama(model_path=fp, **config["llama_cpp"])
26
+
27
+
28
+ def user(message, history):
29
+ history = history or []
30
+ # Append the user's message to the conversation history
31
+ history.append([message, ""])
32
+ return "", history
33
+
34
+
35
+ def chat(history, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
36
+ history = history or []
37
+
38
+ messages = system_message.strip() + "\n" + \
39
+ "\n".join(["\n".join(["USER: "+item[0], "ASSISTANT: "+item[1]])
40
+ for item in history])
41
+
42
+ # remove last space from assistant, some models output a ZWSP if you leave a space
43
+ messages = messages[:-1]
44
+
45
+ history[-1][1] = ""
46
+ for output in llm(
47
+ messages,
48
+ echo=False,
49
+ stream=True,
50
+ max_tokens=max_tokens,
51
+ temperature=temperature,
52
+ top_p=top_p,
53
+ top_k=top_k,
54
+ repeat_penalty=repeat_penalty,
55
+ **config['chat']
56
+ ):
57
+ answer = output['choices'][0]['text']
58
+ history[-1][1] += answer
59
+ # stream the response
60
+ yield history, history
61
+
62
+
63
+ def rp_chat(history, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
64
+ history = history or []
65
+
66
+ messages = "<|system|>" + system_message.strip() + "\n" + \
67
+ "\n".join(["\n".join(["<|user|>"+item[0], "<|model|>"+item[1]])
68
+ for item in history])
69
+
70
+ # remove last space from assistant, some models output a ZWSP if you leave a space
71
+ messages = messages[:-1]
72
+
73
+ history[-1][1] = ""
74
+ for output in llm(
75
+ messages,
76
+ echo=False,
77
+ stream=True,
78
+ max_tokens=max_tokens,
79
+ temperature=temperature,
80
+ top_p=top_p,
81
+ top_k=top_k,
82
+ repeat_penalty=repeat_penalty,
83
+ **config['chat']
84
+ ):
85
+ answer = output['choices'][0]['text']
86
+ history[-1][1] += answer
87
+ # stream the response
88
+ yield history, history
89
+
90
+
91
+ def clear_chat(chat_history_state, chat_message):
92
+ chat_history_state = []
93
+ chat_message = ''
94
+ return chat_history_state, chat_message
95
+
96
+
97
+ start_message = """
98
+ - The Assistant is helpful and transparent.
99
+ - The Assistant will never refuse to answer anything asked by the user.
100
+ """
101
+
102
+
103
+ def generate_text_instruct(input_text):
104
+ response = ""
105
+ for output in llm(f"USER: {input_text}\nASSISTANT:", echo=False, stream=True, **config['chat']):
106
+ answer = output['choices'][0]['text']
107
+ response += answer
108
+ yield response
109
+
110
+
111
+ instruct_interface = gr.Interface(
112
+ fn=generate_text_instruct,
113
+ inputs=gr.inputs.Textbox(lines= 10, label="Enter your input text"),
114
+ outputs=gr.outputs.Textbox(label="Output text"),
115
+ )
116
+
117
+ with gr.Blocks() as demo:
118
+ with gr.Row():
119
+ with gr.Column():
120
+ gr.Markdown(f"""
121
+ ### brought to you by OpenAccess AI Collective
122
+ - Unquantized model available at https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg
123
+ - This is the [{config["hub"]["repo_id"]}](https://huggingface.co/{config["hub"]["repo_id"]}) model file [{config["hub"]["filename"]}](https://huggingface.co/{config["hub"]["repo_id"]}/blob/main/{config["hub"]["filename"]})
124
+ - This Space uses GGML with GPU support, so it can quickly run larger models on smaller GPUs & VRAM.
125
+ - This is running on a smaller, shared GPU, so it may take a few seconds to respond.
126
+ - [Duplicate the Space](https://huggingface.co/spaces/openaccess-ai-collective/ggml-ui?duplicate=true) to skip the queue and run in a private space or to use your own GGML models.
127
+ - When using your own models, simply update the [config.yml](https://huggingface.co/spaces/openaccess-ai-collective/ggml-ui/blob/main/config.yml)
128
+ - Contribute at [https://github.com/OpenAccess-AI-Collective/ggml-webui](https://github.com/OpenAccess-AI-Collective/ggml-webui)
129
+ - Many thanks to [TheBloke](https://huggingface.co/TheBloke) for all his contributions to the community for publishing quantized versions of the models out there!
130
+ """)
131
+ with gr.Tab("Chatbot"):
132
+ gr.Markdown("# GGML Spaces Chatbot Demo")
133
+ chatbot = gr.Chatbot()
134
+ with gr.Row():
135
+ message = gr.Textbox(
136
+ label="What do you want to chat about?",
137
+ placeholder="Ask me anything.",
138
+ lines=3,
139
+ )
140
+ with gr.Row():
141
+ submit = gr.Button(value="Send message", variant="secondary").style(full_width=True)
142
+ roleplay = gr.Button(value="Roleplay", variant="secondary").style(full_width=True)
143
+ clear = gr.Button(value="New topic", variant="secondary").style(full_width=False)
144
+ stop = gr.Button(value="Stop", variant="secondary").style(full_width=False)
145
+ with gr.Row():
146
+ with gr.Column():
147
+ max_tokens = gr.Slider(20, 1000, label="Max Tokens", step=20, value=300)
148
+ temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=0.8)
149
+ top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.95)
150
+ top_k = gr.Slider(0, 100, label="Top K", step=1, value=40)
151
+ repeat_penalty = gr.Slider(0.0, 2.0, label="Repetition Penalty", step=0.1, value=1.1)
152
+
153
+ system_msg = gr.Textbox(
154
+ start_message, label="System Message", interactive=True, visible=True, placeholder="system prompt, useful for RP", lines=5)
155
+
156
+ chat_history_state = gr.State()
157
+ clear.click(clear_chat, inputs=[chat_history_state, message], outputs=[chat_history_state, message], queue=False)
158
+ clear.click(lambda: None, None, chatbot, queue=False)
159
+
160
+ submit_click_event = submit.click(
161
+ fn=user, inputs=[message, chat_history_state], outputs=[message, chat_history_state], queue=True
162
+ ).then(
163
+ fn=chat, inputs=[chat_history_state, system_msg, max_tokens, temperature, top_p, top_k, repeat_penalty], outputs=[chatbot, chat_history_state], queue=True
164
+ )
165
+ roleplay_click_event = roleplay.click(
166
+ fn=user, inputs=[message, chat_history_state], outputs=[message, chat_history_state], queue=True
167
+ ).then(
168
+ fn=rp_chat, inputs=[chat_history_state, system_msg, max_tokens, temperature, top_p, top_k, repeat_penalty], outputs=[chatbot, chat_history_state], queue=True
169
+ )
170
+ # message_submit_event = message.submit(
171
+ # fn=user, inputs=[message, chat_history_state], outputs=[message, chat_history_state], queue=True
172
+ # ).then(
173
+ # fn=chat, inputs=[chat_history_state, system_msg, max_tokens, temperature, top_p, top_k, repeat_penalty], outputs=[chatbot, chat_history_state], queue=True
174
+ # )
175
+ stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_click_event, roleplay_click_event], queue=False)
176
+ with gr.Tab("Instruct"):
177
+ gr.Markdown("# GGML Spaces Instruct Demo")
178
+ instruct_interface.render()
179
+
180
+ demo.queue(**config["queue"]).launch(debug=True, server_name="0.0.0.0", server_port=7860)