yibolu
commited on
Commit
•
308345c
1
Parent(s):
34d9abf
add missing files, fix readme and add docker file
Browse files- .gitattributes +1 -0
- Dockerfile +11 -0
- README.md +12 -1
- demo.py +6 -4
- faster_chat_glm/glm.cpython-38-x86_64-linux-gnu.so +0 -0
- {faster_chat_glm → lyraChatGLM}/__init__.py +3 -0
- {faster_chat_glm → lyraChatGLM}/__init__.py~ +0 -0
- lyraChatGLM/glm.cpython-38-x86_64-linux-gnu.so +3 -0
- lyraChatGLM/libnvinfer_plugin.so +3 -0
- {faster_chat_glm → lyraChatGLM}/model.py +0 -0
- requirements.txt +4 -0
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
models/glm6b-kv-cache-dy-bs8.ftm filter=lfs diff=lfs merge=lfs -text
|
36 |
models/glm6b-bs8.ftm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
models/glm6b-kv-cache-dy-bs8.ftm filter=lfs diff=lfs merge=lfs -text
|
36 |
models/glm6b-bs8.ftm filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.so filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvcr.io/nvidia/pytorch:23.02-py3
|
2 |
+
|
3 |
+
WORKDIR /workdir
|
4 |
+
|
5 |
+
COPY requirements.txt /workdir/
|
6 |
+
|
7 |
+
# since installing icetk will install protobuf 3.18.3, and we need protobuf==3.20.3
|
8 |
+
RUN pip install -r requirements.txt && \
|
9 |
+
pip install protobuf==3.20.3
|
10 |
+
|
11 |
+
|
README.md
CHANGED
@@ -35,6 +35,17 @@ Among its main features are:
|
|
35 |
|
36 |
- **Repository:** [https://huggingface.co/THUDM/chatglm-6b]
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
## Uses
|
39 |
|
40 |
```python
|
@@ -86,7 +97,7 @@ print(res)
|
|
86 |
## Citation
|
87 |
``` bibtex
|
88 |
@Misc{lyraChatGLM2023,
|
89 |
-
author = {Kangjian Wu, Zhengtao Wang, Bin Wu},
|
90 |
title = {lyraChatGLM: Accelerating ChatGLM by 10x+},
|
91 |
howpublished = {\url{https://huggingface.co/TMElyralab/lyraChatGLM}},
|
92 |
year = {2023}
|
|
|
35 |
|
36 |
- **Repository:** [https://huggingface.co/THUDM/chatglm-6b]
|
37 |
|
38 |
+
## Try Demo in 2 fast steps
|
39 |
+
|
40 |
+
``` bash
|
41 |
+
#step 1
|
42 |
+
git clone https://huggingface.co/TMElyralab/lyraChatGLM
|
43 |
+
cd lyraChatGLM
|
44 |
+
|
45 |
+
#step 2
|
46 |
+
docker run --gpus=1 --rm --net=host -v ${PWD}:/workdir yibolu96/lyra-chatglm-env:0.0.1 python3 /workdir/demo.py
|
47 |
+
```
|
48 |
+
|
49 |
## Uses
|
50 |
|
51 |
```python
|
|
|
97 |
## Citation
|
98 |
``` bibtex
|
99 |
@Misc{lyraChatGLM2023,
|
100 |
+
author = {Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
|
101 |
title = {lyraChatGLM: Accelerating ChatGLM by 10x+},
|
102 |
howpublished = {\url{https://huggingface.co/TMElyralab/lyraChatGLM}},
|
103 |
year = {2023}
|
demo.py
CHANGED
@@ -1,18 +1,20 @@
|
|
1 |
# coding=utf-8
|
2 |
|
3 |
from transformers import AutoTokenizer
|
4 |
-
from
|
|
|
5 |
|
|
|
6 |
|
7 |
MAX_OUT_LEN = 100
|
8 |
-
chatglm6b_dir =
|
9 |
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
|
10 |
input_str = ["为什么我们需要对深度学习模型加速?", ]
|
11 |
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
|
12 |
input_ids = inputs.input_ids.to('cuda:0')
|
13 |
|
|
|
14 |
|
15 |
-
plan_path = './models/glm6b-bs8.ftm'
|
16 |
# kernel for chat model.
|
17 |
kernel = GLM6B(plan_path=plan_path,
|
18 |
batch_size=1,
|
@@ -24,7 +26,7 @@ kernel = GLM6B(plan_path=plan_path,
|
|
24 |
vocab_size=150528,
|
25 |
max_seq_len=MAX_OUT_LEN)
|
26 |
|
27 |
-
chat = FasterChatGLM(model_dir=
|
28 |
|
29 |
# generate
|
30 |
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
|
|
|
1 |
# coding=utf-8
|
2 |
|
3 |
from transformers import AutoTokenizer
|
4 |
+
from lyraChatGLM import GLM6B, FasterChatGLM
|
5 |
+
import os
|
6 |
|
7 |
+
current_workdir = os.path.dirname(__file__)
|
8 |
|
9 |
MAX_OUT_LEN = 100
|
10 |
+
chatglm6b_dir = os.path.join(current_workdir, "models")
|
11 |
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
|
12 |
input_str = ["为什么我们需要对深度学习模型加速?", ]
|
13 |
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
|
14 |
input_ids = inputs.input_ids.to('cuda:0')
|
15 |
|
16 |
+
plan_path = os.path.join(current_workdir, "models/glm6b-bs8.ftm")
|
17 |
|
|
|
18 |
# kernel for chat model.
|
19 |
kernel = GLM6B(plan_path=plan_path,
|
20 |
batch_size=1,
|
|
|
26 |
vocab_size=150528,
|
27 |
max_seq_len=MAX_OUT_LEN)
|
28 |
|
29 |
+
chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()
|
30 |
|
31 |
# generate
|
32 |
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
|
faster_chat_glm/glm.cpython-38-x86_64-linux-gnu.so
DELETED
Binary file (188 kB)
|
|
{faster_chat_glm → lyraChatGLM}/__init__.py
RENAMED
@@ -1,5 +1,8 @@
|
|
1 |
import os
|
|
|
2 |
|
|
|
|
|
3 |
os.environ["TORCH_USE_RTLD_GLOBAL"]="YES"
|
4 |
|
5 |
import torch
|
|
|
1 |
import os
|
2 |
+
import ctypes
|
3 |
|
4 |
+
current_workdir = os.path.dirname(__file__)
|
5 |
+
ctypes.cdll.LoadLibrary(os.path.join(current_workdir, "libnvinfer_plugin.so"))
|
6 |
os.environ["TORCH_USE_RTLD_GLOBAL"]="YES"
|
7 |
|
8 |
import torch
|
{faster_chat_glm → lyraChatGLM}/__init__.py~
RENAMED
File without changes
|
lyraChatGLM/glm.cpython-38-x86_64-linux-gnu.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:feaeb19a7b780cdb669066bb096726d23f0c3ed401fe2f71adf12c66960c0d07
|
3 |
+
size 188432
|
lyraChatGLM/libnvinfer_plugin.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a87eb31795009c545422ef978f607d97be5454c68f09cb829352c0529d1ba8b
|
3 |
+
size 235256088
|
{faster_chat_glm → lyraChatGLM}/model.py
RENAMED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
icetk
|
2 |
+
torch
|
3 |
+
transformers
|
4 |
+
|