fix chatglm; new feature about add_special_tokens;
Browse files- app.py +1 -1
- config.py +2 -3
- examples.py +3 -3
- js/onload.js +1 -1
- util.py +5 -1
- utils/compress_rate_util.py +9 -0
- utils/speed_util.py +3 -0
- vocab/chatglm_6b/chatglm_6b/tokenization_chatglm.py +2 -1
- vocab/chatglm_6b/test_chatglm.py +2 -1
- vocab/gpt_35_turbo/README.md +8 -0
- vocab/gpt_35_turbo/__init__.py +15 -4
- vocab/gpt_35_turbo/decode_test.py +8 -1
- vocab/gpt_35_turbo/test_tiktoken.py +3 -1
app.py
CHANGED
@@ -59,7 +59,7 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
|
|
59 |
gr.Markdown("## Input Text")
|
60 |
dropdown_examples = gr.Dropdown(
|
61 |
# ["空格测试", "标点测试", "符号测试", "数字测试"],
|
62 |
-
["
|
63 |
value="Examples",
|
64 |
type="index",
|
65 |
show_label=False,
|
|
|
59 |
gr.Markdown("## Input Text")
|
60 |
dropdown_examples = gr.Dropdown(
|
61 |
# ["空格测试", "标点测试", "符号测试", "数字测试"],
|
62 |
+
["space", "punctuation", "symbol", "number"],
|
63 |
value="Examples",
|
64 |
type="index",
|
65 |
show_label=False,
|
config.py
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
USE_REMOTE = False
|
|
|
1 |
+
USE_REMOTE = False
|
2 |
+
ADD_SPECIAL_TOKEN = False
|
|
examples.py
CHANGED
@@ -2,9 +2,9 @@ examples = {
|
|
2 |
"en": [
|
3 |
["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
4 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
5 |
-
["
|
6 |
-
["
|
7 |
-
["
|
8 |
]
|
9 |
,
|
10 |
"zh": [
|
|
|
2 |
"en": [
|
3 |
["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
4 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
5 |
+
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
6 |
+
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
7 |
+
["number: (10086 + 98) = 100184", "baichuan", "llama"]
|
8 |
]
|
9 |
,
|
10 |
"zh": [
|
js/onload.js
CHANGED
@@ -3,7 +3,7 @@ function() {
|
|
3 |
//$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
|
4 |
//$("footer a").childNodes[0].textContent ="Send Feedback"
|
5 |
|
6 |
-
document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback";
|
7 |
document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
|
8 |
|
9 |
// download button
|
|
|
3 |
//$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
|
4 |
//$("footer a").childNodes[0].textContent ="Send Feedback"
|
5 |
|
6 |
+
document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback"; // 🤔Reporting Issues
|
7 |
document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
|
8 |
|
9 |
// download button
|
util.py
CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
|
|
2 |
import json
|
3 |
import socket
|
4 |
import pandas as pd
|
|
|
5 |
from vocab import load_tokener
|
6 |
from utils.zh_util import iter_vocab
|
7 |
from utils.log_util import logger
|
@@ -16,7 +17,10 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
16 |
logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
|
17 |
pos_tokens = []
|
18 |
tokenizer = load_tokener(tokenizer_type)
|
19 |
-
|
|
|
|
|
|
|
20 |
|
21 |
table = []
|
22 |
|
|
|
2 |
import json
|
3 |
import socket
|
4 |
import pandas as pd
|
5 |
+
import config
|
6 |
from vocab import load_tokener
|
7 |
from utils.zh_util import iter_vocab
|
8 |
from utils.log_util import logger
|
|
|
17 |
logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
|
18 |
pos_tokens = []
|
19 |
tokenizer = load_tokener(tokenizer_type)
|
20 |
+
if config.ADD_SPECIAL_TOKEN:
|
21 |
+
encoding = tokenizer.encode(text, add_special_tokens=True)
|
22 |
+
else:
|
23 |
+
encoding = tokenizer.encode(text, add_special_tokens=False)
|
24 |
|
25 |
table = []
|
26 |
|
utils/compress_rate_util.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
|
4 |
+
中文数据
|
5 |
+
英文数据:
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
"""
|
utils/speed_util.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
分词速度
|
3 |
+
"""
|
vocab/chatglm_6b/chatglm_6b/tokenization_chatglm.py
CHANGED
@@ -195,6 +195,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
195 |
padding_side="left",
|
196 |
**kwargs
|
197 |
) -> None:
|
|
|
198 |
super().__init__(
|
199 |
do_lower_case=do_lower_case,
|
200 |
remove_space=remove_space,
|
@@ -212,7 +213,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
212 |
self.mask_token = mask_token
|
213 |
self.gMASK_token = gmask_token
|
214 |
|
215 |
-
|
216 |
|
217 |
""" Initialisation """
|
218 |
|
|
|
195 |
padding_side="left",
|
196 |
**kwargs
|
197 |
) -> None:
|
198 |
+
self.sp_tokenizer = SPTokenizer(vocab_file)
|
199 |
super().__init__(
|
200 |
do_lower_case=do_lower_case,
|
201 |
remove_space=remove_space,
|
|
|
213 |
self.mask_token = mask_token
|
214 |
self.gMASK_token = gmask_token
|
215 |
|
216 |
+
|
217 |
|
218 |
""" Initialisation """
|
219 |
|
vocab/chatglm_6b/test_chatglm.py
CHANGED
@@ -33,7 +33,7 @@ from transformers import AutoTokenizer
|
|
33 |
|
34 |
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
35 |
# tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
|
36 |
-
tokenizer = AutoTokenizer.from_pretrained("
|
37 |
|
38 |
|
39 |
def encode_text(text):
|
@@ -105,6 +105,7 @@ def test_tokens():
|
|
105 |
|
106 |
|
107 |
test_tokens()
|
|
|
108 |
|
109 |
# tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens
|
110 |
|
|
|
33 |
|
34 |
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
35 |
# tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
|
36 |
+
tokenizer = AutoTokenizer.from_pretrained("chatglm_6b/", trust_remote_code=True)
|
37 |
|
38 |
|
39 |
def encode_text(text):
|
|
|
105 |
|
106 |
|
107 |
test_tokens()
|
108 |
+
encode_text("good job d的 算法")
|
109 |
|
110 |
# tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens
|
111 |
|
vocab/gpt_35_turbo/README.md
CHANGED
@@ -24,6 +24,14 @@ special_token
|
|
24 |
{"id": 100276, "token": "<|endofprompt|>", "token_decode": "<|endofprompt|>", "token_len": 15, "zh_count": 0, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
|
25 |
```
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
## 词典文件
|
28 |
|
29 |
|
|
|
24 |
{"id": 100276, "token": "<|endofprompt|>", "token_decode": "<|endofprompt|>", "token_len": 15, "zh_count": 0, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
|
25 |
```
|
26 |
|
27 |
+
汉字+符号
|
28 |
+
```
|
29 |
+
{"id": 39045, "token": ",请", "token_decode": ",请", "token_len": 2, "zh_count": 1, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
|
30 |
+
```
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
## 词典文件
|
36 |
|
37 |
|
vocab/gpt_35_turbo/__init__.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
|
|
|
|
|
2 |
|
3 |
import tiktoken
|
4 |
from tiktoken import Encoding
|
@@ -22,17 +24,19 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
|
22 |
decode_str = "null"
|
23 |
return decode_str
|
24 |
|
|
|
25 |
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
26 |
"""
|
27 |
为什么没有这个方法?
|
28 |
"""
|
29 |
try:
|
30 |
-
return
|
31 |
except:
|
32 |
# 什么要返回None?见zh_util.py
|
33 |
# 16个空闲id, 100256 100261-100275
|
34 |
return [None for token in tokens]
|
35 |
|
|
|
36 |
def get_vocab(self, token_type="str"):
|
37 |
"""Returns vocab as a dict
|
38 |
:param token_type: ["str", "byte"]
|
@@ -59,10 +63,17 @@ def get_vocab(self, token_type="str"):
|
|
59 |
return vocab
|
60 |
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# tiktoken patch
|
|
|
|
|
64 |
Encoding.decode = decode
|
65 |
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
66 |
Encoding.get_vocab = get_vocab
|
67 |
-
|
68 |
-
|
|
|
1 |
+
"""
|
2 |
+
,请
|
3 |
+
"""
|
4 |
|
5 |
import tiktoken
|
6 |
from tiktoken import Encoding
|
|
|
24 |
decode_str = "null"
|
25 |
return decode_str
|
26 |
|
27 |
+
|
28 |
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
29 |
"""
|
30 |
为什么没有这个方法?
|
31 |
"""
|
32 |
try:
|
33 |
+
return self.decode_tokens_bytes(tokens)
|
34 |
except:
|
35 |
# 什么要返回None?见zh_util.py
|
36 |
# 16个空闲id, 100256 100261-100275
|
37 |
return [None for token in tokens]
|
38 |
|
39 |
+
|
40 |
def get_vocab(self, token_type="str"):
|
41 |
"""Returns vocab as a dict
|
42 |
:param token_type: ["str", "byte"]
|
|
|
63 |
return vocab
|
64 |
|
65 |
|
66 |
+
def encode(self, *args, **kwargs):
|
67 |
+
"""
|
68 |
+
add_special_token 是为了兼容 hf_tokenizer
|
69 |
+
"""
|
70 |
+
kwargs.pop("add_special_token", None)
|
71 |
+
return self._encode(*args, **kwargs)
|
72 |
+
|
73 |
|
74 |
# tiktoken patch
|
75 |
+
Encoding._encode = Encoding.encode
|
76 |
+
Encoding.encode = encode
|
77 |
Encoding.decode = decode
|
78 |
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
79 |
Encoding.get_vocab = get_vocab
|
|
|
|
vocab/gpt_35_turbo/decode_test.py
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
|
2 |
from vocab.gpt_35_turbo import tokenizer
|
3 |
|
4 |
-
print(tokenizer.decode([100256]))
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
print(tokenizer.convert_ids_to_tokens([100256]))
|
|
|
1 |
|
2 |
from vocab.gpt_35_turbo import tokenizer
|
3 |
|
|
|
4 |
|
5 |
+
text = "你好,请告诉我聚乙烯是什么"
|
6 |
+
encoding = tokenizer.encode(text)
|
7 |
+
|
8 |
+
|
9 |
+
print(tokenizer.decode([6744]))
|
10 |
+
print(tokenizer.convert_ids_to_tokens([6744]))
|
11 |
+
|
12 |
+
print(tokenizer.decode([100256]))
|
13 |
print(tokenizer.convert_ids_to_tokens([100256]))
|
vocab/gpt_35_turbo/test_tiktoken.py
CHANGED
@@ -12,7 +12,9 @@ import tiktoken
|
|
12 |
|
13 |
|
14 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
15 |
-
|
|
|
|
|
16 |
decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
|
17 |
print(encoding)
|
18 |
print(decoding_bytes)
|
|
|
12 |
|
13 |
|
14 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
15 |
+
text = "你好,请告诉我聚乙烯是什么"
|
16 |
+
# text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
|
17 |
+
encoding = tokenizer.encode(text)
|
18 |
decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
|
19 |
print(encoding)
|
20 |
print(decoding_bytes)
|