update
Browse files- app.py +32 -91
- style.css → css/style.css +0 -0
- evaluation.md +5 -0
- examples.py +22 -0
- images/README.md +5 -0
- images/download_button.html +1 -0
- js/onload.js +12 -0
- util.py +67 -27
- utils/_vocab.zh.jsonl +1189 -0
- utils/log_util.py +1 -1
- utils/zh_util.py +4 -2
- vocab/README.md +3 -1
- vocab/__init__.py +13 -8
- vocab/{baichuan_7b → baichuan}/Baichuan-7B/config.json +0 -0
- vocab/{baichuan_7b → baichuan}/Baichuan-7B/configuration_baichuan.py +0 -0
- vocab/{baichuan_7b → baichuan}/Baichuan-7B/special_tokens_map.json +0 -0
- vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenization_baichuan.py +0 -0
- vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer.model +0 -0
- vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer_config.json +0 -0
- vocab/{baichuan_7b → baichuan}/__init__.py +0 -0
- vocab/{baichuan_7b → baichuan}/demo.py +0 -0
- vocab/baichuan2/__init__.py +10 -0
- vocab/bloom/test_tokenizer.py +2 -0
- vocab/chinese_llama2/__init__.py +7 -0
- vocab/falcon_180b/__init__.py +11 -0
- vocab/falcon_180b/tokenizer/special_tokens_map.json +16 -0
- vocab/falcon_180b/tokenizer/tokenizer.json +0 -0
- vocab/falcon_180b/tokenizer/tokenizer_config.json +8 -0
- vocab/gpt_35_turbo/__init__.py +14 -5
- vocab/gpt_35_turbo/aaa.py +5 -0
- vocab/gpt_4/__init__.py +1 -46
- vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json +2 -0
- vocab/gpt_neox_chinese_v1/mock.py +24 -9
- vocab/gpt_neox_chinese_v1/trouble-shooting.md +22 -0
- vocab/llama/__init__.py +13 -0
- vocab/llama/demo.py +33 -0
app.py
CHANGED
@@ -4,21 +4,19 @@
|
|
4 |
|
5 |
"""
|
6 |
## TODO:
|
7 |
-
- http get方式获取参数,(高优先级)
|
8 |
- i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language
|
9 |
- iter_vocab 的 warmup
|
10 |
-
-
|
11 |
-
-
|
12 |
-
-
|
|
|
|
|
|
|
|
|
13 |
- 通过 javascript 添加 hover_text
|
14 |
-
- 给方法 + 缓存,避免重复调用
|
15 |
- 英文 utf-8编码
|
16 |
-
-
|
17 |
-
- 中文字词统计,是否要包括 _ G 等字符
|
18 |
- baichuan的单字数量怎么两万多个?
|
19 |
-
- OOV
|
20 |
-
- feedback位置
|
21 |
-
- gpt4, gpt3.5 的overlap tokens 有问题。
|
22 |
- qwen: ValueError: Unclosed image token
|
23 |
|
24 |
plots
|
@@ -39,57 +37,16 @@ table
|
|
39 |
import gradio as gr
|
40 |
from vocab import all_tokenizers
|
41 |
from util import *
|
|
|
42 |
|
43 |
-
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
44 |
-
examples_zh = [
|
45 |
-
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
46 |
-
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
47 |
-
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
48 |
-
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
49 |
-
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
50 |
-
]
|
51 |
|
52 |
-
examples = [
|
53 |
-
["spaces: 2spaces 8spaces", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
54 |
-
["punctuations: ,./?\",。!?;", "baichuan_7b", "llama"],
|
55 |
-
["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
56 |
-
["digits: (10086 + 98) = 100184", "baichuan_7b", "llama"],
|
57 |
-
]
|
58 |
|
59 |
-
|
60 |
-
# jieba.enable_parallel() # flask中没办法parallel
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
def example_fn(example_idx):
|
66 |
-
return examples[example_idx]
|
67 |
-
|
68 |
-
|
69 |
-
"""Replace this text in the input field to see how tokenization works
|
70 |
-
|
71 |
-
|
72 |
-
"""
|
73 |
-
|
74 |
-
default_user_input = """Replace this text in the input field to see how tokenization works
|
75 |
-
华为发布Mate60手机
|
76 |
-
ラグビーワールドカップ2023フランス"""
|
77 |
-
default_tokenizer_type_1 = "llama"
|
78 |
-
default_tokenizer_type_2 = "internlm_chat_7b"
|
79 |
-
default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
|
80 |
-
default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
|
81 |
-
default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
|
82 |
-
default_output_text_1, default_output_table_1, default_output_len_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
|
83 |
-
default_output_text_2, default_output_table_2, default_output_len_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
|
84 |
-
|
85 |
-
with gr.Blocks(css="style.css") as demo:
|
86 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
87 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
88 |
# 功能:输入文本,进行分词
|
89 |
# 分词器:常见的分词器有集中,
|
90 |
# 背景:方便分词、看词粒度、对比
|
91 |
-
#
|
92 |
-
# Byte: 表示分词
|
93 |
|
94 |
with gr.Row():
|
95 |
gr.Markdown("## Input Text")
|
@@ -103,26 +60,18 @@ with gr.Blocks(css="style.css") as demo:
|
|
103 |
scale=0,
|
104 |
elem_classes="example-style"
|
105 |
)
|
106 |
-
|
107 |
user_input = gr.Textbox(
|
108 |
-
value=default_user_input,
|
109 |
label="Input Text",
|
110 |
lines=5,
|
111 |
show_label=False,
|
112 |
-
)
|
113 |
-
# gr.Examples(
|
114 |
-
# examples,
|
115 |
-
# None,
|
116 |
-
# )
|
117 |
-
|
118 |
gr.Markdown("## Tokenization")
|
119 |
-
|
120 |
with gr.Row():
|
121 |
with gr.Column(scale=6):
|
122 |
with gr.Group():
|
123 |
tokenizer_type_1 = gr.Dropdown(
|
124 |
all_tokenizers,
|
125 |
-
value=default_tokenizer_type_1,
|
126 |
label="Tokenizer 1",
|
127 |
)
|
128 |
with gr.Group():
|
@@ -131,19 +80,17 @@ with gr.Blocks(css="style.css") as demo:
|
|
131 |
"""
|
132 |
with gr.Row():
|
133 |
stats_vocab_size_1 = gr.TextArea(
|
134 |
-
value=default_stats_vocab_size_1,
|
135 |
label="VocabSize",
|
136 |
lines=1,
|
137 |
elem_classes="statistics"
|
138 |
)
|
139 |
stats_zh_token_size_1 = gr.TextArea(
|
140 |
-
value=default_stats_zh_token_size_1,
|
141 |
label="ZH char/word",
|
142 |
lines=1,
|
143 |
elem_classes="statistics"
|
144 |
)
|
145 |
stats_overlap_token_size_1 = gr.TextArea(
|
146 |
-
value=default_stats_overlap_token_size,
|
147 |
label="Overlap Tokens",
|
148 |
lines=1,
|
149 |
elem_classes="statistics"
|
@@ -161,19 +108,16 @@ with gr.Blocks(css="style.css") as demo:
|
|
161 |
with gr.Group():
|
162 |
tokenizer_type_2 = gr.Dropdown(
|
163 |
all_tokenizers,
|
164 |
-
value=default_tokenizer_type_2,
|
165 |
label="Tokenizer 2",
|
166 |
)
|
167 |
with gr.Group():
|
168 |
with gr.Row():
|
169 |
stats_vocab_size_2 = gr.TextArea(
|
170 |
-
value=default_stats_vocab_size_2,
|
171 |
label="VocabSize",
|
172 |
lines=1,
|
173 |
elem_classes="statistics"
|
174 |
)
|
175 |
stats_zh_token_size_2 = gr.TextArea(
|
176 |
-
value=default_stats_zh_token_size_2,
|
177 |
label="ZH char/word", # 中文字/词
|
178 |
lines=1,
|
179 |
elem_classes="statistics"
|
@@ -184,7 +128,6 @@ with gr.Blocks(css="style.css") as demo:
|
|
184 |
# elem_classes="statistics"
|
185 |
# )
|
186 |
stats_overlap_token_size_2 = gr.TextArea(
|
187 |
-
value=default_stats_overlap_token_size,
|
188 |
label="Overlap Tokens",
|
189 |
lines=1,
|
190 |
elem_classes="statistics"
|
@@ -194,42 +137,28 @@ with gr.Blocks(css="style.css") as demo:
|
|
194 |
with gr.Row():
|
195 |
with gr.Column():
|
196 |
output_text_1 = gr.Highlightedtext(
|
197 |
-
value=default_output_text_1,
|
198 |
-
label=f"Tokens: {default_output_len_1}",
|
199 |
show_legend=True,
|
200 |
elem_classes="space-show"
|
201 |
)
|
202 |
with gr.Column():
|
203 |
output_text_2 = gr.Highlightedtext(
|
204 |
-
value=default_output_text_2,
|
205 |
-
label=f"Tokens: {default_output_len_2}",
|
206 |
show_legend=True,
|
207 |
elem_classes="space-show"
|
208 |
)
|
209 |
|
210 |
with gr.Row():
|
211 |
-
output_table_1 = gr.Dataframe(
|
212 |
-
|
213 |
-
headers=["TokenID", "Byte", "Text"],
|
214 |
-
datatype=["str", "str", "str"],
|
215 |
-
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
216 |
-
)
|
217 |
-
output_table_2 = gr.Dataframe(
|
218 |
-
value=default_output_table_2,
|
219 |
-
headers=["TokenID", "Token", "Text"],
|
220 |
-
datatype=["str", "str", "str"],
|
221 |
-
)
|
222 |
|
223 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
|
224 |
[output_text_1, output_table_1])
|
225 |
-
# 下面两个好像可以合并
|
226 |
tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
|
227 |
tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
|
228 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
229 |
|
230 |
user_input.change(tokenize_pair,
|
231 |
[user_input, tokenizer_type_1, tokenizer_type_2],
|
232 |
-
[output_text_1, output_table_1, output_text_2, output_table_2])
|
233 |
|
234 |
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
|
235 |
[output_text_2, output_table_2])
|
@@ -243,9 +172,21 @@ with gr.Blocks(css="style.css") as demo:
|
|
243 |
[user_input, tokenizer_type_1, tokenizer_type_2]
|
244 |
)
|
245 |
|
246 |
-
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
if __name__ == "__main__":
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
"""
|
6 |
## TODO:
|
|
|
7 |
- i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language
|
8 |
- iter_vocab 的 warmup
|
9 |
+
- 开关
|
10 |
+
- add_special_token 开关
|
11 |
+
- theme 开关 light/dark
|
12 |
+
- token_id/tokens/bytes 开关
|
13 |
+
- 中文字词统计,是否要包括 _ G 等字符
|
14 |
+
- 评测
|
15 |
+
- OOV评测
|
16 |
- 通过 javascript 添加 hover_text
|
|
|
17 |
- 英文 utf-8编码
|
18 |
+
- 词典支持下载,借用image下载的标签,
|
|
|
19 |
- baichuan的单字数量怎么两万多个?
|
|
|
|
|
|
|
20 |
- qwen: ValueError: Unclosed image token
|
21 |
|
22 |
plots
|
|
|
37 |
import gradio as gr
|
38 |
from vocab import all_tokenizers
|
39 |
from util import *
|
40 |
+
from examples import example_fn
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
46 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
47 |
# 功能:输入文本,进行分词
|
48 |
# 分词器:常见的分词器有集中,
|
49 |
# 背景:方便分词、看词粒度、对比
|
|
|
|
|
50 |
|
51 |
with gr.Row():
|
52 |
gr.Markdown("## Input Text")
|
|
|
60 |
scale=0,
|
61 |
elem_classes="example-style"
|
62 |
)
|
|
|
63 |
user_input = gr.Textbox(
|
64 |
+
# value=default_user_input,
|
65 |
label="Input Text",
|
66 |
lines=5,
|
67 |
show_label=False,
|
68 |
+
)
|
|
|
|
|
|
|
|
|
|
|
69 |
gr.Markdown("## Tokenization")
|
|
|
70 |
with gr.Row():
|
71 |
with gr.Column(scale=6):
|
72 |
with gr.Group():
|
73 |
tokenizer_type_1 = gr.Dropdown(
|
74 |
all_tokenizers,
|
|
|
75 |
label="Tokenizer 1",
|
76 |
)
|
77 |
with gr.Group():
|
|
|
80 |
"""
|
81 |
with gr.Row():
|
82 |
stats_vocab_size_1 = gr.TextArea(
|
|
|
83 |
label="VocabSize",
|
84 |
lines=1,
|
85 |
elem_classes="statistics"
|
86 |
)
|
87 |
stats_zh_token_size_1 = gr.TextArea(
|
|
|
88 |
label="ZH char/word",
|
89 |
lines=1,
|
90 |
elem_classes="statistics"
|
91 |
)
|
92 |
stats_overlap_token_size_1 = gr.TextArea(
|
93 |
+
# value=default_stats_overlap_token_size,
|
94 |
label="Overlap Tokens",
|
95 |
lines=1,
|
96 |
elem_classes="statistics"
|
|
|
108 |
with gr.Group():
|
109 |
tokenizer_type_2 = gr.Dropdown(
|
110 |
all_tokenizers,
|
|
|
111 |
label="Tokenizer 2",
|
112 |
)
|
113 |
with gr.Group():
|
114 |
with gr.Row():
|
115 |
stats_vocab_size_2 = gr.TextArea(
|
|
|
116 |
label="VocabSize",
|
117 |
lines=1,
|
118 |
elem_classes="statistics"
|
119 |
)
|
120 |
stats_zh_token_size_2 = gr.TextArea(
|
|
|
121 |
label="ZH char/word", # 中文字/词
|
122 |
lines=1,
|
123 |
elem_classes="statistics"
|
|
|
128 |
# elem_classes="statistics"
|
129 |
# )
|
130 |
stats_overlap_token_size_2 = gr.TextArea(
|
|
|
131 |
label="Overlap Tokens",
|
132 |
lines=1,
|
133 |
elem_classes="statistics"
|
|
|
137 |
with gr.Row():
|
138 |
with gr.Column():
|
139 |
output_text_1 = gr.Highlightedtext(
|
|
|
|
|
140 |
show_legend=True,
|
141 |
elem_classes="space-show"
|
142 |
)
|
143 |
with gr.Column():
|
144 |
output_text_2 = gr.Highlightedtext(
|
|
|
|
|
145 |
show_legend=True,
|
146 |
elem_classes="space-show"
|
147 |
)
|
148 |
|
149 |
with gr.Row():
|
150 |
+
output_table_1 = gr.Dataframe()
|
151 |
+
output_table_2 = gr.Dataframe()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
|
154 |
[output_text_1, output_table_1])
|
|
|
155 |
tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
|
156 |
tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
|
157 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
158 |
|
159 |
user_input.change(tokenize_pair,
|
160 |
[user_input, tokenizer_type_1, tokenizer_type_2],
|
161 |
+
[output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
|
162 |
|
163 |
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
|
164 |
[output_text_2, output_table_2])
|
|
|
172 |
[user_input, tokenizer_type_1, tokenizer_type_2]
|
173 |
)
|
174 |
|
175 |
+
demo.load(_js=open("js/onload.js", "r", encoding="utf-8").read())
|
176 |
+
demo.load(
|
177 |
+
fn=on_load,
|
178 |
+
inputs=None,
|
179 |
+
outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
|
180 |
+
)
|
181 |
+
|
182 |
|
183 |
if __name__ == "__main__":
|
184 |
+
print("http://127.0.0.1:7860/?tokenizer1=llama&tokenizer2=chinese_llama2&text=fdsjlk") # llama chinese_llama2
|
185 |
+
print(
|
186 |
+
"http://127.0.0.1:7860/?tokenizer1=chinese_llama&tokenizer2=chinese_llama2&text=fdsjlk") # llama chinese_llama2
|
187 |
+
print("http://127.0.0.1:7860/?tokenizer1=baichuan&tokenizer2=baichuan2&text=sss") # baichuan 1 VS 2
|
188 |
+
print("http://127.0.0.1:7860/?tokenizer1=bert&tokenizer2=clue&text=sss") # bert VS clue
|
189 |
+
print("http://127.0.0.1:7860/?tokenizer1=clue&tokenizer2=kplug&text=sss") # clue VS kplug
|
190 |
+
print("http://127.0.0.1:7860/?tokenizer1=baichuan&tokenizer2=baichuan2&text=sss") #
|
191 |
+
# demo.queue(max_size=20).launch()
|
192 |
+
demo.launch()
|
style.css → css/style.css
RENAMED
File without changes
|
evaluation.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
## coverage
|
4 |
+
|
5 |
+
rare characters falling back to utf-8 bytes
|
examples.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
examples = {
|
2 |
+
"en": [
|
3 |
+
["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
4 |
+
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
5 |
+
["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
6 |
+
["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
7 |
+
["digits: (10086 + 98) = 100184", "baichuan", "llama"]
|
8 |
+
]
|
9 |
+
,
|
10 |
+
"zh": [
|
11 |
+
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
12 |
+
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
13 |
+
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
14 |
+
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
15 |
+
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
16 |
+
]
|
17 |
+
|
18 |
+
}
|
19 |
+
|
20 |
+
|
21 |
+
def example_fn(example_idx):
|
22 |
+
return examples["en"][example_idx]
|
images/README.md
CHANGED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## info
|
3 |
+
|
4 |
+
https://huggingface.co/bert-base-uncased
|
5 |
+
|
images/download_button.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div class="icon-buttons svelte-1btp92j"><a href="data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4NCjwhLS0gU3ZnIFZlY3RvciBJY29ucyA6IGh0dHA6Ly93d3cub25saW5ld2ViZm9udHMuY29tL2ljb24gLS0+DQo8IURPQ1RZUEUgc3ZnIFBVQkxJQyAiLS8vVzNDLy9EVEQgU1ZHIDEuMS8vRU4iICJodHRwOi8vd3d3LnczLm9yZy9HcmFwaGljcy9TVkcvMS4xL0RURC9zdmcxMS5kdGQiPg0KPHN2ZyB2ZXJzaW9uPSIxLjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgeG1sbnM6eGxpbms9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGxpbmsiIHg9IjBweCIgeT0iMHB4IiB2aWV3Qm94PSIwIDAgMjU2IDI1NiIgZW5hYmxlLWJhY2tncm91bmQ9Im5ldyAwIDAgMjU2IDI1NiIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSI+DQo8bWV0YWRhdGE+IFN2ZyBWZWN0b3IgSWNvbnMgOiBodHRwOi8vd3d3Lm9ubGluZXdlYmZvbnRzLmNvbS9pY29uIDwvbWV0YWRhdGE+DQo8Zz48Zz48cGF0aCBmaWxsPSIjMDAwMDAwIiBkPSJNMTAwLDIyNC4zYy0wLjIsMS41LTEuMywxLjktMi41LDIuMWMtNS4yLDAuOS05LjQsMy45LTEzLjksNi41Yy0yLjgsMS42LTUuOSwyLjgtOC45LDQuMmMtMC40LDAuMi0xLjEsMC4zLTEuMywwLjFjLTEuNS0yLTMuMi0wLjQtNC41LDAuMWMtMi42LDEtNC45LDIuNy03LjMsNGMtMS4zLDAuNy0yLjYsMS41LTQsMi4xYy0xLjUsMC42LTMuMSwwLjgtNC4zLDIuMmMtMC40LDAuNC0xLjMsMC41LTEuOSwwLjVjLTAuNSwwLTAuOS0wLjUtMS40LTAuN2MwLjItMC40LDAuNC0wLjgsMC44LTFjMS4zLTAuOCwyLjctMS41LDQtMi4yYzItMS4xLDQtMi4zLDYuMS0zLjRjMC4zLTAuMiwwLjYtMC40LDAuOS0wLjVjNC45LTEuMiw5LTQsMTMuMy02LjNjMS42LTAuOSwzLTIuMSw0LjYtMy4yYy0xLjMtMS43LTIuNi0xLjItMy42LTAuNmMtNCwyLTcuOSw0LjEtMTEuOCw2LjJjLTAuNiwwLjMtMS4xLDAuOS0xLjgsMS4xYy0wLjYsMC4yLTEuNiwwLjEtMS44LTAuMmMtMC4zLTAuNS0wLjMtMS41LDAtMS45YzEuOS0yLjUsMy42LTUuMiw3LjItNS42YzEuMS0wLjEsMi4zLTAuNCwxLjQtMmMtMC40LTAuNy0wLjYtMS4zLDAuMS0xLjhjMC44LTAuNiwxLjctMS40LDIuOC0wLjdjMS4xLDAuOCwyLjQsMS4xLDMuNiwwLjVjMS4yLTAuNSwyLjQtMS4xLDEuNS0yLjhjLTAuNi0xLjEsMC4yLTEuNywxLjEtMi4yYzEuOS0wLjksMy44LTEuOSw1LjctMi43YzAuNi0wLjMsMS43LTAuNiwyLTAuM2MxLjksMi4xLDMuMSwwLjIsNC40LTAuN2MwLjUtMC40LDEuMi0wLjYsMS44LTAuOGMxLjIsMi0xLjMsMi0xLjUsMy40YzAuNSwwLjEsMSwwLjQsMS40LDAuM2MyLjctMS4xLDUuMy0yLjQsNy45LTMuNmMwLjItMC4xLDAuMi0wLjcsMC40LTEuMWMtMC4zLTAuMS0wLjctMC4zLTAuOS0wLjJjLTAuOSwwLjQtMS44LDAuOS0zLjEsMWMwLjctMC44LDEuNC0xLjcsMi4zLTIuM2MzLjEtMS45LDYuMy0zLjYsOS41LTUuM2MwLjYtMC4zLDEuNy0wLjYsMi4yLTAuM2MxLjMsMC44LDIuNSwwLjUsMy43LDBjMi40LTEsNC43LTIsNy0zLjFjMC40LTAuMiwwLjUtMC43LDAuOS0xLjJjLTIuMi0xLjUtMy45LDAuMi02LjEsMC43YzAuNC0wLjksMC40LTEuNywwLjgtMS45YzEuNS0wLjksMy0xLjgsNC42LTIuNWMxLjMtMC42LDIuNy0wLjYsMy43LTIuM2MtMC44LTAuMi0xLjQtMC41LTEuOS0wLjRjLTQuNCwwLjMtOC42LDAuOS0xMS42LDQuN2MtMS42LDEuOS00LjEsMi02LjQsMi4yYy0xLjMsMC4xLTEuNy0wLjgtMS40LTEuOWMwLjMtMSwwLjYtMi4yLDEuNC0yLjdjMi4zLTEuNCw0LjQtMy42LDcuNi0yLjdjMC42LDAuMiwxLjQsMC4xLDItMC4xYzMuMi0xLjEsNi40LTIuMyw5LjUtMy42YzEuMS0wLjQsMi43LTAuOSwxLTIuNmMtMC4xLTAuMSwwLjUtMS4yLDEtMS41YzMuOC0yLjYsNy42LTUuMSwxMS40LTcuNmM3LjItNC44LDE1LTguNSwyMi44LTEyLjRjNC44LTIuNCw4LjgtNS44LDEyLjktOS4xYzAuOS0wLjcsMS43LTEuNywyLjQtMi42YzEuNy0yLjMsMS40LTQuMS0xLjItNS40Yy0xLjYtMC44LTMuNS0xLjQtNS4zLTEuN2MtNi40LTEtMTIuOC0xLjctMTkuMS0yLjhjLTUuMS0wLjktOS4zLTMuNy0xMS42LTguM2MtMS4zLTIuNi0xLjgtNi0xLjMtOC44YzEuNC03LjcsNC42LTE0LjcsMTAuMi0yMC40YzUuNi01LjgsMTIuMS05LjksMTkuNy0xMi40YzQuNi0xLjUsOS41LTEsMTQuMiwwLjFjMywwLjcsNS45LDAuNyw4LjktMC42YzIuOS0xLjMsNS44LTIuNSw4LjMtNC41YzAuMi0wLjEsMC40LTAuMywwLjYtMC40YzEtMC4zLDEuOCwxLjcsMi45LDBjMC42LTAuOCwxLjQtMS41LDIuMS0yLjJjMCwwLDAuMiwwLDAuNiwwYzAsMC43LDAuMSwxLjUsMC4xLDIuMmMwLDEuNS0wLjcsMi45LDAuNSw0LjNjMC40LDAuNSwwLDEuNiwwLDIuNGMwLDMuMSwwLjIsNi4zLDAuMSw5LjRjLTAuMiwzLjQtMC43LDYuOC0yLjcsOS43Yy0yLjQsMy41LTUuMyw2LjUtOC44LDguOWMtMSwwLjYtMS44LDAuNy0yLjksMC40Yy0yLjYtMC43LTQuOS0yLTYuOC00LjFjLTIuOS0zLjItNi40LTUuOC0xMC4zLTcuOWMtMi45LTEuNS01LjUtMS4zLTguMiwwLjZjLTMuMywyLjQtNi42LDQuOC04LjcsOC41Yy0yLjEsMy44LTIuNiw3LjgtMC45LDExLjdjMC45LDIsMywzLDUuMSwzLjRjNC40LDAuOCw4LjgsMS41LDEzLjIsMmMzLjgsMC40LDcuOCwwLjIsMTEuNSwxYzUuMiwxLjEsMTAsMy4yLDEyLjYsOC40YzEsMi4xLDIuMyw0LjEsMy4xLDYuM2MwLjksMi43LDEsNS40LTAuNSw4LjFjLTIuMyw0LTUuMiw3LjQtOC45LDEwLjNjLTYsNC42LTEyLjYsOC0xOS4xLDExLjhjLTIuNCwxLjQtNC4zLDMuMS01LjksNS4zYy0xLDEuNC0yLjMsMi43LTMuOCwzLjZjLTcuMyw0LjMtMTQuNiw4LjMtMjIsMTIuNWMtMi40LDEuMy01LDEuMi03LjUsMS4zYy0wLjQsMC0wLjktMC40LTEtMC43czAuMy0wLjksMC43LTEuMWMyLTEsMy45LTIsNi0yLjljMS4xLTAuNSwyLjItMC45LDIuMy0yLjhjLTEuMywwLjMtMi40LDAuNS0zLjQsMC45Yy00LjIsMS43LTguNCwzLjUtMTIuNyw1LjFjLTIuOCwxLTUuOCwxLjYtOC40LDMuNGMtMC41LDAuMy0xLjQsMC4yLTIsMC4xYy0xLjktMC41LTMuNCwwLTUsMC45Yy0yLjksMS41LTUuOCwzLTguOCw0LjRjLTQuNiwyLjItOS40LDQuMi0xMy43LDcuMWMtMS4yLDAuOC0yLjIsMS43LTMuMywyLjZjLTAuMiwwLjEtMC4yLDAuNy0wLjEsMC45YzAuMSwwLjIsMC42LDAuNiwwLjgsMC41YzEuNi0wLjUsMy4yLTAuOSw0LjYtMS41YzIuMS0xLDQuMS0yLjMsNi4yLTMuM2MyLTEsNC0yLjMsNi42LTEuNWMxLDAuMywyLjMtMC44LDMuNi0xLjFjMC41LTAuMSwxLjIsMC4yLDEuOCwwLjRjLTAuMiwwLjYtMC4zLDEuNS0wLjcsMS43Yy0xLjYsMC45LTMuMywxLjUtNS4xLDIuMkMxMDEuNywyMjMuOSwxMDAuOCwyMjQuMSwxMDAsMjI0LjNjLTAuMS0wLjEtMC4xLTAuNC0wLjItMC42Yy0wLjEtMC4zLTAuMy0wLjUtMC41LTAuN2MwLDAtMC4zLDAuMi0wLjUsMC4zYzAuMiwwLjIsMC4zLDAuNSwwLjYsMC43Qzk5LjUsMjI0LjIsOTkuOCwyMjQuMiwxMDAsMjI0LjN6Ii8+PHBhdGggZmlsbD0iIzAwMDAwMCIgZD0iTTE1MS45LDU5LjZjNC44LTYuNSw4LjQtMTMuNCwxMy42LTE5LjFjMC4zLDAuMiwwLjYsMC40LDAuOCwwLjVjLTEuOCwzLjUtMy41LDcuMS01LjMsMTAuNmMwLjIsMC4xLDAuNSwwLjMsMC43LDAuNGMwLjgtMS4xLDEuNy0yLjIsMi40LTMuNGMxLjYtMi43LDIuOS01LjYsNC42LTguM2M0LjUtNy4yLDguNy0xNC42LDE1LTIwLjZjMC40LTAuNCwwLjgtMC45LDEuMy0xLjJjMS41LTEuMSwyLjctMS4xLDQuMywwLjFjMC4yLDAuMiwwLjQsMC44LDAuMywxLjFjLTAuNCwwLjktMSwxLjktMS42LDIuN2MtMS43LDIuNS0zLjMsNS01LjEsNy41Yy0xLjksMi42LTIuNyw1LjQtMi43LDguNWMwLDEuOS0wLjgsMy4zLTIuOSw0Yy0wLjEtMC44LTAuMS0xLjQtMC4yLTIuNGMtMi42LDMuOC01LjEsNy4zLTcuNSwxMC45Yy0wLjEsMC4xLTAuNCwwLTEsMGMyLjMtNS40LDYuNi05LjcsNy43LTE1LjZjLTMsMS40LTQuMiw0LjMtNS45LDYuNmMtMS43LDIuMy0yLjksNS00LjIsNy42Yy0xLjQsMi43LTIuOCw1LjQtNCw4LjJjLTEuMSwyLjYtMy4yLDQuOS0yLjYsOC40YzMuMS0xLjIsMy41LTQuOSw2LjUtNi4yYzAsMC42LDAuMSwxLDAsMS4zYy0zLjIsNy4xLTYuMSwxNC40LTEwLjgsMjAuOGMtMi42LDMuNS00LjYsNy42LTYuOCwxMS40Yy0xLDEuNy0yLjIsMy4zLTMuMyw0LjljLTAuNCwwLjYtMC44LDEuMS0xLjEsMS44Yy0xLDIuNC0xLjcsNC45LTMsNy4xYy0xLjMsMi4yLTEuMSw0LjctMS44LDdjLTEuNiw1LjUtMy4zLDEwLjktNSwxNi40Yy0yLjMsNy43LTMuOCwxNS43LTUuMSwyMy42Yy0wLjYsMy45LTAuOCw3LjktMC44LDExLjljMCw0LjEtNC40LDcuMS04LjEsNS41Yy0wLjQtMC4yLTAuOC0wLjUtMS0wLjljLTEuOS0zLjUtNC44LTYuNC03LjctOWMtMi41LTIuMy0yLjgtNC44LTItNy42YzAuNi0yLjIsMC40LTQuMS0wLjEtNi4xYy0wLjgtMy4yLTEuNy02LjMtMi05LjZjLTAuMi0yLjEtMS42LTMuNS0yLjMtNS4zYy0xLjMtMy4xLTIuNS02LjMtMy42LTkuNGMtMS45LTUuMy0zLjctMTAuNi01LjYtMTUuOGMtMC45LTIuNi0yLjItNS4xLTIuOS03LjhjLTAuOC0zLjMtMi44LTUuNy01LjMtNy43Yy0yLjItMS43LTMuMi0zLjktMi45LTYuNGMwLjMtMi45LTAuMS02LDEuNy04LjdjMS4yLTEuNywxLjctMy44LDIuNi01LjhjMC41LTEuMywxLjMtMi4xLDIuNi0xLjJjMS4xLDAuOCwyLDAuNCwyLjktMC4yYzIuMy0xLjcsNC43LTAuNSw3LjEtMC4zYzIuNiwyLjYsNS44LDQuNSw2LjQsOC43YzAuOSw2LjUsMi4yLDEyLjksMy43LDE5LjJjMS4zLDUuNiwzLDExLjEsNC41LDE2LjZjMC4yLDAuOSwwLjMsMS45LDAuNSwyLjhjMC40LDEuNCwwLjcsMi45LDEuMiw0LjNjMC4yLDAuNiwwLjUsMS4xLDAuNywxLjZjMC40LDEtMC4xLDIuNywxLjUsMi43YzAuNiwwLDEuMi0xLjgsMS42LTIuOGMyLjUtNy4zLDQuOC0xNC42LDcuMy0yMS45YzIuMS02LjEsNC40LTEyLjEsNi40LTE4LjJjMS4yLTMuNSwxLjUtNy4zLDMuMy0xMC43YzEuNi0zLDMuMi01LjksNS04LjhjMC41LTAuOSwxLjQtMi4xLDIuOC0wLjhjMC4xLDAuMSwxLTAuMywxLjMtMC43YzAuNy0wLjksMS4zLTEuOSwyLTIuOUMxNTAuNCw1Ny4zLDE1MC42LDU3LjQsMTUxLjksNTkuNnogTTE4NC4yLDIzLjhjLTEuMiwwLjMtMiwwLjItMi4yLDAuNmMtMS42LDItMy4yLDQtNC43LDZjLTAuMSwwLjIsMCwwLjgsMC4yLDAuOWMwLjQsMC4xLDEsMC4yLDEuMiwwQzE4MC44LDI5LjMsMTgyLjgsMjcuMiwxODQuMiwyMy44eiIvPjxwYXRoIGZpbGw9IiMwMDAwMDAiIGQ9Ik0xNzcsMTMuNmMtMS4zLDEuOC0xLjQsNC45LTQuMyw2LjJjLTAuNSwwLjItMS40LDAuMS0xLjktMC4xYy0wLjMtMC4xLTAuMy0xLTAuMi0xLjVjMC40LTEuOCwxLTMuNiwxLjUtNS40YzAuMS0wLjMsMC4yLTAuNywwLjMtMWMwLjgtMS41LDEuOC0yLDMuMS0xLjZDMTc2LjcsMTAuNSwxNzcuMSwxMS4zLDE3NywxMy42eiIvPjwvZz48L2c+DQo8L3N2Zz4=" download="image" target="_blank"><button aria-label="Download" title="Download" class="svelte-1030q2h"> <div class="svelte-1030q2h"><svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg></div></button></a> </div>
|
js/onload.js
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
function() {
|
2 |
+
// feedback
|
3 |
+
//$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
|
4 |
+
//$("footer a").childNodes[0].textContent ="Send Feedback"
|
5 |
+
|
6 |
+
document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback";
|
7 |
+
document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
|
8 |
+
|
9 |
+
// download button
|
10 |
+
|
11 |
+
// API
|
12 |
+
}
|
util.py
CHANGED
@@ -5,13 +5,15 @@ import pandas as pd
|
|
5 |
from vocab import load_tokener
|
6 |
from utils.zh_util import iter_vocab
|
7 |
from utils.log_util import logger
|
|
|
|
|
8 |
|
9 |
|
10 |
-
|
|
|
11 |
"""
|
12 |
-
TODO: cache tokenizer
|
13 |
"""
|
14 |
-
logger.info("
|
15 |
pos_tokens = []
|
16 |
tokenizer = load_tokener(tokenizer_type)
|
17 |
encoding = tokenizer.encode(text)
|
@@ -29,16 +31,16 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
|
|
29 |
token_str = token.decode("utf-8")
|
30 |
except:
|
31 |
token_str = token.decode("utf-8", errors="ignore")
|
32 |
-
logger.
|
33 |
{"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
|
34 |
ensure_ascii=False))
|
35 |
|
36 |
token_bytes = token
|
37 |
-
json_dumps = json.dumps(token_str)
|
38 |
elif isinstance(token, str):
|
39 |
token_str = token
|
40 |
token_bytes = bytes(token_str, "utf-8")
|
41 |
-
json_dumps = json.dumps(token_str)
|
42 |
else:
|
43 |
return
|
44 |
|
@@ -48,31 +50,23 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
|
|
48 |
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
49 |
"Text": decode_text, #
|
50 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
51 |
-
"Bytes": str(token_bytes),
|
52 |
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
53 |
}
|
54 |
)
|
55 |
|
56 |
table_df = pd.DataFrame(table)
|
57 |
-
logger.info(f"
|
58 |
# print(table_df)
|
59 |
|
60 |
-
|
61 |
-
return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
|
62 |
-
else:
|
63 |
-
return pos_tokens, table_df, len(encoding)
|
64 |
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
if headers and 'x-forwarded-for' in headers:
|
72 |
-
x_forwarded_for = headers['x-forwarded-for']
|
73 |
-
client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
|
74 |
-
logger.info(f"[client_ip]: {client_ip}, {tokenizer_type_1}, {tokenizer_type_2}")
|
75 |
-
|
76 |
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
77 |
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
78 |
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
@@ -84,21 +78,67 @@ def basic_count(tokenizer_type):
|
|
84 |
return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
85 |
|
86 |
|
|
|
87 |
def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
|
88 |
tokenizer1 = load_tokener(tokenizer_type_1)
|
89 |
tokenizer2 = load_tokener(tokenizer_type_2)
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
overlap_token_size = len(overlap_tokens)
|
94 |
-
logger.info(
|
|
|
95 |
return overlap_token_size, overlap_token_size
|
96 |
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
def test_coding():
|
99 |
bytes1 = b'\xe4\xb8\xad'
|
100 |
print(bytes1) # b'\xe4\xb8\xad'
|
101 |
|
102 |
|
103 |
if __name__ == "__main__":
|
104 |
-
print(
|
|
|
|
5 |
from vocab import load_tokener
|
6 |
from utils.zh_util import iter_vocab
|
7 |
from utils.log_util import logger
|
8 |
+
from functools import lru_cache
|
9 |
+
from urllib.parse import urlparse, parse_qs
|
10 |
|
11 |
|
12 |
+
@lru_cache
|
13 |
+
def tokenize(text, tokenizer_type, color_num=5):
|
14 |
"""
|
|
|
15 |
"""
|
16 |
+
logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
|
17 |
pos_tokens = []
|
18 |
tokenizer = load_tokener(tokenizer_type)
|
19 |
encoding = tokenizer.encode(text)
|
|
|
31 |
token_str = token.decode("utf-8")
|
32 |
except:
|
33 |
token_str = token.decode("utf-8", errors="ignore")
|
34 |
+
logger.error("decode_error: " + json.dumps(
|
35 |
{"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
|
36 |
ensure_ascii=False))
|
37 |
|
38 |
token_bytes = token
|
39 |
+
# json_dumps = json.dumps(token_str)
|
40 |
elif isinstance(token, str):
|
41 |
token_str = token
|
42 |
token_bytes = bytes(token_str, "utf-8")
|
43 |
+
# json_dumps = json.dumps(token_str)
|
44 |
else:
|
45 |
return
|
46 |
|
|
|
50 |
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
51 |
"Text": decode_text, #
|
52 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
53 |
+
"UTF8 Bytes": str(token_bytes),
|
54 |
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
55 |
}
|
56 |
)
|
57 |
|
58 |
table_df = pd.DataFrame(table)
|
59 |
+
logger.info(f"Tokens={table[:2]}")
|
60 |
# print(table_df)
|
61 |
|
62 |
+
return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
|
|
|
|
|
|
|
63 |
|
64 |
|
65 |
+
@lru_cache
|
66 |
+
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
67 |
+
"""
|
68 |
+
input_text.change
|
69 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
70 |
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
71 |
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
72 |
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
|
|
78 |
return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
79 |
|
80 |
|
81 |
+
@lru_cache
|
82 |
def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
|
83 |
tokenizer1 = load_tokener(tokenizer_type_1)
|
84 |
tokenizer2 = load_tokener(tokenizer_type_2)
|
85 |
+
|
86 |
+
vocab_set_1 = tokenizer1.get_vocab().keys()
|
87 |
+
vocab_set_2 = tokenizer2.get_vocab().keys()
|
88 |
+
|
89 |
+
token1 = next(iter(vocab_set_1))
|
90 |
+
token2 = next(iter(vocab_set_2))
|
91 |
+
if type(token1) != type(token2): # bytes str
|
92 |
+
if isinstance(token1, str):
|
93 |
+
vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
|
94 |
+
if isinstance(token2, str):
|
95 |
+
vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
|
96 |
+
|
97 |
+
overlap_tokens = vocab_set_1 & vocab_set_2
|
98 |
overlap_token_size = len(overlap_tokens)
|
99 |
+
logger.info(
|
100 |
+
f"{overlap_token_size} OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}: {list(overlap_tokens)[:10]}")
|
101 |
return overlap_token_size, overlap_token_size
|
102 |
|
103 |
|
104 |
+
default_user_input = """Replace this text in the input field to see how tokenization works
|
105 |
+
华为发布Mate60手机
|
106 |
+
ラグビーワールドカップ2023フランス"""
|
107 |
+
default_tokenizer_type_1 = "llama"
|
108 |
+
# default_tokenizer_type_2 = "internlm_chat_7b"
|
109 |
+
default_tokenizer_type_2 = "gpt_35_turbo"
|
110 |
+
|
111 |
+
|
112 |
+
def on_load(request: gr.Request):
|
113 |
+
"""
|
114 |
+
onLoad
|
115 |
+
"""
|
116 |
+
text = None
|
117 |
+
tokenizer_type_1 = None
|
118 |
+
tokenizer_type_2 = None
|
119 |
+
query_params = {}
|
120 |
+
if request:
|
121 |
+
client_ip = request.client.host
|
122 |
+
# local_ip = socket.gethostbyname(socket.gethostbyname(""))
|
123 |
+
# headers = request.kwargs['headers']
|
124 |
+
# if headers and 'x-forwarded-for' in headers:
|
125 |
+
# x_forwarded_for = headers['x-forwarded-for']
|
126 |
+
# client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
|
127 |
+
if "referer" in request.headers:
|
128 |
+
query_params = parse_qs(urlparse(request.headers["referer"]).query)
|
129 |
+
query_params = {k: v[0] for k, v in query_params.items() if len(v) > 0}
|
130 |
+
tokenizer_type_1 = query_params.get("tokenizer1", default_tokenizer_type_1)
|
131 |
+
tokenizer_type_2 = query_params.get("tokenizer2", default_tokenizer_type_2)
|
132 |
+
text = query_params.get("text", default_user_input)
|
133 |
+
logger.info(f"client_ip: {client_ip}; params: {query_params}")
|
134 |
+
return text, tokenizer_type_1, tokenizer_type_2
|
135 |
+
|
136 |
+
|
137 |
def test_coding():
|
138 |
bytes1 = b'\xe4\xb8\xad'
|
139 |
print(bytes1) # b'\xe4\xb8\xad'
|
140 |
|
141 |
|
142 |
if __name__ == "__main__":
|
143 |
+
print(get_overlap_token_size("gpt_35_turbo", "gpt_4"))
|
144 |
+
# print(basic_count("internlm_chat_7b"))
|
utils/_vocab.zh.jsonl
ADDED
@@ -0,0 +1,1189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"id": 529, "token": "’", "type": "中文标点"}
|
2 |
+
{"id": 753, "token": "’s", "type": "中文标点"}
|
3 |
+
{"id": 863, "token": "”", "type": "中文标点"}
|
4 |
+
{"id": 1054, "token": " “", "type": "中文标点"}
|
5 |
+
{"id": 1389, "token": " –", "type": "中文标点"}
|
6 |
+
{"id": 1431, "token": "’t", "type": "中文标点"}
|
7 |
+
{"id": 1811, "token": "。", "type": "中文标点"}
|
8 |
+
{"id": 1981, "token": "…", "type": "中文标点"}
|
9 |
+
{"id": 2001, "token": " —", "type": "中文标点"}
|
10 |
+
{"id": 2029, "token": ".”", "type": "中文标点"}
|
11 |
+
{"id": 2118, "token": "“", "type": "中文标点"}
|
12 |
+
{"id": 2345, "token": "—", "type": "中文标点"}
|
13 |
+
{"id": 2476, "token": ",”", "type": "中文标点"}
|
14 |
+
{"id": 2950, "token": ".”\n\n", "type": "中文标点"}
|
15 |
+
{"id": 3207, "token": "’re", "type": "中文标点"}
|
16 |
+
{"id": 3451, "token": " ‘", "type": "中文标点"}
|
17 |
+
{"id": 3490, "token": "。\n\n", "type": "中文标点"}
|
18 |
+
{"id": 3922, "token": ",", "type": "中文标点"}
|
19 |
+
{"id": 4070, "token": "’ve", "type": "中文标点"}
|
20 |
+
{"id": 4235, "token": "–", "type": "中文标点"}
|
21 |
+
{"id": 4344, "token": "’m", "type": "中文标点"}
|
22 |
+
{"id": 4696, "token": " …", "type": "中文标点"}
|
23 |
+
{"id": 4805, "token": "’ll", "type": "中文标点"}
|
24 |
+
{"id": 5232, "token": ":", "type": "中文标点"}
|
25 |
+
{"id": 5486, "token": "、", "type": "中文标点"}
|
26 |
+
{"id": 5551, "token": "…\n\n", "type": "中文标点"}
|
27 |
+
{"id": 6447, "token": "!", "type": "中文标点"}
|
28 |
+
{"id": 7070, "token": "’d", "type": "中文标点"}
|
29 |
+
{"id": 7663, "token": "”\n\n", "type": "中文标点"}
|
30 |
+
{"id": 7705, "token": ")", "type": "中文标点"}
|
31 |
+
{"id": 8107, "token": "年", "type": "中文单字"}
|
32 |
+
{"id": 8713, "token": "——", "type": "中文标点"}
|
33 |
+
{"id": 9039, "token": "数", "type": "中文单字"}
|
34 |
+
{"id": 9080, "token": "日", "type": "中文单字"}
|
35 |
+
{"id": 9174, "token": "。\n", "type": "中文标点"}
|
36 |
+
{"id": 9520, "token": "”,", "type": "中文标点"}
|
37 |
+
{"id": 9554, "token": "的", "type": "中文单字"}
|
38 |
+
{"id": 9787, "token": " ·", "type": "中文标点"}
|
39 |
+
{"id": 9953, "token": "月", "type": "中文单字"}
|
40 |
+
{"id": 10110, "token": "(", "type": "中文标点"}
|
41 |
+
{"id": 10378, "token": "“I", "type": "中文标点"}
|
42 |
+
{"id": 10416, "token": " […", "type": "中文标点"}
|
43 |
+
{"id": 10646, "token": "」", "type": "中文标点"}
|
44 |
+
{"id": 11144, "token": "【", "type": "中文标点"}
|
45 |
+
{"id": 11199, "token": "】", "type": "中文标点"}
|
46 |
+
{"id": 11453, "token": "”.", "type": "中文标点"}
|
47 |
+
{"id": 11571, "token": "?", "type": "中文标点"}
|
48 |
+
{"id": 11883, "token": "用", "type": "中文单字"}
|
49 |
+
{"id": 12291, "token": " …\n\n", "type": "中文标点"}
|
50 |
+
{"id": 12671, "token": "?”", "type": "中文标点"}
|
51 |
+
{"id": 12996, "token": " […]\n\n", "type": "中文标点"}
|
52 |
+
{"id": 13153, "token": "成", "type": "中文单字"}
|
53 |
+
{"id": 13177, "token": "「", "type": "中文标点"}
|
54 |
+
{"id": 13372, "token": "名", "type": "中文单字"}
|
55 |
+
{"id": 13646, "token": "时", "type": "中文单字"}
|
56 |
+
{"id": 14260, "token": "·", "type": "中文标点"}
|
57 |
+
{"id": 14305, "token": "“The", "type": "中文标点"}
|
58 |
+
{"id": 14336, "token": "‘", "type": "中文标点"}
|
59 |
+
{"id": 14382, "token": "……", "type": "中文标点"}
|
60 |
+
{"id": 14558, "token": "件", "type": "中文单字"}
|
61 |
+
{"id": 14639, "token": ".’", "type": "中文标点"}
|
62 |
+
{"id": 15085, "token": "“We", "type": "中文标点"}
|
63 |
+
{"id": 15120, "token": "一", "type": "中文单字"}
|
64 |
+
{"id": 15179, "token": " „", "type": "中文标点"}
|
65 |
+
{"id": 15225, "token": "请", "type": "中文单字"}
|
66 |
+
{"id": 15397, "token": "”.\n\n", "type": "中文标点"}
|
67 |
+
{"id": 16325, "token": "中", "type": "中文单字"}
|
68 |
+
{"id": 16423, "token": "据", "type": "中文单字"}
|
69 |
+
{"id": 16616, "token": "?”\n\n", "type": "中文标点"}
|
70 |
+
{"id": 16620, "token": "————", "type": "中文标点"}
|
71 |
+
{"id": 16882, "token": "码", "type": "中文单字"}
|
72 |
+
{"id": 16937, "token": "不", "type": "中文单字"}
|
73 |
+
{"id": 17039, "token": "新", "type": "中文单字"}
|
74 |
+
{"id": 17161, "token": "文", "type": "中文单字"}
|
75 |
+
{"id": 17223, "token": "—and", "type": "中文标点"}
|
76 |
+
{"id": 17297, "token": "下", "type": "中文单字"}
|
77 |
+
{"id": 17620, "token": "分", "type": "中文单字"}
|
78 |
+
{"id": 17701, "token": "入", "type": "中文单字"}
|
79 |
+
{"id": 17792, "token": "人", "type": "中文单字"}
|
80 |
+
{"id": 17818, "token": "“It", "type": "中文标点"}
|
81 |
+
{"id": 17860, "token": "功", "type": "中文单字"}
|
82 |
+
{"id": 17905, "token": "上", "type": "中文单字"}
|
83 |
+
{"id": 17982, "token": "户", "type": "中文单字"}
|
84 |
+
{"id": 18171, "token": "!\n\n", "type": "中文标点"}
|
85 |
+
{"id": 18184, "token": "为", "type": "中文单字"}
|
86 |
+
{"id": 18217, "token": " ’", "type": "中文标点"}
|
87 |
+
{"id": 18319, "token": "!”", "type": "中文标点"}
|
88 |
+
{"id": 18363, "token": "间", "type": "中文单字"}
|
89 |
+
{"id": 18476, "token": "号", "type": "中文单字"}
|
90 |
+
{"id": 18655, "token": "取", "type": "中文单字"}
|
91 |
+
{"id": 18904, "token": "回", "type": "中文单字"}
|
92 |
+
{"id": 19000, "token": "在", "type": "��文单字"}
|
93 |
+
{"id": 19047, "token": "页", "type": "中文单字"}
|
94 |
+
{"id": 19066, "token": "。\n\n\n\n", "type": "中文标点"}
|
95 |
+
{"id": 19113, "token": "字", "type": "中文单字"}
|
96 |
+
{"id": 19361, "token": "有", "type": "中文单字"}
|
97 |
+
{"id": 19483, "token": "个", "type": "中文单字"}
|
98 |
+
{"id": 19524, "token": " ”", "type": "中文标点"}
|
99 |
+
{"id": 19653, "token": "成功", "type": "中文多字"}
|
100 |
+
{"id": 19967, "token": "作", "type": "中文单字"}
|
101 |
+
{"id": 20145, "token": "】【", "type": "中文标点"}
|
102 |
+
{"id": 20182, "token": "’,", "type": "中文标点"}
|
103 |
+
{"id": 20379, "token": "示", "type": "中文单字"}
|
104 |
+
{"id": 20600, "token": "用户", "type": "中文多字"}
|
105 |
+
{"id": 20675, "token": "数据", "type": "中文多字"}
|
106 |
+
{"id": 20834, "token": "出", "type": "中文单字"}
|
107 |
+
{"id": 21043, "token": "是", "type": "中文单字"}
|
108 |
+
{"id": 21060, "token": "….", "type": "中文标点"}
|
109 |
+
{"id": 21082, "token": "时间", "type": "中文多字"}
|
110 |
+
{"id": 21388, "token": "失", "type": "中文单字"}
|
111 |
+
{"id": 21405, "token": "表", "type": "中文单字"}
|
112 |
+
{"id": 21418, "token": "除", "type": "中文单字"}
|
113 |
+
{"id": 21601, "token": "加", "type": "中文单字"}
|
114 |
+
{"id": 21809, "token": "败", "type": "中文单字"}
|
115 |
+
{"id": 21909, "token": "~", "type": "中文标点"}
|
116 |
+
{"id": 21990, "token": "生", "type": "中文单字"}
|
117 |
+
{"id": 22023, "token": "信", "type": "中文单字"}
|
118 |
+
{"id": 22117, "token": "’est", "type": "中文标点"}
|
119 |
+
{"id": 22238, "token": "类", "type": "中文单字"}
|
120 |
+
{"id": 22324, "token": "置", "type": "中文单字"}
|
121 |
+
{"id": 22416, "token": "—the", "type": "中文标点"}
|
122 |
+
{"id": 22649, "token": "理", "type": "中文单字"}
|
123 |
+
{"id": 22656, "token": "本", "type": "中文单字"}
|
124 |
+
{"id": 22820, "token": "失败", "type": "中文多字"}
|
125 |
+
{"id": 23018, "token": "息", "type": "中文单字"}
|
126 |
+
{"id": 23039, "token": "行", "type": "中文单字"}
|
127 |
+
{"id": 23187, "token": "定", "type": "中文单字"}
|
128 |
+
{"id": 23189, "token": ",’", "type": "中文标点"}
|
129 |
+
{"id": 23226, "token": "改", "type": "中文单字"}
|
130 |
+
{"id": 23249, "token": " ", "type": "中文标点"}
|
131 |
+
{"id": 23530, "token": "市", "type": "中文单字"}
|
132 |
+
{"id": 23538, "token": "期", "type": "中文单字"}
|
133 |
+
{"id": 23897, "token": "以", "type": "中文单字"}
|
134 |
+
{"id": 23951, "token": "修", "type": "中文单字"}
|
135 |
+
{"id": 23954, "token": ")\n", "type": "中文标点"}
|
136 |
+
{"id": 24186, "token": "元", "type": "中文单字"}
|
137 |
+
{"id": 24273, "token": "方", "type": "中文单字"}
|
138 |
+
{"id": 24535, "token": "’.", "type": "中文标点"}
|
139 |
+
{"id": 24580, "token": "录", "type": "中文单字"}
|
140 |
+
{"id": 24775, "token": "区", "type": "中文单字"}
|
141 |
+
{"id": 24946, "token": "单", "type": "中文单字"}
|
142 |
+
{"id": 25010, "token": "�除", "type": "中文多字"}
|
143 |
+
{"id": 25129, "token": "位", "type": "中文单字"}
|
144 |
+
{"id": 25287, "token": "型", "type": "中文单字"}
|
145 |
+
{"id": 25333, "token": "法", "type": "中文单字"}
|
146 |
+
{"id": 25336, "token": "县", "type": "中文单字"}
|
147 |
+
{"id": 25359, "token": "存", "type": "中文单字"}
|
148 |
+
{"id": 25446, "token": "品", "type": "中文单字"}
|
149 |
+
{"id": 25580, "token": "前", "type": "中文单字"}
|
150 |
+
{"id": 25666, "token": "称", "type": "中文单字"}
|
151 |
+
{"id": 25758, "token": "!”\n\n", "type": "中文标点"}
|
152 |
+
{"id": 26016, "token": ";", "type": "中文标点"}
|
153 |
+
{"id": 26062, "token": "�回", "type": "中文多字"}
|
154 |
+
{"id": 26123, "token": "》", "type": "中文标点"}
|
155 |
+
{"id": 26130, "token": "注", "type": "中文单字"}
|
156 |
+
{"id": 26239, "token": "修改", "type": "中文多字"}
|
157 |
+
{"id": 26592, "token": "值", "type": "中文单字"}
|
158 |
+
{"id": 26794, "token": "输", "type": "中文单字"}
|
159 |
+
{"id": 26892, "token": "建", "type": "中文单字"}
|
160 |
+
{"id": 27179, "token": " (“", "type": "中文标点"}
|
161 |
+
{"id": 27327, "token": "能", "type": "中文单字"}
|
162 |
+
{"id": 27384, "token": "大", "type": "中文单字"}
|
163 |
+
{"id": 27452, "token": "例", "type": "中文单字"}
|
164 |
+
{"id": 27479, "token": "度", "type": "中文单字"}
|
165 |
+
{"id": 27704, "token": "始", "type": "中文单字"}
|
166 |
+
{"id": 27948, "token": "?\n\n", "type": "中文标点"}
|
167 |
+
{"id": 27996, "token": "文件", "type": "中文多字"}
|
168 |
+
{"id": 28037, "token": "到", "type": "中文单字"}
|
169 |
+
{"id": 28038, "token": "《", "type": "中文标点"}
|
170 |
+
{"id": 28190, "token": "面", "type": "中文单字"}
|
171 |
+
{"id": 28359, "token": "�数", "type": "中文多字"}
|
172 |
+
{"id": 28466, "token": "载", "type": "中文单字"}
|
173 |
+
{"id": 28469, "token": "信息", "type": "中文多字"}
|
174 |
+
{"id": 28542, "token": "点", "type": "中文单字"}
|
175 |
+
{"id": 28587, "token": "��取", "type": "中文多字"}
|
176 |
+
{"id": 28624, "token": " […]", "type": "中文标点"}
|
177 |
+
{"id": 28741, "token": "密", "type": "中文单字"}
|
178 |
+
{"id": 28833, "token": "动", "type": "中文单字"}
|
179 |
+
{"id": 28873, "token": "果", "type": "中文单字"}
|
180 |
+
{"id": 28918, "token": "、\n\n", "type": "中文标点"}
|
181 |
+
{"id": 28966, "token": ")\n\n", "type": "中文标点"}
|
182 |
+
{"id": 29096, "token": "—a", "type": "中文标点"}
|
183 |
+
{"id": 29129, "token": "图", "type": "中文单字"}
|
184 |
+
{"id": 29172, "token": "提", "type": "中文单字"}
|
185 |
+
{"id": 29391, "token": "发", "type": "中文单字"}
|
186 |
+
{"id": 29411, "token": ":\n", "type": "中文标点"}
|
187 |
+
{"id": 29430, "token": "式", "type": "中文单字"}
|
188 |
+
{"id": 29472, "token": "—\n\n", "type": "中文标点"}
|
189 |
+
{"id": 29504, "token": "国", "type": "中文单字"}
|
190 |
+
{"id": 29681, "token": "」\n\n", "type": "中文标点"}
|
191 |
+
{"id": 29706, "token": "删除", "type": "中文多字"}
|
192 |
+
{"id": 29719, "token": "’un", "type": "中文标点"}
|
193 |
+
{"id": 29741, "token": "登", "type": "中文单字"}
|
194 |
+
{"id": 29826, "token": "错", "type": "中文单字"}
|
195 |
+
{"id": 30019, "token": "。。", "type": "中文标点"}
|
196 |
+
{"id": 30046, "token": "者", "type": "中文单字"}
|
197 |
+
{"id": 30051, "token": "认", "type": "中文单字"}
|
198 |
+
{"id": 30156, "token": "误", "type": "中文单字"}
|
199 |
+
{"id": 30177, "token": "接", "type": "中文单字"}
|
200 |
+
{"id": 30184, "token": "’\n\n", "type": "中文标点"}
|
201 |
+
{"id": 30356, "token": "关", "type": "中文单字"}
|
202 |
+
{"id": 30358, "token": "重", "type": "中文单字"}
|
203 |
+
{"id": 30537, "token": "第", "type": "中文单字"}
|
204 |
+
{"id": 30590, "token": "地", "type": "中文单字"}
|
205 |
+
{"id": 30624, "token": "如", "type": "中文单字"}
|
206 |
+
{"id": 30697, "token": "————————", "type": "中文标点"}
|
207 |
+
{"id": 30735, "token": "设", "type": "中文单字"}
|
208 |
+
{"id": 30832, "token": "目", "type": "中文单字"}
|
209 |
+
{"id": 30867, "token": "开", "type": "中文单字"}
|
210 |
+
{"id": 30926, "token": "事", "type": "中文单字"}
|
211 |
+
{"id": 31041, "token": "�数", "type": "中文多字"}
|
212 |
+
{"id": 31091, "token": "名称", "type": "中文多字"}
|
213 |
+
{"id": 31378, "token": "“This", "type": "中文标点"}
|
214 |
+
{"id": 31472, "token": " :", "type": "中文标点"}
|
215 |
+
{"id": 31540, "token": "可", "type": "中文单字"}
|
216 |
+
{"id": 31634, "token": "要", "type": "中文单字"}
|
217 |
+
{"id": 31640, "token": "代", "type": "中文单字"}
|
218 |
+
{"id": 31809, "token": "小", "type": "中文单字"}
|
219 |
+
{"id": 31867, "token": "选", "type": "中文单字"}
|
220 |
+
{"id": 31944, "token": "标", "type": "中文单字"}
|
221 |
+
{"id": 31958, "token": "明", "type": "中文单字"}
|
222 |
+
{"id": 31968, "token": "编", "type": "中文单字"}
|
223 |
+
{"id": 32018, "token": "求", "type": "中文单字"}
|
224 |
+
{"id": 32218, "token": "列", "type": "中文单字"}
|
225 |
+
{"id": 32239, "token": "网", "type": "中文单字"}
|
226 |
+
{"id": 32296, "token": "输入", "type": "中文多字"}
|
227 |
+
{"id": 32307, "token": "万", "type": "中文单字"}
|
228 |
+
{"id": 32335, "token": "最", "type": "中文单字"}
|
229 |
+
{"id": 32351, "token": "!!", "type": "中文标点"}
|
230 |
+
{"id": 32438, "token": "�建", "type": "中文多字"}
|
231 |
+
{"id": 32626, "token": "返回", "type": "中文多字"}
|
232 |
+
{"id": 32648, "token": "器", "type": "中文单字"}
|
233 |
+
{"id": 32938, "token": "所", "type": "中文单字"}
|
234 |
+
{"id": 32943, "token": "内", "type": "中文单字"}
|
235 |
+
{"id": 33005, "token": "类型", "type": "中文多字"}
|
236 |
+
{"id": 33014, "token": "体", "type": "中文单字"}
|
237 |
+
{"id": 33035, "token": "通", "type": "中文单字"}
|
238 |
+
{"id": 33052, "token": "务", "type": "中文单字"}
|
239 |
+
{"id": 33091, "token": "此", "type": "中文单字"}
|
240 |
+
{"id": 33122, "token": "商", "type": "中文单字"}
|
241 |
+
{"id": 33144, "token": "序", "type": "中文单字"}
|
242 |
+
{"id": 33200, "token": "错误", "type": "中文多字"}
|
243 |
+
{"id": 33208, "token": "化", "type": "中文单字"}
|
244 |
+
{"id": 33420, "token": "消", "type": "中文单字"}
|
245 |
+
{"id": 33476, "token": "否", "type": "中文单字"}
|
246 |
+
{"id": 33563, "token": "保", "type": "中文单字"}
|
247 |
+
{"id": 33611, "token": "”)", "type": "中文标点"}
|
248 |
+
{"id": 33655, "token": "使", "type": "中文单字"}
|
249 |
+
{"id": 33671, "token": "次", "type": "中文单字"}
|
250 |
+
{"id": 33672, "token": "“You", "type": "中文标点"}
|
251 |
+
{"id": 33748, "token": "机", "type": "中文单字"}
|
252 |
+
{"id": 33764, "token": "对", "type": "中文单字"}
|
253 |
+
{"id": 33765, "token": "参数", "type": "中文多字"}
|
254 |
+
{"id": 33777, "token": "’é", "type": "中文标点"}
|
255 |
+
{"id": 33857, "token": "量", "type": "中文单字"}
|
256 |
+
{"id": 33904, "token": "函数", "type": "中文多字"}
|
257 |
+
{"id": 33967, "token": "密码", "type": "中文多字"}
|
258 |
+
{"id": 33976, "token": "查", "type": "中文单字"}
|
259 |
+
{"id": 34045, "token": "。”", "type": "中文标点"}
|
260 |
+
{"id": 34048, "token": "部", "type": "中文单字"}
|
261 |
+
{"id": 34171, "token": "性", "type": "中文单字"}
|
262 |
+
{"id": 34208, "token": "和", "type": "中文单字"}
|
263 |
+
{"id": 34226, "token": "更", "type": "中文单字"}
|
264 |
+
{"id": 34547, "token": "后", "type": "中文单字"}
|
265 |
+
{"id": 34577, "token": "证", "type": "中文单字"}
|
266 |
+
{"id": 34676, "token": " 【", "type": "中文标点"}
|
267 |
+
{"id": 34690, "token": "”,", "type": "中文标点"}
|
268 |
+
{"id": 34972, "token": "题", "type": "中文单字"}
|
269 |
+
{"id": 35056, "token": "确", "type": "中文单字"}
|
270 |
+
{"id": 35083, "token": "格", "type": "中文单字"}
|
271 |
+
{"id": 35147, "token": ".“", "type": "中文标点"}
|
272 |
+
{"id": 35192, "token": ".—", "type": "中文标点"}
|
273 |
+
{"id": 35284, "token": ".”\n\n\n\n", "type": "中文标点"}
|
274 |
+
{"id": 35287, "token": "了", "type": "中文单字"}
|
275 |
+
{"id": 35304, "token": "���", "type": "中文单字"}
|
276 |
+
{"id": 35330, "token": "金", "type": "中文单字"}
|
277 |
+
{"id": 35417, "token": "公", "type": "中文单字"}
|
278 |
+
{"id": 35424, "token": "午", "type": "中文单字"}
|
279 |
+
{"id": 35757, "token": "円", "type": "中文单字"}
|
280 |
+
{"id": 35816, "token": "“There", "type": "中文标点"}
|
281 |
+
{"id": 35818, "token": "片", "type": "中文单字"}
|
282 |
+
{"id": 35894, "token": "空", "type": "中文单字"}
|
283 |
+
{"id": 35959, "token": "请求", "type": "中文多字"}
|
284 |
+
{"id": 36225, "token": "��加", "type": "中文多字"}
|
285 |
+
{"id": 36319, "token": ".’\n\n", "type": "中文标点"}
|
286 |
+
{"id": 36343, "token": "态", "type": "中文单字"}
|
287 |
+
{"id": 36515, "token": "登录", "type": "中文多字"}
|
288 |
+
{"id": 36577, "token": "’une", "type": "中文标点"}
|
289 |
+
{"id": 36651, "token": "管", "type": "中文单字"}
|
290 |
+
{"id": 36668, "token": "主", "type": "中文单字"}
|
291 |
+
{"id": 36761, "token": "』", "type": "中文标点"}
|
292 |
+
{"id": 36827, "token": "天", "type": "中文单字"}
|
293 |
+
{"id": 36896, "token": "、「", "type": "中文标点"}
|
294 |
+
{"id": 37026, "token": "自", "type": "中文单字"}
|
295 |
+
{"id": 37046, "token": "我", "type": "中文单字"}
|
296 |
+
{"id": 37087, "token": "全", "type": "中文单字"}
|
297 |
+
{"id": 37271, "token": "今", "type": "中文单字"}
|
298 |
+
{"id": 37395, "token": "页面", "type": "中文多字"}
|
299 |
+
{"id": 37507, "token": "来", "type": "中文单字"}
|
300 |
+
{"id": 37648, "token": "��作", "type": "中文多字"}
|
301 |
+
{"id": 37656, "token": "正", "type": "中文单字"}
|
302 |
+
{"id": 37687, "token": "说", "type": "中文单字"}
|
303 |
+
{"id": 37689, "token": "意", "type": "中文单字"}
|
304 |
+
{"id": 37705, "token": "送", "type": "中文单字"}
|
305 |
+
{"id": 37729, "token": "容", "type": "中文单字"}
|
306 |
+
{"id": 37767, "token": "已", "type": "中文单字"}
|
307 |
+
{"id": 37985, "token": "结", "type": "中文单字"}
|
308 |
+
{"id": 38087, "token": ":“", "type": "中文标点"}
|
309 |
+
{"id": 38093, "token": "会", "type": "中文单字"}
|
310 |
+
{"id": 38129, "token": "使用", "type": "中文多字"}
|
311 |
+
{"id": 38232, "token": "。</", "type": "中文标点"}
|
312 |
+
{"id": 38365, "token": "。\r\n", "type": "中文标点"}
|
313 |
+
{"id": 38542, "token": "—but", "type": "中文标点"}
|
314 |
+
{"id": 38574, "token": "段", "type": "中文单字"}
|
315 |
+
{"id": 38609, "token": "�认", "type": "中文多字"}
|
316 |
+
{"id": 38684, "token": "“If", "type": "中文标点"}
|
317 |
+
{"id": 38741, "token": "。,", "type": "中文标点"}
|
318 |
+
{"id": 38743, "token": "计", "type": "中文单字"}
|
319 |
+
{"id": 39045, "token": ",请", "type": "中文多字"}
|
320 |
+
{"id": 39084, "token": "源", "type": "中文单字"}
|
321 |
+
{"id": 39135, "token": "色", "type": "中文单字"}
|
322 |
+
{"id": 39177, "token": "時", "type": "中文单字"}
|
323 |
+
{"id": 39209, "token": "交", "type": "中文单字"}
|
324 |
+
{"id": 39276, "token": "系", "type": "中文单字"}
|
325 |
+
{"id": 39282, "token": "过", "type": "中文单字"}
|
326 |
+
{"id": 39312, "token": "电", "type": "中文单字"}
|
327 |
+
{"id": 39365, "token": "询", "type": "中文单字"}
|
328 |
+
{"id": 39404, "token": "符", "type": "中文单字"}
|
329 |
+
{"id": 39425, "token": "…………", "type": "中文标点"}
|
330 |
+
{"id": 39442, "token": "未", "type": "中文单字"}
|
331 |
+
{"id": 39607, "token": "程", "type": "中文单字"}
|
332 |
+
{"id": 40053, "token": "常", "type": "中文单字"}
|
333 |
+
{"id": 40089, "token": "条", "type": "中文单字"}
|
334 |
+
{"id": 40195, "token": "下", "type": "中文单字"}
|
335 |
+
{"id": 40265, "token": "当", "type": "中文单字"}
|
336 |
+
{"id": 40452, "token": "管理", "type": "中文多字"}
|
337 |
+
{"id": 40466, "token": "��态", "type": "中文多字"}
|
338 |
+
{"id": 40474, "token": "情", "type": "中文单字"}
|
339 |
+
{"id": 40526, "token": "口", "type": "中文单字"}
|
340 |
+
{"id": 40565, "token": "“He", "type": "中文标点"}
|
341 |
+
{"id": 40702, "token": "’S", "type": "中文标点"}
|
342 |
+
{"id": 40753, "token": "’a", "type": "中文标点"}
|
343 |
+
{"id": 40862, "token": "合", "type": "中文单字"}
|
344 |
+
{"id": 41007, "token": "方法", "type": "中文多字"}
|
345 |
+
{"id": 41053, "token": "车", "type": "中文单字"}
|
346 |
+
{"id": 41073, "token": "实", "type": "中文单字"}
|
347 |
+
{"id": 41127, "token": "组", "type": "中文单字"}
|
348 |
+
{"id": 41128, "token": "—that", "type": "中文标点"}
|
349 |
+
{"id": 41190, "token": "操作", "type": "中文多字"}
|
350 |
+
{"id": 41354, "token": "’.\n\n", "type": "中文标点"}
|
351 |
+
{"id": 41401, "token": "版", "type": "中文单字"}
|
352 |
+
{"id": 41642, "token": "周", "type": "中文单字"}
|
353 |
+
{"id": 41723, "token": "址", "type": "中文单字"}
|
354 |
+
{"id": 41771, "token": "获取", "type": "中文多字"}
|
355 |
+
{"id": 41827, "token": ":\"", "type": "中文标点"}
|
356 |
+
{"id": 41914, "token": "记", "type": "中文单字"}
|
357 |
+
{"id": 41920, "token": "二", "type": "中文单字"}
|
358 |
+
{"id": 42016, "token": "同", "type": "中文单字"}
|
359 |
+
{"id": 42052, "token": "业", "type": "中文单字"}
|
360 |
+
{"id": 42081, "token": "权", "type": "中文单字"}
|
361 |
+
{"id": 42246, "token": "其", "type": "中文单字"}
|
362 |
+
{"id": 42275, "token": " ,", "type": "中文标点"}
|
363 |
+
{"id": 42399, "token": "进", "type": "中文单字"}
|
364 |
+
{"id": 42421, "token": "试", "type": "中文单字"}
|
365 |
+
{"id": 42462, "token": "验", "type": "中文单字"}
|
366 |
+
{"id": 42506, "token": "料", "type": "中文单字"}
|
367 |
+
{"id": 42553, "token": ",\n", "type": "中文标点"}
|
368 |
+
{"id": 42605, "token": ",“", "type": "中文标点"}
|
369 |
+
{"id": 42783, "token": "传", "type": "中文单字"}
|
370 |
+
{"id": 43032, "token": "述", "type": "中文单字"}
|
371 |
+
{"id": 43167, "token": "集", "type": "中文单字"}
|
372 |
+
{"id": 43240, "token": "多", "type": "中文单字"}
|
373 |
+
{"id": 43292, "token": "无", "type": "中文单字"}
|
374 |
+
{"id": 43323, "token": "员", "type": "中文单字"}
|
375 |
+
{"id": 43378, "token": "报", "type": "中文单字"}
|
376 |
+
{"id": 43444, "token": " (", "type": "中文标点"}
|
377 |
+
{"id": 43511, "token": "他", "type": "中文单字"}
|
378 |
+
{"id": 43568, "token": "無", "type": "中文单字"}
|
379 |
+
{"id": 43741, "token": "‘s", "type": "中文标点"}
|
380 |
+
{"id": 43955, "token": "添加", "type": "中文多字"}
|
381 |
+
{"id": 44130, "token": "“What", "type": "中文标点"}
|
382 |
+
{"id": 44309, "token": "服", "type": "中文单字"}
|
383 |
+
{"id": 44368, "token": "线", "type": "中文单字"}
|
384 |
+
{"id": 44388, "token": "这", "type": "中文单字"}
|
385 |
+
{"id": 44416, "token": "制", "type": "中文单字"}
|
386 |
+
{"id": 44529, "token": " ", "type": "中文标点"}
|
387 |
+
{"id": 44603, "token": "—it", "type": "中文标点"}
|
388 |
+
{"id": 44620, "token": "『", "type": "中文标点"}
|
389 |
+
{"id": 44689, "token": "的", "type": "中文单字"}
|
390 |
+
{"id": 44816, "token": "�始", "type": "中文多字"}
|
391 |
+
{"id": 44820, "token": "�单", "type": "中文多字"}
|
392 |
+
{"id": 44915, "token": "内容", "type": "中文多字"}
|
393 |
+
{"id": 44996, "token": "’il", "type": "中文标点"}
|
394 |
+
{"id": 45018, "token": "设置", "type": "中文多字"}
|
395 |
+
{"id": 45059, "token": "生成", "type": "中文多字"}
|
396 |
+
{"id": 45163, "token": "将", "type": "中文单字"}
|
397 |
+
{"id": 45191, "token": "状态", "type": "中文多字"}
|
398 |
+
{"id": 45221, "token": "=”", "type": "中文标点"}
|
399 |
+
{"id": 45258, "token": "?’", "type": "中文标点"}
|
400 |
+
{"id": 45277, "token": "列表", "type": "中文多字"}
|
401 |
+
{"id": 45390, "token": "处", "type": "中文单字"}
|
402 |
+
{"id": 45460, "token": "】\n\n", "type": "中文标点"}
|
403 |
+
{"id": 45472, "token": "输", "type": "中文单字"}
|
404 |
+
{"id": 45516, "token": "!\");\n", "type": "中文标点"}
|
405 |
+
{"id": 45631, "token": " 「", "type": "中文标点"}
|
406 |
+
{"id": 45736, "token": "高", "type": "中文单字"}
|
407 |
+
{"id": 45829, "token": "子", "type": "中文单字"}
|
408 |
+
{"id": 45893, "token": "道", "type": "中文单字"}
|
409 |
+
{"id": 45934, "token": "�述", "type": "中文多字"}
|
410 |
+
{"id": 46028, "token": "章", "type": "中文单字"}
|
411 |
+
{"id": 46031, "token": "字段", "type": "中文多字"}
|
412 |
+
{"id": 46034, "token": "手", "type": "中文单字"}
|
413 |
+
{"id": 46056, "token": "库", "type": "中文单字"}
|
414 |
+
{"id": 46091, "token": "三", "type": "中文单字"}
|
415 |
+
{"id": 46093, "token": "….\n\n", "type": "中文标点"}
|
416 |
+
{"id": 46233, "token": "“In", "type": "中文标点"}
|
417 |
+
{"id": 46239, "token": "提示", "type": "中文多字"}
|
418 |
+
{"id": 46281, "token": "从", "type": "中文单字"}
|
419 |
+
{"id": 46456, "token": "支", "type": "中文单字"}
|
420 |
+
{"id": 46690, "token": "“They", "type": "中文标点"}
|
421 |
+
{"id": 46729, "token": "家", "type": "中文单字"}
|
422 |
+
{"id": 46885, "token": "日期", "type": "中文多字"}
|
423 |
+
{"id": 46961, "token": "长", "type": "中文单字"}
|
424 |
+
{"id": 47000, "token": "付", "type": "中文单字"}
|
425 |
+
{"id": 47012, "token": "获取", "type": "中文多字"}
|
426 |
+
{"id": 47018, "token": "秒", "type": "中文单字"}
|
427 |
+
{"id": 47030, "token": "图片", "type": "中文多字"}
|
428 |
+
{"id": 47043, "token": "商品", "type": "中文多字"}
|
429 |
+
{"id": 47095, "token": "路", "type": "中文单字"}
|
430 |
+
{"id": 47200, "token": "代码", "type": "中文多字"}
|
431 |
+
{"id": 47406, "token": "完", "type": "中文单字"}
|
432 |
+
{"id": 47436, "token": ":</", "type": "中文标点"}
|
433 |
+
{"id": 47523, "token": "象", "type": "中文单字"}
|
434 |
+
{"id": 47548, "token": "则", "type": "中文单字"}
|
435 |
+
{"id": 47551, "token": "现", "type": "中文单字"}
|
436 |
+
{"id": 47566, "token": "设", "type": "中文单字"}
|
437 |
+
{"id": 47577, "token": "地址", "type": "中文多字"}
|
438 |
+
{"id": 47585, "token": "保存", "type": "中文多字"}
|
439 |
+
{"id": 47653, "token": "京", "type": "中文单字"}
|
440 |
+
{"id": 47770, "token": "转", "type": "中文单字"}
|
441 |
+
{"id": 47896, "token": " –\n\n", "type": "中文标点"}
|
442 |
+
{"id": 47971, "token": "�示", "type": "中文多字"}
|
443 |
+
{"id": 48039, "token": "辑", "type": "中文单字"}
|
444 |
+
{"id": 48044, "token": "一个", "type": "中文多字"}
|
445 |
+
{"id": 48249, "token": "限", "type": "中文单字"}
|
446 |
+
{"id": 48349, "token": "“A", "type": "中文标点"}
|
447 |
+
{"id": 48463, "token": "默认", "type": "中文多字"}
|
448 |
+
{"id": 48634, "token": "力", "type": "中文单字"}
|
449 |
+
{"id": 48706, "token": "存在", "type": "中文多字"}
|
450 |
+
{"id": 48785, "token": "数", "type": "中文单字"}
|
451 |
+
{"id": 48858, "token": "创建", "type": "中文多字"}
|
452 |
+
{"id": 48864, "token": "学", "type": "中文单字"}
|
453 |
+
{"id": 48915, "token": "外", "type": "中文单字"}
|
454 |
+
{"id": 48972, "token": "调", "type": "中文单字"}
|
455 |
+
{"id": 48974, "token": "服务", "type": "中文多字"}
|
456 |
+
{"id": 48982, "token": "项", "type": "中文单字"}
|
457 |
+
{"id": 49055, "token": "请输入", "type": "中文多字"}
|
458 |
+
{"id": 49216, "token": ".”\n", "type": "中文标点"}
|
459 |
+
{"id": 49372, "token": "),", "type": "中文标点"}
|
460 |
+
{"id": 49409, "token": "北", "type": "中文单字"}
|
461 |
+
{"id": 49491, "token": "字符", "type": "中文多字"}
|
462 |
+
{"id": 49525, "token": "—in", "type": "中文标点"}
|
463 |
+
{"id": 49543, "token": ":\n\n", "type": "中文标点"}
|
464 |
+
{"id": 49792, "token": "工", "type": "中文单字"}
|
465 |
+
{"id": 49838, "token": "笑", "type": "中文单字"}
|
466 |
+
{"id": 49928, "token": "监", "type": "中文单字"}
|
467 |
+
{"id": 49977, "token": "“That", "type": "中文标点"}
|
468 |
+
{"id": 49988, "token": "任", "type": "中文单字"}
|
469 |
+
{"id": 50004, "token": "—which", "type": "中文标点"}
|
470 |
+
{"id": 50021, "token": "相", "type": "中文单字"}
|
471 |
+
{"id": 50027, "token": "验证", "type": "中文多字"}
|
472 |
+
{"id": 50034, "token": "微", "type": "中文单字"}
|
473 |
+
{"id": 50126, "token": "册", "type": "中文单字"}
|
474 |
+
{"id": 50182, "token": "联", "type": "中文单字"}
|
475 |
+
{"id": 50211, "token": "平", "type": "中文单字"}
|
476 |
+
{"id": 50285, "token": "增", "type": "中文单字"}
|
477 |
+
{"id": 50287, "token": "听", "type": "中文单字"}
|
478 |
+
{"id": 50338, "token": "解", "type": "中文单字"}
|
479 |
+
{"id": 50617, "token": "—to", "type": "中文标点"}
|
480 |
+
{"id": 50667, "token": "等", "type": "中文单字"}
|
481 |
+
{"id": 50808, "token": "’ai", "type": "中文标点"}
|
482 |
+
{"id": 50928, "token": "得", "type": "中文单字"}
|
483 |
+
{"id": 51107, "token": "更新", "type": "中文多字"}
|
484 |
+
{"id": 51109, "token": "收", "type": "中文单字"}
|
485 |
+
{"id": 51142, "token": "用户", "type": "中文多字"}
|
486 |
+
{"id": 51202, "token": "选�", "type": "中文多字"}
|
487 |
+
{"id": 51279, "token": "…”", "type": "中文标点"}
|
488 |
+
{"id": 51385, "token": "安", "type": "中文单字"}
|
489 |
+
{"id": 51392, "token": "价", "type": "中文单字"}
|
490 |
+
{"id": 51431, "token": "第", "type": "中文单字"}
|
491 |
+
{"id": 51450, "token": "取消", "type": "中文多字"}
|
492 |
+
{"id": 51466, "token": "藏", "type": "中文单字"}
|
493 |
+
{"id": 51477, "token": "创建", "type": "中文多字"}
|
494 |
+
{"id": 51504, "token": "选择", "type": "中文多字"}
|
495 |
+
{"id": 51510, "token": "订单", "type": "中文多字"}
|
496 |
+
{"id": 51609, "token": "命", "type": "中文单字"}
|
497 |
+
{"id": 51611, "token": "应", "type": "中文单字"}
|
498 |
+
{"id": 51747, "token": "为空", "type": "中文多字"}
|
499 |
+
{"id": 51749, "token": "—or", "type": "中文标点"}
|
500 |
+
{"id": 51757, "token": "—I", "type": "中文标点"}
|
501 |
+
{"id": 51786, "token": "“,", "type": "中文标点"}
|
502 |
+
{"id": 51928, "token": "“When", "type": "中文标点"}
|
503 |
+
{"id": 52030, "token": "看", "type": "中文单字"}
|
504 |
+
{"id": 52084, "token": "索", "type": "中文单字"}
|
505 |
+
{"id": 52188, "token": "�始化", "type": "中文多字"}
|
506 |
+
{"id": 52225, "token": "资", "type": "中文单字"}
|
507 |
+
{"id": 52254, "token": "查询", "type": "中文多字"}
|
508 |
+
{"id": 52289, "token": "’en", "type": "中文标点"}
|
509 |
+
{"id": 52332, "token": "产", "type": "中文单字"}
|
510 |
+
{"id": 52563, "token": "表示", "type": "中文多字"}
|
511 |
+
{"id": 52675, "token": "串", "type": "中文单字"}
|
512 |
+
{"id": 52927, "token": "布", "type": "中文单字"}
|
513 |
+
{"id": 53229, "token": "原", "type": "中文单字"}
|
514 |
+
{"id": 53263, "token": "…..", "type": "中文标点"}
|
515 |
+
{"id": 53283, "token": "知", "type": "中文单字"}
|
516 |
+
{"id": 53434, "token": "级", "type": "中文单字"}
|
517 |
+
{"id": 53513, "token": "––", "type": "中文标点"}
|
518 |
+
{"id": 53610, "token": "水", "type": "中文单字"}
|
519 |
+
{"id": 53626, "token": "上传", "type": "中文多字"}
|
520 |
+
{"id": 53676, "token": "…and", "type": "中文标点"}
|
521 |
+
{"id": 53802, "token": "监听", "type": "中文多字"}
|
522 |
+
{"id": 53826, "token": "击", "type": "中文单字"}
|
523 |
+
{"id": 53901, "token": "好", "type": "中文单字"}
|
524 |
+
{"id": 53953, "token": "物", "type": "中文单字"}
|
525 |
+
{"id": 54140, "token": "文", "type": "中文单字"}
|
526 |
+
{"id": 54154, "token": "设置", "type": "中文多字"}
|
527 |
+
{"id": 54253, "token": "不能", "type": "中文多字"}
|
528 |
+
{"id": 54322, "token": "放", "type": "中文单字"}
|
529 |
+
{"id": 54456, "token": "亿", "type": "中文单字"}
|
530 |
+
{"id": 54493, "token": "经", "type": "中文单字"}
|
531 |
+
{"id": 54581, "token": "描述", "type": "中文多字"}
|
532 |
+
{"id": 54689, "token": "。。\n\n", "type": "中文标点"}
|
533 |
+
{"id": 54747, "token": "。“", "type": "中文标点"}
|
534 |
+
{"id": 54872, "token": "模", "type": "中文单字"}
|
535 |
+
{"id": 55030, "token": "之", "type": "中文单字"}
|
536 |
+
{"id": 55038, "token": "台", "type": "中文单字"}
|
537 |
+
{"id": 55080, "token": "…I", "type": "中文标点"}
|
538 |
+
{"id": 55121, "token": "显示", "type": "中文多字"}
|
539 |
+
{"id": 55139, "token": "州", "type": "中文单字"}
|
540 |
+
{"id": 55434, "token": "—is", "type": "中文标点"}
|
541 |
+
{"id": 55487, "token": "配", "type": "中文单字"}
|
542 |
+
{"id": 55642, "token": "处理", "type": "中文多字"}
|
543 |
+
{"id": 55723, "token": "画", "type": "中文单字"}
|
544 |
+
{"id": 55758, "token": "统", "type": "中文单字"}
|
545 |
+
{"id": 55951, "token": "是", "type": "中文单字"}
|
546 |
+
{"id": 55999, "token": "共", "type": "中文单字"}
|
547 |
+
{"id": 56026, "token": "连", "type": "中文单字"}
|
548 |
+
{"id": 56040, "token": "〜", "type": "中文标点"}
|
549 |
+
{"id": 56163, "token": "„", "type": "中文标点"}
|
550 |
+
{"id": 56209, "token": "…\"", "type": "中文标点"}
|
551 |
+
{"id": 56235, "token": "海", "type": "中文单字"}
|
552 |
+
{"id": 56386, "token": "开始", "type": "中文多字"}
|
553 |
+
{"id": 56438, "token": "所有", "type": "中文多字"}
|
554 |
+
{"id": 56602, "token": "节", "type": "中文单字"}
|
555 |
+
{"id": 56716, "token": "返回", "type": "中文多字"}
|
556 |
+
{"id": 56906, "token": "退", "type": "中文单字"}
|
557 |
+
{"id": 56907, "token": "”。", "type": "中文标点"}
|
558 |
+
{"id": 56955, "token": "”),", "type": "中文标点"}
|
559 |
+
{"id": 56965, "token": "間", "type": "中文单字"}
|
560 |
+
{"id": 57106, "token": "比", "type": "中文单字"}
|
561 |
+
{"id": 57107, "token": "问", "type": "中文单字"}
|
562 |
+
{"id": 57237, "token": "至", "type": "中文单字"}
|
563 |
+
{"id": 57287, "token": "’aut", "type": "中文标点"}
|
564 |
+
{"id": 57378, "token": "备", "type": "中文单字"}
|
565 |
+
{"id": 57633, "token": "”:", "type": "中文标点"}
|
566 |
+
{"id": 57668, "token": "你", "type": "中文单字"}
|
567 |
+
{"id": 57752, "token": "黑", "type": "中文单字"}
|
568 |
+
{"id": 57861, "token": "…”\n\n", "type": "中文标点"}
|
569 |
+
{"id": 57892, "token": "’av", "type": "中文标点"}
|
570 |
+
{"id": 58004, "token": "下午", "type": "中文多字"}
|
571 |
+
{"id": 58119, "token": "编辑", "type": "中文多字"}
|
572 |
+
{"id": 58291, "token": "或", "type": "中文单字"}
|
573 |
+
{"id": 58318, "token": "与", "type": "中文单字"}
|
574 |
+
{"id": 58322, "token": "影", "type": "中文单字"}
|
575 |
+
{"id": 58386, "token": "’h", "type": "中文标点"}
|
576 |
+
{"id": 58521, "token": "作者", "type": "中文多字"}
|
577 |
+
{"id": 58543, "token": "话", "type": "中文单字"}
|
578 |
+
{"id": 58552, "token": "视", "type": "中文单字"}
|
579 |
+
{"id": 58653, "token": "读", "type": "中文单字"}
|
580 |
+
{"id": 58655, "token": "告", "type": "中文单字"}
|
581 |
+
{"id": 58666, "token": "美", "type": "中文单字"}
|
582 |
+
{"id": 58721, "token": "事件", "type": "中文多字"}
|
583 |
+
{"id": 58850, "token": "女", "type": "中文单字"}
|
584 |
+
{"id": 58911, "token": "山", "type": "中文单字"}
|
585 |
+
{"id": 59243, "token": "和", "type": "中文单字"}
|
586 |
+
{"id": 59363, "token": "生", "type": "中文单字"}
|
587 |
+
{"id": 59459, "token": "。(", "type": "中文标点"}
|
588 |
+
{"id": 59462, "token": "需", "type": "中文单字"}
|
589 |
+
{"id": 59464, "token": "复", "type": "中文单字"}
|
590 |
+
{"id": 59505, "token": "手机", "type": "中文多字"}
|
591 |
+
{"id": 59563, "token": "南", "type": "中文单字"}
|
592 |
+
{"id": 59614, "token": "必", "type": "中文单字"}
|
593 |
+
{"id": 59622, "token": "�行", "type": "中文多字"}
|
594 |
+
{"id": 59712, "token": "」「", "type": "中文标点"}
|
595 |
+
{"id": 59757, "token": "分", "type": "中文单字"}
|
596 |
+
{"id": 59795, "token": "中国", "type": "中文多字"}
|
597 |
+
{"id": 59892, "token": "闭", "type": "中文单字"}
|
598 |
+
{"id": 59914, "token": "加载", "type": "中文多字"}
|
599 |
+
{"id": 60174, "token": "城", "type": "中文单字"}
|
600 |
+
{"id": 60205, "token": "用户名", "type": "中文多字"}
|
601 |
+
{"id": 60233, "token": " 。", "type": "中文标点"}
|
602 |
+
{"id": 60239, "token": "�性", "type": "中文多字"}
|
603 |
+
{"id": 60251, "token": "结果", "type": "中文多字"}
|
604 |
+
{"id": 60317, "token": ";\n", "type": "中文标点"}
|
605 |
+
{"id": 60358, "token": "近", "type": "中文单字"}
|
606 |
+
{"id": 60455, "token": "效", "type": "中文单字"}
|
607 |
+
{"id": 60632, "token": "利", "type": "中文单字"}
|
608 |
+
{"id": 60634, "token": "移", "type": "中文单字"}
|
609 |
+
{"id": 60654, "token": "—as", "type": "中文标点"}
|
610 |
+
{"id": 60656, "token": "’int", "type": "中文标点"}
|
611 |
+
{"id": 60710, "token": "–\n\n", "type": "中文标点"}
|
612 |
+
{"id": 60843, "token": "总", "type": "中文单字"}
|
613 |
+
{"id": 60979, "token": "按", "type": "中文单字"}
|
614 |
+
{"id": 61056, "token": "排", "type": "中文单字"}
|
615 |
+
{"id": 61075, "token": "首", "type": "中文单字"}
|
616 |
+
{"id": 61131, "token": "’n", "type": "中文标点"}
|
617 |
+
{"id": 61176, "token": "··", "type": "中文标点"}
|
618 |
+
{"id": 61304, "token": "記", "type": "中文单字"}
|
619 |
+
{"id": 61311, "token": "————————————————", "type": "中文标点"}
|
620 |
+
{"id": 61337, "token": "社", "type": "中文单字"}
|
621 |
+
{"id": 61496, "token": "标题", "type": "中文多字"}
|
622 |
+
{"id": 61553, "token": "“As", "type": "中文标点"}
|
623 |
+
{"id": 61559, "token": "“No", "type": "中文标点"}
|
624 |
+
{"id": 61603, "token": "“But", "type": "中文标点"}
|
625 |
+
{"id": 61633, "token": "注意", "type": "中文多字"}
|
626 |
+
{"id": 61648, "token": "完成", "type": "中文多字"}
|
627 |
+
{"id": 61710, "token": "确定", "type": "中文多字"}
|
628 |
+
{"id": 61786, "token": "西", "type": "中文单字"}
|
629 |
+
{"id": 61826, "token": "先", "type": "中文单字"}
|
630 |
+
{"id": 61903, "token": "…\"\n\n", "type": "中文标点"}
|
631 |
+
{"id": 61994, "token": "然", "type": "中文单字"}
|
632 |
+
{"id": 62049, "token": "键", "type": "中文单字"}
|
633 |
+
{"id": 62205, "token": "名", "type": "中文单字"}
|
634 |
+
{"id": 62249, "token": "周期", "type": "中文多字"}
|
635 |
+
{"id": 62291, "token": "额", "type": "中文单字"}
|
636 |
+
{"id": 62543, "token": "写", "type": "中文单字"}
|
637 |
+
{"id": 62597, "token": "“My", "type": "中文标点"}
|
638 |
+
{"id": 62717, "token": "�名", "type": "中文多字"}
|
639 |
+
{"id": 62789, "token": "注册", "type": "中文多字"}
|
640 |
+
{"id": 62855, "token": "签", "type": "中文单字"}
|
641 |
+
{"id": 63091, "token": "自", "type": "中文单字"}
|
642 |
+
{"id": 63093, "token": "。',\n", "type": "中文标点"}
|
643 |
+
{"id": 63212, "token": "因", "type": "中文单字"}
|
644 |
+
{"id": 63289, "token": "下载", "type": "中文多字"}
|
645 |
+
{"id": 63344, "token": "如果", "type": "中文多字"}
|
646 |
+
{"id": 63362, "token": "数据", "type": "中文多字"}
|
647 |
+
{"id": 63397, "token": "命周期", "type": "中文多字"}
|
648 |
+
{"id": 63679, "token": "注", "type": "中文单字"}
|
649 |
+
{"id": 63750, "token": "”—", "type": "中文标点"}
|
650 |
+
{"id": 63938, "token": "—not", "type": "中文标点"}
|
651 |
+
{"id": 63977, "token": " —\n\n", "type": "中文标点"}
|
652 |
+
{"id": 64022, "token": "别", "type": "中文单字"}
|
653 |
+
{"id": 64026, "token": "并", "type": "中文单字"}
|
654 |
+
{"id": 64045, "token": "异", "type": "中文单字"}
|
655 |
+
{"id": 64063, "token": "束", "type": "中文单字"}
|
656 |
+
{"id": 64171, "token": "修改", "type": "中文多字"}
|
657 |
+
{"id": 64173, "token": "删除", "type": "中文多字"}
|
658 |
+
{"id": 64179, "token": "生命周期", "type": "中文多字"}
|
659 |
+
{"id": 64209, "token": "心", "type": "中文单字"}
|
660 |
+
{"id": 64376, "token": "。\",\n", "type": "中文标点"}
|
661 |
+
{"id": 64414, "token": "链", "type": "中文单字"}
|
662 |
+
{"id": 64467, "token": "指", "type": "中文单字"}
|
663 |
+
{"id": 64479, "token": "评", "type": "中文单字"}
|
664 |
+
{"id": 64531, "token": "整", "type": "中文单字"}
|
665 |
+
{"id": 64623, "token": "’in", "type": "中文标点"}
|
666 |
+
{"id": 64803, "token": "四", "type": "中文单字"}
|
667 |
+
{"id": 64889, "token": "断", "type": "中文单字"}
|
668 |
+
{"id": 64936, "token": "角", "type": "中文单字"}
|
669 |
+
{"id": 64960, "token": "生命周期函数", "type": "中文多字"}
|
670 |
+
{"id": 65053, "token": "监听页面", "type": "中文多字"}
|
671 |
+
{"id": 65164, "token": "连接", "type": "中文多字"}
|
672 |
+
{"id": 65218, "token": "上", "type": "中文单字"}
|
673 |
+
{"id": 65305, "token": "消息", "type": "中文多字"}
|
674 |
+
{"id": 65312, "token": "”).", "type": "中文标点"}
|
675 |
+
{"id": 65372, "token": "软", "type": "中文单字"}
|
676 |
+
{"id": 65455, "token": "头", "type": "中文单字"}
|
677 |
+
{"id": 65459, "token": ")、", "type": "中文标点"}
|
678 |
+
{"id": 65529, "token": "对象", "type": "中文多字"}
|
679 |
+
{"id": 65571, "token": "是否", "type": "中文多字"}
|
680 |
+
{"id": 65573, "token": "邮", "type": "中文单字"}
|
681 |
+
{"id": 65659, "token": "义", "type": "中文单字"}
|
682 |
+
{"id": 65743, "token": "司", "type": "中文单字"}
|
683 |
+
{"id": 65782, "token": "步", "type": "中文单字"}
|
684 |
+
{"id": 65789, "token": "门", "type": "中文单字"}
|
685 |
+
{"id": 65820, "token": "导", "type": "中文单字"}
|
686 |
+
{"id": 65854, "token": "客", "type": "中文单字"}
|
687 |
+
{"id": 65884, "token": "不能为空", "type": "中文多字"}
|
688 |
+
{"id": 65917, "token": "右", "type": "中文单字"}
|
689 |
+
{"id": 66052, "token": "频", "type": "中文单字"}
|
690 |
+
{"id": 66101, "token": "\"—", "type": "中文标点"}
|
691 |
+
{"id": 66201, "token": "像", "type": "中文单字"}
|
692 |
+
{"id": 66327, "token": "。「", "type": "中文标点"}
|
693 |
+
{"id": 66378, "token": "特", "type": "中文单字"}
|
694 |
+
{"id": 66383, "token": "」と", "type": "中文标点"}
|
695 |
+
{"id": 66545, "token": "”;", "type": "中文标点"}
|
696 |
+
{"id": 66621, "token": " ….", "type": "中文标点"}
|
697 |
+
{"id": 66625, "token": "“Our", "type": "中文标点"}
|
698 |
+
{"id": 66677, "token": "记录", "type": "中文多字"}
|
699 |
+
{"id": 66679, "token": "…\n\n\n", "type": "中文标点"}
|
700 |
+
{"id": 66776, "token": "非", "type": "中文单字"}
|
701 |
+
{"id": 66850, "token": " “[", "type": "中文标点"}
|
702 |
+
{"id": 66870, "token": "省", "type": "中文单字"}
|
703 |
+
{"id": 67117, "token": "输出", "type": "中文多字"}
|
704 |
+
{"id": 67178, "token": "造", "type": "中文单字"}
|
705 |
+
{"id": 67282, "token": "’ét", "type": "中文标点"}
|
706 |
+
{"id": 67287, "token": "姓名", "type": "中文多字"}
|
707 |
+
{"id": 67494, "token": "说明", "type": "中文多字"}
|
708 |
+
{"id": 67658, "token": "字符串", "type": "中文多字"}
|
709 |
+
{"id": 67669, "token": "径", "type": "中文单字"}
|
710 |
+
{"id": 67735, "token": "�试", "type": "中文多字"}
|
711 |
+
{"id": 67870, "token": "’e", "type": "中文标点"}
|
712 |
+
{"id": 67886, "token": " ”\n\n", "type": "中文标点"}
|
713 |
+
{"id": 67933, "token": "详", "type": "中文单字"}
|
714 |
+
{"id": 67986, "token": "验证码", "type": "中文多字"}
|
715 |
+
{"id": 67998, "token": "。\\", "type": "中文标点"}
|
716 |
+
{"id": 68171, "token": "由", "type": "中文单字"}
|
717 |
+
{"id": 68230, "token": "^", "type": "中文标点"}
|
718 |
+
{"id": 68306, "token": "’on", "type": "中文标点"}
|
719 |
+
{"id": 68379, "token": "包", "type": "中文单字"}
|
720 |
+
{"id": 68438, "token": "通过", "type": "中文多字"}
|
721 |
+
{"id": 68464, "token": "东", "type": "中文单字"}
|
722 |
+
{"id": 68850, "token": ")—", "type": "中文标点"}
|
723 |
+
{"id": 68931, "token": "论", "type": "中文单字"}
|
724 |
+
{"id": 68932, "token": "“And", "type": "中文标点"}
|
725 |
+
{"id": 69049, "token": "当前", "type": "中文多字"}
|
726 |
+
{"id": 69165, "token": "络", "type": "中文单字"}
|
727 |
+
{"id": 69253, "token": "款", "type": "中文单字"}
|
728 |
+
{"id": 69272, "token": "�藏", "type": "中文多字"}
|
729 |
+
{"id": 69362, "token": "支付", "type": "中文多字"}
|
730 |
+
{"id": 69496, "token": "启", "type": "中文单字"}
|
731 |
+
{"id": 69636, "token": "而", "type": "中文单字"}
|
732 |
+
{"id": 69856, "token": "填", "type": "中文单字"}
|
733 |
+
{"id": 69905, "token": "格式", "type": "中文多字"}
|
734 |
+
{"id": 69962, "token": "释", "type": "中文单字"}
|
735 |
+
{"id": 69978, "token": "持", "type": "中文单字"}
|
736 |
+
{"id": 70041, "token": "��索", "type": "中文多字"}
|
737 |
+
{"id": 70090, "token": "北京", "type": "中文多字"}
|
738 |
+
{"id": 70141, "token": "向", "type": "中文单字"}
|
739 |
+
{"id": 70158, "token": "输入", "type": "中文多字"}
|
740 |
+
{"id": 70203, "token": "算", "type": "中文单字"}
|
741 |
+
{"id": 70214, "token": "“So", "type": "中文标点"}
|
742 |
+
{"id": 70262, "token": "对", "type": "中文单字"}
|
743 |
+
{"id": 70277, "token": "江", "type": "中文单字"}
|
744 |
+
{"id": 70284, "token": "不存在", "type": "中文多字"}
|
745 |
+
{"id": 70349, "token": "里", "type": "中文单字"}
|
746 |
+
{"id": 70453, "token": "查", "type": "中文单字"}
|
747 |
+
{"id": 70472, "token": "如", "type": "中文单字"}
|
748 |
+
{"id": 70525, "token": "发", "type": "中文单字"}
|
749 |
+
{"id": 70542, "token": "份", "type": "中文单字"}
|
750 |
+
{"id": 70615, "token": "),", "type": "中文标点"}
|
751 |
+
{"id": 70616, "token": "责", "type": "中文单字"}
|
752 |
+
{"id": 70626, "token": "科", "type": "中文单字"}
|
753 |
+
{"id": 70694, "token": "文件", "type": "中文多字"}
|
754 |
+
{"id": 70774, "token": "类", "type": "中文单字"}
|
755 |
+
{"id": 70821, "token": "民", "type": "中文单字"}
|
756 |
+
{"id": 70924, "token": "数组", "type": "中文多字"}
|
757 |
+
{"id": 71005, "token": "治", "type": "中文单字"}
|
758 |
+
{"id": 71082, "token": "%,", "type": "中文标点"}
|
759 |
+
{"id": 71174, "token": "声", "type": "中文单字"}
|
760 |
+
{"id": 71201, "token": "—they", "type": "中文标点"}
|
761 |
+
{"id": 71208, "token": "男", "type": "中文单字"}
|
762 |
+
{"id": 71270, "token": "“(", "type": "中文标点"}
|
763 |
+
{"id": 71298, "token": "[…", "type": "中文标点"}
|
764 |
+
{"id": 71461, "token": "重新", "type": "中文多字"}
|
765 |
+
{"id": 71480, "token": "—you", "type": "中文标点"}
|
766 |
+
{"id": 71600, "token": "设计", "type": "中文多字"}
|
767 |
+
{"id": 71638, "token": "分类", "type": "中文多字"}
|
768 |
+
{"id": 71668, "token": "输出", "type": "中文多字"}
|
769 |
+
{"id": 71689, "token": "以上", "type": "中文多字"}
|
770 |
+
{"id": 71733, "token": "异常", "type": "中文多字"}
|
771 |
+
{"id": 71869, "token": "族", "type": "中文单字"}
|
772 |
+
{"id": 71890, "token": "站", "type": "中文单字"}
|
773 |
+
{"id": 72027, "token": "没", "type": "中文单字"}
|
774 |
+
{"id": 72069, "token": "参数", "type": "中文多字"}
|
775 |
+
{"id": 72099, "token": "県", "type": "中文单字"}
|
776 |
+
{"id": 72125, "token": "雅", "type": "中文单字"}
|
777 |
+
{"id": 72209, "token": "版本", "type": "中文多字"}
|
778 |
+
{"id": 72234, "token": "换", "type": "中文单字"}
|
779 |
+
{"id": 72237, "token": "核", "type": "中文单字"}
|
780 |
+
{"id": 72238, "token": "素", "type": "中文单字"}
|
781 |
+
{"id": 72318, "token": "—for", "type": "中文标点"}
|
782 |
+
{"id": 72368, "token": "都", "type": "中文单字"}
|
783 |
+
{"id": 72404, "token": "超", "type": "中文单字"}
|
784 |
+
{"id": 72434, "token": "!’", "type": "中文标点"}
|
785 |
+
{"id": 72456, "token": "网络", "type": "中文多字"}
|
786 |
+
{"id": 72516, "token": "店", "type": "中文单字"}
|
787 |
+
{"id": 72718, "token": "起", "type": "中文单字"}
|
788 |
+
{"id": 72794, "token": "隐藏", "type": "中文多字"}
|
789 |
+
{"id": 72843, "token": "享", "type": "中文单字"}
|
790 |
+
{"id": 72873, "token": "方", "type": "中文单字"}
|
791 |
+
{"id": 72917, "token": "进行", "type": "中文多字"}
|
792 |
+
{"id": 73051, "token": "是否", "type": "中文多字"}
|
793 |
+
{"id": 73071, "token": "提交", "type": "中文多字"}
|
794 |
+
{"id": 73117, "token": "发送", "type": "中文多字"}
|
795 |
+
{"id": 73164, "token": "联系", "type": "中文多字"}
|
796 |
+
{"id": 73325, "token": "拉", "type": "中文单字"}
|
797 |
+
{"id": 73329, "token": "…\n\n\n\n", "type": "中文标点"}
|
798 |
+
{"id": 73361, "token": "米", "type": "中文单字"}
|
799 |
+
{"id": 73548, "token": "系统", "type": "中文多字"}
|
800 |
+
{"id": 73686, "token": "引", "type": "中文单字"}
|
801 |
+
{"id": 73740, "token": "编号", "type": "中文多字"}
|
802 |
+
{"id": 73751, "token": "点击", "type": "中文多字"}
|
803 |
+
{"id": 73769, "token": "更", "type": "中文单字"}
|
804 |
+
{"id": 73939, "token": "…)", "type": "中文标点"}
|
805 |
+
{"id": 73958, "token": "中", "type": "中文单字"}
|
806 |
+
{"id": 73981, "token": "语", "type": "中文单字"}
|
807 |
+
{"id": 74022, "token": "”?", "type": "中文标点"}
|
808 |
+
{"id": 74090, "token": "土", "type": "中文单字"}
|
809 |
+
{"id": 74138, "token": "宋", "type": "中文单字"}
|
810 |
+
{"id": 74245, "token": "直", "type": "中文单字"}
|
811 |
+
{"id": 74257, "token": "每", "type": "中文单字"}
|
812 |
+
{"id": 74318, "token": "公司", "type": "中文多字"}
|
813 |
+
{"id": 74396, "token": "箱", "type": "中文单字"}
|
814 |
+
{"id": 74412, "token": "字", "type": "中文单字"}
|
815 |
+
{"id": 74445, "token": "项目", "type": "中文多字"}
|
816 |
+
{"id": 74482, "token": "後", "type": "中文单字"}
|
817 |
+
{"id": 74662, "token": "在", "type": "中文单字"}
|
818 |
+
{"id": 74770, "token": "可以", "type": "中文多字"}
|
819 |
+
{"id": 74843, "token": "参", "type": "中文单字"}
|
820 |
+
{"id": 75140, "token": "变", "type": "中文单字"}
|
821 |
+
{"id": 75146, "token": "基", "type": "中文单字"}
|
822 |
+
{"id": 75259, "token": "页面", "type": "中文多字"}
|
823 |
+
{"id": 75267, "token": "場", "type": "中文单字"}
|
824 |
+
{"id": 75293, "token": "待", "type": "中文单字"}
|
825 |
+
{"id": 75320, "token": "程序", "type": "中文多字"}
|
826 |
+
{"id": 75376, "token": ")。", "type": "中文标点"}
|
827 |
+
{"id": 75486, "token": "规", "type": "中文单字"}
|
828 |
+
{"id": 75493, "token": "数据库", "type": "中文多字"}
|
829 |
+
{"id": 75513, "token": "政", "type": "中文单字"}
|
830 |
+
{"id": 75550, "token": "“For", "type": "中文标点"}
|
831 |
+
{"id": 75630, "token": "雅黑", "type": "中文多字"}
|
832 |
+
{"id": 75631, "token": "软雅黑", "type": "中文多字"}
|
833 |
+
{"id": 75761, "token": "排序", "type": "中文多字"}
|
834 |
+
{"id": 75787, "token": "。\n\n\n\n\n\n", "type": "中文标点"}
|
835 |
+
{"id": 75863, "token": "也", "type": "中文单字"}
|
836 |
+
{"id": 75910, "token": "介", "type": "中文单字"}
|
837 |
+
{"id": 75976, "token": "首页", "type": "中文多字"}
|
838 |
+
{"id": 76070, "token": "—including", "type": "中文标点"}
|
839 |
+
{"id": 76099, "token": "关闭", "type": "中文多字"}
|
840 |
+
{"id": 76148, "token": ",\n\n", "type": "中文标点"}
|
841 |
+
{"id": 76161, "token": "钟", "type": "中文单字"}
|
842 |
+
{"id": 76208, "token": "五", "type": "中文单字"}
|
843 |
+
{"id": 76217, "token": "执行", "type": "中文多字"}
|
844 |
+
{"id": 76323, "token": "审", "type": "中文单字"}
|
845 |
+
{"id": 76417, "token": "单位", "type": "中文多字"}
|
846 |
+
{"id": 76455, "token": "手机号", "type": "中文多字"}
|
847 |
+
{"id": 76502, "token": "日", "type": "中文单字"}
|
848 |
+
{"id": 76505, "token": "木", "type": "中文单字"}
|
849 |
+
{"id": 76537, "token": "打", "type": "中文单字"}
|
850 |
+
{"id": 76706, "token": "活", "type": "中文单字"}
|
851 |
+
{"id": 76718, "token": "微软雅黑", "type": "中文多字"}
|
852 |
+
{"id": 76750, "token": "播", "type": "中文单字"}
|
853 |
+
{"id": 76843, "token": "!!\n\n", "type": "中文标点"}
|
854 |
+
{"id": 76858, "token": "!”", "type": "中文标点"}
|
855 |
+
{"id": 76864, "token": "!」", "type": "中文标点"}
|
856 |
+
{"id": 76868, "token": "方式", "type": "中文多字"}
|
857 |
+
{"id": 76929, "token": "—he", "type": "中文标点"}
|
858 |
+
{"id": 76982, "token": "该", "type": "中文单字"}
|
859 |
+
{"id": 77138, "token": "’am", "type": "中文标点"}
|
860 |
+
{"id": 77158, "token": "…)\n\n", "type": "中文标点"}
|
861 |
+
{"id": 77190, "token": "初始化", "type": "中文多字"}
|
862 |
+
{"id": 77195, "token": "条件", "type": "中文多字"}
|
863 |
+
{"id": 77219, "token": "記事", "type": "中文多字"}
|
864 |
+
{"id": 77284, "token": "“.", "type": "中文标点"}
|
865 |
+
{"id": 77413, "token": "展", "type": "中文单字"}
|
866 |
+
{"id": 77479, "token": ",…\n\n", "type": "中文标点"}
|
867 |
+
{"id": 77748, "token": "钮", "type": "中文单字"}
|
868 |
+
{"id": 77913, "token": "具", "type": "中文单字"}
|
869 |
+
{"id": 77937, "token": "路径", "type": "中文多字"}
|
870 |
+
{"id": 78021, "token": "退出", "type": "中文多字"}
|
871 |
+
{"id": 78111, "token": "宋体", "type": "中文多字"}
|
872 |
+
{"id": 78228, "token": "志", "type": "中文单字"}
|
873 |
+
{"id": 78244, "token": "言", "type": "中文单字"}
|
874 |
+
{"id": 78272, "token": "购", "type": "中文单字"}
|
875 |
+
{"id": 78366, "token": "……………………", "type": "中文标点"}
|
876 |
+
{"id": 78388, "token": "但", "type": "中文单字"}
|
877 |
+
{"id": 78519, "token": "星", "type": "中文单字"}
|
878 |
+
{"id": 78640, "token": "两", "type": "中文单字"}
|
879 |
+
{"id": 78657, "token": "例如", "type": "中文多字"}
|
880 |
+
{"id": 78659, "token": "左", "type": "中文单字"}
|
881 |
+
{"id": 78698, "token": "考", "type": "中文单字"}
|
882 |
+
{"id": 78935, "token": "构", "type": "中文单字"}
|
883 |
+
{"id": 78943, "token": "報", "type": "中文单字"}
|
884 |
+
{"id": 79059, "token": "球", "type": "中文单字"}
|
885 |
+
{"id": 79108, "token": "设计器", "type": "中文多字"}
|
886 |
+
{"id": 79203, "token": "更新", "type": "中文多字"}
|
887 |
+
{"id": 79656, "token": "相关", "type": "中文多字"}
|
888 |
+
{"id": 79785, "token": "音", "type": "中文单字"}
|
889 |
+
{"id": 79908, "token": "动生成", "type": "中文多字"}
|
890 |
+
{"id": 79982, "token": "端", "type": "中文单字"}
|
891 |
+
{"id": 80000, "token": "。”\n\n", "type": "中文标点"}
|
892 |
+
{"id": 80003, "token": ",默认", "type": "中文多字"}
|
893 |
+
{"id": 80019, "token": "新", "type": "中文单字"}
|
894 |
+
{"id": 80073, "token": "搜索", "type": "中文多字"}
|
895 |
+
{"id": 80078, "token": "—even", "type": "中文标点"}
|
896 |
+
{"id": 80172, "token": "投", "type": "中文单字"}
|
897 |
+
{"id": 80195, "token": "立", "type": "中文单字"}
|
898 |
+
{"id": 80356, "token": "属性", "type": "中文多字"}
|
899 |
+
{"id": 80426, "token": "�断", "type": "中文多字"}
|
900 |
+
{"id": 80578, "token": "们", "type": "中文单字"}
|
901 |
+
{"id": 80615, "token": ".…\n\n", "type": "中文标点"}
|
902 |
+
{"id": 80699, "token": "火", "type": "中文单字"}
|
903 |
+
{"id": 80804, "token": "示", "type": "中文单字"}
|
904 |
+
{"id": 80866, "token": "清", "type": "中文单字"}
|
905 |
+
{"id": 81194, "token": "金额", "type": "中文多字"}
|
906 |
+
{"id": 81201, "token": "账", "type": "中文单字"}
|
907 |
+
{"id": 81258, "token": "就", "type": "中文单字"}
|
908 |
+
{"id": 81368, "token": "费", "type": "中文单字"}
|
909 |
+
{"id": 81506, "token": "请选择", "type": "中文多字"}
|
910 |
+
{"id": 81526, "token": "示例", "type": "中文多字"}
|
911 |
+
{"id": 81543, "token": "没有", "type": "中文多字"}
|
912 |
+
{"id": 81546, "token": ":\"+", "type": "中文标点"}
|
913 |
+
{"id": 81628, "token": "查询", "type": "中文多字"}
|
914 |
+
{"id": 81646, "token": "默认", "type": "中文多字"}
|
915 |
+
{"id": 81665, "token": "结束", "type": "中文多字"}
|
916 |
+
{"id": 81742, "token": "案", "type": "中文单字"}
|
917 |
+
{"id": 81902, "token": "—with", "type": "中文标点"}
|
918 |
+
{"id": 81951, "token": "控", "type": "中文单字"}
|
919 |
+
{"id": 81976, "token": "请求", "type": "中文多字"}
|
920 |
+
{"id": 82042, "token": "广", "type": "中文单字"}
|
921 |
+
{"id": 82175, "token": "’app", "type": "中文标点"}
|
922 |
+
{"id": 82267, "token": "确认", "type": "中文多字"}
|
923 |
+
{"id": 82302, "token": "历", "type": "中文单字"}
|
924 |
+
{"id": 82317, "token": "及", "type": "中文单字"}
|
925 |
+
{"id": 82363, "token": "如果", "type": "中文多字"}
|
926 |
+
{"id": 82364, "token": "?”", "type": "中文标点"}
|
927 |
+
{"id": 82420, "token": "計", "type": "中文单字"}
|
928 |
+
{"id": 82530, "token": "、、", "type": "中文标点"}
|
929 |
+
{"id": 82533, "token": "止", "type": "中文单字"}
|
930 |
+
{"id": 82554, "token": "方法", "type": "中文多字"}
|
931 |
+
{"id": 82696, "token": "么", "type": "中文单字"}
|
932 |
+
{"id": 82768, "token": "货", "type": "中文单字"}
|
933 |
+
{"id": 82805, "token": "测试", "type": "中文多字"}
|
934 |
+
{"id": 82900, "token": "数量", "type": "中文多字"}
|
935 |
+
{"id": 82912, "token": "位置", "type": "中文多字"}
|
936 |
+
{"id": 82973, "token": "時間", "type": "中文多字"}
|
937 |
+
{"id": 83042, "token": "�权", "type": "中文多字"}
|
938 |
+
{"id": 83047, "token": "开", "type": "中文单字"}
|
939 |
+
{"id": 83125, "token": "文章", "type": "中文多字"}
|
940 |
+
{"id": 83175, "token": "阳", "type": "中文单字"}
|
941 |
+
{"id": 83266, "token": "队", "type": "中文单字"}
|
942 |
+
{"id": 83301, "token": "技", "type": "中文单字"}
|
943 |
+
{"id": 83324, "token": "场", "type": "中文单字"}
|
944 |
+
{"id": 83337, "token": "链接", "type": "中文多字"}
|
945 |
+
{"id": 83354, "token": ">", "type": "中文标点"}
|
946 |
+
{"id": 83439, "token": "添加", "type": "中文多字"}
|
947 |
+
{"id": 83639, "token": "最", "type": "中文单字"}
|
948 |
+
{"id": 83687, "token": "数字", "type": "中文多字"}
|
949 |
+
{"id": 83741, "token": "声明", "type": "中文多字"}
|
950 |
+
{"id": 83747, "token": "少", "type": "中文单字"}
|
951 |
+
{"id": 83766, "token": "…but", "type": "中文标点"}
|
952 |
+
{"id": 83799, "token": "形", "type": "中文单字"}
|
953 |
+
{"id": 83800, "token": "产品", "type": "中文多字"}
|
954 |
+
{"id": 83872, "token": "—are", "type": "中文标点"}
|
955 |
+
{"id": 83932, "token": "稿", "type": "中文单字"}
|
956 |
+
{"id": 83947, "token": "英", "type": "中文单字"}
|
957 |
+
{"id": 83994, "token": "游", "type": "中文单字"}
|
958 |
+
{"id": 84095, "token": "亿元", "type": "中文多字"}
|
959 |
+
{"id": 84131, "token": "分钟", "type": "中文多字"}
|
960 |
+
{"id": 84341, "token": ".…", "type": "中文标点"}
|
961 |
+
{"id": 84410, "token": "商", "type": "中文单字"}
|
962 |
+
{"id": 84498, "token": "“She", "type": "中文标点"}
|
963 |
+
{"id": 84765, "token": "!\",", "type": "中文标点"}
|
964 |
+
{"id": 84844, "token": "供", "type": "中文单字"}
|
965 |
+
{"id": 84851, "token": "推", "type": "中文单字"}
|
966 |
+
{"id": 84875, "token": "!\n\n\n\n", "type": "中文标点"}
|
967 |
+
{"id": 84941, "token": "—who", "type": "中文标点"}
|
968 |
+
{"id": 85155, "token": "初始化", "type": "中文多字"}
|
969 |
+
{"id": 85188, "token": "税", "type": "中文单字"}
|
970 |
+
{"id": 85284, "token": "按钮", "type": "中文多字"}
|
971 |
+
{"id": 85366, "token": "—an", "type": "中文标点"}
|
972 |
+
{"id": 85663, "token": "無し�", "type": "中文多字"}
|
973 |
+
{"id": 85707, "token": "初", "type": "中文单字"}
|
974 |
+
{"id": 85997, "token": "当", "type": "中文单字"}
|
975 |
+
{"id": 85998, "token": "!');\n", "type": "中文标点"}
|
976 |
+
{"id": 86127, "token": "私", "type": "中文单字"}
|
977 |
+
{"id": 86206, "token": "需要", "type": "中文多字"}
|
978 |
+
{"id": 86222, "token": "解", "type": "中文单字"}
|
979 |
+
{"id": 86319, "token": "—we", "type": "中文标点"}
|
980 |
+
{"id": 86348, "token": "全部", "type": "中文多字"}
|
981 |
+
{"id": 86354, "token": "景", "type": "中文单字"}
|
982 |
+
{"id": 86429, "token": "资源", "type": "中文多字"}
|
983 |
+
{"id": 86436, "token": "去", "type": "中文单字"}
|
984 |
+
{"id": 86461, "token": "华", "type": "中文单字"}
|
985 |
+
{"id": 86508, "token": "“Yes", "type": "中文标点"}
|
986 |
+
{"id": 86601, "token": "’T", "type": "中文标点"}
|
987 |
+
{"id": 86741, "token": "评论", "type": "中文多字"}
|
988 |
+
{"id": 86758, "token": "使用", "type": "中文多字"}
|
989 |
+
{"id": 86846, "token": "’B", "type": "中文标点"}
|
990 |
+
{"id": 86867, "token": "配置", "type": "中文多字"}
|
991 |
+
{"id": 87023, "token": "–and", "type": "中文标点"}
|
992 |
+
{"id": 87109, "token": "不", "type": "中文单字"}
|
993 |
+
{"id": 87177, "token": "話", "type": "中文单字"}
|
994 |
+
{"id": 87217, "token": "番", "type": "中文单字"}
|
995 |
+
{"id": 87219, "token": "问题", "type": "中文多字"}
|
996 |
+
{"id": 87247, "token": "—all", "type": "中文标点"}
|
997 |
+
{"id": 87327, "token": "报道", "type": "中文多字"}
|
998 |
+
{"id": 87412, "token": "环", "type": "中文单字"}
|
999 |
+
{"id": 87441, "token": "张", "type": "中文单字"}
|
1000 |
+
{"id": 87447, "token": "開", "type": "中文单字"}
|
1001 |
+
{"id": 87474, "token": "無しさん", "type": "中文多字"}
|
1002 |
+
{"id": 87502, "token": "种", "type": "中文单字"}
|
1003 |
+
{"id": 87646, "token": "成", "type": "中文单字"}
|
1004 |
+
{"id": 87671, "token": "—one", "type": "中文标点"}
|
1005 |
+
{"id": 87844, "token": "易", "type": "中文单字"}
|
1006 |
+
{"id": 87990, "token": "“Oh", "type": "中文标点"}
|
1007 |
+
{"id": 88108, "token": "……\n\n", "type": "中文标点"}
|
1008 |
+
{"id": 88126, "token": "您", "type": "中文单字"}
|
1009 |
+
{"id": 88161, "token": "’an", "type": "中文标点"}
|
1010 |
+
{"id": 88240, "token": "视频", "type": "中文多字"}
|
1011 |
+
{"id": 88343, "token": "》,", "type": "中文标点"}
|
1012 |
+
{"id": 88348, "token": ".’”\n\n", "type": "中文标点"}
|
1013 |
+
{"id": 88356, "token": "再", "type": "中文单字"}
|
1014 |
+
{"id": 88367, "token": "可能", "type": "中文多字"}
|
1015 |
+
{"id": 88435, "token": "文字", "type": "中文多字"}
|
1016 |
+
{"id": 88631, "token": "板", "type": "中文单字"}
|
1017 |
+
{"id": 88851, "token": "’acc", "type": "中文标点"}
|
1018 |
+
{"id": 88852, "token": "以下", "type": "中文多字"}
|
1019 |
+
{"id": 88905, "token": "电话", "type": "中文多字"}
|
1020 |
+
{"id": 88925, "token": "“Well", "type": "中文标点"}
|
1021 |
+
{"id": 88958, "token": "—from", "type": "中文标点"}
|
1022 |
+
{"id": 89046, "token": "連", "type": "中文单字"}
|
1023 |
+
{"id": 89151, "token": "真", "type": "中文单字"}
|
1024 |
+
{"id": 89186, "token": "有效", "type": "中文多字"}
|
1025 |
+
{"id": 89213, "token": "’:", "type": "中文标点"}
|
1026 |
+
{"id": 89408, "token": "今年", "type": "中文多字"}
|
1027 |
+
{"id": 89575, "token": "€“", "type": "中文标点"}
|
1028 |
+
{"id": 89753, "token": "流", "type": "中文单字"}
|
1029 |
+
{"id": 89783, "token": "余", "type": "中文单字"}
|
1030 |
+
{"id": 89874, "token": "”\n", "type": "中文标点"}
|
1031 |
+
{"id": 89902, "token": "任务", "type": "中文多字"}
|
1032 |
+
{"id": 90070, "token": "见", "type": "中文单字"}
|
1033 |
+
{"id": 90091, "token": "正确", "type": "中文多字"}
|
1034 |
+
{"id": 90112, "token": "给", "type": "中文单字"}
|
1035 |
+
{"id": 90147, "token": "服务器", "type": "中文多字"}
|
1036 |
+
{"id": 90223, "token": "’es", "type": "中文标点"}
|
1037 |
+
{"id": 90261, "token": "来源", "type": "中文多字"}
|
1038 |
+
{"id": 90354, "token": "结", "type": "中文单字"}
|
1039 |
+
{"id": 90493, "token": "。<", "type": "中文标点"}
|
1040 |
+
{"id": 90578, "token": "…\n", "type": "中文标点"}
|
1041 |
+
{"id": 90581, "token": "-", "type": "中文标点"}
|
1042 |
+
{"id": 90756, "token": "详情", "type": "中文多字"}
|
1043 |
+
{"id": 90863, "token": "—if", "type": "中文标点"}
|
1044 |
+
{"id": 91006, "token": "?」", "type": "中文标点"}
|
1045 |
+
{"id": 91077, "token": "局", "type": "中文单字"}
|
1046 |
+
{"id": 91082, "token": "主", "type": "中文单字"}
|
1047 |
+
{"id": 91240, "token": "’à", "type": "中文标点"}
|
1048 |
+
{"id": 91272, "token": "优", "type": "中文单字"}
|
1049 |
+
{"id": 91386, "token": "书", "type": "中文单字"}
|
1050 |
+
{"id": 91417, "token": "’y", "type": "中文标点"}
|
1051 |
+
{"id": 91418, "token": "’util", "type": "中文标点"}
|
1052 |
+
{"id": 91443, "token": "’hui", "type": "中文标点"}
|
1053 |
+
{"id": 91466, "token": "一页", "type": "中文多字"}
|
1054 |
+
{"id": 91495, "token": ",并", "type": "中文多字"}
|
1055 |
+
{"id": 91547, "token": "发布", "type": "中文多字"}
|
1056 |
+
{"id": 91763, "token": "思", "type": "中文单字"}
|
1057 |
+
{"id": 91774, "token": "見", "type": "中文单字"}
|
1058 |
+
{"id": 91837, "token": ":<", "type": "中文标点"}
|
1059 |
+
{"id": 91875, "token": "動", "type": "中文单字"}
|
1060 |
+
{"id": 91940, "token": "运", "type": "中文单字"}
|
1061 |
+
{"id": 91951, "token": "审核", "type": "中文多字"}
|
1062 |
+
{"id": 91967, "token": "图", "type": "中文单字"}
|
1063 |
+
{"id": 91985, "token": "样", "type": "中文单字"}
|
1064 |
+
{"id": 92019, "token": "其中", "type": "中文多字"}
|
1065 |
+
{"id": 92056, "token": "权限", "type": "中文多字"}
|
1066 |
+
{"id": 92099, "token": "删除成功", "type": "中文多字"}
|
1067 |
+
{"id": 92113, "token": " “…", "type": "中文标点"}
|
1068 |
+
{"id": 92150, "token": "�新", "type": "中文多字"}
|
1069 |
+
{"id": 92193, "token": "(笑", "type": "中文多字"}
|
1070 |
+
{"id": 92211, "token": ",《", "type": "中文标点"}
|
1071 |
+
{"id": 92264, "token": ",’”", "type": "中文标点"}
|
1072 |
+
{"id": 92318, "token": "时间", "type": "中文多字"}
|
1073 |
+
{"id": 92366, "token": "】,", "type": "中文标点"}
|
1074 |
+
{"id": 92378, "token": ")\r\n", "type": "中文标点"}
|
1075 |
+
{"id": 92382, "token": "定义", "type": "中文多字"}
|
1076 |
+
{"id": 92517, "token": "关", "type": "中文单字"}
|
1077 |
+
{"id": 92527, "token": "登", "type": "中文单字"}
|
1078 |
+
{"id": 92553, "token": "销", "type": "中文单字"}
|
1079 |
+
{"id": 92555, "token": "万元", "type": "中文多字"}
|
1080 |
+
{"id": 92672, "token": "同时", "type": "中文多字"}
|
1081 |
+
{"id": 92693, "token": "無料", "type": "中文多字"}
|
1082 |
+
{"id": 92748, "token": "’all", "type": "中文标点"}
|
1083 |
+
{"id": 92776, "token": "即", "type": "中文单字"}
|
1084 |
+
{"id": 92780, "token": "只", "type": "中文单字"}
|
1085 |
+
{"id": 92877, "token": "老", "type": "中文单字"}
|
1086 |
+
{"id": 93056, "token": "、“", "type": "中文标点"}
|
1087 |
+
{"id": 93115, "token": "岁", "type": "中文单字"}
|
1088 |
+
{"id": 93126, "token": "’Brien", "type": "中文标点"}
|
1089 |
+
{"id": 93132, "token": "大小", "type": "中文多字"}
|
1090 |
+
{"id": 93233, "token": "找", "type": "中文单字"}
|
1091 |
+
{"id": 93269, "token": "“These", "type": "中文标点"}
|
1092 |
+
{"id": 93393, "token": "实", "type": "中文单字"}
|
1093 |
+
{"id": 93413, "token": "或", "type": "中文单字"}
|
1094 |
+
{"id": 93446, "token": "“\n\n", "type": "中文标点"}
|
1095 |
+
{"id": 93474, "token": "节点", "type": "中文多字"}
|
1096 |
+
{"id": 93598, "token": "若", "type": "中文单字"}
|
1097 |
+
{"id": 93636, "token": "小时", "type": "中文多字"}
|
1098 |
+
{"id": 93673, "token": "“To", "type": "中文标点"}
|
1099 |
+
{"id": 93830, "token": "—\"", "type": "中文标点"}
|
1100 |
+
{"id": 93922, "token": "’autres", "type": "中文标点"}
|
1101 |
+
{"id": 93994, "token": "其他", "type": "中文多字"}
|
1102 |
+
{"id": 94134, "token": "自治", "type": "中文多字"}
|
1103 |
+
{"id": 94249, "token": "分享", "type": "中文多字"}
|
1104 |
+
{"id": 94345, "token": "’ex", "type": "中文标点"}
|
1105 |
+
{"id": 94366, "token": "稍", "type": "中文单字"}
|
1106 |
+
{"id": 94518, "token": "…the", "type": "中文标点"}
|
1107 |
+
{"id": 94537, "token": "�件", "type": "中文多字"}
|
1108 |
+
{"id": 94588, "token": "达", "type": "中文单字"}
|
1109 |
+
{"id": 94668, "token": "邮箱", "type": "中文多字"}
|
1110 |
+
{"id": 94720, "token": "新增", "type": "中文多字"}
|
1111 |
+
{"id": 94785, "token": "提", "type": "中文单字"}
|
1112 |
+
{"id": 94895, "token": ":%", "type": "中文标点"}
|
1113 |
+
{"id": 94923, "token": "院", "type": "中文单字"}
|
1114 |
+
{"id": 94983, "token": "加", "type": "中文单字"}
|
1115 |
+
{"id": 95001, "token": "価", "type": "中文单字"}
|
1116 |
+
{"id": 95221, "token": "気", "type": "中文单字"}
|
1117 |
+
{"id": 95337, "token": "约", "type": "中文单字"}
|
1118 |
+
{"id": 95399, "token": "速", "type": "中文单字"}
|
1119 |
+
{"id": 95475, "token": "停", "type": "中文单字"}
|
1120 |
+
{"id": 95532, "token": "?\n", "type": "中文标点"}
|
1121 |
+
{"id": 95543, "token": "反", "type": "中文单字"}
|
1122 |
+
{"id": 95544, "token": "票", "type": "中文单字"}
|
1123 |
+
{"id": 95598, "token": "十", "type": "中文单字"}
|
1124 |
+
{"id": 96153, "token": ",则", "type": "中文多字"}
|
1125 |
+
{"id": 96197, "token": ",—", "type": "中文标点"}
|
1126 |
+
{"id": 96203, "token": "“At", "type": "中文标点"}
|
1127 |
+
{"id": 96206, "token": "’)", "type": "中文标点"}
|
1128 |
+
{"id": 96332, "token": "[…]", "type": "中文标点"}
|
1129 |
+
{"id": 96356, "token": "身", "type": "中文单字"}
|
1130 |
+
{"id": 96407, "token": "商品", "type": "中文多字"}
|
1131 |
+
{"id": 96412, "token": "含", "type": "中文单字"}
|
1132 |
+
{"id": 96455, "token": "率", "type": "中文单字"}
|
1133 |
+
{"id": 96500, "token": "汽", "type": "中文单字"}
|
1134 |
+
{"id": 96511, "token": "专", "type": "中文单字"}
|
1135 |
+
{"id": 96555, "token": "/", "type": "中文标点"}
|
1136 |
+
{"id": 96557, "token": "管理员", "type": "中文多字"}
|
1137 |
+
{"id": 97049, "token": "歳", "type": "中文单字"}
|
1138 |
+
{"id": 97150, "token": ",在", "type": "中文多字"}
|
1139 |
+
{"id": 97360, "token": ".–", "type": "中文标点"}
|
1140 |
+
{"id": 97432, "token": "”。\n\n", "type": "中文标点"}
|
1141 |
+
{"id": 97518, "token": "関", "type": "中文单字"}
|
1142 |
+
{"id": 97522, "token": "议", "type": "中文单字"}
|
1143 |
+
{"id": 97565, "token": "雷", "type": "中文单字"}
|
1144 |
+
{"id": 97655, "token": "正在", "type": "中文多字"}
|
1145 |
+
{"id": 97908, "token": "�能", "type": "中文多字"}
|
1146 |
+
{"id": 97999, "token": "。(", "type": "中文标点"}
|
1147 |
+
{"id": 98128, "token": "自动生成", "type": "中文多字"}
|
1148 |
+
{"id": 98134, "token": "’elle", "type": "中文标点"}
|
1149 |
+
{"id": 98184, "token": "些", "type": "中文单字"}
|
1150 |
+
{"id": 98220, "token": "界", "type": "中文单字"}
|
1151 |
+
{"id": 98245, "token": "陆", "type": "中文单字"}
|
1152 |
+
{"id": 98261, "token": "注意", "type": "中文多字"}
|
1153 |
+
{"id": 98390, "token": "备注", "type": "中文多字"}
|
1154 |
+
{"id": 98406, "token": "倍", "type": "中文单字"}
|
1155 |
+
{"id": 98458, "token": ",’’", "type": "中文标点"}
|
1156 |
+
{"id": 98476, "token": "“How", "type": "中文标点"}
|
1157 |
+
{"id": 98499, "token": "読", "type": "中文单字"}
|
1158 |
+
{"id": 98580, "token": "价格", "type": "中文多字"}
|
1159 |
+
{"id": 98657, "token": "检", "type": "中文单字"}
|
1160 |
+
{"id": 98711, "token": "我的", "type": "中文多字"}
|
1161 |
+
{"id": 98739, "token": "我们", "type": "中文多字"}
|
1162 |
+
{"id": 98806, "token": "还", "type": "中文单字"}
|
1163 |
+
{"id": 98871, "token": "析", "type": "中文单字"}
|
1164 |
+
{"id": 98897, "token": "企", "type": "中文单字"}
|
1165 |
+
{"id": 98915, "token": "友", "type": "中文单字"}
|
1166 |
+
{"id": 99007, "token": "”的", "type": "中文多字"}
|
1167 |
+
{"id": 99072, "token": "。www", "type": "中文标点"}
|
1168 |
+
{"id": 99083, "token": "“All", "type": "中文标点"}
|
1169 |
+
{"id": 99313, "token": ",…", "type": "中文标点"}
|
1170 |
+
{"id": 99337, "token": "简", "type": "中文单字"}
|
1171 |
+
{"id": 99379, "token": "移到", "type": "中文多字"}
|
1172 |
+
{"id": 99382, "token": ")”", "type": "中文标点"}
|
1173 |
+
{"id": 99397, "token": "問", "type": "中文单字"}
|
1174 |
+
{"id": 99480, "token": "功能", "type": "中文多字"}
|
1175 |
+
{"id": 99496, "token": "若要", "type": "中文多字"}
|
1176 |
+
{"id": 99502, "token": "长度", "type": "中文多字"}
|
1177 |
+
{"id": 99563, "token": "—at", "type": "中文标点"}
|
1178 |
+
{"id": 99643, "token": "】,【", "type": "中文标点"}
|
1179 |
+
{"id": 99741, "token": "装", "type": "中文单字"}
|
1180 |
+
{"id": 99750, "token": "感", "type": "中文单字"}
|
1181 |
+
{"id": 99771, "token": "哈", "type": "中文单字"}
|
1182 |
+
{"id": 99799, "token": "“One", "type": "中文标点"}
|
1183 |
+
{"id": 99849, "token": "何", "type": "中文单字"}
|
1184 |
+
{"id": 99941, "token": "预", "type": "中文单字"}
|
1185 |
+
{"id": 100065, "token": "~\n\n", "type": "中文标点"}
|
1186 |
+
{"id": 100066, "token": "送料", "type": "中文多字"}
|
1187 |
+
{"id": 100067, "token": "…it", "type": "中文标点"}
|
1188 |
+
{"id": 100179, "token": "尔", "type": "中文单字"}
|
1189 |
+
{"id": 100207, "token": "在线", "type": "中文多字"}
|
utils/log_util.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
import logging
|
3 |
|
4 |
logging.basicConfig(
|
5 |
-
format='%(asctime)s
|
6 |
level=logging.INFO,
|
7 |
datefmt="%Y-%m-%d %H:%M:%S",
|
8 |
|
|
|
2 |
import logging
|
3 |
|
4 |
logging.basicConfig(
|
5 |
+
format='[%(asctime)s] [%(levelname)s] [%(process)d:%(thread)d] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s',
|
6 |
level=logging.INFO,
|
7 |
datefmt="%Y-%m-%d %H:%M:%S",
|
8 |
|
utils/zh_util.py
CHANGED
@@ -52,7 +52,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
|
|
52 |
if has_chinese(decode_str):
|
53 |
# bert词典有 ##开头的
|
54 |
# byteBPE词典有带空格的
|
55 |
-
decode_str = decode_str.strip().replace("#", "")
|
56 |
zh_token_count["total"] += 1
|
57 |
if len(decode_str) > 1:
|
58 |
zh_token_count["中文多字"] += 1
|
@@ -93,4 +93,6 @@ if __name__ == "__main__":
|
|
93 |
# test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
|
94 |
# test_coding_length(zh_punc)
|
95 |
# test_coding_length(zh_iterator())
|
96 |
-
|
|
|
|
|
|
52 |
if has_chinese(decode_str):
|
53 |
# bert词典有 ##开头的
|
54 |
# byteBPE词典有带空格的
|
55 |
+
decode_str = decode_str.strip().replace("#", "") # TODO, 按类型
|
56 |
zh_token_count["total"] += 1
|
57 |
if len(decode_str) > 1:
|
58 |
zh_token_count["中文多字"] += 1
|
|
|
93 |
# test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
|
94 |
# test_coding_length(zh_punc)
|
95 |
# test_coding_length(zh_iterator())
|
96 |
+
|
97 |
+
from vocab.gpt_35_turbo import tokenizer
|
98 |
+
iter_vocab(tokenizer)
|
vocab/README.md
CHANGED
@@ -86,4 +86,6 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
|
|
86 |
|
87 |
|
88 |
|
89 |
-
##
|
|
|
|
|
|
86 |
|
87 |
|
88 |
|
89 |
+
## reversible and lossless
|
90 |
+
|
91 |
+
It's reversible and lossless, so you can convert tokens back into the original text
|
vocab/__init__.py
CHANGED
@@ -24,8 +24,12 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
24 |
- tiktoken
|
25 |
- icetk
|
26 |
- hf_tokenizer
|
27 |
-
-
|
28 |
-
|
|
|
|
|
|
|
|
|
29 |
- tiktoken
|
30 |
- 特征:空格就是空格,
|
31 |
- 示例:gpt3.5 gpt4
|
@@ -57,8 +61,8 @@ all_tokenizers = [
|
|
57 |
"moss",
|
58 |
#
|
59 |
# ######
|
60 |
-
|
61 |
-
|
62 |
#
|
63 |
# #### bloom 系列
|
64 |
"bloom",
|
@@ -69,7 +73,7 @@ all_tokenizers = [
|
|
69 |
# "gpt_neox_chinese_v1",
|
70 |
#
|
71 |
# ##### glm系列
|
72 |
-
|
73 |
"chatglm_6b",
|
74 |
"chatglm2-6b",
|
75 |
#
|
@@ -80,13 +84,14 @@ all_tokenizers = [
|
|
80 |
# "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
81 |
# "belle_llama_ext_7b",
|
82 |
# "alpaca_7b",
|
83 |
-
"
|
|
|
84 |
"qwen",
|
85 |
"internlm_chat_7b",
|
86 |
-
"
|
|
|
87 |
]
|
88 |
|
89 |
-
|
90 |
class TokenizerType(Enum):
|
91 |
"""
|
92 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
|
|
24 |
- tiktoken
|
25 |
- icetk
|
26 |
- hf_tokenizer
|
27 |
+
- 特征:
|
28 |
+
- .model 是 tokenizer.models.BPE 类型
|
29 |
+
- 词典有 Ġ "\u0120" 开头
|
30 |
+
- 有1个tokenizer.json(包括 merge vocab),或者分开独立文件
|
31 |
+
- .model.from_file .model.save .model.token_to_id .model.tokenize
|
32 |
+
- 示例:gpt_neox_20b, moss, bloom
|
33 |
- tiktoken
|
34 |
- 特征:空格就是空格,
|
35 |
- 示例:gpt3.5 gpt4
|
|
|
61 |
"moss",
|
62 |
#
|
63 |
# ######
|
64 |
+
"chatyuan_large_v2",
|
65 |
+
"prompt_clue",
|
66 |
#
|
67 |
# #### bloom 系列
|
68 |
"bloom",
|
|
|
73 |
# "gpt_neox_chinese_v1",
|
74 |
#
|
75 |
# ##### glm系列
|
76 |
+
"glm_chinese",
|
77 |
"chatglm_6b",
|
78 |
"chatglm2-6b",
|
79 |
#
|
|
|
84 |
# "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
85 |
# "belle_llama_ext_7b",
|
86 |
# "alpaca_7b",
|
87 |
+
"baichuan",
|
88 |
+
"baichuan2",
|
89 |
"qwen",
|
90 |
"internlm_chat_7b",
|
91 |
+
"falcon_180b",
|
92 |
+
# "goat",
|
93 |
]
|
94 |
|
|
|
95 |
class TokenizerType(Enum):
|
96 |
"""
|
97 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
vocab/{baichuan_7b → baichuan}/Baichuan-7B/config.json
RENAMED
File without changes
|
vocab/{baichuan_7b → baichuan}/Baichuan-7B/configuration_baichuan.py
RENAMED
File without changes
|
vocab/{baichuan_7b → baichuan}/Baichuan-7B/special_tokens_map.json
RENAMED
File without changes
|
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenization_baichuan.py
RENAMED
File without changes
|
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer.model
RENAMED
File without changes
|
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer_config.json
RENAMED
File without changes
|
vocab/{baichuan_7b → baichuan}/__init__.py
RENAMED
File without changes
|
vocab/{baichuan_7b → baichuan}/demo.py
RENAMED
File without changes
|
vocab/baichuan2/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
from vocab import TokenizerType
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", trust_remote_code=True)
|
5 |
+
|
6 |
+
|
7 |
+
# byte-bpe sentencepiece
|
8 |
+
tokenizer.type = TokenizerType.ByteBPE
|
9 |
+
|
10 |
+
tokenizer.comments = "expand the vocqbulary size from 64000 in Baichuan1 to 125696"
|
vocab/bloom/test_tokenizer.py
CHANGED
@@ -12,6 +12,8 @@ print("vocab size:", tokenizer.vocab_size)
|
|
12 |
tokens = tokenizer.encode("中")
|
13 |
decode_line = tokenizer.decode(tokens)
|
14 |
|
|
|
|
|
15 |
|
16 |
def id2token(ids):
|
17 |
return tokenizer.convert_ids_to_tokens(ids)
|
|
|
12 |
tokens = tokenizer.encode("中")
|
13 |
decode_line = tokenizer.decode(tokens)
|
14 |
|
15 |
+
tokenizer.save_vocabulary("tmp", "ddd")
|
16 |
+
|
17 |
|
18 |
def id2token(ids):
|
19 |
return tokenizer.convert_ids_to_tokens(ids)
|
vocab/chinese_llama2/__init__.py
CHANGED
@@ -1,3 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import LlamaTokenizer
|
2 |
|
3 |
tokenizer = LlamaTokenizer.from_pretrained("ziqingyang/chinese-llama-2-7b")
|
|
|
1 |
+
"""
|
2 |
+
## 词典扩容
|
3 |
+
32000 <pad>
|
4 |
+
32001 但
|
5 |
+
|
6 |
+
"""
|
7 |
+
|
8 |
from transformers import LlamaTokenizer
|
9 |
|
10 |
tokenizer = LlamaTokenizer.from_pretrained("ziqingyang/chinese-llama-2-7b")
|
vocab/falcon_180b/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
|
5 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
6 |
+
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
# tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-180b") # token
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
|
vocab/falcon_180b/tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
">>TITLE<<",
|
4 |
+
">>ABSTRACT<<",
|
5 |
+
">>INTRODUCTION<<",
|
6 |
+
">>SUMMARY<<",
|
7 |
+
">>COMMENT<<",
|
8 |
+
">>ANSWER<<",
|
9 |
+
">>QUESTION<<",
|
10 |
+
">>DOMAIN<<",
|
11 |
+
">>PREFIX<<",
|
12 |
+
">>SUFFIX<<",
|
13 |
+
">>MIDDLE<<"
|
14 |
+
],
|
15 |
+
"eos_token": "<|endoftext|>"
|
16 |
+
}
|
vocab/falcon_180b/tokenizer/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vocab/falcon_180b/tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"model_max_length": 2048,
|
5 |
+
"name_or_path": "tiiuae/falcon-40b",
|
6 |
+
"special_tokens_map_file": null,
|
7 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
8 |
+
}
|
vocab/gpt_35_turbo/__init__.py
CHANGED
@@ -7,6 +7,8 @@ from utils.log_util import logger
|
|
7 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
8 |
tokenizer.vocab_size = tokenizer.n_vocab
|
9 |
|
|
|
|
|
10 |
|
11 |
|
12 |
def decode(self, tokens, errors="replace"):
|
@@ -20,8 +22,11 @@ def decode(self, tokens, errors="replace"):
|
|
20 |
def convert_ids_to_tokens(self, tokens):
|
21 |
return tokenizer.decode_tokens_bytes(tokens)
|
22 |
|
23 |
-
def get_vocab(self):
|
24 |
-
"""Returns vocab as a dict
|
|
|
|
|
|
|
25 |
vocab = {}
|
26 |
key_error_list = []
|
27 |
unicode_decode_error_list = []
|
@@ -29,11 +34,13 @@ def get_vocab(self):
|
|
29 |
try:
|
30 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
31 |
token_str = token_byte.decode("utf-8")
|
32 |
-
vocab[
|
33 |
-
except KeyError: # 100256 100261-100275
|
34 |
key_error_list.append(i)
|
35 |
-
|
|
|
36 |
unicode_decode_error_list.append((i, str(token_byte)))
|
|
|
37 |
|
38 |
# vocab.update(self.added_tokens_encoder)
|
39 |
logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
|
@@ -41,6 +48,8 @@ def get_vocab(self):
|
|
41 |
return vocab
|
42 |
|
43 |
|
|
|
|
|
44 |
Encoding.decode = decode
|
45 |
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
46 |
Encoding.get_vocab = get_vocab
|
|
|
7 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
8 |
tokenizer.vocab_size = tokenizer.n_vocab
|
9 |
|
10 |
+
tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
|
11 |
+
tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
|
12 |
|
13 |
|
14 |
def decode(self, tokens, errors="replace"):
|
|
|
22 |
def convert_ids_to_tokens(self, tokens):
|
23 |
return tokenizer.decode_tokens_bytes(tokens)
|
24 |
|
25 |
+
def get_vocab(self, token_type="str"):
|
26 |
+
"""Returns vocab as a dict
|
27 |
+
:param token_type: ["str", "byte"]
|
28 |
+
:return:
|
29 |
+
"""
|
30 |
vocab = {}
|
31 |
key_error_list = []
|
32 |
unicode_decode_error_list = []
|
|
|
34 |
try:
|
35 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
36 |
token_str = token_byte.decode("utf-8")
|
37 |
+
vocab[token_byte] = i
|
38 |
+
except KeyError: # 16 KeyError, 100256 100261-100275
|
39 |
key_error_list.append(i)
|
40 |
+
# vocab[f"[KeyError]-{i}"] = i
|
41 |
+
except UnicodeDecodeError: # 773 UnicodeDecodeError
|
42 |
unicode_decode_error_list.append((i, str(token_byte)))
|
43 |
+
vocab[token_byte] = i
|
44 |
|
45 |
# vocab.update(self.added_tokens_encoder)
|
46 |
logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
|
|
|
48 |
return vocab
|
49 |
|
50 |
|
51 |
+
|
52 |
+
# tiktoken patch
|
53 |
Encoding.decode = decode
|
54 |
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
55 |
Encoding.get_vocab = get_vocab
|
vocab/gpt_35_turbo/aaa.py
CHANGED
@@ -17,6 +17,11 @@ import tiktoken
|
|
17 |
|
18 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
for token_id in [100263, 99834]: # special_tokens: 200257-100260 100276
|
22 |
try:
|
|
|
17 |
|
18 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
19 |
|
20 |
+
tokens = [100263, 99834]
|
21 |
+
|
22 |
+
tokenizer.decode(tokens)
|
23 |
+
|
24 |
+
tokenizer._core_bpe.decode_bytes(tokens).decode("utf-8", errors="replace")
|
25 |
|
26 |
for token_id in [100263, 99834]: # special_tokens: 200257-100260 100276
|
27 |
try:
|
vocab/gpt_4/__init__.py
CHANGED
@@ -1,48 +1,3 @@
|
|
1 |
|
2 |
-
|
3 |
-
import tiktoken
|
4 |
-
from tiktoken import Encoding
|
5 |
-
from utils.log_util import logger
|
6 |
-
|
7 |
-
tokenizer = tiktoken.encoding_for_model('gpt-4')
|
8 |
-
tokenizer.vocab_size = tokenizer.n_vocab
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
def decode(self, tokens, errors="replace"):
|
13 |
-
# def decode(self, tokens: list[int], errors: str = "replace") -> str:
|
14 |
-
try:
|
15 |
-
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
16 |
-
except:
|
17 |
-
decode_str = "null"
|
18 |
-
return decode_str
|
19 |
-
|
20 |
-
def convert_ids_to_tokens(self, tokens):
|
21 |
-
return tokenizer.decode_tokens_bytes(tokens)
|
22 |
-
|
23 |
-
def get_vocab(self):
|
24 |
-
"""Returns vocab as a dict"""
|
25 |
-
vocab = {}
|
26 |
-
key_error_list = []
|
27 |
-
unicode_decode_error_list = []
|
28 |
-
for i in range(self.vocab_size):
|
29 |
-
try:
|
30 |
-
token_byte = self.convert_ids_to_tokens([i])[0]
|
31 |
-
token_str = token_byte.decode("utf-8")
|
32 |
-
vocab[token_str] = i
|
33 |
-
except KeyError: # 100256 100261-100275
|
34 |
-
key_error_list.append(i)
|
35 |
-
except UnicodeDecodeError: # 特别多
|
36 |
-
unicode_decode_error_list.append((i, str(token_byte)))
|
37 |
-
|
38 |
-
# vocab.update(self.added_tokens_encoder)
|
39 |
-
logger.info(f"gpt-4 {len(key_error_list)} KeyError: {key_error_list}")
|
40 |
-
logger.info(f"gpt-4 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
41 |
-
return vocab
|
42 |
-
|
43 |
-
|
44 |
-
Encoding.decode = decode
|
45 |
-
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
46 |
-
Encoding.get_vocab = get_vocab
|
47 |
-
|
48 |
|
|
|
1 |
|
2 |
+
from vocab.gpt_35_turbo import tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json
CHANGED
@@ -255,6 +255,8 @@
|
|
255 |
"end_of_word_suffix": null,
|
256 |
"fuse_unk": false,
|
257 |
"vocab": {
|
|
|
|
|
258 |
"531": 531,
|
259 |
"541": 541,
|
260 |
"566": 566,
|
|
|
255 |
"end_of_word_suffix": null,
|
256 |
"fuse_unk": false,
|
257 |
"vocab": {
|
258 |
+
"<|endoftext|>": 0,
|
259 |
+
"<|padding|>": 1,
|
260 |
"531": 531,
|
261 |
"541": 541,
|
262 |
"566": 566,
|
vocab/gpt_neox_chinese_v1/mock.py
CHANGED
@@ -1,17 +1,32 @@
|
|
1 |
import copy
|
2 |
import json
|
|
|
3 |
|
4 |
-
|
|
|
5 |
|
6 |
-
tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
|
7 |
|
8 |
-
vocab = tokenizer["model"]["vocab"]
|
|
|
9 |
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import copy
|
2 |
import json
|
3 |
+
from tokenizers import Tokenizer
|
4 |
|
5 |
+
def export_mock_tokenizer():
|
6 |
+
input_path = "20B_tokenizer_chinese.json"
|
7 |
|
8 |
+
tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
|
9 |
|
10 |
+
vocab = tokenizer["model"]["vocab"]
|
11 |
+
added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
|
12 |
|
13 |
+
for k, v in copy.deepcopy(vocab).items():
|
14 |
+
if v not in added_tokens:
|
15 |
+
vocab[str(v)] = v
|
16 |
+
vocab.pop(k)
|
17 |
|
18 |
+
out_path = input_path.replace(".json", ".mock.json")
|
19 |
+
with open(out_path, "w", encoding="utf-8") as f_out:
|
20 |
+
f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
|
21 |
|
22 |
+
|
23 |
+
def mock2():
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
def load_mock_tokenizer():
|
28 |
+
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
|
29 |
+
print('')
|
30 |
+
|
31 |
+
export_mock_tokenizer()
|
32 |
+
load_mock_tokenizer()
|
vocab/gpt_neox_chinese_v1/trouble-shooting.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
## Exception: data did not match any variant of untagged enum ModelWrapper at line 108219 column 3
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
## The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
|
9 |
+
|
10 |
+
|
11 |
+
```
|
12 |
+
The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
|
13 |
+
The OrderedVocab you are attempting to save contains a hole for index 50255, your vocabulary could be corrupted !
|
14 |
+
The OrderedVocab you are attempting to save contains a hole for index 50256, your vocabulary could be corrupted !
|
15 |
+
```
|
16 |
+
|
17 |
+
|
18 |
+
原因:50254 这些token并未在vocab中定义,只在 `added_tokens` 里定义了。
|
19 |
+
|
20 |
+
## ss
|
21 |
+
|
22 |
+
|
vocab/llama/__init__.py
CHANGED
@@ -1,7 +1,20 @@
|
|
1 |
|
2 |
"""
|
3 |
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"""
|
6 |
|
7 |
import os
|
|
|
1 |
|
2 |
"""
|
3 |
|
4 |
+
## 指令 special token
|
5 |
|
6 |
+
{"token_id": 29961, "decode_str": "[", "token": "["}
|
7 |
+
{"token_id": 25580, "decode_str": "INST", "token": "INST"}
|
8 |
+
{"token_id": 29962, "decode_str": "]", "token": "]"}
|
9 |
+
|
10 |
+
{"token_id": 3532, "decode_str": "<<", "token": "▁<<"}
|
11 |
+
{"token_id": 14816, "decode_str": "SY", "token": "SY"}
|
12 |
+
{"token_id": 29903, "decode_str": "S", "token": "S"}
|
13 |
+
{"token_id": 6778, "decode_str": ">>", "token": ">>"}
|
14 |
+
|
15 |
+
{"token_id": 13, "decode_str": "\n", "token": "<0x0A>"}
|
16 |
+
|
17 |
+
疑问:为什么不将 <<SYS>> <</SYS>> [INST] [/INST] 做成1个id?
|
18 |
"""
|
19 |
|
20 |
import os
|
vocab/llama/demo.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from transformers import LlamaTokenizer
|
5 |
+
|
6 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
7 |
+
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR)
|
12 |
+
|
13 |
+
|
14 |
+
tokens = [ 1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492,
|
15 |
+
526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889,
|
16 |
+
29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641,
|
17 |
+
9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311,
|
18 |
+
1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916,
|
19 |
+
391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793,
|
20 |
+
29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443,
|
21 |
+
5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644,
|
22 |
+
263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338,
|
23 |
+
451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012,
|
24 |
+
310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915,
|
25 |
+
29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016,
|
26 |
+
29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816,
|
27 |
+
29903, 6778, 13, 13, 15970, 526, 366, 518, 29914, 25580,
|
28 |
+
29962]
|
29 |
+
|
30 |
+
text = tokenizer.decode(tokens)
|
31 |
+
print(text)
|
32 |
+
for token_id in tokens:
|
33 |
+
print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))
|