eson commited on
Commit
d10ecd7
1 Parent(s): 1ee0570
Files changed (34) hide show
  1. app.py +76 -119
  2. style.css +32 -0
  3. util.py +94 -0
  4. utils/zh_util.py +9 -4
  5. vocab/{alpaca_7b → Intern_gpt}/README.md +0 -0
  6. vocab/README.md +3 -1
  7. vocab/__init__.py +40 -11
  8. vocab/{bert_en → _alpaca_7b}/README.md +0 -0
  9. vocab/{goat → _goat}/README.md +0 -0
  10. tokenizer.py → vocab/_goat/__init__.py +0 -0
  11. vocab/baichuan_7b/demo.py +3 -0
  12. vocab/bert_base_cased/README.md +0 -0
  13. vocab/bert_base_cased/__init__.py +3 -0
  14. vocab/{bert_chinese → bert_base_chinese}/README.md +0 -0
  15. vocab/{bert_chinese → bert_base_chinese}/__init__.py +0 -0
  16. vocab/{bert_chinese → bert_base_chinese}/test.py +0 -0
  17. vocab/{bert_chinese → bert_base_chinese}/test_zh_coding_len.py +0 -0
  18. vocab/{bert_chinese → bert_base_chinese}/tokenizer/config.json +0 -0
  19. vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer.json +0 -0
  20. vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer_config.json +0 -0
  21. vocab/{bert_chinese → bert_base_chinese}/tokenizer/vocab.txt +0 -0
  22. vocab/{bert_chinese → bert_base_chinese}/vocab.txt +0 -0
  23. vocab/bert_base_uncased/__init__.py +3 -0
  24. vocab/chatglm2_6b/__init__.py +2 -0
  25. vocab/gpt_35_turbo/__init__.py +17 -1
  26. vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py +1 -1
  27. vocab/gpt_neox_chinese_v1/to_v2/test2.py +1 -1
  28. vocab/gpt_nexo_20b/__init__.py +1 -0
  29. vocab/internlm_chat_7b/README.md +0 -0
  30. vocab/internlm_chat_7b/__init__.py +6 -0
  31. vocab/kplug/__init__.py +5 -0
  32. vocab/llama/__init__.py +1 -1
  33. vocab/llama2/__init__.py +0 -0
  34. vocab/moss/test_tokenizer.py +3 -4
app.py CHANGED
@@ -3,6 +3,12 @@
3
  # time: 2022/8/23 16:06
4
 
5
  """
 
 
 
 
 
 
6
 
7
  plots
8
 
@@ -19,21 +25,11 @@ table
19
  [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
20
  """
21
 
22
- import json
23
- import pandas as pd
24
  import gradio as gr
25
 
26
- from vocab import all_tokenizers, load_tokener
27
-
28
- # 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673
29
- # 隐藏legend:
30
- css = """
31
- .space-show {white-space: pre-wrap;}
32
- .cell-wrap {white-space: pre-wrap;}
33
- .category-legend {display: none !important}
34
- .statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;}
35
- .statistics label {text-align: center !important;}
36
- """
37
 
38
  example_text = """Replace this text in the input field to see how tokenization works
39
  华为智能音箱发布:华为Sound X"""
@@ -42,81 +38,18 @@ example_text = """Replace this text in the input field to see how tokenization w
42
  examples = [
43
  # ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
44
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
45
- ["符号测试:🦙", "baichuan_7b", "llama"],
46
- ["中文测试:🦙", "baichuan_7b", "llama"],
47
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
48
  ]
49
 
50
 
 
 
51
 
52
- def tokenize(text, tokenizer_type, color_num=5):
53
- """
54
- TODO: cache tokenizer
55
- """
56
- print(text, tokenizer_type)
57
- pos_tokens = []
58
- tokenizer = load_tokener(tokenizer_type)
59
- encoding = tokenizer.encode(text)
60
-
61
- table = []
62
-
63
- for idx, token_id in enumerate(encoding):
64
- decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
65
- pos_tokens.extend([(decode_text, str(idx % color_num))])
66
-
67
- # token "Byte": # 这是 utf-8编码吧?
68
- token = tokenizer.convert_ids_to_tokens([token_id])[0]
69
- if isinstance(token, bytes):
70
- try:
71
- token_str = token.decode("utf-8")
72
- except:
73
- token_str = token.decode("utf-8", errors="ignore")
74
- print("decode_error", token, token_str)
75
-
76
- token_bytes = token
77
- json_dumps = json.dumps(token_str)
78
- elif isinstance(token, str):
79
- token_str = token
80
- token_bytes = bytes(token_str, "utf-8")
81
- json_dumps = json.dumps(token_str)
82
- else:
83
- return
84
-
85
-
86
- # ⭐
87
- table.append(
88
- {"TokenID": token_id,
89
- "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
90
- "Text": decode_text, #
91
- # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
92
- "Bytes": str(token_bytes),
93
- # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
94
- }
95
- )
96
-
97
- table_df = pd.DataFrame(table)
98
- print(table)
99
- # print(table_df)
100
-
101
- return pos_tokens, table_df, len(encoding)
102
-
103
-
104
- def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
105
- pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1)
106
- pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2)
107
- return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2
108
 
109
 
110
- def get_vocab_size(tokenizer_type):
111
- tokenizer = load_tokener(tokenizer_type)
112
- return tokenizer.vocab_size
113
-
114
- def test_coding():
115
- bytes1 = b'\xe4\xb8\xad'
116
- print(bytes1) # b'\xe4\xb8\xad'
117
-
118
-
119
- with gr.Blocks(css=css) as demo:
120
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
121
  # links: https://www.coderstool.com/utf8-encoding-decoding
122
  # 功能:输入文本,进行分词
@@ -125,16 +58,29 @@ with gr.Blocks(css=css) as demo:
125
  #
126
  # Byte: 表示分词
127
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- gr.Markdown("## Input Text")
130
  user_input = gr.Textbox(
131
  value=example_text,
132
  label="Input Text",
133
  lines=5,
134
  show_label=False,
135
  ) # placeholder="Enter sentence here..."
 
 
 
 
136
 
137
- # submitBtn = gr.Button("生成回复", variant="primary")
138
 
139
  gr.Markdown("## Tokenization")
140
 
@@ -156,18 +102,24 @@ with gr.Blocks(css=css) as demo:
156
  lines=1,
157
  elem_classes="statistics"
158
  )
159
- stats_token_size_1 = gr.TextArea(
160
- label="Tokens",
 
161
  lines=1,
162
  elem_classes="statistics"
163
  )
164
- stats_3 = gr.TextArea(
165
- label="Compress Rate",
166
  lines=1,
167
  elem_classes="statistics"
168
  )
 
 
 
 
 
169
  # https://www.onlinewebfonts.com/icon/418591
170
- gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
171
  with gr.Column(scale=6):
172
  with gr.Group():
173
  tokenizer_type_2 = gr.Dropdown(
@@ -182,19 +134,23 @@ with gr.Blocks(css=css) as demo:
182
  lines=1,
183
  elem_classes="statistics"
184
  )
185
- stats_token_size_2 = gr.TextArea(
186
- label="Tokens",
 
187
  lines=1,
188
  elem_classes="statistics"
189
  )
190
- stats_6 = gr.TextArea(
191
- label="Compress Rate",
 
 
 
 
 
192
  lines=1,
193
  elem_classes="statistics"
194
  )
195
 
196
-
197
-
198
  # TODO: 图 表 压缩率
199
  with gr.Row():
200
  with gr.Column():
@@ -212,41 +168,42 @@ with gr.Blocks(css=css) as demo:
212
 
213
  with gr.Row():
214
  output_table_1 = gr.Dataframe(
215
- # headers=["TokenID", "Byte", "Text"],
216
- # datatype=["str", "str", "str"],
217
  # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
218
  )
219
  output_table_2 = gr.Dataframe(
220
- # headers=["TokenID", "Token", "Text"],
221
- # datatype=["str", "str", "str"],
222
  )
223
 
224
- tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
225
- tokenizer_type_1.change(get_vocab_size, [tokenizer_type_1], [stats_vocab_size_1])
 
 
 
 
226
 
227
  user_input.change(tokenize_pair,
228
  [user_input, tokenizer_type_1, tokenizer_type_2],
229
- [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2])
230
-
231
- tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2, stats_token_size_2])
232
- tokenizer_type_2.change(get_vocab_size, [tokenizer_type_2], [stats_vocab_size_2])
233
-
234
- gr.Examples(
235
- examples,
236
- [user_input, tokenizer_type_1, tokenizer_type_2],
237
- [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2],
238
- tokenize_pair,
239
- cache_examples=True,
 
240
  )
241
 
242
- # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
243
- # show_progress=True)
244
 
245
- # examples=[
246
- # ["What a beautiful morning for a walk!"],
247
- # ["It was the best of times, it was the worst of times."],
248
- # ["多个空格 It ss was the best of times, it was the worst of times."],
249
- # ]
250
 
251
  if __name__ == "__main__":
252
- demo.launch()
 
 
3
  # time: 2022/8/23 16:06
4
 
5
  """
6
+ ## TODO:
7
+ 1. token数,放到 label里
8
+ 2. http get方式获取参数,
9
+ 3. 自启动
10
+ 4.
11
+
12
 
13
  plots
14
 
 
25
  [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
26
  """
27
 
28
+
 
29
  import gradio as gr
30
 
31
+ from vocab import all_tokenizers
32
+ from util import *
 
 
 
 
 
 
 
 
 
33
 
34
  example_text = """Replace this text in the input field to see how tokenization works
35
  华为智能音箱发布:华为Sound X"""
 
38
  examples = [
39
  # ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
40
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
41
+ ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
42
+ ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
43
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
44
  ]
45
 
46
 
47
+ def example_fn(example_idx):
48
+ return examples[example_idx]
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
+ with gr.Blocks(css="style.css") as demo:
 
 
 
 
 
 
 
 
 
53
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
54
  # links: https://www.coderstool.com/utf8-encoding-decoding
55
  # 功能:输入文本,进行分词
 
58
  #
59
  # Byte: 表示分词
60
 
61
+ with gr.Row():
62
+ gr.Markdown("## Input Text")
63
+ dropdown_examples = gr.Dropdown(
64
+ ["Example1", "Example2", "Example3"],
65
+ value="Examples",
66
+ type="index",
67
+ show_label=False,
68
+ container=False,
69
+ scale=0,
70
+ elem_classes="example-style"
71
+ )
72
 
 
73
  user_input = gr.Textbox(
74
  value=example_text,
75
  label="Input Text",
76
  lines=5,
77
  show_label=False,
78
  ) # placeholder="Enter sentence here..."
79
+ # gr.Examples(
80
+ # examples,
81
+ # None,
82
+ # )
83
 
 
84
 
85
  gr.Markdown("## Tokenization")
86
 
 
102
  lines=1,
103
  elem_classes="statistics"
104
  )
105
+ stats_zh_token_size_1 = gr.TextArea(
106
+ # value="1252/1455",
107
+ label="ZH char/word",
108
  lines=1,
109
  elem_classes="statistics"
110
  )
111
+ stats_overlap_token_size_1 = gr.TextArea(
112
+ label="Overlap Tokens",
113
  lines=1,
114
  elem_classes="statistics"
115
  )
116
+ # stats_3 = gr.TextArea(
117
+ # label="Compress Rate",
118
+ # lines=1,
119
+ # elem_classes="statistics"
120
+ # )
121
  # https://www.onlinewebfonts.com/icon/418591
122
+ gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
123
  with gr.Column(scale=6):
124
  with gr.Group():
125
  tokenizer_type_2 = gr.Dropdown(
 
134
  lines=1,
135
  elem_classes="statistics"
136
  )
137
+ stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
138
+ # value="12/45",
139
+ label="ZH char/word",
140
  lines=1,
141
  elem_classes="statistics"
142
  )
143
+ # stats_6 = gr.TextArea(
144
+ # label="Compress Rate",
145
+ # lines=1,
146
+ # elem_classes="statistics"
147
+ # )
148
+ stats_overlap_token_size_2 = gr.TextArea(
149
+ label="Overlap Tokens",
150
  lines=1,
151
  elem_classes="statistics"
152
  )
153
 
 
 
154
  # TODO: 图 表 压缩率
155
  with gr.Row():
156
  with gr.Column():
 
168
 
169
  with gr.Row():
170
  output_table_1 = gr.Dataframe(
171
+ headers=["TokenID", "Byte", "Text"],
172
+ datatype=["str", "str", "str"],
173
  # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
174
  )
175
  output_table_2 = gr.Dataframe(
176
+ headers=["TokenID", "Token", "Text"],
177
+ datatype=["str", "str", "str"],
178
  )
179
 
180
+ tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
181
+ [output_text_1, output_table_1])
182
+ # 下面两个好像可以合并
183
+ tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
184
+ tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
185
+ [stats_overlap_token_size_1, stats_overlap_token_size_2])
186
 
187
  user_input.change(tokenize_pair,
188
  [user_input, tokenizer_type_1, tokenizer_type_2],
189
+ [output_text_1, output_table_1, output_text_2, output_table_2])
190
+
191
+ tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
192
+ [output_text_2, output_table_2])
193
+ tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
194
+ tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
195
+ [stats_overlap_token_size_1, stats_overlap_token_size_2])
196
+
197
+ dropdown_examples.change(
198
+ example_fn,
199
+ dropdown_examples,
200
+ [user_input, tokenizer_type_1, tokenizer_type_2]
201
  )
202
 
203
+ # start up 初始化
204
+ gr.update(lines=2, visible=True, value="Short story: ")
205
 
 
 
 
 
 
206
 
207
  if __name__ == "__main__":
208
+ demo.queue(max_size=20).launch()
209
+ # demo.launch()
style.css ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673 */
3
+ .space-show {
4
+ white-space: pre-wrap;
5
+ }
6
+
7
+ .cell-wrap {
8
+ white-space: pre-wrap;
9
+ }
10
+
11
+ /* 隐藏legend */
12
+ .category-legend {
13
+ display: none !important;
14
+ }
15
+
16
+ .statistics textarea {
17
+ min-width: min(50px, 100%) !important;
18
+ font-size: 20px !important;
19
+ font-weight: 600 !important;
20
+ text-align: center !important;
21
+ border: none !important;
22
+ }
23
+
24
+ .statistics label {
25
+ text-align: center !important;
26
+ }
27
+
28
+ /* align-self: flex-end; */
29
+ .example-style {
30
+ max-width: 150px;
31
+ align-self: self-end;
32
+ }
util.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import gradio as gr
4
+ import json
5
+ import pandas as pd
6
+ from vocab import load_tokener
7
+ from utils.zh_util import iter_vocab
8
+
9
+
10
+
11
+
12
+ def tokenize(text, tokenizer_type, color_num=5):
13
+ """
14
+ TODO: cache tokenizer
15
+ """
16
+ print(f"入参:tokenize, {text}, {tokenizer_type}")
17
+ pos_tokens = []
18
+ tokenizer = load_tokener(tokenizer_type)
19
+ encoding = tokenizer.encode(text)
20
+
21
+ table = []
22
+
23
+ for idx, token_id in enumerate(encoding):
24
+ decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
25
+ pos_tokens.extend([(decode_text, str(idx % color_num))])
26
+
27
+ # token "Byte": # 这是 utf-8编码吧?
28
+ token = tokenizer.convert_ids_to_tokens([token_id])[0]
29
+ if isinstance(token, bytes):
30
+ try:
31
+ token_str = token.decode("utf-8")
32
+ except:
33
+ token_str = token.decode("utf-8", errors="ignore")
34
+ print("decode_error", tokenizer_type, token, token_str)
35
+
36
+ token_bytes = token
37
+ json_dumps = json.dumps(token_str)
38
+ elif isinstance(token, str):
39
+ token_str = token
40
+ token_bytes = bytes(token_str, "utf-8")
41
+ json_dumps = json.dumps(token_str)
42
+ else:
43
+ return
44
+
45
+ # ⭐
46
+ table.append(
47
+ {"TokenID": token_id,
48
+ "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
49
+ "Text": decode_text, #
50
+ # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
51
+ "Bytes": str(token_bytes),
52
+ # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
53
+ }
54
+ )
55
+
56
+ table_df = pd.DataFrame(table)
57
+ print(f"Tokenization[{tokenizer_type}]: {table}")
58
+ # print(table_df)
59
+
60
+ return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
61
+
62
+
63
+ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
64
+ pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
65
+ pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
66
+ return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
67
+
68
+
69
+ def basic_count(tokenizer_type):
70
+ tokenizer = load_tokener(tokenizer_type)
71
+ stats = iter_vocab(tokenizer, tokenizer_type)
72
+ return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
73
+
74
+
75
+ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
76
+ tokenizer1 = load_tokener(tokenizer_type_1)
77
+ tokenizer2 = load_tokener(tokenizer_type_2)
78
+ vocab1 = tokenizer1.get_vocab()
79
+ vocab2 = tokenizer2.get_vocab()
80
+ overlap_tokens = vocab1.keys() & vocab2.keys()
81
+ overlap_token_size = len(overlap_tokens)
82
+ print(f"OverlapTokens: {tokenizer_type_1}, {tokenizer_type_2} {list(overlap_tokens)[:10]}")
83
+ return overlap_token_size, overlap_token_size
84
+
85
+
86
+
87
+
88
+ def test_coding():
89
+ bytes1 = b'\xe4\xb8\xad'
90
+ print(bytes1) # b'\xe4\xb8\xad'
91
+
92
+
93
+ if __name__ == "__main__":
94
+ print(basic_count("internlm_chat_7b"))
utils/zh_util.py CHANGED
@@ -37,8 +37,12 @@ def get_coding_length(tokenizer, vocab, filter=None):
37
  def has_zh_char(text):
38
  return any(ch in zh_punc for ch in text)
39
 
 
 
 
 
 
40
 
41
- def iter_vocab(tokenizer, name=""):
42
  f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
43
  zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
44
  all_single_zh_tokens = set()
@@ -72,16 +76,17 @@ def iter_vocab(tokenizer, name=""):
72
  # TODO: 繁体字,简体字
73
  zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
74
 
75
- return {
76
  "name": name,
77
  "impl": str(tokenizer.__class__),
78
  "vocab_size": tokenizer.vocab_size,
79
- "中文汉字数": str(zh_token_count),
80
  "中文标点数": zh_symbol_count,
81
  "中文汉字编码长度均值": mean_length,
82
  "中文汉字编码长度分布": json.dumps(dist_length),
83
-
84
  }
 
 
85
 
86
 
87
  if __name__ == "__main__":
 
37
  def has_zh_char(text):
38
  return any(ch in zh_punc for ch in text)
39
 
40
+ cache = {}
41
+
42
+ def iter_vocab(tokenizer, name="", from_cache=True):
43
+ if from_cache and name in cache:
44
+ return cache[name]
45
 
 
46
  f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
47
  zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
48
  all_single_zh_tokens = set()
 
76
  # TODO: 繁体字,简体字
77
  zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
78
 
79
+ result = {
80
  "name": name,
81
  "impl": str(tokenizer.__class__),
82
  "vocab_size": tokenizer.vocab_size,
83
+ "中文汉字数": zh_token_count,
84
  "中文标点数": zh_symbol_count,
85
  "中文汉字编码长度均值": mean_length,
86
  "中文汉字编码长度分布": json.dumps(dist_length),
 
87
  }
88
+ cache[name] = result
89
+ return result
90
 
91
 
92
  if __name__ == "__main__":
vocab/{alpaca_7b → Intern_gpt}/README.md RENAMED
File without changes
vocab/README.md CHANGED
@@ -11,9 +11,9 @@ gpt-neox词典
11
  ## decode
12
 
13
  bert词典有个特殊字符 #
14
- gpt词典有个特殊字符 G
15
 
16
  gpt-neox词典呢?
 
17
 
18
 
19
  ## 关于分词粒度
@@ -80,6 +80,8 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
80
  - 功能符号: `<|endoftext|>` 表示换行。tab? 空格?
81
  - 很多数字独立编码,几乎上千个。
82
 
 
 
83
  ## 空格、tab、换行
84
 
85
 
 
11
  ## decode
12
 
13
  bert词典有个特殊字符 #
 
14
 
15
  gpt-neox词典呢?
16
+ - _开头表示空格或句首
17
 
18
 
19
  ## 关于分词粒度
 
80
  - 功能符号: `<|endoftext|>` 表示换行。tab? 空格?
81
  - 很多数字独立编码,几乎上千个。
82
 
83
+ - 类似的还有:moss
84
+
85
  ## 空格、tab、换行
86
 
87
 
vocab/__init__.py CHANGED
@@ -1,16 +1,39 @@
1
  import importlib
2
  from enum import Enum, auto
3
 
4
-
5
- """
6
- Interface:
7
- -
8
 
9
  tokenizer.parent = ""
 
 
10
  tokenizer.type = TokenizerType.ByteBPE.name
11
  tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  tokenizer.comments = "split all numbers into individual digits, " \
13
  "and fallback to bytes to decompose unknown UTF-8 characters"
 
 
 
 
 
 
14
  """
15
 
16
  Animal = Enum('Animal', 'ANT BEE CAT DOG')
@@ -21,9 +44,13 @@ uniq_tokenizers = [
21
 
22
  all_tokenizers = [
23
  "gpt_35_turbo",
 
24
  "gpt2",
25
  "gpt2_chinese",
26
- "bert_chinese",
 
 
 
27
  "moss",
28
  #
29
  # ######
@@ -31,7 +58,7 @@ all_tokenizers = [
31
  # "prompt_clue",
32
  #
33
  # #### bloom 系列
34
- # "bloom",
35
  # "bloomz_6b4_zh",
36
  # "belle_7b_2m", # 模型和词典都基于bloom
37
  #
@@ -41,19 +68,21 @@ all_tokenizers = [
41
  # ##### glm系列
42
  # "glm_chinese",
43
  "chatglm_6b",
 
44
  #
45
  # #### llama alpaca系列
46
- "llama", # '中文单字': 700, '中文多字': 0
47
  "chinese_llama_lora_7b", #
48
  # "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
49
  # "belle_llama_ext_7b",
50
  # "alpaca_7b",
51
  "baichuan_7b",
52
- "qwen"
 
 
53
  ]
54
 
55
 
56
-
57
  class TokenizerType(Enum):
58
  """
59
  - https://huggingface.co/docs/transformers/tokenizer_summary
@@ -105,10 +134,10 @@ class TokenizerImpl(Enum):
105
  BertTokenizer = auto() #
106
 
107
 
108
-
109
  def load_tokener(model_name):
110
  tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
111
  return tokenizer
112
 
 
113
  if __name__ == "__main__":
114
- pass
 
1
  import importlib
2
  from enum import Enum, auto
3
 
4
+ """Interface:
5
+ tokenizer.encode
6
+ tokenizer.decode
7
+ tokenizer.convert_ids_to_tokens
8
 
9
  tokenizer.parent = ""
10
+ tokenizer.vocab_size
11
+ tokenizer.get_vocab() # gpt-neox-20b, llama
12
  tokenizer.type = TokenizerType.ByteBPE.name
13
  tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
14
+ - bert
15
+ - 特征
16
+ - 示例:
17
+ - gpt2
18
+ - 特征:
19
+ - sentencepiece:
20
+ - 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
21
+ - 示例:llama,baichuan
22
+ - tiktoken
23
+ - icetk
24
+ - hf_tokenizer
25
+ - 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有 merge.txt
26
+ - 示例:gpt_neox_20b, moss
27
+ - gpt3.5 gpt4
28
+ - 特征:tiktoken
29
  tokenizer.comments = "split all numbers into individual digits, " \
30
  "and fallback to bytes to decompose unknown UTF-8 characters"
31
+
32
+ tokenizer.all_special_tokens # baichuan
33
+ tokenizer.special_tokens_set # gpt3.5_turbo
34
+ tokenizer.special_tokens_map
35
+
36
+ tokenizer.dependency [sentencepiece, tiktoken, icetk]
37
  """
38
 
39
  Animal = Enum('Animal', 'ANT BEE CAT DOG')
 
44
 
45
  all_tokenizers = [
46
  "gpt_35_turbo",
47
+ "gpt4",
48
  "gpt2",
49
  "gpt2_chinese",
50
+ "bert_base_cased",
51
+ "bert_base_uncased",
52
+ "bert_base_chinese",
53
+ "kplug",
54
  "moss",
55
  #
56
  # ######
 
58
  # "prompt_clue",
59
  #
60
  # #### bloom 系列
61
+ "bloom",
62
  # "bloomz_6b4_zh",
63
  # "belle_7b_2m", # 模型和词典都基于bloom
64
  #
 
68
  # ##### glm系列
69
  # "glm_chinese",
70
  "chatglm_6b",
71
+ "chatglm2-6b",
72
  #
73
  # #### llama alpaca系列
74
+ "llama", # '中文单字': 700, '中文多字': 0
75
  "chinese_llama_lora_7b", #
76
  # "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
77
  # "belle_llama_ext_7b",
78
  # "alpaca_7b",
79
  "baichuan_7b",
80
+ "qwen",
81
+ "internlm_chat_7b",
82
+ "goat",
83
  ]
84
 
85
 
 
86
  class TokenizerType(Enum):
87
  """
88
  - https://huggingface.co/docs/transformers/tokenizer_summary
 
134
  BertTokenizer = auto() #
135
 
136
 
 
137
  def load_tokener(model_name):
138
  tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
139
  return tokenizer
140
 
141
+
142
  if __name__ == "__main__":
143
+ pass
vocab/{bert_en → _alpaca_7b}/README.md RENAMED
File without changes
vocab/{goat → _goat}/README.md RENAMED
File without changes
tokenizer.py → vocab/_goat/__init__.py RENAMED
File without changes
vocab/baichuan_7b/demo.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ from vocab.baichuan_7b import tokenizer
3
+
vocab/bert_base_cased/README.md ADDED
File without changes
vocab/bert_base_cased/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ from transformers import BertTokenizer
3
+ tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
vocab/{bert_chinese → bert_base_chinese}/README.md RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/__init__.py RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/test.py RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/test_zh_coding_len.py RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/tokenizer/config.json RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer.json RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer_config.json RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/tokenizer/vocab.txt RENAMED
File without changes
vocab/{bert_chinese → bert_base_chinese}/vocab.txt RENAMED
File without changes
vocab/bert_base_uncased/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ from transformers import BertTokenizer
3
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab/chatglm2_6b/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -4,10 +4,10 @@ import tiktoken
4
  from tiktoken import Encoding
5
 
6
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
7
-
8
  tokenizer.vocab_size = tokenizer.n_vocab
9
 
10
 
 
11
  def decode(self, tokens, errors="replace"):
12
  # def decode(self, tokens: list[int], errors: str = "replace") -> str:
13
  try:
@@ -19,8 +19,24 @@ def decode(self, tokens, errors="replace"):
19
  def convert_ids_to_tokens(self, tokens):
20
  return tokenizer.decode_tokens_bytes(tokens)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  Encoding.decode = decode
24
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 
25
 
26
 
 
4
  from tiktoken import Encoding
5
 
6
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 
7
  tokenizer.vocab_size = tokenizer.n_vocab
8
 
9
 
10
+
11
  def decode(self, tokens, errors="replace"):
12
  # def decode(self, tokens: list[int], errors: str = "replace") -> str:
13
  try:
 
19
  def convert_ids_to_tokens(self, tokens):
20
  return tokenizer.decode_tokens_bytes(tokens)
21
 
22
+ def get_vocab(self):
23
+ """Returns vocab as a dict"""
24
+ vocab = {}
25
+ for i in range(self.vocab_size):
26
+ try:
27
+ token_byte = self.convert_ids_to_tokens([i])[0]
28
+ token_str = token_byte.decode("utf-8")
29
+ vocab[token_str] = i
30
+ except KeyError:
31
+ print("gpt_35_turbo decode KeyError", i)
32
+ except UnicodeDecodeError:
33
+ print("gpt_35_turbo decode UnicodeDecodeError", i, str(token_byte))
34
+ # vocab.update(self.added_tokens_encoder)
35
+ return vocab
36
+
37
 
38
  Encoding.decode = decode
39
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
40
+ Encoding.get_vocab = get_vocab
41
 
42
 
vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py CHANGED
@@ -47,7 +47,7 @@ def append_token(word_list, base_tokenizer, output_tokenizer_path, unused_ids=No
47
  data, base_tokenizer = base_tokenizer
48
  vocab = data["model"]["vocab"]
49
  merges = data["model"]["merges"]
50
- vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
51
 
52
  for word in word_list:
53
  encoding = base_tokenizer.encode(word)
 
47
  data, base_tokenizer = base_tokenizer
48
  vocab = data["model"]["vocab"]
49
  merges = data["model"]["merges"]
50
+ vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
51
 
52
  for word in word_list:
53
  encoding = base_tokenizer.encode(word)
vocab/gpt_neox_chinese_v1/to_v2/test2.py CHANGED
@@ -21,7 +21,7 @@ def append_token(word_list, base_tokenizer, unused_ids=None):
21
  data, base_tokenizer = base_tokenizer
22
  vocab = data["model"]["vocab"]
23
  merges = data["model"]["merges"]
24
- vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
25
 
26
  for word in word_list:
27
  encoding = base_tokenizer.encode(word)
 
21
  data, base_tokenizer = base_tokenizer
22
  vocab = data["model"]["vocab"]
23
  merges = data["model"]["merges"]
24
+ vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
25
 
26
  for word in word_list:
27
  encoding = base_tokenizer.encode(word)
vocab/gpt_nexo_20b/__init__.py CHANGED
@@ -21,3 +21,4 @@ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
21
  # tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
22
 
23
 
 
 
21
  # tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
22
 
23
 
24
+
vocab/internlm_chat_7b/README.md ADDED
File without changes
vocab/internlm_chat_7b/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ https://huggingface.co/internlm/internlm-chat-7b
3
+ """
4
+
5
+ from transformers import AutoTokenizer
6
+ tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
vocab/kplug/__init__.py CHANGED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ from transformers import BertTokenizer
3
+
4
+ tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-encoder")
5
+ print(tokenizer)
vocab/llama/__init__.py CHANGED
@@ -20,4 +20,4 @@ tokenizer.parent = ""
20
  tokenizer.type = TokenizerType.ByteBPE.name
21
  tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
22
  tokenizer.comments = "split all numbers into individual digits, " \
23
- "and fallback to bytes to decompose unknown UTF-8 characters"
 
20
  tokenizer.type = TokenizerType.ByteBPE.name
21
  tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
22
  tokenizer.comments = "split all numbers into individual digits, " \
23
+ "and fallback to bytes to decompose unknown UTF-8 characters"
vocab/llama2/__init__.py ADDED
File without changes
vocab/moss/test_tokenizer.py CHANGED
@@ -3,6 +3,8 @@
3
  vocab size: 106029
4
 
5
  中文汉字数:54230, 中文标点数: 549
 
 
6
  """
7
 
8
  import json
@@ -21,15 +23,12 @@ for token in tokens:
21
  print(token, tokenizer.decode([token]))
22
 
23
 
24
- def id2token(ids):
25
- return tokenizer.convert_ids_to_tokens(ids)
26
-
27
  def test_token():
28
  for word in "中国解决方法黑白侗,。!?;":
29
  encoding = tokenizer.encode(word)
30
  for token_id in encoding:
31
  decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
32
- token = id2token([token_id])
33
  print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
34
 
35
 
 
3
  vocab size: 106029
4
 
5
  中文汉字数:54230, 中文标点数: 549
6
+
7
+ moss很奇怪,
8
  """
9
 
10
  import json
 
23
  print(token, tokenizer.decode([token]))
24
 
25
 
 
 
 
26
  def test_token():
27
  for word in "中国解决方法黑白侗,。!?;":
28
  encoding = tokenizer.encode(word)
29
  for token_id in encoding:
30
  decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
31
+ token = tokenizer.convert_ids_to_tokens([token_id])
32
  print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
33
 
34