Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

eson commited on Sep 3, 2023

Commit

d10ecd7

•

1 Parent(s): 1ee0570

update

Browse files

Files changed (34) hide show

app.py +76 -119
style.css +32 -0
util.py +94 -0
utils/zh_util.py +9 -4
vocab/{alpaca_7b → Intern_gpt}/README.md +0 -0
vocab/README.md +3 -1
vocab/__init__.py +40 -11
vocab/{bert_en → _alpaca_7b}/README.md +0 -0
vocab/{goat → _goat}/README.md +0 -0
tokenizer.py → vocab/_goat/__init__.py +0 -0
vocab/baichuan_7b/demo.py +3 -0
vocab/bert_base_cased/README.md +0 -0
vocab/bert_base_cased/__init__.py +3 -0
vocab/{bert_chinese → bert_base_chinese}/README.md +0 -0
vocab/{bert_chinese → bert_base_chinese}/__init__.py +0 -0
vocab/{bert_chinese → bert_base_chinese}/test.py +0 -0
vocab/{bert_chinese → bert_base_chinese}/test_zh_coding_len.py +0 -0
vocab/{bert_chinese → bert_base_chinese}/tokenizer/config.json +0 -0
vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer.json +0 -0
vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer_config.json +0 -0
vocab/{bert_chinese → bert_base_chinese}/tokenizer/vocab.txt +0 -0
vocab/{bert_chinese → bert_base_chinese}/vocab.txt +0 -0
vocab/bert_base_uncased/__init__.py +3 -0
vocab/chatglm2_6b/__init__.py +2 -0
vocab/gpt_35_turbo/__init__.py +17 -1
vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py +1 -1
vocab/gpt_neox_chinese_v1/to_v2/test2.py +1 -1
vocab/gpt_nexo_20b/__init__.py +1 -0
vocab/internlm_chat_7b/README.md +0 -0
vocab/internlm_chat_7b/__init__.py +6 -0
vocab/kplug/__init__.py +5 -0
vocab/llama/__init__.py +1 -1
vocab/llama2/__init__.py +0 -0
vocab/moss/test_tokenizer.py +3 -4

app.py CHANGED Viewed

@@ -3,6 +3,12 @@
 # time: 2022/8/23 16:06
 """
 plots
@@ -19,21 +25,11 @@ table
 [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
 """
-import json
-import pandas as pd
 import gradio as gr
-from vocab import all_tokenizers, load_tokener
-# 显示空格：https://blog.csdn.net/liuxiao723846/article/details/118994673
-# 隐藏legend：
-css = """
-.space-show {white-space: pre-wrap;}
-.cell-wrap {white-space: pre-wrap;}
-.category-legend {display: none !important}
-.statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;}
-.statistics label {text-align: center !important;}
-"""
 example_text = """Replace this text in the input field to see how tokenization works
 华为智能音箱发布：华为Sound X"""
@@ -42,81 +38,18 @@ example_text = """Replace this text in the input field to see how tokenization w
 examples = [
     # ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
     ["标点测试：，。！？；", "baichuan_7b", "llama"],
-    ["符号测试：🦙", "baichuan_7b", "llama"],
-    ["中文测试：🦙", "baichuan_7b", "llama"],
     ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
 ]
-def tokenize(text, tokenizer_type, color_num=5):
-    """
-    TODO: cache tokenizer
-    """
-    print(text, tokenizer_type)
-    pos_tokens = []
-    tokenizer = load_tokener(tokenizer_type)
-    encoding = tokenizer.encode(text)
-    table = []
-    for idx, token_id in enumerate(encoding):
-        decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
-        pos_tokens.extend([(decode_text, str(idx % color_num))])
-        # token  "Byte":  # 这是 utf-8编码吧？
-        token = tokenizer.convert_ids_to_tokens([token_id])[0]
-        if isinstance(token, bytes):
-            try:
-                token_str = token.decode("utf-8")
-            except:
-                token_str = token.decode("utf-8", errors="ignore")
-                print("decode_error", token, token_str)
-            token_bytes = token
-            json_dumps = json.dumps(token_str)
-        elif isinstance(token, str):
-            token_str = token
-            token_bytes = bytes(token_str, "utf-8")
-            json_dumps = json.dumps(token_str)
-        else:
-            return
-        # ⭐
-        table.append(
-            {"TokenID": token_id,
-             "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
-             "Text": decode_text,  #
-             # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
-             "Bytes": str(token_bytes),
-             # "Unicode": json_dumps  # unicode, 如果是ascii码，就直接显示。如果不是ascii码，就显示unicode
-             }
-        )
-    table_df = pd.DataFrame(table)
-    print(table)
-    # print(table_df)
-    return pos_tokens, table_df, len(encoding)
-def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
-    pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1)
-    pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2)
-    return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2
-def get_vocab_size(tokenizer_type):
-    tokenizer = load_tokener(tokenizer_type)
-    return tokenizer.vocab_size
-def test_coding():
-    bytes1 = b'\xe4\xb8\xad'
-    print(bytes1)  # b'\xe4\xb8\xad'
-with gr.Blocks(css=css) as demo:
     gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
     # links: https://www.coderstool.com/utf8-encoding-decoding
     # 功能：输入文本，进行分词
@@ -125,16 +58,29 @@ with gr.Blocks(css=css) as demo:
     #
     # Byte: 表示分词
-    gr.Markdown("## Input Text")
     user_input = gr.Textbox(
         value=example_text,
         label="Input Text",
         lines=5,
         show_label=False,
     )  # placeholder="Enter sentence here..."
-    # submitBtn = gr.Button("生成回复", variant="primary")
     gr.Markdown("## Tokenization")
@@ -156,18 +102,24 @@ with gr.Blocks(css=css) as demo:
                             lines=1,
                             elem_classes="statistics"
                         )
-                        stats_token_size_1 = gr.TextArea(
-                            label="Tokens",
                             lines=1,
                             elem_classes="statistics"
                         )
-                        stats_3 = gr.TextArea(
-                            label="Compress Rate",
                             lines=1,
                             elem_classes="statistics"
                         )
         # https://www.onlinewebfonts.com/icon/418591
-        gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_type_2 = gr.Dropdown(
@@ -182,19 +134,23 @@ with gr.Blocks(css=css) as demo:
                             lines=1,
                             elem_classes="statistics"
                         )
-                        stats_token_size_2 = gr.TextArea(
-                            label="Tokens",
                             lines=1,
                             elem_classes="statistics"
                         )
-                        stats_6 = gr.TextArea(
-                            label="Compress Rate",
                             lines=1,
                             elem_classes="statistics"
                         )
     # TODO: 图 表 压缩率
     with gr.Row():
         with gr.Column():
@@ -212,41 +168,42 @@ with gr.Blocks(css=css) as demo:
     with gr.Row():
         output_table_1 = gr.Dataframe(
-            # headers=["TokenID", "Byte", "Text"],
-            # datatype=["str", "str", "str"],
             # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
         )
         output_table_2 = gr.Dataframe(
-            # headers=["TokenID", "Token", "Text"],
-            # datatype=["str", "str", "str"],
         )
-    tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
-    tokenizer_type_1.change(get_vocab_size, [tokenizer_type_1], [stats_vocab_size_1])
     user_input.change(tokenize_pair,
                       [user_input, tokenizer_type_1, tokenizer_type_2],
-                      [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2])
-    tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2, stats_token_size_2])
-    tokenizer_type_2.change(get_vocab_size, [tokenizer_type_2], [stats_vocab_size_2])
-    gr.Examples(
-        examples,
-        [user_input, tokenizer_type_1, tokenizer_type_2],
-        [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2],
-        tokenize_pair,
-        cache_examples=True,
     )
-    # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
-    #                 show_progress=True)
-    # examples=[
-    #     ["What a beautiful morning for a walk!"],
-    #     ["It was the best of times, it was the worst of times."],
-    #     ["多个空格    It  ss  was the best of times, it was the worst of times."],
-    # ]
 if __name__ == "__main__":
-    demo.launch()

 # time: 2022/8/23 16:06
 """
+## TODO:
+1. token数，放到 label里
+2. http get方式获取参数，
+3. 自启动
+4.
 plots
 [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
 """
 import gradio as gr
+from vocab import all_tokenizers
+from util import *
 example_text = """Replace this text in the input field to see how tokenization works
 华为智能音箱发布：华为Sound X"""
 examples = [
     # ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
     ["标点测试：，。！？；", "baichuan_7b", "llama"],
+    ["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
+    ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
     ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
 ]
+def example_fn(example_idx):
+    return examples[example_idx]
+with gr.Blocks(css="style.css") as demo:
     gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
     # links: https://www.coderstool.com/utf8-encoding-decoding
     # 功能：输入文本，进行分词
     #
     # Byte: 表示分词
+    with gr.Row():
+        gr.Markdown("## Input Text")
+        dropdown_examples = gr.Dropdown(
+            ["Example1", "Example2", "Example3"],
+            value="Examples",
+            type="index",
+            show_label=False,
+            container=False,
+            scale=0,
+            elem_classes="example-style"
+        )
     user_input = gr.Textbox(
         value=example_text,
         label="Input Text",
         lines=5,
         show_label=False,
     )  # placeholder="Enter sentence here..."
+    # gr.Examples(
+    #     examples,
+    #     None,
+    # )
     gr.Markdown("## Tokenization")
                             lines=1,
                             elem_classes="statistics"
                         )
+                        stats_zh_token_size_1 = gr.TextArea(
+                            # value="1252/1455",
+                            label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         )
+                        stats_overlap_token_size_1 = gr.TextArea(
+                            label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
                         )
+                        # stats_3 = gr.TextArea(
+                        #     label="Compress Rate",
+                        #     lines=1,
+                        #     elem_classes="statistics"
+                        # )
         # https://www.onlinewebfonts.com/icon/418591
+        gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False)  # height=10,
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_type_2 = gr.Dropdown(
                             lines=1,
                             elem_classes="statistics"
                         )
+                        stats_zh_token_size_2 = gr.TextArea(  # 中文单子数，
+                            # value="12/45",
+                            label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         )
+                        # stats_6 = gr.TextArea(
+                        #     label="Compress Rate",
+                        #     lines=1,
+                        #     elem_classes="statistics"
+                        # )
+                        stats_overlap_token_size_2 = gr.TextArea(
+                            label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
                         )
     # TODO: 图 表 压缩率
     with gr.Row():
         with gr.Column():
     with gr.Row():
         output_table_1 = gr.Dataframe(
+            headers=["TokenID", "Byte", "Text"],
+            datatype=["str", "str", "str"],
             # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
         )
         output_table_2 = gr.Dataframe(
+            headers=["TokenID", "Token", "Text"],
+            datatype=["str", "str", "str"],
         )
+    tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
+                            [output_text_1, output_table_1])
+    # 下面两个好像可以合并
+    tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
+    tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
+                            [stats_overlap_token_size_1, stats_overlap_token_size_2])
     user_input.change(tokenize_pair,
                       [user_input, tokenizer_type_1, tokenizer_type_2],
+                      [output_text_1, output_table_1, output_text_2, output_table_2])
+    tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
+                            [output_text_2, output_table_2])
+    tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
+    tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
+                            [stats_overlap_token_size_1, stats_overlap_token_size_2])
+    dropdown_examples.change(
+        example_fn,
+        dropdown_examples,
+        [user_input, tokenizer_type_1, tokenizer_type_2]
     )
+    # start up 初始化
+    gr.update(lines=2, visible=True, value="Short story: ")
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()
+    # demo.launch()

style.css ADDED Viewed

	@@ -0,0 +1,32 @@

+/* 显示空格：https://blog.csdn.net/liuxiao723846/article/details/118994673 */
+.space-show {
+	white-space: pre-wrap;
+}
+.cell-wrap {
+	white-space: pre-wrap;
+}
+/* 隐藏legend */
+.category-legend {
+	display: none !important;
+}
+.statistics textarea {
+	min-width: min(50px, 100%) !important;
+	font-size: 20px !important;
+	font-weight: 600 !important;
+	text-align: center !important;
+	border: none !important;
+}
+.statistics label {
+	text-align: center !important;
+}
+/* align-self: flex-end; */
+.example-style {
+	max-width: 150px;
+	align-self: self-end;
+}

util.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import gradio as gr
+import json
+import pandas as pd
+from vocab import load_tokener
+from utils.zh_util import iter_vocab
+def tokenize(text, tokenizer_type, color_num=5):
+    """
+    TODO: cache tokenizer
+    """
+    print(f"入参：tokenize, {text}, {tokenizer_type}")
+    pos_tokens = []
+    tokenizer = load_tokener(tokenizer_type)
+    encoding = tokenizer.encode(text)
+    table = []
+    for idx, token_id in enumerate(encoding):
+        decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
+        pos_tokens.extend([(decode_text, str(idx % color_num))])
+        # token  "Byte":  # 这是 utf-8编码吧？
+        token = tokenizer.convert_ids_to_tokens([token_id])[0]
+        if isinstance(token, bytes):
+            try:
+                token_str = token.decode("utf-8")
+            except:
+                token_str = token.decode("utf-8", errors="ignore")
+                print("decode_error", tokenizer_type, token, token_str)
+            token_bytes = token
+            json_dumps = json.dumps(token_str)
+        elif isinstance(token, str):
+            token_str = token
+            token_bytes = bytes(token_str, "utf-8")
+            json_dumps = json.dumps(token_str)
+        else:
+            return
+        # ⭐
+        table.append(
+            {"TokenID": token_id,
+             "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
+             "Text": decode_text,  #
+             # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
+             "Bytes": str(token_bytes),
+             # "Unicode": json_dumps  # unicode, 如果是ascii码，就直接显示。如果不是ascii码，就显示unicode
+             }
+        )
+    table_df = pd.DataFrame(table)
+    print(f"Tokenization[{tokenizer_type}]: {table}")
+    # print(table_df)
+    return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
+def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
+    pos_tokens_1, table_df_1  = tokenize(text, tokenizer_type_1)
+    pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
+    return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
+def basic_count(tokenizer_type):
+    tokenizer = load_tokener(tokenizer_type)
+    stats = iter_vocab(tokenizer, tokenizer_type)
+    return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
+def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
+    tokenizer1 = load_tokener(tokenizer_type_1)
+    tokenizer2 = load_tokener(tokenizer_type_2)
+    vocab1 = tokenizer1.get_vocab()
+    vocab2 = tokenizer2.get_vocab()
+    overlap_tokens = vocab1.keys() & vocab2.keys()
+    overlap_token_size = len(overlap_tokens)
+    print(f"OverlapTokens: {tokenizer_type_1}, {tokenizer_type_2} {list(overlap_tokens)[:10]}")
+    return overlap_token_size, overlap_token_size
+def test_coding():
+    bytes1 = b'\xe4\xb8\xad'
+    print(bytes1)  # b'\xe4\xb8\xad'
+if __name__ == "__main__":
+    print(basic_count("internlm_chat_7b"))

utils/zh_util.py CHANGED Viewed

@@ -37,8 +37,12 @@ def get_coding_length(tokenizer, vocab, filter=None):
 def has_zh_char(text):
     return any(ch in zh_punc for ch in text)
-def iter_vocab(tokenizer, name=""):
     f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
     zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
     all_single_zh_tokens = set()
@@ -72,16 +76,17 @@ def iter_vocab(tokenizer, name=""):
     # TODO: 繁体字，简体字
     zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
-    return {
         "name": name,
         "impl": str(tokenizer.__class__),
         "vocab_size": tokenizer.vocab_size,
-        "中文汉字数": str(zh_token_count),
         "中文标点数": zh_symbol_count,
         "中文汉字编码长度均值": mean_length,
         "中文汉字编码长度分布": json.dumps(dist_length),
     }
 if __name__ == "__main__":

 def has_zh_char(text):
     return any(ch in zh_punc for ch in text)
+cache = {}
+def iter_vocab(tokenizer, name="", from_cache=True):
+    if from_cache and name in cache:
+        return cache[name]
     f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
     zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
     all_single_zh_tokens = set()
     # TODO: 繁体字，简体字
     zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
+    result = {
         "name": name,
         "impl": str(tokenizer.__class__),
         "vocab_size": tokenizer.vocab_size,
+        "中文汉字数": zh_token_count,
         "中文标点数": zh_symbol_count,
         "中文汉字编码长度均值": mean_length,
         "中文汉字编码长度分布": json.dumps(dist_length),
     }
+    cache[name] = result
+    return result
 if __name__ == "__main__":

vocab/{alpaca_7b → Intern_gpt}/README.md RENAMED Viewed

File without changes

vocab/README.md CHANGED Viewed

@@ -11,9 +11,9 @@ gpt-neox词典
 ## decode
 bert词典有个特殊字符 #
-gpt词典有个特殊字符 G
 gpt-neox词典呢？
 ## 关于分词粒度
@@ -80,6 +80,8 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
 - 功能符号： `<|endoftext|>` 表示换行。tab？ 空格？
 - 很多数字独立编码，几乎上千个。
 ## 空格、tab、换行

 ## decode
 bert词典有个特殊字符 #
 gpt-neox词典呢？
+  - _开头表示空格或句首
 ## 关于分词粒度
 - 功能符号： `<|endoftext|>` 表示换行。tab？ 空格？
 - 很多数字独立编码，几乎上千个。
+- 类似的还有：moss
 ## 空格、tab、换行

vocab/__init__.py CHANGED Viewed

@@ -1,16 +1,39 @@
 import importlib
 from enum import Enum, auto
-"""
-Interface:
--
 tokenizer.parent = ""
 tokenizer.type = TokenizerType.ByteBPE.name
 tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
 tokenizer.comments = "split all numbers into individual digits, " \
                      "and fallback to bytes to decompose unknown UTF-8 characters"
 """
 Animal = Enum('Animal', 'ANT BEE CAT DOG')
@@ -21,9 +44,13 @@ uniq_tokenizers = [
 all_tokenizers = [
     "gpt_35_turbo",
     "gpt2",
     "gpt2_chinese",
-    "bert_chinese",
     "moss",
     #
     # ######
@@ -31,7 +58,7 @@ all_tokenizers = [
     # "prompt_clue",
     #
     # #### bloom 系列
-    # "bloom",
     # "bloomz_6b4_zh",
     # "belle_7b_2m",   # 模型和词典都基于bloom
     #
@@ -41,19 +68,21 @@ all_tokenizers = [
     # ##### glm系列
     # "glm_chinese",
     "chatglm_6b",
     #
     # #### llama alpaca系列
-    "llama",  #  '中文单字': 700, '中文多字': 0
     "chinese_llama_lora_7b",  #
     # "chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
     # "belle_llama_ext_7b",
     # "alpaca_7b",
     "baichuan_7b",
-    "qwen"
 ]
 class TokenizerType(Enum):
     """
     - https://huggingface.co/docs/transformers/tokenizer_summary
@@ -105,10 +134,10 @@ class TokenizerImpl(Enum):
     BertTokenizer = auto()  #
 def load_tokener(model_name):
     tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
     return tokenizer
 if __name__ == "__main__":
-    pass

 import importlib
 from enum import Enum, auto
+"""Interface:
+tokenizer.encode
+tokenizer.decode
+tokenizer.convert_ids_to_tokens
 tokenizer.parent = ""
+tokenizer.vocab_size
+tokenizer.get_vocab()   # gpt-neox-20b, llama
 tokenizer.type = TokenizerType.ByteBPE.name
 tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
+  - bert
+    - 特征
+    - 示例：
+  - gpt2
+    - 特征：
+  - sentencepiece:
+    - 特征：.sp_model 是SentencePieceProcessor类型，sp_model.id_to_piece，有tokenizer.json tokenizer.model，词典字符有 ▁，
+    - 示例：llama，baichuan
+  - tiktoken
+  - icetk
+  - hf_tokenizer
+    - 特征：.model 是 tokenizer.models.BPE 类型，词典有 Ġ  "\u0120" 开头，有 merge.txt
+    - 示例：gpt_neox_20b, moss
+  - gpt3.5 gpt4
+    - 特征：tiktoken
 tokenizer.comments = "split all numbers into individual digits, " \
                      "and fallback to bytes to decompose unknown UTF-8 characters"
+tokenizer.all_special_tokens  # baichuan
+tokenizer.special_tokens_set   # gpt3.5_turbo
+tokenizer.special_tokens_map
+tokenizer.dependency [sentencepiece, tiktoken, icetk]
 """
 Animal = Enum('Animal', 'ANT BEE CAT DOG')
 all_tokenizers = [
     "gpt_35_turbo",
+    "gpt4",
     "gpt2",
     "gpt2_chinese",
+    "bert_base_cased",
+    "bert_base_uncased",
+    "bert_base_chinese",
+    "kplug",
     "moss",
     #
     # ######
     # "prompt_clue",
     #
     # #### bloom 系列
+    "bloom",
     # "bloomz_6b4_zh",
     # "belle_7b_2m",   # 模型和词典都基于bloom
     #
     # ##### glm系列
     # "glm_chinese",
     "chatglm_6b",
+    "chatglm2-6b",
     #
     # #### llama alpaca系列
+    "llama",  # '中文单字': 700, '中文多字': 0
     "chinese_llama_lora_7b",  #
     # "chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
     # "belle_llama_ext_7b",
     # "alpaca_7b",
     "baichuan_7b",
+    "qwen",
+    "internlm_chat_7b",
+    "goat",
 ]
 class TokenizerType(Enum):
     """
     - https://huggingface.co/docs/transformers/tokenizer_summary
     BertTokenizer = auto()  #
 def load_tokener(model_name):
     tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
     return tokenizer
 if __name__ == "__main__":
+    pass

vocab/{bert_en → _alpaca_7b}/README.md RENAMED Viewed

File without changes

vocab/{goat → _goat}/README.md RENAMED Viewed

File without changes

tokenizer.py → vocab/_goat/__init__.py RENAMED Viewed

File without changes

vocab/baichuan_7b/demo.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ from vocab.baichuan_7b import tokenizer
3	+

vocab/bert_base_cased/README.md ADDED Viewed

File without changes

vocab/bert_base_cased/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ from transformers import BertTokenizer
3	+ tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

vocab/{bert_chinese → bert_base_chinese}/README.md RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/__init__.py RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/test.py RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/test_zh_coding_len.py RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/tokenizer/config.json RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer.json RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer_config.json RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/tokenizer/vocab.txt RENAMED Viewed

File without changes

vocab/{bert_chinese → bert_base_chinese}/vocab.txt RENAMED Viewed

File without changes

vocab/bert_base_uncased/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ from transformers import BertTokenizer
3	+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

vocab/chatglm2_6b/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from transformers import AutoTokenizer
2	+ tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)

vocab/gpt_35_turbo/__init__.py CHANGED Viewed

@@ -4,10 +4,10 @@ import tiktoken
 from tiktoken import Encoding
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 tokenizer.vocab_size = tokenizer.n_vocab
 def decode(self, tokens, errors="replace"):
 # def decode(self, tokens: list[int], errors: str = "replace") -> str:
     try:
@@ -19,8 +19,24 @@ def decode(self, tokens, errors="replace"):
 def convert_ids_to_tokens(self, tokens):
     return tokenizer.decode_tokens_bytes(tokens)
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens

 from tiktoken import Encoding
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 tokenizer.vocab_size = tokenizer.n_vocab
 def decode(self, tokens, errors="replace"):
 # def decode(self, tokens: list[int], errors: str = "replace") -> str:
     try:
 def convert_ids_to_tokens(self, tokens):
     return tokenizer.decode_tokens_bytes(tokens)
+def get_vocab(self):
+    """Returns vocab as a dict"""
+    vocab = {}
+    for i in range(self.vocab_size):
+        try:
+            token_byte = self.convert_ids_to_tokens([i])[0]
+            token_str = token_byte.decode("utf-8")
+            vocab[token_str] = i
+        except KeyError:
+            print("gpt_35_turbo decode KeyError", i)
+        except UnicodeDecodeError:
+            print("gpt_35_turbo decode UnicodeDecodeError", i, str(token_byte))
+    # vocab.update(self.added_tokens_encoder)
+    return vocab
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
+Encoding.get_vocab = get_vocab

vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py CHANGED Viewed

@@ -47,7 +47,7 @@ def append_token(word_list, base_tokenizer, output_tokenizer_path, unused_ids=No
     data, base_tokenizer = base_tokenizer
     vocab = data["model"]["vocab"]
     merges = data["model"]["merges"]
-    vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
     for word in word_list:
         encoding = base_tokenizer.encode(word)

     data, base_tokenizer = base_tokenizer
     vocab = data["model"]["vocab"]
     merges = data["model"]["merges"]
+    vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
     for word in word_list:
         encoding = base_tokenizer.encode(word)

vocab/gpt_neox_chinese_v1/to_v2/test2.py CHANGED Viewed

@@ -21,7 +21,7 @@ def append_token(word_list, base_tokenizer,  unused_ids=None):
     data, base_tokenizer = base_tokenizer
     vocab = data["model"]["vocab"]
     merges = data["model"]["merges"]
-    vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
     for word in word_list:
         encoding = base_tokenizer.encode(word)

     data, base_tokenizer = base_tokenizer
     vocab = data["model"]["vocab"]
     merges = data["model"]["merges"]
+    vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
     for word in word_list:
         encoding = base_tokenizer.encode(word)

vocab/gpt_nexo_20b/__init__.py CHANGED Viewed

	@@ -21,3 +21,4 @@ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
21	# tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
22
23


21	# tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
22
23
24	+

vocab/internlm_chat_7b/README.md ADDED Viewed

File without changes

vocab/internlm_chat_7b/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+https://huggingface.co/internlm/internlm-chat-7b
+"""
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)

vocab/kplug/__init__.py CHANGED Viewed

	@@ -0,0 +1,5 @@

+from transformers import BertTokenizer
+tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-encoder")
+print(tokenizer)

vocab/llama/__init__.py CHANGED Viewed

@@ -20,4 +20,4 @@ tokenizer.parent = ""
 tokenizer.type = TokenizerType.ByteBPE.name
 tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
 tokenizer.comments = "split all numbers into individual digits, " \
-                     "and fallback to bytes to decompose unknown UTF-8 characters"

 tokenizer.type = TokenizerType.ByteBPE.name
 tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
 tokenizer.comments = "split all numbers into individual digits, " \
+                     "and fallback to bytes to decompose unknown UTF-8 characters"

vocab/llama2/__init__.py ADDED Viewed

File without changes

vocab/moss/test_tokenizer.py CHANGED Viewed

@@ -3,6 +3,8 @@
 vocab size: 106029
 中文汉字数：54230, 中文标点数: 549
 """
 import json
@@ -21,15 +23,12 @@ for token in tokens:
     print(token, tokenizer.decode([token]))
-def id2token(ids):
-    return tokenizer.convert_ids_to_tokens(ids)
 def test_token():
     for word in "中国解决方法黑白侗，。！？；":
         encoding = tokenizer.encode(word)
         for token_id in encoding:
             decode_str = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
-            token = id2token([token_id])
             print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))

 vocab size: 106029
 中文汉字数：54230, 中文标点数: 549
+moss很奇怪，
 """
 import json
     print(token, tokenizer.decode([token]))
 def test_token():
     for word in "中国解决方法黑白侗，。！？；":
         encoding = tokenizer.encode(word)
         for token_id in encoding:
             decode_str = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
+            token = tokenizer.convert_ids_to_tokens([token_id])
             print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))