Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

yhavinga commited on May 5

Commit

c78da21

•

1 Parent(s): 7d83c88

Add Llama tokenizer creation for Dutch, English, Code, Markdown and TeX.

Browse files

Files changed (6) hide show

app.py +1 -1
app_compression.py +1 -1
config.py +17 -5
stats/compress_rate.json +504 -0
utils/compression_util.py +2 -2
vocab/wizardcoder_15b_v1/__init__.py +4 -4

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from patcher.gr_interface import TabbedInterface
 demo = TabbedInterface(
     [tab_playground, tab_compression],
     [" ⚔️ Playground", "🏆 Compression Leaderboard",],  # 编码速度，解码速度，字符分类(zh、num等，支持正则)，支持的语言，机构，。
-    title='<div align="center">Tokenizer Arena ⚔️</div>',
     css="css/style.css"
 )

 demo = TabbedInterface(
     [tab_playground, tab_compression],
     [" ⚔️ Playground", "🏆 Compression Leaderboard",],  # 编码速度，解码速度，字符分类(zh、num等，支持正则)，支持的语言，机构，。
+    title='Tokenizer Arena ⚔️ (with some Dutch 🇳🇱🇧🇪🇸🇷 hacked in)',
     css="css/style.css"
 )

app_compression.py CHANGED Viewed

@@ -59,7 +59,7 @@ with gr.Blocks() as demo:
         with gr.Row():
             compress_rate_corpus = gr.Dropdown(
                 common_corpuses,  # , "code"
-                value=["cc100-en", "cc100-zh-Hans"],
                 label="corpus",
                 multiselect=True
                 # info=""

         with gr.Row():
             compress_rate_corpus = gr.Dropdown(
                 common_corpuses,  # , "code"
+                value=["cc100-nl", "cc100-en"],
                 label="corpus",
                 multiselect=True
                 # info=""

config.py CHANGED Viewed

@@ -11,10 +11,22 @@ LAZY_IMPORT = True
 # DEBUG: 设置环境变量 RUST_BACKTRACE=full
 #
-default_user_input = """\
-Replace this text in the input field to see how tokenization works.
-Buenos días!
-华为发布Mate60手机。
 ラグビーワールドカップ2023フランス"""
 default_tokenizer_type_1 = "llama3"
-default_tokenizer_type_2 = "gpt_4"

 # DEBUG: 设置环境变量 RUST_BACKTRACE=full
 #
+default_user_input = """“We apologize for any inconvenience and concern this may have caused to our customers and all concerned. We pray for the rest of the souls of those who lost their lives aboard the Japanese Coast Guard's equipment and extend our condolences to the bereaved families,” he said.
+Steenvliegen of oevervliegen[2] (Plecoptera) zijn een kleine orde van gevleugelde insecten. Steenvliegen zijn te herkennen aan hun slanke, langwerpige lichaamsvorm en de doorzichtige vleugels die in rust plat op de rug worden gehouden.
+def load_image_file(file, mode='RGB'):
+    im = PIL.Image.open(file)
+    if mode:
+        im = im.convert(mode)
+    return np.array(im)
+\section{The expected number of intervening \mbox{H\,{\sc i}}
+  absorbers}\label{section:expected_number}
+\begin{equation}\label{equation:expected_number}
+  \mu =  \iint{f(N_{\rm HI},X)\,\mathrm{d}X\,\mathrm{d}N_{\rm HI}},
+\end{equation}
+Eerder noemde De Meij Oud en Nieuw "een soort oorlogsgebied". En hij heeft dan ook geen zin in de nieuwjaarsnacht. "Als je weet dat er collega's gewond gaan raken, kan je niet meer zeggen: het is mooi politiewerk en we gaan naar een spannende nacht. Het zijn gewoon risico's die je niet wil lopen."
+华为发布Mate60手机
 ラグビーワールドカップ2023フランス"""
 default_tokenizer_type_1 = "llama3"
+# default_tokenizer_type_2 = "internlm_chat_7b"
+default_tokenizer_type_2 = "mistral_7b"

stats/compress_rate.json CHANGED Viewed

@@ -4282,5 +4282,509 @@
     "n_bytes": 2633047,
     "n_tokens": 757405,
     "n_chars": 927311
   }
 }

     "n_bytes": 2633047,
     "n_tokens": 757405,
     "n_chars": 927311
+  },
+  "dutch_llama_tokenizer.cc100-en": {
+    "vocab_size": 32000,
+    "n_bytes": 1124813,
+    "n_tokens": 291975,
+    "n_chars": 1121360
+  },
+  "gronlp-gpt2-small-dutch.cc100-en": {
+    "vocab_size": 40000,
+    "n_bytes": 1124813,
+    "n_tokens": 361710,
+    "n_chars": 1121360
+  },
+  "yhavinga-gpt2-medium-dutch.cc100-en": {
+    "vocab_size": 50257,
+    "n_bytes": 1124813,
+    "n_tokens": 361847,
+    "n_chars": 1121360
+  },
+  "yhavinga-ul2-large-en-nl.cc100-en": {
+    "vocab_size": 32128,
+    "n_bytes": 1124813,
+    "n_tokens": 297641,
+    "n_chars": 1121360
+  },
+  "dutch_llama_tokenizer.cc100-zh-Hans": {
+    "vocab_size": 32000,
+    "n_bytes": 2633047,
+    "n_tokens": 2621293,
+    "n_chars": 927311
+  },
+  "gronlp-gpt2-small-dutch.cc100-zh-Hans": {
+    "vocab_size": 40000,
+    "n_bytes": 2633047,
+    "n_tokens": 1350320,
+    "n_chars": 927311
+  },
+  "yhavinga-gpt2-medium-dutch.cc100-zh-Hans": {
+    "vocab_size": 50257,
+    "n_bytes": 2633047,
+    "n_tokens": 2600872,
+    "n_chars": 927311
+  },
+  "yhavinga-ul2-large-en-nl.cc100-zh-Hans": {
+    "vocab_size": 32128,
+    "n_bytes": 2633047,
+    "n_tokens": 2519719,
+    "n_chars": 927311
+  },
+  "aya_101.cc100-nl": {
+    "vocab_size": 250100,
+    "n_bytes": 1513030,
+    "n_tokens": 423616,
+    "n_chars": 1508067
+  },
+  "baichuan.cc100-nl": {
+    "vocab_size": 64000,
+    "n_bytes": 1513030,
+    "n_tokens": 574927,
+    "n_chars": 1508067
+  },
+  "baichuan2.cc100-nl": {
+    "vocab_size": 125696,
+    "n_bytes": 1513030,
+    "n_tokens": 540387,
+    "n_chars": 1508067
+  },
+  "bert_base_cased.cc100-nl": {
+    "vocab_size": 28996,
+    "n_bytes": 1513030,
+    "n_tokens": 630793,
+    "n_chars": 1508067
+  },
+  "bert_base_chinese.cc100-nl": {
+    "vocab_size": 21128,
+    "n_bytes": 1513030,
+    "n_tokens": 626052,
+    "n_chars": 1508067
+  },
+  "bert_base_uncased.cc100-nl": {
+    "vocab_size": 30522,
+    "n_bytes": 1513030,
+    "n_tokens": 574651,
+    "n_chars": 1508067
+  },
+  "bloom.cc100-nl": {
+    "vocab_size": 250680,
+    "n_bytes": 1513030,
+    "n_tokens": 488924,
+    "n_chars": 1508067
+  },
+  "byt5_small.cc100-nl": {
+    "vocab_size": 384,
+    "n_bytes": 1513030,
+    "n_tokens": 1523030,
+    "n_chars": 1508067
+  },
+  "character_glm_6b.cc100-nl": {
+    "vocab_size": 64789,
+    "n_bytes": 1513030,
+    "n_tokens": 559014,
+    "n_chars": 1508067
+  },
+  "chatglm2_6b.cc100-nl": {
+    "vocab_size": 64787,
+    "n_bytes": 1513030,
+    "n_tokens": 559017,
+    "n_chars": 1508067
+  },
+  "chatglm3_6b.cc100-nl": {
+    "vocab_size": 64796,
+    "n_bytes": 1513030,
+    "n_tokens": 559014,
+    "n_chars": 1508067
+  },
+  "chatglm_6b.cc100-nl": {
+    "vocab_size": 150344,
+    "n_bytes": 1513030,
+    "n_tokens": 533174,
+    "n_chars": 1508067
+  },
+  "chatyuan_large_v2.cc100-nl": {
+    "vocab_size": 32128,
+    "n_bytes": 1513030,
+    "n_tokens": 837963,
+    "n_chars": 1508067
+  },
+  "chinese_llama.cc100-nl": {
+    "vocab_size": 49953,
+    "n_bytes": 1513030,
+    "n_tokens": 488766,
+    "n_chars": 1508067
+  },
+  "chinese_llama2.cc100-nl": {
+    "vocab_size": 55296,
+    "n_bytes": 1513030,
+    "n_tokens": 495966,
+    "n_chars": 1508067
+  },
+  "code_davinci_002.cc100-nl": {
+    "vocab_size": 50281,
+    "n_bytes": 1513030,
+    "n_tokens": 559119,
+    "n_chars": 1508067
+  },
+  "crystal_coder.cc100-nl": {
+    "vocab_size": 32022,
+    "n_bytes": 1513030,
+    "n_tokens": 485966,
+    "n_chars": 1508067
+  },
+  "dbrx_instruct.cc100-nl": {
+    "vocab_size": 100280,
+    "n_bytes": 1513030,
+    "n_tokens": 449343,
+    "n_chars": 1508067
+  },
+  "deepseek_coder_33b_instruct.cc100-nl": {
+    "vocab_size": 32022,
+    "n_bytes": 1513030,
+    "n_tokens": 603966,
+    "n_chars": 1508067
+  },
+  "deepseek_llm_7b_base.cc100-nl": {
+    "vocab_size": 100015,
+    "n_bytes": 1513030,
+    "n_tokens": 536746,
+    "n_chars": 1508067
+  },
+  "dutch_llama_tokenizer.cc100-nl": {
+    "vocab_size": 32000,
+    "n_bytes": 1513030,
+    "n_tokens": 366481,
+    "n_chars": 1508067
+  },
+  "falcon_180b.cc100-nl": {
+    "vocab_size": 65024,
+    "n_bytes": 1513030,
+    "n_tokens": 438112,
+    "n_chars": 1508067
+  },
+  "falcon_7b.cc100-nl": {
+    "vocab_size": 65024,
+    "n_bytes": 1513030,
+    "n_tokens": 438112,
+    "n_chars": 1508067
+  },
+  "fastchat_t5_3b.cc100-nl": {
+    "vocab_size": 32110,
+    "n_bytes": 1513030,
+    "n_tokens": 933018,
+    "n_chars": 1508067
+  },
+  "flan_t5_base.cc100-nl": {
+    "vocab_size": 32100,
+    "n_bytes": 1513030,
+    "n_tokens": 696337,
+    "n_chars": 1508067
+  },
+  "gemma_7b.cc100-nl": {
+    "vocab_size": 256000,
+    "n_bytes": 1513030,
+    "n_tokens": 387522,
+    "n_chars": 1508067
+  },
+  "gpt2.cc100-nl": {
+    "vocab_size": 50257,
+    "n_bytes": 1513030,
+    "n_tokens": 559119,
+    "n_chars": 1508067
+  },
+  "gpt2_chinese.cc100-nl": {
+    "vocab_size": 21128,
+    "n_bytes": 1513030,
+    "n_tokens": 676651,
+    "n_chars": 1508067
+  },
+  "gpt_35_turbo.cc100-nl": {
+    "vocab_size": 100277,
+    "n_bytes": 1513030,
+    "n_tokens": 449343,
+    "n_chars": 1508067
+  },
+  "gpt_4.cc100-nl": {
+    "vocab_size": 100277,
+    "n_bytes": 1513030,
+    "n_tokens": 449343,
+    "n_chars": 1508067
+  },
+  "gpt_neox_japanese_2_7b.cc100-nl": {
+    "vocab_size": 32000,
+    "n_bytes": 1513030,
+    "n_tokens": 1509448,
+    "n_chars": 1508067
+  },
+  "gpt_nexo_20b.cc100-nl": {
+    "vocab_size": 50277,
+    "n_bytes": 1513030,
+    "n_tokens": 497728,
+    "n_chars": 1508067
+  },
+  "grok_1.cc100-nl": {
+    "vocab_size": 131072,
+    "n_bytes": 1513030,
+    "n_tokens": 457359,
+    "n_chars": 1508067
+  },
+  "gronlp-gpt2-small-dutch.cc100-nl": {
+    "vocab_size": 40000,
+    "n_bytes": 1513030,
+    "n_tokens": 332376,
+    "n_chars": 1508067
+  },
+  "internlm2_chat_7b.cc100-nl": {
+    "vocab_size": 92544,
+    "n_bytes": 1513030,
+    "n_tokens": 494821,
+    "n_chars": 1508067
+  },
+  "internlm2_math_7b.cc100-nl": {
+    "vocab_size": 92544,
+    "n_bytes": 1513030,
+    "n_tokens": 494821,
+    "n_chars": 1508067
+  },
+  "internlm_chat_7b.cc100-nl": {
+    "vocab_size": 103168,
+    "n_bytes": 1513030,
+    "n_tokens": 494108,
+    "n_chars": 1508067
+  },
+  "internlm_xcomposer_7b.cc100-nl": {
+    "vocab_size": 103168,
+    "n_bytes": 1513030,
+    "n_tokens": 494108,
+    "n_chars": 1508067
+  },
+  "jamba_v0_1.cc100-nl": {
+    "vocab_size": 65536,
+    "n_bytes": 1513030,
+    "n_tokens": 442176,
+    "n_chars": 1508067
+  },
+  "kplug.cc100-nl": {
+    "vocab_size": 10261,
+    "n_bytes": 1513030,
+    "n_tokens": 678131,
+    "n_chars": 1508067
+  },
+  "llama.cc100-nl": {
+    "vocab_size": 32000,
+    "n_bytes": 1513030,
+    "n_tokens": 495966,
+    "n_chars": 1508067
+  },
+  "llama2.cc100-nl": {
+    "vocab_size": 32001,
+    "n_bytes": 1513030,
+    "n_tokens": 495966,
+    "n_chars": 1508067
+  },
+  "llama3.cc100-nl": {
+    "vocab_size": 128256,
+    "n_bytes": 1513030,
+    "n_tokens": 448173,
+    "n_chars": 1508067
+  },
+  "llama_3_chinese_8b.cc100-nl": {
+    "vocab_size": 128256,
+    "n_bytes": 1513030,
+    "n_tokens": 458173,
+    "n_chars": 1508067
+  },
+  "mistral_7b.cc100-nl": {
+    "vocab_size": 32000,
+    "n_bytes": 1513030,
+    "n_tokens": 515884,
+    "n_chars": 1508067
+  },
+  "mixtral_8_7b.cc100-nl": {
+    "vocab_size": 32000,
+    "n_bytes": 1513030,
+    "n_tokens": 515884,
+    "n_chars": 1508067
+  },
+  "mobilebert_uncased.cc100-nl": {
+    "vocab_size": 30522,
+    "n_bytes": 1513030,
+    "n_tokens": 574651,
+    "n_chars": 1508067
+  },
+  "moss.cc100-nl": {
+    "vocab_size": 106072,
+    "n_bytes": 1513030,
+    "n_tokens": 557984,
+    "n_chars": 1508067
+  },
+  "mt5_large.cc100-nl": {
+    "vocab_size": 250100,
+    "n_bytes": 1513030,
+    "n_tokens": 423616,
+    "n_chars": 1508067
+  },
+  "dutch_llama_tokenizer.cc100-es": {
+    "vocab_size": 32000,
+    "n_bytes": 1664455,
+    "n_tokens": 610314,
+    "n_chars": 1630297
+  },
+  "gronlp-gpt2-small-dutch.cc100-es": {
+    "vocab_size": 40000,
+    "n_bytes": 1664455,
+    "n_tokens": 608465,
+    "n_chars": 1630297
+  },
+  "yhavinga-gpt2-medium-dutch.cc100-es": {
+    "vocab_size": 50257,
+    "n_bytes": 1664455,
+    "n_tokens": 605886,
+    "n_chars": 1630297
+  },
+  "yhavinga-ul2-large-en-nl.cc100-es": {
+    "vocab_size": 32128,
+    "n_bytes": 1664455,
+    "n_tokens": 686255,
+    "n_chars": 1630297
+  },
+  "olmo_7b.cc100-nl": {
+    "vocab_size": 50280,
+    "n_bytes": 1513030,
+    "n_tokens": 497728,
+    "n_chars": 1508067
+  },
+  "orion_14b_chat.cc100-nl": {
+    "vocab_size": 84608,
+    "n_bytes": 1513030,
+    "n_tokens": 599429,
+    "n_chars": 1508067
+  },
+  "phi_1.cc100-nl": {
+    "vocab_size": 50295,
+    "n_bytes": 1513030,
+    "n_tokens": 559124,
+    "n_chars": 1508067
+  },
+  "phi_2.cc100-nl": {
+    "vocab_size": 50295,
+    "n_bytes": 1513030,
+    "n_tokens": 559124,
+    "n_chars": 1508067
+  },
+  "phi_3_mini.cc100-nl": {
+    "vocab_size": 32011,
+    "n_bytes": 1513030,
+    "n_tokens": 495966,
+    "n_chars": 1508067
+  },
+  "pko_t5_large.cc100-nl": {
+    "vocab_size": 50358,
+    "n_bytes": 1513030,
+    "n_tokens": 1017288,
+    "n_chars": 1508067
+  },
+  "prompt_clue.cc100-nl": {
+    "vocab_size": 32128,
+    "n_bytes": 1513030,
+    "n_tokens": 837963,
+    "n_chars": 1508067
+  },
+  "qwen1_5_14b_chat.cc100-nl": {
+    "vocab_size": 151646,
+    "n_bytes": 1513030,
+    "n_tokens": 453342,
+    "n_chars": 1508067
+  },
+  "qwen_1_8b_chat.cc100-nl": {
+    "vocab_size": 151851,
+    "n_bytes": 1513030,
+    "n_tokens": 453342,
+    "n_chars": 1508067
+  },
+  "qwen_72b_chat.cc100-nl": {
+    "vocab_size": 151851,
+    "n_bytes": 1513030,
+    "n_tokens": 453342,
+    "n_chars": 1508067
+  },
+  "qwen_7b_chat.cc100-nl": {
+    "vocab_size": 151851,
+    "n_bytes": 1513030,
+    "n_tokens": 453342,
+    "n_chars": 1508067
+  },
+  "roberta_chinese_clue.cc100-nl": {
+    "vocab_size": 8021,
+    "n_bytes": 1513030,
+    "n_tokens": 821246,
+    "n_chars": 1508067
+  },
+  "skywork_13b_base.cc100-nl": {
+    "vocab_size": 65519,
+    "n_bytes": 1513030,
+    "n_tokens": 495958,
+    "n_chars": 1508067
+  },
+  "skywork_13b_math.cc100-nl": {
+    "vocab_size": 65519,
+    "n_bytes": 1513030,
+    "n_tokens": 495958,
+    "n_chars": 1508067
+  },
+  "solar_10_7b.cc100-nl": {
+    "vocab_size": 32000,
+    "n_bytes": 1513030,
+    "n_tokens": 515884,
+    "n_chars": 1508067
+  },
+  "starchat_alpha.cc100-nl": {
+    "vocab_size": 49156,
+    "n_bytes": 1513030,
+    "n_tokens": 532871,
+    "n_chars": 1508067
+  },
+  "switch_c_2048.cc100-nl": {
+    "vocab_size": 32100,
+    "n_bytes": 1513030,
+    "n_tokens": 696333,
+    "n_chars": 1508067
+  },
+  "t5_base.cc100-nl": {
+    "vocab_size": 32100,
+    "n_bytes": 1513030,
+    "n_tokens": 696333,
+    "n_chars": 1508067
+  },
+  "t5_large.cc100-nl": {
+    "vocab_size": 32100,
+    "n_bytes": 1513030,
+    "n_tokens": 696333,
+    "n_chars": 1508067
+  },
+  "t5_small.cc100-nl": {
+    "vocab_size": 32100,
+    "n_bytes": 1513030,
+    "n_tokens": 696333,
+    "n_chars": 1508067
+  },
+  "text_davinci_003.cc100-nl": {
+    "vocab_size": 50281,
+    "n_bytes": 1513030,
+    "n_tokens": 559119,
+    "n_chars": 1508067
+  },
+  "tigerbot_13b_chat_v2.cc100-nl": {
+    "vocab_size": 60515,
+    "n_bytes": 1513030,
+    "n_tokens": 486271,
+    "n_chars": 1508067
+  },
+  "tigerbot_70b_chat_v4_4k.cc100-nl": {
+    "vocab_size": 65110,
+    "n_bytes": 1513030,
+    "n_tokens": 486472,
+    "n_chars": 1508067
   }
 }

utils/compression_util.py CHANGED Viewed

@@ -20,7 +20,7 @@ from typing import List, Optional, Union, Literal
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
-common_corpuses = sorted(["cc100-en", "cc100-zh-Hans", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
                           "cc100-fa", "cc100-ar", "cc100-ja"])
 VALID_CODES_CC100 = [
@@ -155,7 +155,7 @@ def tokenize_corpus(
 def get_compression_leaderboard(
-        corpuses: List[str] = ['cc100-en'],
         unit: str = "b_tokens/g_bytes",
         tokenizer_filter: Optional[str] = None,
         return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"

 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
+common_corpuses = sorted(["cc100-nl", "cc100-en", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
                           "cc100-fa", "cc100-ar", "cc100-ja"])
 VALID_CODES_CC100 = [
 def get_compression_leaderboard(
+        corpuses: List[str] = ['cc100-nl'],
         unit: str = "b_tokens/g_bytes",
         tokenizer_filter: Optional[str] = None,
         return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"

vocab/wizardcoder_15b_v1/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)

+#
+# from transformers import AutoTokenizer
+#
+# tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)