Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

yhavinga commited on Jan 2

Commit

55df72d

•

1 Parent(s): f331792

Hack some Dutch tokenizers into it

Browse files

Files changed (7) hide show

README.md +1 -1
util.py +23 -0
vocab/__init__.py +5 -0
vocab/dutch_llama_tokenizer/__init__.py +4 -0
vocab/gronlp-gpt2-small-dutch/__init__.py +4 -0
vocab/yhavinga-gpt2-medium-dutch/__init__.py +4 -0
vocab/yhavinga-ul2-large-en-nl/__init__.py +4 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Tokenizer Arena
 emoji: ⚡
 colorFrom: red
 colorTo: gray

 ---
+title: Dutch Tokenizer Arena
 emoji: ⚡
 colorFrom: red
 colorTo: gray

util.py CHANGED Viewed

@@ -125,6 +125,29 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
     return overlap_token_size, overlap_token_size
 def on_load(url_params, request: gr.Request):
     """

     return overlap_token_size, overlap_token_size
+default_user_input = """
+“We apologize for any inconvenience and concern this may have caused to our customers and all concerned. We pray for the rest of the souls of those who lost their lives aboard the Japanese Coast Guard's equipment and extend our condolences to the bereaved families,” he said.
+Steenvliegen of oevervliegen[2] (Plecoptera) zijn een kleine orde van gevleugelde insecten. Steenvliegen zijn te herkennen aan hun slanke, langwerpige lichaamsvorm en de doorzichtige vleugels die in rust plat op de rug worden gehouden.
+def load_image_file(file, mode='RGB'):
+    im = PIL.Image.open(file)
+    if mode:
+        im = im.convert(mode)
+    return np.array(im)
+\section{The expected number of intervening \mbox{H\,{\sc i}}
+  absorbers}\label{section:expected_number}
+\begin{equation}\label{equation:expected_number}
+  \mu =  \iint{f(N_{\rm HI},X)\,\mathrm{d}X\,\mathrm{d}N_{\rm HI}},
+\end{equation}
+Eerder noemde De Meij Oud en Nieuw "een soort oorlogsgebied". En hij heeft dan ook geen zin in de nieuwjaarsnacht. "Als je weet dat er collega's gewond gaan raken, kan je niet meer zeggen: het is mooi politiewerk en we gaan naar een spannende nacht. Het zijn gewoon risico's die je niet wil lopen."
+华为发布Mate60手机
+ラグビーワールドカップ2023フランス"""
+default_tokenizer_type_1 = "dutch_llama_tokenizer"
+# default_tokenizer_type_2 = "internlm_chat_7b"
+default_tokenizer_type_2 = "mistral_7b"
 def on_load(url_params, request: gr.Request):
     """

vocab/__init__.py CHANGED Viewed

@@ -94,6 +94,11 @@ all_tokenizers = [
     ("qwen1_5_14b_chat", "", "GPT2Tokenizer",),  # 15万，速度有点慢
     ("starchat_alpha", "", "GPT2Tokenizer",),
     ####### google/sentencepiece tokenizer:
     # T5 llama internlm
     ("t5_small", "", "sentencepiece"),

     ("qwen1_5_14b_chat", "", "GPT2Tokenizer",),  # 15万，速度有点慢
     ("starchat_alpha", "", "GPT2Tokenizer",),
+    ("gronlp-gpt2-small-dutch", "", "GPT2Tokenizer",),
+    ("yhavinga-gpt2-medium-dutch", "", "GPT2Tokenizer",),
+    ("dutch_llama_tokenizer", ),
+    ("yhavinga-ul2-large-en-nl", "", "sentencepiece"),
     ####### google/sentencepiece tokenizer:
     # T5 llama internlm
     ("t5_small", "", "sentencepiece"),

vocab/dutch_llama_tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ from transformers import AutoTokenizer
3	+
4	+ tokenizer = AutoTokenizer.from_pretrained("yhavinga/dutch-llama-tokenizer", trust_remote_code=True)

vocab/gronlp-gpt2-small-dutch/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ from transformers import AutoTokenizer
3	+
4	+ tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-dutch", trust_remote_code=True)

vocab/yhavinga-gpt2-medium-dutch/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ from transformers import AutoTokenizer
3	+
4	+ tokenizer = AutoTokenizer.from_pretrained("yhavinga/gpt2-medium-dutch", trust_remote_code=True)

vocab/yhavinga-ul2-large-en-nl/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ from transformers import AutoTokenizer
3	+
4	+ tokenizer = AutoTokenizer.from_pretrained("yhavinga/ul2-large-en-nl", trust_remote_code=True, use_fast=False)