Hack some Dutch tokenizers into it
Browse files
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Tokenizer Arena
|
3 |
emoji: ⚡
|
4 |
colorFrom: red
|
5 |
colorTo: gray
|
|
|
1 |
---
|
2 |
+
title: Dutch Tokenizer Arena
|
3 |
emoji: ⚡
|
4 |
colorFrom: red
|
5 |
colorTo: gray
|
util.py
CHANGED
@@ -125,6 +125,29 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
|
|
125 |
return overlap_token_size, overlap_token_size
|
126 |
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
def on_load(url_params, request: gr.Request):
|
130 |
"""
|
|
|
125 |
return overlap_token_size, overlap_token_size
|
126 |
|
127 |
|
128 |
+
default_user_input = """
|
129 |
+
“We apologize for any inconvenience and concern this may have caused to our customers and all concerned. We pray for the rest of the souls of those who lost their lives aboard the Japanese Coast Guard's equipment and extend our condolences to the bereaved families,” he said.
|
130 |
+
|
131 |
+
Steenvliegen of oevervliegen[2] (Plecoptera) zijn een kleine orde van gevleugelde insecten. Steenvliegen zijn te herkennen aan hun slanke, langwerpige lichaamsvorm en de doorzichtige vleugels die in rust plat op de rug worden gehouden.
|
132 |
+
|
133 |
+
def load_image_file(file, mode='RGB'):
|
134 |
+
im = PIL.Image.open(file)
|
135 |
+
if mode:
|
136 |
+
im = im.convert(mode)
|
137 |
+
return np.array(im)
|
138 |
+
|
139 |
+
\section{The expected number of intervening \mbox{H\,{\sc i}}
|
140 |
+
absorbers}\label{section:expected_number}
|
141 |
+
\begin{equation}\label{equation:expected_number}
|
142 |
+
\mu = \iint{f(N_{\rm HI},X)\,\mathrm{d}X\,\mathrm{d}N_{\rm HI}},
|
143 |
+
\end{equation}
|
144 |
+
|
145 |
+
Eerder noemde De Meij Oud en Nieuw "een soort oorlogsgebied". En hij heeft dan ook geen zin in de nieuwjaarsnacht. "Als je weet dat er collega's gewond gaan raken, kan je niet meer zeggen: het is mooi politiewerk en we gaan naar een spannende nacht. Het zijn gewoon risico's die je niet wil lopen."
|
146 |
+
华为发布Mate60手机
|
147 |
+
ラグビーワールドカップ2023フランス"""
|
148 |
+
default_tokenizer_type_1 = "dutch_llama_tokenizer"
|
149 |
+
# default_tokenizer_type_2 = "internlm_chat_7b"
|
150 |
+
default_tokenizer_type_2 = "mistral_7b"
|
151 |
|
152 |
def on_load(url_params, request: gr.Request):
|
153 |
"""
|
vocab/__init__.py
CHANGED
@@ -94,6 +94,11 @@ all_tokenizers = [
|
|
94 |
("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
|
95 |
("starchat_alpha", "", "GPT2Tokenizer",),
|
96 |
|
|
|
|
|
|
|
|
|
|
|
97 |
####### google/sentencepiece tokenizer:
|
98 |
# T5 llama internlm
|
99 |
("t5_small", "", "sentencepiece"),
|
|
|
94 |
("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
|
95 |
("starchat_alpha", "", "GPT2Tokenizer",),
|
96 |
|
97 |
+
("gronlp-gpt2-small-dutch", "", "GPT2Tokenizer",),
|
98 |
+
("yhavinga-gpt2-medium-dutch", "", "GPT2Tokenizer",),
|
99 |
+
("dutch_llama_tokenizer", ),
|
100 |
+
("yhavinga-ul2-large-en-nl", "", "sentencepiece"),
|
101 |
+
|
102 |
####### google/sentencepiece tokenizer:
|
103 |
# T5 llama internlm
|
104 |
("t5_small", "", "sentencepiece"),
|
vocab/dutch_llama_tokenizer/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("yhavinga/dutch-llama-tokenizer", trust_remote_code=True)
|
vocab/gronlp-gpt2-small-dutch/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-dutch", trust_remote_code=True)
|
vocab/yhavinga-gpt2-medium-dutch/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("yhavinga/gpt2-medium-dutch", trust_remote_code=True)
|
vocab/yhavinga-ul2-large-en-nl/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("yhavinga/ul2-large-en-nl", trust_remote_code=True, use_fast=False)
|