Add Llama tokenizer creation for Dutch, English, Code, Markdown and TeX.
Browse files- app.py +1 -1
- app_compression.py +1 -1
- config.py +17 -5
- stats/compress_rate.json +504 -0
- utils/compression_util.py +2 -2
- vocab/wizardcoder_15b_v1/__init__.py +4 -4
app.py
CHANGED
@@ -8,7 +8,7 @@ from patcher.gr_interface import TabbedInterface
|
|
8 |
demo = TabbedInterface(
|
9 |
[tab_playground, tab_compression],
|
10 |
[" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
|
11 |
-
title='
|
12 |
css="css/style.css"
|
13 |
)
|
14 |
|
|
|
8 |
demo = TabbedInterface(
|
9 |
[tab_playground, tab_compression],
|
10 |
[" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
|
11 |
+
title='Tokenizer Arena ⚔️ (with some Dutch 🇳🇱🇧🇪🇸🇷 hacked in)',
|
12 |
css="css/style.css"
|
13 |
)
|
14 |
|
app_compression.py
CHANGED
@@ -59,7 +59,7 @@ with gr.Blocks() as demo:
|
|
59 |
with gr.Row():
|
60 |
compress_rate_corpus = gr.Dropdown(
|
61 |
common_corpuses, # , "code"
|
62 |
-
value=["cc100-
|
63 |
label="corpus",
|
64 |
multiselect=True
|
65 |
# info=""
|
|
|
59 |
with gr.Row():
|
60 |
compress_rate_corpus = gr.Dropdown(
|
61 |
common_corpuses, # , "code"
|
62 |
+
value=["cc100-nl", "cc100-en"],
|
63 |
label="corpus",
|
64 |
multiselect=True
|
65 |
# info=""
|
config.py
CHANGED
@@ -11,10 +11,22 @@ LAZY_IMPORT = True
|
|
11 |
# DEBUG: 设置环境变量 RUST_BACKTRACE=full
|
12 |
#
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
ラグビーワールドカップ2023フランス"""
|
19 |
default_tokenizer_type_1 = "llama3"
|
20 |
-
default_tokenizer_type_2 = "
|
|
|
|
11 |
# DEBUG: 设置环境变量 RUST_BACKTRACE=full
|
12 |
#
|
13 |
|
14 |
+
|
15 |
+
default_user_input = """“We apologize for any inconvenience and concern this may have caused to our customers and all concerned. We pray for the rest of the souls of those who lost their lives aboard the Japanese Coast Guard's equipment and extend our condolences to the bereaved families,” he said.
|
16 |
+
Steenvliegen of oevervliegen[2] (Plecoptera) zijn een kleine orde van gevleugelde insecten. Steenvliegen zijn te herkennen aan hun slanke, langwerpige lichaamsvorm en de doorzichtige vleugels die in rust plat op de rug worden gehouden.
|
17 |
+
def load_image_file(file, mode='RGB'):
|
18 |
+
im = PIL.Image.open(file)
|
19 |
+
if mode:
|
20 |
+
im = im.convert(mode)
|
21 |
+
return np.array(im)
|
22 |
+
\section{The expected number of intervening \mbox{H\,{\sc i}}
|
23 |
+
absorbers}\label{section:expected_number}
|
24 |
+
\begin{equation}\label{equation:expected_number}
|
25 |
+
\mu = \iint{f(N_{\rm HI},X)\,\mathrm{d}X\,\mathrm{d}N_{\rm HI}},
|
26 |
+
\end{equation}
|
27 |
+
Eerder noemde De Meij Oud en Nieuw "een soort oorlogsgebied". En hij heeft dan ook geen zin in de nieuwjaarsnacht. "Als je weet dat er collega's gewond gaan raken, kan je niet meer zeggen: het is mooi politiewerk en we gaan naar een spannende nacht. Het zijn gewoon risico's die je niet wil lopen."
|
28 |
+
华为发布Mate60手机
|
29 |
ラグビーワールドカップ2023フランス"""
|
30 |
default_tokenizer_type_1 = "llama3"
|
31 |
+
# default_tokenizer_type_2 = "internlm_chat_7b"
|
32 |
+
default_tokenizer_type_2 = "mistral_7b"
|
stats/compress_rate.json
CHANGED
@@ -4282,5 +4282,509 @@
|
|
4282 |
"n_bytes": 2633047,
|
4283 |
"n_tokens": 757405,
|
4284 |
"n_chars": 927311
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4285 |
}
|
4286 |
}
|
|
|
4282 |
"n_bytes": 2633047,
|
4283 |
"n_tokens": 757405,
|
4284 |
"n_chars": 927311
|
4285 |
+
},
|
4286 |
+
"dutch_llama_tokenizer.cc100-en": {
|
4287 |
+
"vocab_size": 32000,
|
4288 |
+
"n_bytes": 1124813,
|
4289 |
+
"n_tokens": 291975,
|
4290 |
+
"n_chars": 1121360
|
4291 |
+
},
|
4292 |
+
"gronlp-gpt2-small-dutch.cc100-en": {
|
4293 |
+
"vocab_size": 40000,
|
4294 |
+
"n_bytes": 1124813,
|
4295 |
+
"n_tokens": 361710,
|
4296 |
+
"n_chars": 1121360
|
4297 |
+
},
|
4298 |
+
"yhavinga-gpt2-medium-dutch.cc100-en": {
|
4299 |
+
"vocab_size": 50257,
|
4300 |
+
"n_bytes": 1124813,
|
4301 |
+
"n_tokens": 361847,
|
4302 |
+
"n_chars": 1121360
|
4303 |
+
},
|
4304 |
+
"yhavinga-ul2-large-en-nl.cc100-en": {
|
4305 |
+
"vocab_size": 32128,
|
4306 |
+
"n_bytes": 1124813,
|
4307 |
+
"n_tokens": 297641,
|
4308 |
+
"n_chars": 1121360
|
4309 |
+
},
|
4310 |
+
"dutch_llama_tokenizer.cc100-zh-Hans": {
|
4311 |
+
"vocab_size": 32000,
|
4312 |
+
"n_bytes": 2633047,
|
4313 |
+
"n_tokens": 2621293,
|
4314 |
+
"n_chars": 927311
|
4315 |
+
},
|
4316 |
+
"gronlp-gpt2-small-dutch.cc100-zh-Hans": {
|
4317 |
+
"vocab_size": 40000,
|
4318 |
+
"n_bytes": 2633047,
|
4319 |
+
"n_tokens": 1350320,
|
4320 |
+
"n_chars": 927311
|
4321 |
+
},
|
4322 |
+
"yhavinga-gpt2-medium-dutch.cc100-zh-Hans": {
|
4323 |
+
"vocab_size": 50257,
|
4324 |
+
"n_bytes": 2633047,
|
4325 |
+
"n_tokens": 2600872,
|
4326 |
+
"n_chars": 927311
|
4327 |
+
},
|
4328 |
+
"yhavinga-ul2-large-en-nl.cc100-zh-Hans": {
|
4329 |
+
"vocab_size": 32128,
|
4330 |
+
"n_bytes": 2633047,
|
4331 |
+
"n_tokens": 2519719,
|
4332 |
+
"n_chars": 927311
|
4333 |
+
},
|
4334 |
+
"aya_101.cc100-nl": {
|
4335 |
+
"vocab_size": 250100,
|
4336 |
+
"n_bytes": 1513030,
|
4337 |
+
"n_tokens": 423616,
|
4338 |
+
"n_chars": 1508067
|
4339 |
+
},
|
4340 |
+
"baichuan.cc100-nl": {
|
4341 |
+
"vocab_size": 64000,
|
4342 |
+
"n_bytes": 1513030,
|
4343 |
+
"n_tokens": 574927,
|
4344 |
+
"n_chars": 1508067
|
4345 |
+
},
|
4346 |
+
"baichuan2.cc100-nl": {
|
4347 |
+
"vocab_size": 125696,
|
4348 |
+
"n_bytes": 1513030,
|
4349 |
+
"n_tokens": 540387,
|
4350 |
+
"n_chars": 1508067
|
4351 |
+
},
|
4352 |
+
"bert_base_cased.cc100-nl": {
|
4353 |
+
"vocab_size": 28996,
|
4354 |
+
"n_bytes": 1513030,
|
4355 |
+
"n_tokens": 630793,
|
4356 |
+
"n_chars": 1508067
|
4357 |
+
},
|
4358 |
+
"bert_base_chinese.cc100-nl": {
|
4359 |
+
"vocab_size": 21128,
|
4360 |
+
"n_bytes": 1513030,
|
4361 |
+
"n_tokens": 626052,
|
4362 |
+
"n_chars": 1508067
|
4363 |
+
},
|
4364 |
+
"bert_base_uncased.cc100-nl": {
|
4365 |
+
"vocab_size": 30522,
|
4366 |
+
"n_bytes": 1513030,
|
4367 |
+
"n_tokens": 574651,
|
4368 |
+
"n_chars": 1508067
|
4369 |
+
},
|
4370 |
+
"bloom.cc100-nl": {
|
4371 |
+
"vocab_size": 250680,
|
4372 |
+
"n_bytes": 1513030,
|
4373 |
+
"n_tokens": 488924,
|
4374 |
+
"n_chars": 1508067
|
4375 |
+
},
|
4376 |
+
"byt5_small.cc100-nl": {
|
4377 |
+
"vocab_size": 384,
|
4378 |
+
"n_bytes": 1513030,
|
4379 |
+
"n_tokens": 1523030,
|
4380 |
+
"n_chars": 1508067
|
4381 |
+
},
|
4382 |
+
"character_glm_6b.cc100-nl": {
|
4383 |
+
"vocab_size": 64789,
|
4384 |
+
"n_bytes": 1513030,
|
4385 |
+
"n_tokens": 559014,
|
4386 |
+
"n_chars": 1508067
|
4387 |
+
},
|
4388 |
+
"chatglm2_6b.cc100-nl": {
|
4389 |
+
"vocab_size": 64787,
|
4390 |
+
"n_bytes": 1513030,
|
4391 |
+
"n_tokens": 559017,
|
4392 |
+
"n_chars": 1508067
|
4393 |
+
},
|
4394 |
+
"chatglm3_6b.cc100-nl": {
|
4395 |
+
"vocab_size": 64796,
|
4396 |
+
"n_bytes": 1513030,
|
4397 |
+
"n_tokens": 559014,
|
4398 |
+
"n_chars": 1508067
|
4399 |
+
},
|
4400 |
+
"chatglm_6b.cc100-nl": {
|
4401 |
+
"vocab_size": 150344,
|
4402 |
+
"n_bytes": 1513030,
|
4403 |
+
"n_tokens": 533174,
|
4404 |
+
"n_chars": 1508067
|
4405 |
+
},
|
4406 |
+
"chatyuan_large_v2.cc100-nl": {
|
4407 |
+
"vocab_size": 32128,
|
4408 |
+
"n_bytes": 1513030,
|
4409 |
+
"n_tokens": 837963,
|
4410 |
+
"n_chars": 1508067
|
4411 |
+
},
|
4412 |
+
"chinese_llama.cc100-nl": {
|
4413 |
+
"vocab_size": 49953,
|
4414 |
+
"n_bytes": 1513030,
|
4415 |
+
"n_tokens": 488766,
|
4416 |
+
"n_chars": 1508067
|
4417 |
+
},
|
4418 |
+
"chinese_llama2.cc100-nl": {
|
4419 |
+
"vocab_size": 55296,
|
4420 |
+
"n_bytes": 1513030,
|
4421 |
+
"n_tokens": 495966,
|
4422 |
+
"n_chars": 1508067
|
4423 |
+
},
|
4424 |
+
"code_davinci_002.cc100-nl": {
|
4425 |
+
"vocab_size": 50281,
|
4426 |
+
"n_bytes": 1513030,
|
4427 |
+
"n_tokens": 559119,
|
4428 |
+
"n_chars": 1508067
|
4429 |
+
},
|
4430 |
+
"crystal_coder.cc100-nl": {
|
4431 |
+
"vocab_size": 32022,
|
4432 |
+
"n_bytes": 1513030,
|
4433 |
+
"n_tokens": 485966,
|
4434 |
+
"n_chars": 1508067
|
4435 |
+
},
|
4436 |
+
"dbrx_instruct.cc100-nl": {
|
4437 |
+
"vocab_size": 100280,
|
4438 |
+
"n_bytes": 1513030,
|
4439 |
+
"n_tokens": 449343,
|
4440 |
+
"n_chars": 1508067
|
4441 |
+
},
|
4442 |
+
"deepseek_coder_33b_instruct.cc100-nl": {
|
4443 |
+
"vocab_size": 32022,
|
4444 |
+
"n_bytes": 1513030,
|
4445 |
+
"n_tokens": 603966,
|
4446 |
+
"n_chars": 1508067
|
4447 |
+
},
|
4448 |
+
"deepseek_llm_7b_base.cc100-nl": {
|
4449 |
+
"vocab_size": 100015,
|
4450 |
+
"n_bytes": 1513030,
|
4451 |
+
"n_tokens": 536746,
|
4452 |
+
"n_chars": 1508067
|
4453 |
+
},
|
4454 |
+
"dutch_llama_tokenizer.cc100-nl": {
|
4455 |
+
"vocab_size": 32000,
|
4456 |
+
"n_bytes": 1513030,
|
4457 |
+
"n_tokens": 366481,
|
4458 |
+
"n_chars": 1508067
|
4459 |
+
},
|
4460 |
+
"falcon_180b.cc100-nl": {
|
4461 |
+
"vocab_size": 65024,
|
4462 |
+
"n_bytes": 1513030,
|
4463 |
+
"n_tokens": 438112,
|
4464 |
+
"n_chars": 1508067
|
4465 |
+
},
|
4466 |
+
"falcon_7b.cc100-nl": {
|
4467 |
+
"vocab_size": 65024,
|
4468 |
+
"n_bytes": 1513030,
|
4469 |
+
"n_tokens": 438112,
|
4470 |
+
"n_chars": 1508067
|
4471 |
+
},
|
4472 |
+
"fastchat_t5_3b.cc100-nl": {
|
4473 |
+
"vocab_size": 32110,
|
4474 |
+
"n_bytes": 1513030,
|
4475 |
+
"n_tokens": 933018,
|
4476 |
+
"n_chars": 1508067
|
4477 |
+
},
|
4478 |
+
"flan_t5_base.cc100-nl": {
|
4479 |
+
"vocab_size": 32100,
|
4480 |
+
"n_bytes": 1513030,
|
4481 |
+
"n_tokens": 696337,
|
4482 |
+
"n_chars": 1508067
|
4483 |
+
},
|
4484 |
+
"gemma_7b.cc100-nl": {
|
4485 |
+
"vocab_size": 256000,
|
4486 |
+
"n_bytes": 1513030,
|
4487 |
+
"n_tokens": 387522,
|
4488 |
+
"n_chars": 1508067
|
4489 |
+
},
|
4490 |
+
"gpt2.cc100-nl": {
|
4491 |
+
"vocab_size": 50257,
|
4492 |
+
"n_bytes": 1513030,
|
4493 |
+
"n_tokens": 559119,
|
4494 |
+
"n_chars": 1508067
|
4495 |
+
},
|
4496 |
+
"gpt2_chinese.cc100-nl": {
|
4497 |
+
"vocab_size": 21128,
|
4498 |
+
"n_bytes": 1513030,
|
4499 |
+
"n_tokens": 676651,
|
4500 |
+
"n_chars": 1508067
|
4501 |
+
},
|
4502 |
+
"gpt_35_turbo.cc100-nl": {
|
4503 |
+
"vocab_size": 100277,
|
4504 |
+
"n_bytes": 1513030,
|
4505 |
+
"n_tokens": 449343,
|
4506 |
+
"n_chars": 1508067
|
4507 |
+
},
|
4508 |
+
"gpt_4.cc100-nl": {
|
4509 |
+
"vocab_size": 100277,
|
4510 |
+
"n_bytes": 1513030,
|
4511 |
+
"n_tokens": 449343,
|
4512 |
+
"n_chars": 1508067
|
4513 |
+
},
|
4514 |
+
"gpt_neox_japanese_2_7b.cc100-nl": {
|
4515 |
+
"vocab_size": 32000,
|
4516 |
+
"n_bytes": 1513030,
|
4517 |
+
"n_tokens": 1509448,
|
4518 |
+
"n_chars": 1508067
|
4519 |
+
},
|
4520 |
+
"gpt_nexo_20b.cc100-nl": {
|
4521 |
+
"vocab_size": 50277,
|
4522 |
+
"n_bytes": 1513030,
|
4523 |
+
"n_tokens": 497728,
|
4524 |
+
"n_chars": 1508067
|
4525 |
+
},
|
4526 |
+
"grok_1.cc100-nl": {
|
4527 |
+
"vocab_size": 131072,
|
4528 |
+
"n_bytes": 1513030,
|
4529 |
+
"n_tokens": 457359,
|
4530 |
+
"n_chars": 1508067
|
4531 |
+
},
|
4532 |
+
"gronlp-gpt2-small-dutch.cc100-nl": {
|
4533 |
+
"vocab_size": 40000,
|
4534 |
+
"n_bytes": 1513030,
|
4535 |
+
"n_tokens": 332376,
|
4536 |
+
"n_chars": 1508067
|
4537 |
+
},
|
4538 |
+
"internlm2_chat_7b.cc100-nl": {
|
4539 |
+
"vocab_size": 92544,
|
4540 |
+
"n_bytes": 1513030,
|
4541 |
+
"n_tokens": 494821,
|
4542 |
+
"n_chars": 1508067
|
4543 |
+
},
|
4544 |
+
"internlm2_math_7b.cc100-nl": {
|
4545 |
+
"vocab_size": 92544,
|
4546 |
+
"n_bytes": 1513030,
|
4547 |
+
"n_tokens": 494821,
|
4548 |
+
"n_chars": 1508067
|
4549 |
+
},
|
4550 |
+
"internlm_chat_7b.cc100-nl": {
|
4551 |
+
"vocab_size": 103168,
|
4552 |
+
"n_bytes": 1513030,
|
4553 |
+
"n_tokens": 494108,
|
4554 |
+
"n_chars": 1508067
|
4555 |
+
},
|
4556 |
+
"internlm_xcomposer_7b.cc100-nl": {
|
4557 |
+
"vocab_size": 103168,
|
4558 |
+
"n_bytes": 1513030,
|
4559 |
+
"n_tokens": 494108,
|
4560 |
+
"n_chars": 1508067
|
4561 |
+
},
|
4562 |
+
"jamba_v0_1.cc100-nl": {
|
4563 |
+
"vocab_size": 65536,
|
4564 |
+
"n_bytes": 1513030,
|
4565 |
+
"n_tokens": 442176,
|
4566 |
+
"n_chars": 1508067
|
4567 |
+
},
|
4568 |
+
"kplug.cc100-nl": {
|
4569 |
+
"vocab_size": 10261,
|
4570 |
+
"n_bytes": 1513030,
|
4571 |
+
"n_tokens": 678131,
|
4572 |
+
"n_chars": 1508067
|
4573 |
+
},
|
4574 |
+
"llama.cc100-nl": {
|
4575 |
+
"vocab_size": 32000,
|
4576 |
+
"n_bytes": 1513030,
|
4577 |
+
"n_tokens": 495966,
|
4578 |
+
"n_chars": 1508067
|
4579 |
+
},
|
4580 |
+
"llama2.cc100-nl": {
|
4581 |
+
"vocab_size": 32001,
|
4582 |
+
"n_bytes": 1513030,
|
4583 |
+
"n_tokens": 495966,
|
4584 |
+
"n_chars": 1508067
|
4585 |
+
},
|
4586 |
+
"llama3.cc100-nl": {
|
4587 |
+
"vocab_size": 128256,
|
4588 |
+
"n_bytes": 1513030,
|
4589 |
+
"n_tokens": 448173,
|
4590 |
+
"n_chars": 1508067
|
4591 |
+
},
|
4592 |
+
"llama_3_chinese_8b.cc100-nl": {
|
4593 |
+
"vocab_size": 128256,
|
4594 |
+
"n_bytes": 1513030,
|
4595 |
+
"n_tokens": 458173,
|
4596 |
+
"n_chars": 1508067
|
4597 |
+
},
|
4598 |
+
"mistral_7b.cc100-nl": {
|
4599 |
+
"vocab_size": 32000,
|
4600 |
+
"n_bytes": 1513030,
|
4601 |
+
"n_tokens": 515884,
|
4602 |
+
"n_chars": 1508067
|
4603 |
+
},
|
4604 |
+
"mixtral_8_7b.cc100-nl": {
|
4605 |
+
"vocab_size": 32000,
|
4606 |
+
"n_bytes": 1513030,
|
4607 |
+
"n_tokens": 515884,
|
4608 |
+
"n_chars": 1508067
|
4609 |
+
},
|
4610 |
+
"mobilebert_uncased.cc100-nl": {
|
4611 |
+
"vocab_size": 30522,
|
4612 |
+
"n_bytes": 1513030,
|
4613 |
+
"n_tokens": 574651,
|
4614 |
+
"n_chars": 1508067
|
4615 |
+
},
|
4616 |
+
"moss.cc100-nl": {
|
4617 |
+
"vocab_size": 106072,
|
4618 |
+
"n_bytes": 1513030,
|
4619 |
+
"n_tokens": 557984,
|
4620 |
+
"n_chars": 1508067
|
4621 |
+
},
|
4622 |
+
"mt5_large.cc100-nl": {
|
4623 |
+
"vocab_size": 250100,
|
4624 |
+
"n_bytes": 1513030,
|
4625 |
+
"n_tokens": 423616,
|
4626 |
+
"n_chars": 1508067
|
4627 |
+
},
|
4628 |
+
"dutch_llama_tokenizer.cc100-es": {
|
4629 |
+
"vocab_size": 32000,
|
4630 |
+
"n_bytes": 1664455,
|
4631 |
+
"n_tokens": 610314,
|
4632 |
+
"n_chars": 1630297
|
4633 |
+
},
|
4634 |
+
"gronlp-gpt2-small-dutch.cc100-es": {
|
4635 |
+
"vocab_size": 40000,
|
4636 |
+
"n_bytes": 1664455,
|
4637 |
+
"n_tokens": 608465,
|
4638 |
+
"n_chars": 1630297
|
4639 |
+
},
|
4640 |
+
"yhavinga-gpt2-medium-dutch.cc100-es": {
|
4641 |
+
"vocab_size": 50257,
|
4642 |
+
"n_bytes": 1664455,
|
4643 |
+
"n_tokens": 605886,
|
4644 |
+
"n_chars": 1630297
|
4645 |
+
},
|
4646 |
+
"yhavinga-ul2-large-en-nl.cc100-es": {
|
4647 |
+
"vocab_size": 32128,
|
4648 |
+
"n_bytes": 1664455,
|
4649 |
+
"n_tokens": 686255,
|
4650 |
+
"n_chars": 1630297
|
4651 |
+
},
|
4652 |
+
"olmo_7b.cc100-nl": {
|
4653 |
+
"vocab_size": 50280,
|
4654 |
+
"n_bytes": 1513030,
|
4655 |
+
"n_tokens": 497728,
|
4656 |
+
"n_chars": 1508067
|
4657 |
+
},
|
4658 |
+
"orion_14b_chat.cc100-nl": {
|
4659 |
+
"vocab_size": 84608,
|
4660 |
+
"n_bytes": 1513030,
|
4661 |
+
"n_tokens": 599429,
|
4662 |
+
"n_chars": 1508067
|
4663 |
+
},
|
4664 |
+
"phi_1.cc100-nl": {
|
4665 |
+
"vocab_size": 50295,
|
4666 |
+
"n_bytes": 1513030,
|
4667 |
+
"n_tokens": 559124,
|
4668 |
+
"n_chars": 1508067
|
4669 |
+
},
|
4670 |
+
"phi_2.cc100-nl": {
|
4671 |
+
"vocab_size": 50295,
|
4672 |
+
"n_bytes": 1513030,
|
4673 |
+
"n_tokens": 559124,
|
4674 |
+
"n_chars": 1508067
|
4675 |
+
},
|
4676 |
+
"phi_3_mini.cc100-nl": {
|
4677 |
+
"vocab_size": 32011,
|
4678 |
+
"n_bytes": 1513030,
|
4679 |
+
"n_tokens": 495966,
|
4680 |
+
"n_chars": 1508067
|
4681 |
+
},
|
4682 |
+
"pko_t5_large.cc100-nl": {
|
4683 |
+
"vocab_size": 50358,
|
4684 |
+
"n_bytes": 1513030,
|
4685 |
+
"n_tokens": 1017288,
|
4686 |
+
"n_chars": 1508067
|
4687 |
+
},
|
4688 |
+
"prompt_clue.cc100-nl": {
|
4689 |
+
"vocab_size": 32128,
|
4690 |
+
"n_bytes": 1513030,
|
4691 |
+
"n_tokens": 837963,
|
4692 |
+
"n_chars": 1508067
|
4693 |
+
},
|
4694 |
+
"qwen1_5_14b_chat.cc100-nl": {
|
4695 |
+
"vocab_size": 151646,
|
4696 |
+
"n_bytes": 1513030,
|
4697 |
+
"n_tokens": 453342,
|
4698 |
+
"n_chars": 1508067
|
4699 |
+
},
|
4700 |
+
"qwen_1_8b_chat.cc100-nl": {
|
4701 |
+
"vocab_size": 151851,
|
4702 |
+
"n_bytes": 1513030,
|
4703 |
+
"n_tokens": 453342,
|
4704 |
+
"n_chars": 1508067
|
4705 |
+
},
|
4706 |
+
"qwen_72b_chat.cc100-nl": {
|
4707 |
+
"vocab_size": 151851,
|
4708 |
+
"n_bytes": 1513030,
|
4709 |
+
"n_tokens": 453342,
|
4710 |
+
"n_chars": 1508067
|
4711 |
+
},
|
4712 |
+
"qwen_7b_chat.cc100-nl": {
|
4713 |
+
"vocab_size": 151851,
|
4714 |
+
"n_bytes": 1513030,
|
4715 |
+
"n_tokens": 453342,
|
4716 |
+
"n_chars": 1508067
|
4717 |
+
},
|
4718 |
+
"roberta_chinese_clue.cc100-nl": {
|
4719 |
+
"vocab_size": 8021,
|
4720 |
+
"n_bytes": 1513030,
|
4721 |
+
"n_tokens": 821246,
|
4722 |
+
"n_chars": 1508067
|
4723 |
+
},
|
4724 |
+
"skywork_13b_base.cc100-nl": {
|
4725 |
+
"vocab_size": 65519,
|
4726 |
+
"n_bytes": 1513030,
|
4727 |
+
"n_tokens": 495958,
|
4728 |
+
"n_chars": 1508067
|
4729 |
+
},
|
4730 |
+
"skywork_13b_math.cc100-nl": {
|
4731 |
+
"vocab_size": 65519,
|
4732 |
+
"n_bytes": 1513030,
|
4733 |
+
"n_tokens": 495958,
|
4734 |
+
"n_chars": 1508067
|
4735 |
+
},
|
4736 |
+
"solar_10_7b.cc100-nl": {
|
4737 |
+
"vocab_size": 32000,
|
4738 |
+
"n_bytes": 1513030,
|
4739 |
+
"n_tokens": 515884,
|
4740 |
+
"n_chars": 1508067
|
4741 |
+
},
|
4742 |
+
"starchat_alpha.cc100-nl": {
|
4743 |
+
"vocab_size": 49156,
|
4744 |
+
"n_bytes": 1513030,
|
4745 |
+
"n_tokens": 532871,
|
4746 |
+
"n_chars": 1508067
|
4747 |
+
},
|
4748 |
+
"switch_c_2048.cc100-nl": {
|
4749 |
+
"vocab_size": 32100,
|
4750 |
+
"n_bytes": 1513030,
|
4751 |
+
"n_tokens": 696333,
|
4752 |
+
"n_chars": 1508067
|
4753 |
+
},
|
4754 |
+
"t5_base.cc100-nl": {
|
4755 |
+
"vocab_size": 32100,
|
4756 |
+
"n_bytes": 1513030,
|
4757 |
+
"n_tokens": 696333,
|
4758 |
+
"n_chars": 1508067
|
4759 |
+
},
|
4760 |
+
"t5_large.cc100-nl": {
|
4761 |
+
"vocab_size": 32100,
|
4762 |
+
"n_bytes": 1513030,
|
4763 |
+
"n_tokens": 696333,
|
4764 |
+
"n_chars": 1508067
|
4765 |
+
},
|
4766 |
+
"t5_small.cc100-nl": {
|
4767 |
+
"vocab_size": 32100,
|
4768 |
+
"n_bytes": 1513030,
|
4769 |
+
"n_tokens": 696333,
|
4770 |
+
"n_chars": 1508067
|
4771 |
+
},
|
4772 |
+
"text_davinci_003.cc100-nl": {
|
4773 |
+
"vocab_size": 50281,
|
4774 |
+
"n_bytes": 1513030,
|
4775 |
+
"n_tokens": 559119,
|
4776 |
+
"n_chars": 1508067
|
4777 |
+
},
|
4778 |
+
"tigerbot_13b_chat_v2.cc100-nl": {
|
4779 |
+
"vocab_size": 60515,
|
4780 |
+
"n_bytes": 1513030,
|
4781 |
+
"n_tokens": 486271,
|
4782 |
+
"n_chars": 1508067
|
4783 |
+
},
|
4784 |
+
"tigerbot_70b_chat_v4_4k.cc100-nl": {
|
4785 |
+
"vocab_size": 65110,
|
4786 |
+
"n_bytes": 1513030,
|
4787 |
+
"n_tokens": 486472,
|
4788 |
+
"n_chars": 1508067
|
4789 |
}
|
4790 |
}
|
utils/compression_util.py
CHANGED
@@ -20,7 +20,7 @@ from typing import List, Optional, Union, Literal
|
|
20 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
21 |
|
22 |
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
23 |
-
common_corpuses = sorted(["cc100-
|
24 |
"cc100-fa", "cc100-ar", "cc100-ja"])
|
25 |
|
26 |
VALID_CODES_CC100 = [
|
@@ -155,7 +155,7 @@ def tokenize_corpus(
|
|
155 |
|
156 |
|
157 |
def get_compression_leaderboard(
|
158 |
-
corpuses: List[str] = ['cc100-
|
159 |
unit: str = "b_tokens/g_bytes",
|
160 |
tokenizer_filter: Optional[str] = None,
|
161 |
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
|
|
20 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
21 |
|
22 |
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
23 |
+
common_corpuses = sorted(["cc100-nl", "cc100-en", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
|
24 |
"cc100-fa", "cc100-ar", "cc100-ja"])
|
25 |
|
26 |
VALID_CODES_CC100 = [
|
|
|
155 |
|
156 |
|
157 |
def get_compression_leaderboard(
|
158 |
+
corpuses: List[str] = ['cc100-nl'],
|
159 |
unit: str = "b_tokens/g_bytes",
|
160 |
tokenizer_filter: Optional[str] = None,
|
161 |
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
vocab/wizardcoder_15b_v1/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
-
from transformers import AutoTokenizer
|
3 |
-
|
4 |
-
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
|
|
|
1 |
+
#
|
2 |
+
# from transformers import AutoTokenizer
|
3 |
+
#
|
4 |
+
# tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
|