add tokenizer
Browse files- tokenizer_config.json +1 -1
- vocab.txt +9 -8
tokenizer_config.json
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
"do_basic_tokenize": true,
|
4 |
"do_lower_case": true,
|
5 |
"mask_token": "[MASK]",
|
6 |
-
"name_or_path": "
|
7 |
"never_split": null,
|
8 |
"pad_token": "[PAD]",
|
9 |
"sep_token": "[SEP]",
|
|
|
3 |
"do_basic_tokenize": true,
|
4 |
"do_lower_case": true,
|
5 |
"mask_token": "[MASK]",
|
6 |
+
"name_or_path": "vocab-bart-base-cantonese.txt",
|
7 |
"never_split": null,
|
8 |
"pad_token": "[PAD]",
|
9 |
"sep_token": "[SEP]",
|
vocab.txt
CHANGED
@@ -21,6 +21,7 @@
|
|
21 |
乪
|
22 |
乸
|
23 |
亍
|
|
|
24 |
佮
|
25 |
侲
|
26 |
冚
|
@@ -28,6 +29,7 @@
|
|
28 |
剒
|
29 |
劖
|
30 |
卼
|
|
|
31 |
厹
|
32 |
厾
|
33 |
吔
|
@@ -80,6 +82,7 @@
|
|
80 |
埞
|
81 |
埲
|
82 |
奀
|
|
|
83 |
嬡
|
84 |
嬲
|
85 |
孭
|
@@ -95,9 +98,6 @@
|
|
95 |
忳
|
96 |
愎
|
97 |
愩
|
98 |
-
愾
|
99 |
-
戇
|
100 |
-
戙
|
101 |
[UNK]
|
102 |
[CLS]
|
103 |
[SEP]
|
@@ -12386,12 +12386,16 @@ fishbase
|
|
12386 |
##🔥
|
12387 |
##😂
|
12388 |
##😎
|
|
|
|
|
|
|
12389 |
戥
|
12390 |
戽
|
12391 |
扚
|
12392 |
扠
|
12393 |
扤
|
12394 |
扲
|
|
|
12395 |
扻
|
12396 |
扽
|
12397 |
抆
|
@@ -12439,9 +12443,7 @@ fishbase
|
|
12439 |
攋
|
12440 |
攰
|
12441 |
斲
|
12442 |
-
昅
|
12443 |
曱
|
12444 |
-
柙
|
12445 |
栢
|
12446 |
梘
|
12447 |
棖
|
@@ -12478,6 +12480,7 @@ fishbase
|
|
12478 |
燶
|
12479 |
爨
|
12480 |
猁
|
|
|
12481 |
獌
|
12482 |
瑒
|
12483 |
甖
|
@@ -12536,6 +12539,7 @@ fishbase
|
|
12536 |
蝻
|
12537 |
螆
|
12538 |
蠋
|
|
|
12539 |
裇
|
12540 |
褦
|
12541 |
褸
|
@@ -12567,7 +12571,6 @@ fishbase
|
|
12567 |
鋭
|
12568 |
錔
|
12569 |
錡
|
12570 |
-
鍚
|
12571 |
鍠
|
12572 |
鎅
|
12573 |
鎝
|
@@ -12576,7 +12579,6 @@ fishbase
|
|
12576 |
閂
|
12577 |
閪
|
12578 |
韞
|
12579 |
-
韮
|
12580 |
頇
|
12581 |
餲
|
12582 |
餸
|
@@ -12654,6 +12656,5 @@ fishbase
|
|
12654 |
𨈇
|
12655 |
𨋢
|
12656 |
𨳒
|
12657 |
-
𨶙
|
12658 |
𩓥
|
12659 |
𪘲
|
|
|
21 |
乪
|
22 |
乸
|
23 |
亍
|
24 |
+
仼
|
25 |
佮
|
26 |
侲
|
27 |
冚
|
|
|
29 |
剒
|
30 |
劖
|
31 |
卼
|
32 |
+
厠
|
33 |
厹
|
34 |
厾
|
35 |
吔
|
|
|
82 |
埞
|
83 |
埲
|
84 |
奀
|
85 |
+
妺
|
86 |
嬡
|
87 |
嬲
|
88 |
孭
|
|
|
98 |
忳
|
99 |
愎
|
100 |
愩
|
|
|
|
|
|
|
101 |
[UNK]
|
102 |
[CLS]
|
103 |
[SEP]
|
|
|
12386 |
##🔥
|
12387 |
##😂
|
12388 |
##😎
|
12389 |
+
愾
|
12390 |
+
戇
|
12391 |
+
戙
|
12392 |
戥
|
12393 |
戽
|
12394 |
扚
|
12395 |
扠
|
12396 |
扤
|
12397 |
扲
|
12398 |
+
扺
|
12399 |
扻
|
12400 |
扽
|
12401 |
抆
|
|
|
12443 |
攋
|
12444 |
攰
|
12445 |
斲
|
|
|
12446 |
曱
|
|
|
12447 |
栢
|
12448 |
梘
|
12449 |
棖
|
|
|
12480 |
燶
|
12481 |
爨
|
12482 |
猁
|
12483 |
+
猢
|
12484 |
獌
|
12485 |
瑒
|
12486 |
甖
|
|
|
12539 |
蝻
|
12540 |
螆
|
12541 |
蠋
|
12542 |
+
袓
|
12543 |
裇
|
12544 |
褦
|
12545 |
褸
|
|
|
12571 |
鋭
|
12572 |
錔
|
12573 |
錡
|
|
|
12574 |
鍠
|
12575 |
鎅
|
12576 |
鎝
|
|
|
12579 |
閂
|
12580 |
閪
|
12581 |
韞
|
|
|
12582 |
頇
|
12583 |
餲
|
12584 |
餸
|
|
|
12656 |
𨈇
|
12657 |
𨋢
|
12658 |
𨳒
|
|
|
12659 |
𩓥
|
12660 |
𪘲
|