Upload tokenizer
Browse files- special_tokens_map.json +44 -7
- tokenizer.json +9 -0
- tokenizer_config.json +10 -1
special_tokens_map.json
CHANGED
@@ -51,11 +51,30 @@
|
|
51 |
"ur_PK",
|
52 |
"xh_ZA",
|
53 |
"gl_ES",
|
54 |
-
"sl_SI"
|
|
|
55 |
],
|
56 |
-
"bos_token":
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
"mask_token": {
|
60 |
"content": "<mask>",
|
61 |
"lstrip": true,
|
@@ -63,7 +82,25 @@
|
|
63 |
"rstrip": false,
|
64 |
"single_word": false
|
65 |
},
|
66 |
-
"pad_token":
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
}
|
|
|
51 |
"ur_PK",
|
52 |
"xh_ZA",
|
53 |
"gl_ES",
|
54 |
+
"sl_SI",
|
55 |
+
"bj_ID"
|
56 |
],
|
57 |
+
"bos_token": {
|
58 |
+
"content": "<s>",
|
59 |
+
"lstrip": false,
|
60 |
+
"normalized": false,
|
61 |
+
"rstrip": false,
|
62 |
+
"single_word": false
|
63 |
+
},
|
64 |
+
"cls_token": {
|
65 |
+
"content": "<s>",
|
66 |
+
"lstrip": false,
|
67 |
+
"normalized": false,
|
68 |
+
"rstrip": false,
|
69 |
+
"single_word": false
|
70 |
+
},
|
71 |
+
"eos_token": {
|
72 |
+
"content": "</s>",
|
73 |
+
"lstrip": false,
|
74 |
+
"normalized": false,
|
75 |
+
"rstrip": false,
|
76 |
+
"single_word": false
|
77 |
+
},
|
78 |
"mask_token": {
|
79 |
"content": "<mask>",
|
80 |
"lstrip": true,
|
|
|
82 |
"rstrip": false,
|
83 |
"single_word": false
|
84 |
},
|
85 |
+
"pad_token": {
|
86 |
+
"content": "<pad>",
|
87 |
+
"lstrip": false,
|
88 |
+
"normalized": false,
|
89 |
+
"rstrip": false,
|
90 |
+
"single_word": false
|
91 |
+
},
|
92 |
+
"sep_token": {
|
93 |
+
"content": "</s>",
|
94 |
+
"lstrip": false,
|
95 |
+
"normalized": false,
|
96 |
+
"rstrip": false,
|
97 |
+
"single_word": false
|
98 |
+
},
|
99 |
+
"unk_token": {
|
100 |
+
"content": "<unk>",
|
101 |
+
"lstrip": false,
|
102 |
+
"normalized": false,
|
103 |
+
"rstrip": false,
|
104 |
+
"single_word": false
|
105 |
+
}
|
106 |
}
|
tokenizer.json
CHANGED
@@ -515,6 +515,15 @@
|
|
515 |
"rstrip": false,
|
516 |
"normalized": true,
|
517 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
518 |
}
|
519 |
],
|
520 |
"normalizer": {
|
|
|
515 |
"rstrip": false,
|
516 |
"normalized": true,
|
517 |
"special": true
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"id": 2540,
|
521 |
+
"content": "bj_ID",
|
522 |
+
"single_word": false,
|
523 |
+
"lstrip": false,
|
524 |
+
"rstrip": false,
|
525 |
+
"normalized": false,
|
526 |
+
"special": true
|
527 |
}
|
528 |
],
|
529 |
"normalizer": {
|
tokenizer_config.json
CHANGED
@@ -455,6 +455,14 @@
|
|
455 |
"rstrip": false,
|
456 |
"single_word": false,
|
457 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
}
|
459 |
},
|
460 |
"additional_special_tokens": [
|
@@ -509,7 +517,8 @@
|
|
509 |
"ur_PK",
|
510 |
"xh_ZA",
|
511 |
"gl_ES",
|
512 |
-
"sl_SI"
|
|
|
513 |
],
|
514 |
"bos_token": "<s>",
|
515 |
"clean_up_tokenization_spaces": true,
|
|
|
455 |
"rstrip": false,
|
456 |
"single_word": false,
|
457 |
"special": true
|
458 |
+
},
|
459 |
+
"2540": {
|
460 |
+
"content": "bj_ID",
|
461 |
+
"lstrip": false,
|
462 |
+
"normalized": false,
|
463 |
+
"rstrip": false,
|
464 |
+
"single_word": false,
|
465 |
+
"special": true
|
466 |
}
|
467 |
},
|
468 |
"additional_special_tokens": [
|
|
|
517 |
"ur_PK",
|
518 |
"xh_ZA",
|
519 |
"gl_ES",
|
520 |
+
"sl_SI",
|
521 |
+
"bj_ID"
|
522 |
],
|
523 |
"bos_token": "<s>",
|
524 |
"clean_up_tokenization_spaces": true,
|