tarsssss commited on
Commit
e664806
·
verified ·
1 Parent(s): ed1bdc4

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +44 -7
  2. tokenizer.json +9 -0
  3. tokenizer_config.json +10 -1
special_tokens_map.json CHANGED
@@ -51,11 +51,30 @@
51
  "ur_PK",
52
  "xh_ZA",
53
  "gl_ES",
54
- "sl_SI"
 
55
  ],
56
- "bos_token": "<s>",
57
- "cls_token": "<s>",
58
- "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  "mask_token": {
60
  "content": "<mask>",
61
  "lstrip": true,
@@ -63,7 +82,25 @@
63
  "rstrip": false,
64
  "single_word": false
65
  },
66
- "pad_token": "<pad>",
67
- "sep_token": "</s>",
68
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
 
51
  "ur_PK",
52
  "xh_ZA",
53
  "gl_ES",
54
+ "sl_SI",
55
+ "bj_ID"
56
  ],
57
+ "bos_token": {
58
+ "content": "<s>",
59
+ "lstrip": false,
60
+ "normalized": false,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ },
64
+ "cls_token": {
65
+ "content": "<s>",
66
+ "lstrip": false,
67
+ "normalized": false,
68
+ "rstrip": false,
69
+ "single_word": false
70
+ },
71
+ "eos_token": {
72
+ "content": "</s>",
73
+ "lstrip": false,
74
+ "normalized": false,
75
+ "rstrip": false,
76
+ "single_word": false
77
+ },
78
  "mask_token": {
79
  "content": "<mask>",
80
  "lstrip": true,
 
82
  "rstrip": false,
83
  "single_word": false
84
  },
85
+ "pad_token": {
86
+ "content": "<pad>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false
91
+ },
92
+ "sep_token": {
93
+ "content": "</s>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false
98
+ },
99
+ "unk_token": {
100
+ "content": "<unk>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false
105
+ }
106
  }
tokenizer.json CHANGED
@@ -515,6 +515,15 @@
515
  "rstrip": false,
516
  "normalized": true,
517
  "special": true
 
 
 
 
 
 
 
 
 
518
  }
519
  ],
520
  "normalizer": {
 
515
  "rstrip": false,
516
  "normalized": true,
517
  "special": true
518
+ },
519
+ {
520
+ "id": 2540,
521
+ "content": "bj_ID",
522
+ "single_word": false,
523
+ "lstrip": false,
524
+ "rstrip": false,
525
+ "normalized": false,
526
+ "special": true
527
  }
528
  ],
529
  "normalizer": {
tokenizer_config.json CHANGED
@@ -455,6 +455,14 @@
455
  "rstrip": false,
456
  "single_word": false,
457
  "special": true
 
 
 
 
 
 
 
 
458
  }
459
  },
460
  "additional_special_tokens": [
@@ -509,7 +517,8 @@
509
  "ur_PK",
510
  "xh_ZA",
511
  "gl_ES",
512
- "sl_SI"
 
513
  ],
514
  "bos_token": "<s>",
515
  "clean_up_tokenization_spaces": true,
 
455
  "rstrip": false,
456
  "single_word": false,
457
  "special": true
458
+ },
459
+ "2540": {
460
+ "content": "bj_ID",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
  }
467
  },
468
  "additional_special_tokens": [
 
517
  "ur_PK",
518
  "xh_ZA",
519
  "gl_ES",
520
+ "sl_SI",
521
+ "bj_ID"
522
  ],
523
  "bos_token": "<s>",
524
  "clean_up_tokenization_spaces": true,