bert-base-test / tokenizer.json
Exqrch's picture
Upload tokenizer
dfb317b
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": true
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"!": 5,
",": 6,
".": 7,
"?": 8,
"a": 9,
"b": 10,
"c": 11,
"d": 12,
"e": 13,
"f": 14,
"g": 15,
"h": 16,
"i": 17,
"k": 18,
"l": 19,
"m": 20,
"n": 21,
"o": 22,
"p": 23,
"q": 24,
"r": 25,
"s": 26,
"t": 27,
"u": 28,
"v": 29,
"w": 30,
"x": 31,
"y": 32,
"z": 33,
"##l": 34,
"##e": 35,
"##p": 36,
"##i": 37,
"##s": 38,
"##t": 39,
"##r": 40,
"##f": 41,
"##w": 42,
"##a": 43,
"##o": 44,
"##u": 45,
"##n": 46,
"##d": 47,
"##g": 48,
"##h": 49,
"##v": 50,
"##k": 51,
"##m": 52,
"##z": 53,
"##y": 54,
"##c": 55,
"##b": 56,
"##x": 57,
"th": 58,
"##ou": 59,
"##re": 60,
"the": 61,
"##nd": 62,
"##is": 63,
"##es": 64,
"##er": 65,
"my": 66,
"##or": 67,
"##ve": 68,
"ha": 69,
"##ll": 70,
"##it": 71,
"to": 72,
"##nt": 73,
"and": 74,
"no": 75,
"##ed": 76,
"mo": 77,
"##st": 78,
"##at": 79,
"in": 80,
"thou": 81,
"##ea": 82,
"##in": 83,
"##me": 84,
"co": 85,
"of": 86,
"##ir": 87,
"wh": 88,
"##el": 89,
"##on": 90,
"not": 91,
"is": 92,
"wi": 93,
"##ee": 94,
"as": 95,
"##ld": 96,
"##th": 97,
"##ra": 98,
"most": 99,
"for": 100,
"##ri": 101,
"will": 102,
"be": 103,
"ca": 104,
"me": 105,
"so": 106,
"sh": 107,
"##la": 108,
"##en": 109,
"##se": 110,
"##ro": 111,
"##ch": 112,
"thy": 113,
"##est": 114,
"have": 115,
"what": 116,
"are": 117,
"do": 118,
"it": 119,
"li": 120,
"sp": 121,
"you": 122,
"##il": 123,
"##ty": 124,
"##ar": 125,
"##ay": 126,
"##ke": 127,
"this": 128,
"thee": 129,
"##ear": 130,
"come": 131,
"##irit": 132,
"spirit": 133,
"ch": 134,
"go": 135,
"ho": 136,
"his": 137,
"lo": 138,
"##le": 139,
"##ly": 140,
"##ut": 141,
"##ith": 142,
"more": 143,
"##ment": 144,
"all": 145,
"ba": 146,
"but": 147,
"de": 148,
"st": 149,
"see": 150,
"we": 151,
"wor": 152,
"with": 153,
"##ic": 154,
"##wn": 155,
"##an": 156,
"##ake": 157,
"##ul": 158,
"##gh": 159,
"##mp": 160,
"##ber": 161,
"that": 162,
"##ould": 163,
"##ist": 164,
"now": 165,
"##ing": 166,
"##eep": 167,
"am": 168,
"ari": 169,
"ex": 170,
"he": 171,
"po": 172,
"per": 173,
"re": 174,
"sou": 175,
"ser": 176,
"sla": 177,
"say": 178,
"tis": 179,
"wa": 180,
"##et": 181,
"##ement": 182,
"##id": 183,
"##for": 184,
"##ful": 185,
"##od": 186,
"##un": 187,
"##ure": 188,
"##nst": 189,
"##ge": 190,
"##ves": 191,
"##ms": 192,
"##med": 193,
"##ct": 194,
"##ous": 195,
"##our": 196,
"they": 197,
"##ish": 198,
"##ess": 199,
"##nter": 200,
"##eas": 201,
"##rom": 202,
"liber": 203,
"char": 204,
"ariel": 205,
"serv": 206,
"slave": 207,
"liberty": 208,
"ad": 209,
"br": 210,
"bo": 211,
"by": 212,
"bes": 213,
"bra": 214,
"bear": 215,
"du": 216,
"en": 217,
"ear": 218,
"fre": 219,
"fir": 220,
"fri": 221,
"fly": 222,
"fet": 223,
"from": 224,
"gre": 225,
"gra": 226,
"hon": 227,
"kn": 228,
"king": 229,
"make": 230,
"mist": 231,
"ne": 232,
"ou": 233,
"on": 234,
"own": 235,
"pl": 236,
"qu": 237,
"rel": 238,
"sl": 239,
"sw": 240,
"sa": 241,
"su": 242,
"sen": 243,
"vi": 244,
"would": 245,
"wish": 246,
"##li": 247,
"##em": 248,
"##ere": 249,
"##end": 250,
"##ip": 251,
"##ion": 252,
"##igh": 253,
"##ss": 254,
"##ses": 255,
"##te": 256,
"##tun": 257,
"##fe": 258,
"##wer": 259,
"##and": 260,
"##all": 261,
"##ant": 262,
"##ow": 263,
"##ore": 264,
"##ue": 265,
"##us": 266,
"##ver": 267,
"##vil": 268,
"##ck": 269,
"##cit": 270,
"##ban": 271,
"##ress": 272,
"them": 273,
"there": 274,
"##nds": 275,
"##ven": 276,
"has": 277,
"hast": 278,
"hath": 279,
"##ease": 280,
"cont": 281,
"comp": 282,
"##elf": 283,
"##one": 284,
"forth": 285,
"canst": 286,
"cali": 287,
"mere": 288,
"ship": 289,
"shall": 290,
"##lain": 291,
"like": 292,
"your": 293,
"good": 294,
"how": 295,
"love": 296,
"##lete": 297,
"bad": 298,
"devil": 299
}
}
}