libri_100 / tokenizer.json
Lakoc's picture
Upload tokenizer
436f6a2 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "([bos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "([eos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "([unk])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "([pad])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "([mask])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 1
}
}
],
"special_tokens": {
"([bos])": {
"id": "([bos])",
"ids": [
0
],
"tokens": [
"([bos])"
]
},
"([eos])": {
"id": "([eos])",
"ids": [
1
],
"tokens": [
"([eos])"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"([bos])": 0,
"([eos])": 1,
"([unk])": 2,
"([pad])": 3,
"([mask])": 4,
"'": 5,
"a": 6,
"b": 7,
"c": 8,
"d": 9,
"e": 10,
"f": 11,
"g": 12,
"h": 13,
"i": 14,
"j": 15,
"k": 16,
"l": 17,
"m": 18,
"n": 19,
"o": 20,
"p": 21,
"q": 22,
"r": 23,
"s": 24,
"t": 25,
"u": 26,
"v": 27,
"w": 28,
"x": 29,
"y": 30,
"z": 31,
"Ġ": 32,
"Ġt": 33,
"he": 34,
"Ġa": 35,
"Ġthe": 36,
"in": 37,
"Ġs": 38,
"Ġw": 39,
"Ġo": 40,
"re": 41,
"nd": 42,
"Ġb": 43,
"Ġh": 44,
"er": 45,
"Ġm": 46,
"Ġi": 47,
"ou": 48,
"Ġc": 49,
"Ġf": 50,
"at": 51,
"ed": 52,
"Ġand": 53,
"en": 54,
"Ġto": 55,
"Ġof": 56,
"on": 57,
"is": 58,
"Ġd": 59,
"ing": 60,
"Ġth": 61,
"Ġp": 62,
"Ġhe": 63,
"or": 64,
"Ġl": 65,
"es": 66,
"Ġin": 67,
"ll": 68,
"it": 69,
"ar": 70,
"as": 71,
"an": 72,
"Ġn": 73,
"Ġg": 74,
"om": 75,
"Ġbe": 76,
"Ġha": 77,
"Ġe": 78,
"le": 79,
"ot": 80,
"Ġy": 81,
"ut": 82,
"ow": 83,
"ic": 84,
"Ġwh": 85,
"Ġit": 86,
"ld": 87,
"ve": 88,
"Ġthat": 89,
"ly": 90,
"Ġwas": 91,
"id": 92,
"se": 93,
"st": 94,
"Ġon": 95,
"gh": 96,
"ent": 97,
"Ġre": 98,
"Ġyou": 99
},
"merges": [
"Ġ t",
"h e",
"Ġ a",
"Ġt he",
"i n",
"Ġ s",
"Ġ w",
"Ġ o",
"r e",
"n d",
"Ġ b",
"Ġ h",
"e r",
"Ġ m",
"Ġ i",
"o u",
"Ġ c",
"Ġ f",
"a t",
"e d",
"Ġa nd",
"e n",
"Ġt o",
"Ġo f",
"o n",
"i s",
"Ġ d",
"in g",
"Ġt h",
"Ġ p",
"Ġ he",
"o r",
"Ġ l",
"e s",
"Ġ in",
"l l",
"i t",
"a r",
"a s",
"a n",
"Ġ n",
"Ġ g",
"o m",
"Ġb e",
"Ġh a",
"Ġ e",
"l e",
"o t",
"Ġ y",
"u t",
"o w",
"i c",
"Ġw h",
"Ġi t",
"l d",
"v e",
"Ġth at",
"l y",
"Ġw as",
"i d",
"s e",
"s t",
"Ġo n",
"g h",
"en t",
"Ġ re",
"Ġy ou"
]
}
}