Lakoc
/

libri_100

Inference Endpoints

Model card Files Files and versions Community

libri_100 / tokenizer.json

Lakoc's picture

Upload tokenizer

436f6a2 verified 5 months ago

history blame contribute delete

5.08 kB

	{
	"version": "1.0",
	"truncation": null,
	"padding": null,
	"added_tokens": [
	{
	"id": 0,
	"content": "([bos])",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 1,
	"content": "([eos])",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 2,
	"content": "([unk])",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 3,
	"content": "([pad])",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 4,
	"content": "([mask])",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	}
	],
	"normalizer": null,
	"pre_tokenizer": {
	"type": "ByteLevel",
	"add_prefix_space": true,
	"trim_offsets": true,
	"use_regex": true
	},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "([eos])",
	"type_id": 0
	}
	}
	],
	"pair": [
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "([eos])",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "B",
	"type_id": 1
	}
	},
	{
	"SpecialToken": {
	"id": "([eos])",
	"type_id": 1
	}
	}
	],
	"special_tokens": {
	"([bos])": {
	"id": "([bos])",
	"ids": [
	0
	],
	"tokens": [
	"([bos])"
	]
	},
	"([eos])": {
	"id": "([eos])",
	"ids": [
	1
	],
	"tokens": [
	"([eos])"
	]
	}
	}
	},
	"decoder": {
	"type": "ByteLevel",
	"add_prefix_space": true,
	"trim_offsets": true,
	"use_regex": true
	},
	"model": {
	"type": "BPE",
	"dropout": null,
	"unk_token": null,
	"continuing_subword_prefix": null,
	"end_of_word_suffix": null,
	"fuse_unk": false,
	"byte_fallback": false,
	"vocab": {
	"([bos])": 0,
	"([eos])": 1,
	"([unk])": 2,
	"([pad])": 3,
	"([mask])": 4,
	"'": 5,
	"a": 6,
	"b": 7,
	"c": 8,
	"d": 9,
	"e": 10,
	"f": 11,
	"g": 12,
	"h": 13,
	"i": 14,
	"j": 15,
	"k": 16,
	"l": 17,
	"m": 18,
	"n": 19,
	"o": 20,
	"p": 21,
	"q": 22,
	"r": 23,
	"s": 24,
	"t": 25,
	"u": 26,
	"v": 27,
	"w": 28,
	"x": 29,
	"y": 30,
	"z": 31,
	"Ġ": 32,
	"Ġt": 33,
	"he": 34,
	"Ġa": 35,
	"Ġthe": 36,
	"in": 37,
	"Ġs": 38,
	"Ġw": 39,
	"Ġo": 40,
	"re": 41,
	"nd": 42,
	"Ġb": 43,
	"Ġh": 44,
	"er": 45,
	"Ġm": 46,
	"Ġi": 47,
	"ou": 48,
	"Ġc": 49,
	"Ġf": 50,
	"at": 51,
	"ed": 52,
	"Ġand": 53,
	"en": 54,
	"Ġto": 55,
	"Ġof": 56,
	"on": 57,
	"is": 58,
	"Ġd": 59,
	"ing": 60,
	"Ġth": 61,
	"Ġp": 62,
	"Ġhe": 63,
	"or": 64,
	"Ġl": 65,
	"es": 66,
	"Ġin": 67,
	"ll": 68,
	"it": 69,
	"ar": 70,
	"as": 71,
	"an": 72,
	"Ġn": 73,
	"Ġg": 74,
	"om": 75,
	"Ġbe": 76,
	"Ġha": 77,
	"Ġe": 78,
	"le": 79,
	"ot": 80,
	"Ġy": 81,
	"ut": 82,
	"ow": 83,
	"ic": 84,
	"Ġwh": 85,
	"Ġit": 86,
	"ld": 87,
	"ve": 88,
	"Ġthat": 89,
	"ly": 90,
	"Ġwas": 91,
	"id": 92,
	"se": 93,
	"st": 94,
	"Ġon": 95,
	"gh": 96,
	"ent": 97,
	"Ġre": 98,
	"Ġyou": 99
	},
	"merges": [
	"Ġ t",
	"h e",
	"Ġ a",
	"Ġt he",
	"i n",
	"Ġ s",
	"Ġ w",
	"Ġ o",
	"r e",
	"n d",
	"Ġ b",
	"Ġ h",
	"e r",
	"Ġ m",
	"Ġ i",
	"o u",
	"Ġ c",
	"Ġ f",
	"a t",
	"e d",
	"Ġa nd",
	"e n",
	"Ġt o",
	"Ġo f",
	"o n",
	"i s",
	"Ġ d",
	"in g",
	"Ġt h",
	"Ġ p",
	"Ġ he",
	"o r",
	"Ġ l",
	"e s",
	"Ġ in",
	"l l",
	"i t",
	"a r",
	"a s",
	"a n",
	"Ġ n",
	"Ġ g",
	"o m",
	"Ġb e",
	"Ġh a",
	"Ġ e",
	"l e",
	"o t",
	"Ġ y",
	"u t",
	"o w",
	"i c",
	"Ġw h",
	"Ġi t",
	"l d",
	"v e",
	"Ġth at",
	"l y",
	"Ġw as",
	"i d",
	"s e",
	"s t",
	"Ġo n",
	"g h",
	"en t",
	"Ġ re",
	"Ġy ou"
	]
	}
	}