fish-speech-1 / tokenizer.json
lengyue233's picture
Upload tokenizer
f26514d verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|begin_of_sequence|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<|end_of_sequence|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<|im_start|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<|im_sep|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<|im_end|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "<|semantic|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "<|pad|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<|begin_of_sequence|>": 0,
"<|end_of_sequence|>": 1,
"<|im_start|>": 2,
"<|im_sep|>": 3,
"<|im_end|>": 4,
"<|semantic|>": 5,
"<|pad|>": 6,
"!": 7,
"\"": 8,
"#": 9,
"$": 10,
"%": 11,
"&": 12,
"'": 13,
"(": 14,
")": 15,
"*": 16,
"+": 17,
",": 18,
"-": 19,
".": 20,
"/": 21,
"0": 22,
"1": 23,
"2": 24,
"3": 25,
"4": 26,
"5": 27,
"6": 28,
"7": 29,
"8": 30,
"9": 31,
":": 32,
";": 33,
"<": 34,
"=": 35,
">": 36,
"?": 37,
"@": 38,
"A": 39,
"B": 40,
"C": 41,
"D": 42,
"E": 43,
"F": 44,
"G": 45,
"H": 46,
"I": 47,
"J": 48,
"K": 49,
"L": 50,
"M": 51,
"N": 52,
"O": 53,
"P": 54,
"Q": 55,
"R": 56,
"S": 57,
"T": 58,
"U": 59,
"V": 60,
"W": 61,
"X": 62,
"Y": 63,
"Z": 64,
"[": 65,
"\\": 66,
"]": 67,
"^": 68,
"_": 69,
"`": 70,
"a": 71,
"b": 72,
"c": 73,
"d": 74,
"e": 75,
"f": 76,
"g": 77,
"h": 78,
"i": 79,
"j": 80,
"k": 81,
"l": 82,
"m": 83,
"n": 84,
"o": 85,
"p": 86,
"q": 87,
"r": 88,
"s": 89,
"t": 90,
"u": 91,
"v": 92,
"w": 93,
"x": 94,
"y": 95,
"z": 96,
"{": 97,
"|": 98,
"}": 99,
"~": 100,
"¡": 101,
"¢": 102,
"£": 103,
"¤": 104,
"¥": 105,
"¦": 106,
"§": 107,
"¨": 108,
"©": 109,
"ª": 110,
"«": 111,
"¬": 112,
"®": 113,
"¯": 114,
"°": 115,
"±": 116,
"²": 117,
"³": 118,
"´": 119,
"µ": 120,
"¶": 121,
"·": 122,
"¸": 123,
"¹": 124,
"º": 125,
"»": 126,
"¼": 127,
"½": 128,
"¾": 129,
"¿": 130,
"À": 131,
"Á": 132,
"Â": 133,
"Ã": 134,
"Ä": 135,
"Å": 136,
"Æ": 137,
"Ç": 138,
"È": 139,
"É": 140,
"Ê": 141,
"Ë": 142,
"Ì": 143,
"Í": 144,
"Î": 145,
"Ï": 146,
"Ð": 147,
"Ñ": 148,
"Ò": 149,
"Ó": 150,
"Ô": 151,
"Õ": 152,
"Ö": 153,
"×": 154,
"Ø": 155,
"Ù": 156,
"Ú": 157,
"Û": 158,
"Ü": 159,
"Ý": 160,
"Þ": 161,
"ß": 162,
"à": 163,
"á": 164,
"â": 165,
"ã": 166,
"ä": 167,
"å": 168,
"æ": 169,
"ç": 170,
"è": 171,
"é": 172,
"ê": 173,
"ë": 174,
"ì": 175,
"í": 176,
"î": 177,
"ï": 178,
"ð": 179,
"ñ": 180,
"ò": 181,
"ó": 182,
"ô": 183,
"õ": 184,
"ö": 185,
"÷": 186,
"ø": 187,
"ù": 188,
"ú": 189,
"û": 190,
"ü": 191,
"ý": 192,
"þ": 193,
"ÿ": 194,
"Ā": 195,
"ā": 196,
"Ă": 197,
"ă": 198,
"Ą": 199,
"ą": 200,
"Ć": 201,
"ć": 202,
"Ĉ": 203,
"ĉ": 204,
"Ċ": 205,
"ċ": 206,
"Č": 207,
"č": 208,
"Ď": 209,
"ď": 210,
"Đ": 211,
"đ": 212,
"Ē": 213,
"ē": 214,
"Ĕ": 215,
"ĕ": 216,
"Ė": 217,
"ė": 218,
"Ę": 219,
"ę": 220,
"Ě": 221,
"ě": 222,
"Ĝ": 223,
"ĝ": 224,
"Ğ": 225,
"ğ": 226,
"Ġ": 227,
"ġ": 228,
"Ģ": 229,
"ģ": 230,
"Ĥ": 231,
"ĥ": 232,
"Ħ": 233,
"ħ": 234,
"Ĩ": 235,
"ĩ": 236,
"Ī": 237,
"ī": 238,
"Ĭ": 239,
"ĭ": 240,
"Į": 241,
"į": 242,
"İ": 243,
"ı": 244,
"IJ": 245,
"ij": 246,
"Ĵ": 247,
"ĵ": 248,
"Ķ": 249,
"ķ": 250,
"ĸ": 251,
"Ĺ": 252,
"ĺ": 253,
"Ļ": 254,
"ļ": 255,
"Ľ": 256,
"ľ": 257,
"Ŀ": 258,
"ŀ": 259,
"Ł": 260,
"ł": 261,
"Ń": 262
},
"merges": []
}
}