CHILDES-phoneme-tokenizer / tokenizer.json
codebyzeb's picture
Upload tokenizer
fb4dfca verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "UNK",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "PAD",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "WORD_BOUNDARY",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "UTT_BOUNDARY",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Strip",
"strip_left": true,
"strip_right": true
}
]
},
"pre_tokenizer": {
"type": "WhitespaceSplit"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "UTT_BOUNDARY",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "UTT_BOUNDARY",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "UTT_BOUNDARY",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"UTT_BOUNDARY": {
"id": "UTT_BOUNDARY",
"ids": [
3
],
"tokens": [
"UTT_BOUNDARY"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"UNK": 0,
"PAD": 1,
"WORD_BOUNDARY": 2,
"UTT_BOUNDARY": 3,
"d̠ʒ": 4,
"ʌ": 5,
"s": 6,
"t": 7,
"l": 8,
"aɪ": 9,
"k": 10,
"j": 11,
"ʊ": 12,
"ɹ": 13,
"b": 14,
"æ": 15,
"h": 16,
"oʊ": 17,
"m": 18,
"iː": 19,
"ð": 20,
"ɛ": 21,
"z": 22,
"f": 23,
"eɪ": 24,
"w": 25,
"ɪ": 26,
"ɡ": 27,
"ɑ": 28,
"ə": 29,
"p": 30,
"uː": 31,
"i": 32,
"θ": 33,
"ŋ": 34,
"ɔ": 35,
"ɔɪ": 36,
"n": 37,
"d": 38,
"aʊ": 39,
"v": 40,
"ɜː": 41,
"t̠ʃ": 42,
"ʃ": 43,
"iə": 44,
"ʒ": 45,
"x": 46,
"tʰ": 47,
"ɑː": 48,
"ɒ": 49,
"e": 50,
"kʰ": 51,
"ɔː": 52,
"əʊ": 53,
"ɪə": 54,
"pʰ": 55,
"ɐ": 56,
"eə": 57,
"ʊə": 58,
"n̩": 59,
"a": 60,
"ɑ̃": 61,
"ʁ": 62,
"o": 63,
"ɛ̃": 64,
"y": 65,
"ɔ̃": 66,
"u": 67,
"œ": 68,
"ø": 69,
"ɲ": 70,
"aː": 71,
"oː": 72,
"øː": 73,
"ɛː": 74,
"yː": 75,
"eː": 76,
"d̺": 77,
"t̺ʰ": 78,
"ç": 79,
"ʀ": 80,
"ʏ": 81,
"ts": 82,
"pf": 83,
"ɾ": 84,
"e̞": 85,
"o̞": 86,
"β": 87,
"ʝ": 88,
"r": 89,
"tl": 90,
"ãː": 91,
"ɦ": 92,
"ɛi": 93,
"ʋ": 94,
"ɣ": 95,
"ʌu": 96,
"œy": 97,
"tʲ": 98,
"ã": 99,
"au": 100,
"ʃ̺": 101,
"ɤ": 102,
"t̠ʃ̺ʰ": 103,
"ɕ": 104,
"tɕ": 105,
"t̠ʃ̺": 106,
"ɹ̪̩": 107,
"tɕʰ": 108,
"ɻ": 109,
"ɥ": 110,
"tsʰ": 111,
"ei": 112,
"ou": 113,
"ɻ̩": 114,
"ai": 115,
"kʲ": 116,
"ɯ": 117,
"ɯː": 118,
"ɡʲ": 119,
"ɸ": 120,
"pʲ": 121,
"ɾʲ": 122,
"bʲ": 123,
"mʲ": 124,
"sʲ": 125,
"æi": 126,
"kː": 127,
"tː": 128,
"mː": 129,
"sː": 130,
"pː": 131,
"æː": 132,
"ɤː": 133,
"lː": 134,
"rː": 135,
"nː": 136,
"tʲː": 137,
"øɪ̯": 138,
"dʲ": 139,
"sʲː": 140,
"ʃː": 141,
"fː": 142,
"dː": 143,
"yi": 144,
"jː": 145,
"t̪": 146,
"d̪": 147,
"t̪s": 148,
"ʎ": 149,
"q": 150,
"oˤ": 151,
"ɑˤː": 152,
"eˤ": 153,
"ɔˤ": 154,
"uˤ": 155,
"iˤ": 156,
"ɒː": 157,
"aˤ": 158,
"ɜ": 159,
"œː": 160,
"ʔ": 161,
"ai̯": 162,
"s̪̻": 163,
"ɟ": 164,
"ei̯": 165,
"t̺s̺": 166,
"oi̯": 167,
"s̺": 168,
"t̪̻s̪̻": 169,
"au̯": 170,
"c": 171,
"eu̯": 172,
"l̪": 173,
"s̻": 174,
"z̻": 175,
"t̪ː": 176,
"n̪": 177,
"t̻s̻": 178,
"r̪": 179,
"ɟʝ": 180,
"s̻ː": 181,
"z̻ː": 182,
"l̪ː": 183,
"ɟʝː": 184,
"n̪ː": 185,
"ɲː": 186,
"r̪ː": 187,
"t̠ʃː": 188,
"bː": 189,
"cç": 190,
"t̻s̻ː": 191,
"d̪ː": 192,
"ɡː": 193,
"d̻z̻": 194,
"vː": 195,
"cçː": 196,
"hː": 197,
"lʲ": 198,
"l̪ˠ": 199,
"z̪": 200,
"s̪": 201,
"a̟": 202,
"t̪ʰ": 203,
"ɢ": 204,
"r̥": 205,
"ä": 206,
"θ̻": 207,
"ɬ": 208,
"ð̺̞": 209,
"n̪̥": 210,
"äu̯": 211,
"ŋ̥": 212,
"cʰ": 213,
"ou̯": 214,
"äi̯": 215,
"ɰ": 216,
"ʏː": 217,
"ɪː": 218,
"m̥": 219,
"ɔi̯": 220,
"ɲ̥": 221,
"ɾ̪ʲ": 222,
"d̪ˠ": 223,
"n̪ˠ": 224,
"ɛ̝": 225,
"ɾ̪ˠ": 226,
"mˠ": 227,
"sˠ": 228,
"bˠ": 229,
"pˠʰ": 230,
"t̪ʲʰ": 231,
"ɔ̝": 232,
"t̪ˠʰ": 233,
"vˠ": 234,
"fˠ": 235,
"l̪ʲ": 236,
"iːə": 237,
"uːe": 238,
"n̪ʲ": 239,
"d̪ʲ": 240,
"pʲʰ": 241,
"ɐɪ": 242,
"i̞": 243,
"fʲ": 244,
"χ": 245,
"vʲ": 246,
"ɔi": 247,
"ʊi": 248,
"əi": 249,
"ɪu": 250,
"ɛu": 251,
"ɤ̞": 252,
"dʑ": 253,
"ɯi": 254,
"t̠ʃʰ": 255,
"ʉ̟": 256,
"ʂ": 257,
"ɵ": 258,
"ɧ": 259,
"o̞ː": 260,
"ʉː": 261,
"ʉ": 262,
"ɒ̝": 263,
"ø̞ː": 264,
"øy": 265,
"æʉ": 266,
"ɔy": 267,
"pʼ": 268,
"tʼ": 269,
"t̠ʃʼ": 270,
"kʼ": 271,
"qʼ": 272,
"n̺": 273,
"z̺": 274,
"ɾ̺": 275,
"r̺": 276,
"u̯": 277,
"ɫ̺": 278,
"ɲ̟": 279,
"ʎ̟": 280,
"ts̺": 281,
"ɐː": 282,
"dz": 283,
"d̠ʒː": 284,
"tsː": 285,
"dzː": 286,
"ɐ̃": 287,
"ɐ̃i": 288,
"ɐ̃u̜": 289,
"ũ": 290,
"au̜": 291,
"eu̜": 292,
"ɐi": 293,
"ɛu̜": 294,
"ĩ": 295,
"ũi": 296,
"õ": 297,
"õi": 298,
"ẽ": 299,
"oi": 300,
"iu̜": 301,
"ui": 302,
"aʊ̯": 303,
"oɪ̯": 304,
"eʊ̯": 305,
"ɐ̃ʊ̯̃": 306,
"eɪ̯": 307,
"ẽɪ̯̃": 308,
"uɪ̯": 309,
"iʊ̯": 310,
"oʊ̯": 311,
"aɪ̯": 312,
"ɔɪ̯": 313,
"ɛɪ̯": 314,
"ɛʊ̯": 315,
"ɪ̯": 316,
"ɾ̪": 317,
"t̠ʃʲ": 318,
"e̯ä": 319,
"ʃʲ": 320,
"o̯ä": 321,
"ɨ": 322,
"uɪ": 323,
"t̪s̪": 324,
"əɪ": 325,
"tsʲ": 326,
"zʲ": 327,
"iɪ": 328,
"nʲ": 329,
"eʊ": 330,
"iʊ": 331,
"eo": 332,
"d̠ʒʲ": 333,
"oɪ": 334,
"t̪̻": 335,
"ʒ̺": 336,
"d̪̻": 337,
"t̻ʃ̻": 338,
"z̪̻": 339,
"d̻ʒ̻": 340,
"ʑ": 341
},
"unk_token": "UNK"
}
}