tabert-500-naamapadam / tokenizer.json
AnanthZeke's picture
Training in progress, step 400
c017f9f
raw
history blame contribute delete
No virus
18 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 500,
"content": ".",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 501,
"content": ",",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 502,
"content": "!",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 503,
"content": "?",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 504,
"content": "-",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 505,
"content": ":",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 506,
"content": ";",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 507,
"content": "/",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 508,
"content": "(",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 509,
"content": ")",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 510,
"content": "'",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 511,
"content": "\"",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 512,
"content": "...",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 513,
"content": "0",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 514,
"content": "1",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 515,
"content": "2",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 516,
"content": "3",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 517,
"content": "4",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 518,
"content": "5",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 519,
"content": "6",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 520,
"content": "7",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 521,
"content": "8",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 522,
"content": "9",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": false
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"஁": 5,
"ஂ": 6,
"ஃ": 7,
"அ": 8,
"ஆ": 9,
"இ": 10,
"ஈ": 11,
"உ": 12,
"ஊ": 13,
"஋": 14,
"எ": 15,
"ஏ": 16,
"ஐ": 17,
"஑": 18,
"ஒ": 19,
"ஓ": 20,
"ஔ": 21,
"க": 22,
"஖": 23,
"஗": 24,
"ங": 25,
"ச": 26,
"ஜ": 27,
"ஞ": 28,
"ட": 29,
"஠": 30,
"஡": 31,
"஢": 32,
"ண": 33,
"த": 34,
"஥": 35,
"஦": 36,
"஧": 37,
"ந": 38,
"ன": 39,
"ப": 40,
"஫": 41,
"஬": 42,
"஭": 43,
"ம": 44,
"ய": 45,
"ர": 46,
"ற": 47,
"ல": 48,
"ள": 49,
"ழ": 50,
"வ": 51,
"ஶ": 52,
"ஷ": 53,
"ஸ": 54,
"ஹ": 55,
"஽": 56,
"ா": 57,
"ி": 58,
"ீ": 59,
"ு": 60,
"ூ": 61,
"௃": 62,
"ெ": 63,
"ே": 64,
"ை": 65,
"௉": 66,
"ொ": 67,
"ோ": 68,
"ௌ": 69,
"்": 70,
"ௐ": 71,
"ௗ": 72,
"௠": 73,
"௦": 74,
"௧": 75,
"௨": 76,
"௩": 77,
"௪": 78,
"௫": 79,
"௬": 80,
"௭": 81,
"௮": 82,
"௯": 83,
"௰": 84,
"௱": 85,
"௲": 86,
"௳": 87,
"௴": 88,
"௵": 89,
"௶": 90,
"௷": 91,
"௸": 92,
"௹": 93,
"௺": 94,
"௼": 95,
"௿": 96,
"##ல": 97,
"##ை": 98,
"##க": 99,
"##்": 100,
"##ு": 101,
"##த": 102,
"##ர": 103,
"##வ": 104,
"##ே": 105,
"##ற": 106,
"##ம": 107,
"##ப": 108,
"##ன": 109,
"##ட": 110,
"##ி": 111,
"##ா": 112,
"##ச": 113,
"##ூ": 114,
"##ழ": 115,
"##ந": 116,
"##ோ": 117,
"##ொ": 118,
"##ெ": 119,
"##ள": 120,
"##ங": 121,
"##ய": 122,
"##ஞ": 123,
"##ண": 124,
"##ஸ": 125,
"##ஜ": 126,
"##ஷ": 127,
"##ீ": 128,
"##ஹ": 129,
"##உ": 130,
"##ஃ": 131,
"##அ": 132,
"##ஓ": 133,
"##எ": 134,
"##ஆ": 135,
"##ஊ": 136,
"##இ": 137,
"##ௌ": 138,
"##ஏ": 139,
"##ஒ": 140,
"##ஐ": 141,
"##௫": 142,
"##ஶ": 143,
"##௯": 144,
"##஑": 145,
"##ஈ": 146,
"##஢": 147,
"##ஔ": 148,
"##௦": 149,
"##௧": 150,
"##௰": 151,
"##௪": 152,
"##ஂ": 153,
"##௱": 154,
"##ௗ": 155,
"##௬": 156,
"##஡": 157,
"##஭": 158,
"##௩": 159,
"##௿": 160,
"##ௐ": 161,
"##௲": 162,
"##௭": 163,
"##஧": 164,
"##௮": 165,
"##௨": 166,
"##௃": 167,
"##௵": 168,
"##஦": 169,
"##஬": 170,
"##௶": 171,
"##஽": 172,
"##௹": 173,
"##௸": 174,
"##஖": 175,
"##௴": 176,
"##௉": 177,
"##௳": 178,
"##஫": 179,
"##௠": 180,
"##஠": 181,
"##௼": 182,
"##஁": 183,
"##஥": 184,
"##்க": 185,
"##்த": 186,
"##ம்": 187,
"##ன்": 188,
"##ல்": 189,
"##க்க": 190,
"##்ட": 191,
"##ப்": 192,
"##த்த": 193,
"##ள்": 194,
"##ும்": 195,
"##ர்": 196,
"##ிய": 197,
"##ப்ப": 198,
"##ரு": 199,
"##ந்த": 200,
"##ட்ட": 201,
"##து": 202,
"##ில்": 203,
"##ங்க": 204,
"##ைய": 205,
"##ற்": 206,
"##ின்": 207,
"##ாக": 208,
"##று": 209,
"##ிர": 210,
"##டு": 211,
"##ிக": 212,
"##ண்ட": 213,
"##்ச": 214,
"##க்கு": 215,
"##ர்க": 216,
"##ிற": 217,
"##ில": 218,
"மு": 219,
"##ான": 220,
"##த்து": 221,
"செ": 222,
"என்": 223,
"##டி": 224,
"வி": 225,
"##லை": 226,
"##ற்ற": 227,
"##ள்ள": 228,
"##ார": 229,
"##தி": 230,
"##ார்": 231,
"##ப்பு": 232,
"##ிரு": 233,
"##வு": 234,
"##ட்டு": 235,
"##ல்ல": 236,
"##ரி": 237,
"##வி": 238,
"##க்": 239,
"கு": 240,
"##ான்": 241,
"##ந்து": 242,
"##ால்": 243,
"##ளை": 244,
"##ய்": 245,
"##ச்ச": 246,
"கொ": 247,
"##த்": 248,
"போ": 249,
"இரு": 250,
"##னை": 251,
"அவ": 252,
"கா": 253,
"##ர்கள்": 254,
"##ங்கள்": 255,
"பெ": 256,
"##ண்": 257,
"##ம்ப": 258,
"##றி": 259,
"##ஸ்": 260,
"##ாத": 261,
"##மி": 262,
"பு": 263,
"##கள்": 264,
"##கு": 265,
"##ாவ": 266,
"##மை": 267,
"##ளு": 268,
"வே": 269,
"ஒரு": 270,
"##க்கும்": 271,
"##ின": 272,
"##ழு": 273,
"பா": 274,
"அத": 275,
"தொ": 276,
"இந்த": 277,
"வெ": 278,
"##ண்டு": 279,
"##ாம்": 280,
"வா": 281,
"##ற்க": 282,
"##த்தில்": 283,
"##டை": 284,
"##ன்ன": 285,
"செய": 286,
"##ன்ற": 287,
"##ழ்": 288,
"##மா": 289,
"##ிக்க": 290,
"##டிய": 291,
"நா": 292,
"மா": 293,
"##ச்": 294,
"##ரை": 295,
"##ரா": 296,
"##வா": 297,
"##ரிய": 298,
"##தை": 299,
"##ையில்": 300,
"##ட்": 301,
"##ளி": 302,
"கூ": 303,
"பொ": 304,
"##வே": 305,
"சு": 306,
"##ால": 307,
"##்த்த": 308,
"தமி": 309,
"மே": 310,
"என": 311,
"##றை": 312,
"தே": 313,
"சொ": 314,
"பிர": 315,
"##ங்கள": 316,
"##வை": 317,
"##ாம": 318,
"சி": 319,
"##ப்பட்ட": 320,
"##ற்ப": 321,
"##ையும்": 322,
"##மாக": 323,
"நி": 324,
"##மு": 325,
"##ண்ண": 326,
"பே": 327,
"##த்தை": 328,
"##கிற": 329,
"##திய": 330,
"##ளுக்கு": 331,
"தெ": 332,
"என்று": 333,
"##ட்ச": 334,
"கோ": 335,
"நீ": 336,
"செய்த": 337,
"##ிகள்": 338,
"##வர்": 339,
"##னி": 340,
"##மான": 341,
"##பு": 342,
"என்ற": 343,
"##வும்": 344,
"##சு": 345,
"##ன்று": 346,
"##டுத்த": 347,
"##னு": 348,
"##கள": 349,
"##டன்": 350,
"மற்ற": 351,
"##லி": 352,
"##்கள்": 353,
"##ர்கள": 354,
"உள்ள": 355,
"##ரும்": 356,
"பகு": 357,
"##சி": 358,
"##ற்று": 359,
"##ப்பட": 360,
"##ாள": 361,
"அர": 362,
"செய்": 363,
"பி": 364,
"இத": 365,
"##வத": 366,
"##ணி": 367,
"##வில்": 368,
"##ின்ற": 369,
"##ழி": 370,
"##ாய": 371,
"கே": 372,
"##க்கிற": 373,
"என்ப": 374,
"##ேன்": 375,
"நட": 376,
"து": 377,
"கி": 378,
"##்கு": 379,
"##சிய": 380,
"##னர்": 381,
"திரு": 382,
"##ஞ்ச": 383,
"மற்றும்": 384,
"##டைய": 385,
"##ண்டும்": 386,
"##ிக்": 387,
"தொட": 388,
"வை": 389,
"##பா": 390,
"முத": 391,
"##கம்": 392,
"##டம்": 393,
"மூ": 394,
"##ங்கு": 395,
"##லா": 396,
"கரு": 397,
"சே": 398,
"##ியா": 399,
"பய": 400,
"செய்ய": 401,
"வீ": 402,
"பல": 403,
"ஆக": 404,
"##மைய": 405,
"வரு": 406,
"##வர": 407,
"##ட்டி": 408,
"மீ": 409,
"##களை": 410,
"##னால்": 411,
"##வ்": 412,
"##ப்பா": 413,
"##ளிய": 414,
"இது": 415,
"எழு": 416,
"இருந்த": 417,
"பகுப்பு": 418,
"அறி": 419,
"தி": 420,
"தமிழ்": 421,
"##ப்பி": 422,
"நில": 423,
"##மே": 424,
"அந்த": 425,
"##ும்ப": 426,
"வர": 427,
"பதி": 428,
"##ப்போ": 429,
"##க்கிய": 430,
"நே": 431,
"தலை": 432,
"தமிழ": 433,
"##கை": 434,
"##ணை": 435,
"##லாம்": 436,
"பார": 437,
"##த்தின்": 438,
"##விய": 439,
"வழ": 440,
"##ிருந்த": 441,
"##டுத்து": 442,
"கொண்ட": 443,
"##ர்க்க": 444,
"##ம்பர்": 445,
"குறி": 446,
"##ையை": 447,
"கட": 448,
"என்ன": 449,
"##ூர்": 450,
"அமை": 451,
"##ற்கு": 452,
"##ரம்": 453,
"##லு": 454,
"##ன்ப": 455,
"##நா": 456,
"##கிறது": 457,
"##ிலும்": 458,
"தீ": 459,
"##ழை": 460,
"##க்கள்": 461,
"##வது": 462,
"##லம்": 463,
"##ங்களை": 464,
"##ார்கள்": 465,
"வெளிய": 466,
"இய": 467,
"##ிகள": 468,
"இர": 469,
"##ற்றி": 470,
"##யர்": 471,
"##ணம்": 472,
"சம": 473,
"##ங்கில": 474,
"சா": 475,
"##த்திய": 476,
"சொல்ல": 477,
"##க்கி": 478,
"அதிக": 479,
"வேண்டும்": 480,
"##ாது": 481,
"##னா": 482,
"பத": 483,
"நான்": 484,
"அல்ல": 485,
"இல்": 486,
"பின்": 487,
"இல": 488,
"##ரோ": 489,
"##பி": 490,
"சிற": 491,
"திர": 492,
"##க்கம்": 493,
"##ஸ்ட": 494,
"வந்த": 495,
"##போ": 496,
"##ிற்கு": 497,
"##டிக்க": 498,
"பிற": 499
}
}
}