char-bert-base-uncased / tokenizer.json
lhy's picture
add tokenizer
25ba562
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": false,
"strip_accents": false,
"lowercase": false
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
1
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
2
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[UNK]": 0,
"[CLS]": 1,
"[SEP]": 2,
"[PAD]": 3,
"[MASK]": 4,
"\u0001": 5,
"\u0002": 6,
"\u0003": 7,
"\u0004": 8,
"\u0005": 9,
"!": 10,
"\"": 11,
"#": 12,
"$": 13,
"%": 14,
"&": 15,
"'": 16,
"(": 17,
")": 18,
"*": 19,
"+": 20,
",": 21,
"-": 22,
".": 23,
"/": 24,
"0": 25,
"1": 26,
"2": 27,
"3": 28,
"4": 29,
"5": 30,
"6": 31,
"7": 32,
"8": 33,
"9": 34,
":": 35,
";": 36,
"<": 37,
"=": 38,
">": 39,
"?": 40,
"@": 41,
"A": 42,
"B": 43,
"C": 44,
"D": 45,
"E": 46,
"F": 47,
"G": 48,
"H": 49,
"I": 50,
"J": 51,
"K": 52,
"L": 53,
"M": 54,
"N": 55,
"O": 56,
"P": 57,
"Q": 58,
"R": 59,
"S": 60,
"T": 61,
"U": 62,
"V": 63,
"W": 64,
"X": 65,
"Y": 66,
"Z": 67,
"[": 68,
"\\": 69,
"]": 70,
"^": 71,
"_": 72,
"`": 73,
"a": 74,
"b": 75,
"c": 76,
"d": 77,
"e": 78,
"f": 79,
"g": 80,
"h": 81,
"i": 82,
"j": 83,
"k": 84,
"l": 85,
"m": 86,
"n": 87,
"o": 88,
"p": 89,
"q": 90,
"r": 91,
"s": 92,
"t": 93,
"u": 94,
"v": 95,
"w": 96,
"x": 97,
"y": 98,
"z": 99,
"{": 100,
"|": 101,
"}": 102,
"~": 103,
"¡": 104,
"¢": 105,
"£": 106,
"¥": 107,
"§": 108,
"¯": 109,
"µ": 110,
"º": 111,
"»": 112,
"¿": 113,
"À": 114,
"Â": 115,
"Ã": 116,
"Ä": 117,
"Å": 118,
"Ç": 119,
"Ë": 120,
"Í": 121,
"Î": 122,
"Ï": 123,
"Ñ": 124,
"Ó": 125,
"Ø": 126,
"Ù": 127,
"Ú": 128,
"Ü": 129,
"ß": 130,
"à": 131,
"á": 132,
"â": 133,
"ã": 134,
"ä": 135,
"å": 136,
"æ": 137,
"ç": 138,
"è": 139,
"é": 140,
"ê": 141,
"ë": 142,
"ì": 143,
"í": 144,
"î": 145,
"ï": 146,
"ñ": 147,
"ò": 148,
"ó": 149,
"ô": 150,
"õ": 151,
"ö": 152,
"ø": 153,
"ù": 154,
"ú": 155,
"û": 156,
"ü": 157,
"ý": 158,
"þ": 159,
"ā": 160,
"ă": 161,
"ą": 162,
"ć": 163,
"č": 164,
"ď": 165,
"đ": 166,
"ē": 167,
"ĕ": 168,
"ė": 169,
"Ę": 170,
"ę": 171,
"ě": 172,
"ġ": 173,
"ģ": 174,
"ĩ": 175,
"ī": 176,
"ĭ": 177,
"İ": 178,
"ı": 179,
"ĵ": 180,
"ķ": 181,
"ĸ": 182,
"ĺ": 183,
"ł": 184,
"ń": 185,
"ň": 186,
"ʼn": 187,
"ŋ": 188,
"ō": 189,
"ŏ": 190,
"ő": 191,
"œ": 192,
"ŕ": 193,
"ŗ": 194,
"Ř": 195,
"ř": 196,
"Ś": 197,
"ś": 198,
"Ş": 199,
"ş": 200,
"š": 201,
"ţ": 202,
"Ť": 203,
"ť": 204,
"ũ": 205,
"ū": 206,
"ŭ": 207,
"ű": 208,
"ų": 209,
"ŵ": 210,
"Ÿ": 211,
"ż": 212,
"ž": 213,
"ƀ": 214,
"Ɓ": 215,
"Ƅ": 216,
"ƅ": 217,
"Ƈ": 218,
"ƒ": 219,
"ƙ": 220,
"ƞ": 221,
"Ƭ": 222,
"Ư": 223,
"Ƴ": 224,
"Ǐ": 225,
"Ƿ": 226,
"ǹ": 227,
"ȋ": 228,
"ș": 229,
"ț": 230,
"ȧ": 231,
"ȯ": 232,
"Ʌ": 233,
"ɑ": 234,
"ɗ": 235,
"ɠ": 236,
"ɡ": 237,
"ɢ": 238,
"ɣ": 239,
"ɩ": 240,
"ɪ": 241,
"ɭ": 242,
"ɯ": 243,
"ɱ": 244,
"ɳ": 245,
"ɴ": 246,
"ɺ": 247,
"ɼ": 248,
"ɾ": 249,
"ʀ": 250,
"ʂ": 251,
"ʄ": 252,
"ʋ": 253,
"ʌ": 254,
"ʍ": 255,
"ʏ": 256,
"ʙ": 257,
"ʜ": 258,
"ʝ": 259,
"ʟ": 260,
"ʨ": 261,
"˄": 262,
"Α": 263,
"Β": 264,
"Ε": 265,
"Ζ": 266,
"Η": 267,
"Ι": 268,
"Κ": 269,
"Μ": 270,
"Ν": 271,
"Ο": 272,
"Ρ": 273,
"Τ": 274,
"Υ": 275,
"Χ": 276,
"ί": 277,
"α": 278,
"β": 279,
"γ": 280,
"η": 281,
"ι": 282,
"κ": 283,
"μ": 284,
"ν": 285,
"ο": 286,
"π": 287,
"ρ": 288,
"σ": 289,
"τ": 290,
"υ": 291,
"χ": 292,
"ω": 293,
"ϲ": 294,
"ϳ": 295,
"Ϲ": 296,
"Ϻ": 297,
"Ѕ": 298,
"Ј": 299,
"А": 300,
"В": 301,
"Е": 302,
"З": 303,
"К": 304,
"М": 305,
"Н": 306,
"О": 307,
"Р": 308,
"С": 309,
"Т": 310,
"У": 311,
"Х": 312,
"Ь": 313,
"а": 314,
"в": 315,
"г": 316,
"д": 317,
"е": 318,
"и": 319,
"к": 320,
"л": 321,
"н": 322,
"о": 323,
"п": 324,
"р": 325,
"с": 326,
"т": 327,
"у": 328,
"х": 329,
"ч": 330,
"ш": 331,
"щ": 332,
"ѐ": 333,
"ё": 334,
"ѕ": 335,
"і": 336,
"ј": 337,
"џ": 338,
"ѡ": 339,
"Ѵ": 340,
"ѵ": 341,
"ҏ": 342,
"қ": 343,
"ҡ": 344,
"ң": 345,
"ҥ": 346,
"Ү": 347,
"ү": 348,
"ҳ": 349,
"һ": 350,
"ҽ": 351,
"ӏ": 352,
"ԁ": 353,
"ԛ": 354,
"Ա": 355,
"Ի": 356,
"Ս": 357,
"Տ": 358,
"Օ": 359,
"ա": 360,
"գ": 361,
"զ": 362,
"ժ": 363,
"հ": 364,
"յ": 365,
"ս": 366,
"օ": 367,
"Ⴍ": 368,
"Ⴓ": 369,
"Ⴝ": 370,
"Ꭰ": 371,
"Ꭲ": 372,
"Ꭵ": 373,
"Ꭺ": 374,
"Ꭻ": 375,
"Ꮃ": 376,
"Ꮇ": 377,
"Ꮋ": 378,
"Ꮐ": 379,
"Ꮓ": 380,
"Ꮢ": 381,
"Ꮩ": 382,
"Ꮪ": 383,
"Ꮮ": 384,
"Ꮯ": 385,
"Ꮲ": 386,
"Ꮶ": 387,
"Ᏼ": 388,
"ᚱ": 389,
"ᛁ": 390,
"ᛒ": 391,
"ᛕ": 392,
"ᛖ": 393,
"ᴄ": 394,
"ᴇ": 395,
"ᴋ": 396,
"ᴍ": 397,
"ᴏ": 398,
"ᴑ": 399,
"ᴜ": 400,
"ᴠ": 401,
"ᴡ": 402,
"ᴦ": 403,
"ᴨ": 404,
"ᴺ": 405,
"ᴼ": 406,
"ᴾ": 407,
"ᴿ": 408,
"ḟ": 409,
"ḱ": 410,
"ḿ": 411,
"ṁ": 412,
"ṅ": 413,
"Ṛ": 414,
"ṡ": 415,
"ẁ": 416,
"ẃ": 417,
"ẇ": 418,
"ἀ": 419,
"ἁ": 420,
"ἇ": 421,
"ἰ": 422,
"ἱ": 423,
"ἳ": 424,
"ὀ": 425,
"ὁ": 426,
"ὶ": 427,
"ί": 428,
"ῤ": 429,
"ῥ": 430,
"―": 431,
"₩": 432,
"€": 433,
"₿": 434,
"ℹ": 435,
"⋃": 436,
"𝘼": 437,
"𝘾": 438,
"𝘿": 439,
"𝙀": 440,
"𝙍": 441,
"𝙏": 442,
"##\u0001": 443,
"##\u0002": 444,
"##\u0003": 445,
"##\u0004": 446,
"##\u0005": 447,
"##!": 448,
"##\"": 449,
"###": 450,
"##$": 451,
"##%": 452,
"##&": 453,
"##'": 454,
"##(": 455,
"##)": 456,
"##*": 457,
"##+": 458,
"##,": 459,
"##-": 460,
"##.": 461,
"##/": 462,
"##0": 463,
"##1": 464,
"##2": 465,
"##3": 466,
"##4": 467,
"##5": 468,
"##6": 469,
"##7": 470,
"##8": 471,
"##9": 472,
"##:": 473,
"##;": 474,
"##<": 475,
"##=": 476,
"##>": 477,
"##?": 478,
"##@": 479,
"##A": 480,
"##B": 481,
"##C": 482,
"##D": 483,
"##E": 484,
"##F": 485,
"##G": 486,
"##H": 487,
"##I": 488,
"##J": 489,
"##K": 490,
"##L": 491,
"##M": 492,
"##N": 493,
"##O": 494,
"##P": 495,
"##Q": 496,
"##R": 497,
"##S": 498,
"##T": 499,
"##U": 500,
"##V": 501,
"##W": 502,
"##X": 503,
"##Y": 504,
"##Z": 505,
"##[": 506,
"##\\": 507,
"##]": 508,
"##^": 509,
"##_": 510,
"##`": 511,
"##a": 512,
"##b": 513,
"##c": 514,
"##d": 515,
"##e": 516,
"##f": 517,
"##g": 518,
"##h": 519,
"##i": 520,
"##j": 521,
"##k": 522,
"##l": 523,
"##m": 524,
"##n": 525,
"##o": 526,
"##p": 527,
"##q": 528,
"##r": 529,
"##s": 530,
"##t": 531,
"##u": 532,
"##v": 533,
"##w": 534,
"##x": 535,
"##y": 536,
"##z": 537,
"##{": 538,
"##|": 539,
"##}": 540,
"##~": 541,
"##¡": 542,
"##¢": 543,
"##£": 544,
"##¥": 545,
"##§": 546,
"##¯": 547,
"##µ": 548,
"##º": 549,
"##»": 550,
"##¿": 551,
"##À": 552,
"##Â": 553,
"##Ã": 554,
"##Ä": 555,
"##Å": 556,
"##Ç": 557,
"##Ë": 558,
"##Í": 559,
"##Î": 560,
"##Ï": 561,
"##Ñ": 562,
"##Ó": 563,
"##Ø": 564,
"##Ù": 565,
"##Ú": 566,
"##Ü": 567,
"##ß": 568,
"##à": 569,
"##á": 570,
"##â": 571,
"##ã": 572,
"##ä": 573,
"##å": 574,
"##æ": 575,
"##ç": 576,
"##è": 577,
"##é": 578,
"##ê": 579,
"##ë": 580,
"##ì": 581,
"##í": 582,
"##î": 583,
"##ï": 584,
"##ñ": 585,
"##ò": 586,
"##ó": 587,
"##ô": 588,
"##õ": 589,
"##ö": 590,
"##ø": 591,
"##ù": 592,
"##ú": 593,
"##û": 594,
"##ü": 595,
"##ý": 596,
"##þ": 597,
"##ā": 598,
"##ă": 599,
"##ą": 600,
"##ć": 601,
"##č": 602,
"##ď": 603,
"##đ": 604,
"##ē": 605,
"##ĕ": 606,
"##ė": 607,
"##Ę": 608,
"##ę": 609,
"##ě": 610,
"##ġ": 611,
"##ģ": 612,
"##ĩ": 613,
"##ī": 614,
"##ĭ": 615,
"##İ": 616,
"##ı": 617,
"##ĵ": 618,
"##ķ": 619,
"##ĸ": 620,
"##ĺ": 621,
"##ł": 622,
"##ń": 623,
"##ň": 624,
"##ʼn": 625,
"##ŋ": 626,
"##ō": 627,
"##ŏ": 628,
"##ő": 629,
"##œ": 630,
"##ŕ": 631,
"##ŗ": 632,
"##Ř": 633,
"##ř": 634,
"##Ś": 635,
"##ś": 636,
"##Ş": 637,
"##ş": 638,
"##š": 639,
"##ţ": 640,
"##Ť": 641,
"##ť": 642,
"##ũ": 643,
"##ū": 644,
"##ŭ": 645,
"##ű": 646,
"##ų": 647,
"##ŵ": 648,
"##Ÿ": 649,
"##ż": 650,
"##ž": 651,
"##ƀ": 652,
"##Ɓ": 653,
"##Ƅ": 654,
"##ƅ": 655,
"##Ƈ": 656,
"##ƒ": 657,
"##ƙ": 658,
"##ƞ": 659,
"##Ƭ": 660,
"##Ư": 661,
"##Ƴ": 662,
"##Ǐ": 663,
"##Ƿ": 664,
"##ǹ": 665,
"##ȋ": 666,
"##ș": 667,
"##ț": 668,
"##ȧ": 669,
"##ȯ": 670,
"##Ʌ": 671,
"##ɑ": 672,
"##ɗ": 673,
"##ɠ": 674,
"##ɡ": 675,
"##ɢ": 676,
"##ɣ": 677,
"##ɩ": 678,
"##ɪ": 679,
"##ɭ": 680,
"##ɯ": 681,
"##ɱ": 682,
"##ɳ": 683,
"##ɴ": 684,
"##ɺ": 685,
"##ɼ": 686,
"##ɾ": 687,
"##ʀ": 688,
"##ʂ": 689,
"##ʄ": 690,
"##ʋ": 691,
"##ʌ": 692,
"##ʍ": 693,
"##ʏ": 694,
"##ʙ": 695,
"##ʜ": 696,
"##ʝ": 697,
"##ʟ": 698,
"##ʨ": 699,
"##˄": 700,
"##Α": 701,
"##Β": 702,
"##Ε": 703,
"##Ζ": 704,
"##Η": 705,
"##Ι": 706,
"##Κ": 707,
"##Μ": 708,
"##Ν": 709,
"##Ο": 710,
"##Ρ": 711,
"##Τ": 712,
"##Υ": 713,
"##Χ": 714,
"##ί": 715,
"##α": 716,
"##β": 717,
"##γ": 718,
"##η": 719,
"##ι": 720,
"##κ": 721,
"##μ": 722,
"##ν": 723,
"##ο": 724,
"##π": 725,
"##ρ": 726,
"##σ": 727,
"##τ": 728,
"##υ": 729,
"##χ": 730,
"##ω": 731,
"##ϲ": 732,
"##ϳ": 733,
"##Ϲ": 734,
"##Ϻ": 735,
"##Ѕ": 736,
"##Ј": 737,
"##А": 738,
"##В": 739,
"##Е": 740,
"##З": 741,
"##К": 742,
"##М": 743,
"##Н": 744,
"##О": 745,
"##Р": 746,
"##С": 747,
"##Т": 748,
"##У": 749,
"##Х": 750,
"##Ь": 751,
"##а": 752,
"##в": 753,
"##г": 754,
"##д": 755,
"##е": 756,
"##и": 757,
"##к": 758,
"##л": 759,
"##н": 760,
"##о": 761,
"##п": 762,
"##р": 763,
"##с": 764,
"##т": 765,
"##у": 766,
"##х": 767,
"##ч": 768,
"##ш": 769,
"##щ": 770,
"##ѐ": 771,
"##ё": 772,
"##ѕ": 773,
"##і": 774,
"##ј": 775,
"##џ": 776,
"##ѡ": 777,
"##Ѵ": 778,
"##ѵ": 779,
"##ҏ": 780,
"##қ": 781,
"##ҡ": 782,
"##ң": 783,
"##ҥ": 784,
"##Ү": 785,
"##ү": 786,
"##ҳ": 787,
"##һ": 788,
"##ҽ": 789,
"##ӏ": 790,
"##ԁ": 791,
"##ԛ": 792,
"##Ա": 793,
"##Ի": 794,
"##Ս": 795,
"##Տ": 796,
"##Օ": 797,
"##ա": 798,
"##գ": 799,
"##զ": 800,
"##ժ": 801,
"##հ": 802,
"##յ": 803,
"##ս": 804,
"##օ": 805,
"##Ⴍ": 806,
"##Ⴓ": 807,
"##Ⴝ": 808,
"##Ꭰ": 809,
"##Ꭲ": 810,
"##Ꭵ": 811,
"##Ꭺ": 812,
"##Ꭻ": 813,
"##Ꮃ": 814,
"##Ꮇ": 815,
"##Ꮋ": 816,
"##Ꮐ": 817,
"##Ꮓ": 818,
"##Ꮢ": 819,
"##Ꮩ": 820,
"##Ꮪ": 821,
"##Ꮮ": 822,
"##Ꮯ": 823,
"##Ꮲ": 824,
"##Ꮶ": 825,
"##Ᏼ": 826,
"##ᚱ": 827,
"##ᛁ": 828,
"##ᛒ": 829,
"##ᛕ": 830,
"##ᛖ": 831,
"##ᴄ": 832,
"##ᴇ": 833,
"##ᴋ": 834,
"##ᴍ": 835,
"##ᴏ": 836,
"##ᴑ": 837,
"##ᴜ": 838,
"##ᴠ": 839,
"##ᴡ": 840,
"##ᴦ": 841,
"##ᴨ": 842,
"##ᴺ": 843,
"##ᴼ": 844,
"##ᴾ": 845,
"##ᴿ": 846,
"##ḟ": 847,
"##ḱ": 848,
"##ḿ": 849,
"##ṁ": 850,
"##ṅ": 851,
"##Ṛ": 852,
"##ṡ": 853,
"##ẁ": 854,
"##ẃ": 855,
"##ẇ": 856,
"##ἀ": 857,
"##ἁ": 858,
"##ἇ": 859,
"##ἰ": 860,
"##ἱ": 861,
"##ἳ": 862,
"##ὀ": 863,
"##ὁ": 864,
"##ὶ": 865,
"##ί": 866,
"##ῤ": 867,
"##ῥ": 868,
"##―": 869,
"##₩": 870,
"##€": 871,
"##₿": 872,
"##ℹ": 873,
"##⋃": 874,
"##𝘼": 875,
"##𝘾": 876,
"##𝘿": 877,
"##𝙀": 878,
"##𝙍": 879,
"##𝙏": 880
}
}
}