low-resource-language-model-adaptation
/
adapted_tokenizers
/added-opt-eng
/opt_500-add_full-eng-opt
/added_tokens.json
{ | |
" !|": 50684, | |
" \"\",": 50713, | |
" (,": 50383, | |
" (;": 50386, | |
" 1864": 50700, | |
" 1871": 50742, | |
" 1876": 50745, | |
" 1878": 50757, | |
" 1881": 50722, | |
" 1882": 50724, | |
" 1883": 50736, | |
" 1884": 50707, | |
" 1885": 50667, | |
" 1887": 50686, | |
" 1891": 50666, | |
" 1892": 50654, | |
" 1894": 50642, | |
" ATTENTION": 50485, | |
" Abb": 50461, | |
" Academ": 50331, | |
" Addition": 50497, | |
" Adela": 50694, | |
" Agricult": 50597, | |
" Alab": 50528, | |
" Alber": 50526, | |
" Amaz": 50672, | |
" Angl": 50645, | |
" Anglican": 50754, | |
" Atlant": 50387, | |
" Banglades": 50530, | |
" Barcel": 50635, | |
" Bey": 50691, | |
" Biography": 50362, | |
" Bla": 50476, | |
" Blo": 50688, | |
" COVID": 50494, | |
" Cinc": 50692, | |
" Commer": 50554, | |
" Comple": 50661, | |
" Demographics": 50543, | |
" Diam": 50717, | |
" Diocese": 50640, | |
" Dougl": 50515, | |
" Eliz": 50413, | |
" Emp": 50360, | |
" Frid": 50562, | |
" Geography": 50404, | |
" Geor": 50300, | |
" Gmina": 50525, | |
" Harv": 50462, | |
" Histor": 50353, | |
" Honours": 50665, | |
" Independ": 50436, | |
" Ira": 50448, | |
" Legit": 50351, | |
" Mey": 50708, | |
" Minn": 50415, | |
" Moham": 50619, | |
" Muham": 50547, | |
" Municipality": 50564, | |
" Notable": 50452, | |
" Oblast": 50551, | |
" Oly": 50310, | |
" Paralymp": 50659, | |
" Phoen": 50689, | |
" Pitts": 50553, | |
" Pla": 50350, | |
" Prad": 50578, | |
" Princip": 50685, | |
" Profess": 50358, | |
" Prote": 50569, | |
" Publ": 50643, | |
" Ral": 50719, | |
" Rang": 50541, | |
" Reception": 50477, | |
" Rol": 50618, | |
" Romanized": 50606, | |
" Sain": 50593, | |
" Serb": 50445, | |
" Singles": 50561, | |
" São": 50718, | |
" Theod": 50682, | |
" Thir": 50752, | |
" Tod": 50418, | |
" Tourn": 50471, | |
" Venez": 50594, | |
" Verm": 50638, | |
" Vinc": 50589, | |
" Voiv": 50460, | |
" Voivodeship": 50467, | |
" Warri": 50750, | |
" Wednes": 50710, | |
" YouT": 50628, | |
" aband": 50449, | |
" adap": 50535, | |
" agricult": 50428, | |
" alcoh": 50625, | |
" anc": 50363, | |
" appoin": 50313, | |
" arran": 50397, | |
" asp": 50427, | |
" attra": 50488, | |
" becom": 50335, | |
" bey": 50508, | |
" bran": 50573, | |
" canc": 50613, | |
" carri": 50479, | |
" catal": 50693, | |
" cerem": 50430, | |
" colla": 50346, | |
" commune": 50495, | |
" compla": 50731, | |
" complet": 50399, | |
" compris": 50492, | |
" concer": 50394, | |
" conne": 50435, | |
" consid": 50303, | |
" consp": 50711, | |
" constitut": 50704, | |
" constru": 50308, | |
" controll": 50760, | |
" cricketer": 50650, | |
" danc": 50675, | |
" dang": 50657, | |
" daugh": 50325, | |
" decla": 50405, | |
" deliv": 50545, | |
" demonst": 50559, | |
" determ": 50365, | |
" dia": 50457, | |
" dif": 50287, | |
" dipl": 50469, | |
" discip": 50663, | |
" displa": 50568, | |
" draf": 50616, | |
" earl": 50336, | |
" eigh": 50762, | |
" emp": 50320, | |
" encoura": 50503, | |
" enfor": 50549, | |
" enj": 50470, | |
" enl": 50557, | |
" ere": 50539, | |
" examp": 50323, | |
" excell": 50651, | |
" exclud": 50744, | |
" expla": 50406, | |
" extens": 50626, | |
" gastropod": 50723, | |
" geomet": 50740, | |
" gradu": 50324, | |
" hap": 50453, | |
" headqu": 50407, | |
" herit": 50581, | |
" ille": 50536, | |
" immigr": 50558, | |
" impr": 50533, | |
" inducted": 50761, | |
" insc": 50592, | |
" inse": 50674, | |
" insp": 50379, | |
" institut": 50429, | |
" instru": 50523, | |
" interp": 50450, | |
" interse": 50518, | |
" journ": 50498, | |
" libr": 50746, | |
" locomot": 50519, | |
" locomotives": 50715, | |
" merg": 50402, | |
" moll": 50664, | |
" negot": 50501, | |
" nickn": 50577, | |
" ninet": 50730, | |
" nomin": 50355, | |
" nov": 50327, | |
" organiz": 50401, | |
" pandemic": 50566, | |
" partn": 50403, | |
" passeng": 50410, | |
" performan": 50322, | |
" pla": 50265, | |
" plann": 50359, | |
" poin": 50296, | |
" predecess": 50649, | |
" premi": 50583, | |
" produc": 50330, | |
" publ": 50271, | |
" purp": 50389, | |
" rebu": 50544, | |
" remov": 50373, | |
" repla": 50314, | |
" ri": 50534, | |
" rif": 50636, | |
" samp": 50506, | |
" schol": 50514, | |
" seg": 50510, | |
" sele": 50339, | |
" sla": 50422, | |
" soci": 50571, | |
" songwriter": 50749, | |
" spok": 50489, | |
" subd": 50624, | |
" subsidi": 50610, | |
" suppor": 50511, | |
" surre": 50614, | |
" swit": 50702, | |
" terminus": 50681, | |
" theore": 50582, | |
" thir": 50473, | |
" tourn": 50334, | |
" trav": 50512, | |
" tribut": 50560, | |
" tributary": 50648, | |
" underst": 50490, | |
" unincorporated": 50605, | |
" unsuccess": 50588, | |
" upgrad": 50620, | |
" urg": 50603, | |
" vess": 50455, | |
" villa": 50289, | |
" wick": 50660, | |
" wides": 50615, | |
" winn": 50368, | |
" worksh": 50690, | |
" Á": 50695, | |
" Ł": 50747, | |
" Š": 50669, | |
" ‘": 50468, | |
" “": 50306, | |
",”": 50585, | |
".”": 50438, | |
"::::\"": 50487, | |
"Amer": 50366, | |
"Billboard": 50507, | |
"CAA": 50437, | |
"EAD": 50584, | |
"TENT": 50482, | |
"TENTION": 50484, | |
"adio": 50311, | |
"agre": 50677, | |
"airman": 50374, | |
"alymp": 50646, | |
"ambig": 50580, | |
"ambiguation": 50595, | |
"amese": 50705, | |
"amics": 50714, | |
"ampionship": 50291, | |
"ancell": 50641, | |
"ancellor": 50696, | |
"anner": 50464, | |
"anning": 50491, | |
"aptist": 50598, | |
"areer": 50276, | |
"arged": 50709, | |
"arked": 50697, | |
"arliam": 50326, | |
"arliament": 50328, | |
"arly": 50270, | |
"arri": 50279, | |
"arriage": 50378, | |
"arried": 50301, | |
"arrier": 50734, | |
"arting": 50671, | |
"arv": 50678, | |
"asan": 50612, | |
"atab": 50662, | |
"auge": 50721, | |
"autiful": 50521, | |
"auty": 50611, | |
"avalry": 50556, | |
"avel": 50629, | |
"celand": 50658, | |
"cement": 50392, | |
"chang": 50609, | |
"chestra": 50465, | |
"coh": 50587, | |
"compass": 50741, | |
"cted": 50269, | |
"ctober": 50286, | |
"ctoral": 50412, | |
"cts": 50480, | |
"died": 50733, | |
"disambiguation": 50599, | |
"duce": 50442, | |
"duced": 50292, | |
"ducing": 50555, | |
"ducted": 50670, | |
"ebru": 50293, | |
"ebruary": 50294, | |
"ecess": 50364, | |
"eck": 50338, | |
"ecting": 50496, | |
"ects": 50375, | |
"ecut": 50312, | |
"ecution": 50516, | |
"ecutive": 50333, | |
"edding": 50579, | |
"eech": 50505, | |
"eep": 50340, | |
"eld": 50451, | |
"eleb": 50349, | |
"eler": 50531, | |
"ellite": 50563, | |
"ely": 50267, | |
"embers": 50295, | |
"emble": 50567, | |
"embly": 50337, | |
"emeter": 50447, | |
"emor": 50344, | |
"emorial": 50398, | |
"eneral": 50309, | |
"enia": 50633, | |
"eo": 50656, | |
"eptember": 50285, | |
"erbai": 50572, | |
"ersey": 50393, | |
"erted": 50673, | |
"esota": 50454, | |
"ething": 50443, | |
"etwork": 50321, | |
"ewhere": 50680, | |
"ferences": 50411, | |
"fers": 50630, | |
"fess": 50281, | |
"fic": 50342, | |
"ficult": 50385, | |
"fielder": 50542, | |
"footballer": 50540, | |
"forman": 50316, | |
"gend": 50423, | |
"gether": 50329, | |
"heme": 50600, | |
"iab": 50499, | |
"ibn": 50729, | |
"icine": 50426, | |
"iec": 50441, | |
"ieces": 50493, | |
"ieuten": 50419, | |
"ieutenant": 50424, | |
"ifically": 50483, | |
"ifican": 50652, | |
"ificance": 50655, | |
"iforn": 50317, | |
"ifornia": 50318, | |
"igade": 50444, | |
"iggest": 50548, | |
"igital": 50390, | |
"ilarly": 50621, | |
"ilding": 50417, | |
"ilm": 50268, | |
"inae": 50644, | |
"incip": 50347, | |
"includ": 50431, | |
"incorporated": 50591, | |
"inental": 50676, | |
"inist": 50627, | |
"innati": 50699, | |
"inum": 50738, | |
"istory": 50275, | |
"ivision": 50297, | |
"ić": 50367, | |
"lac": 50538, | |
"lacier": 50748, | |
"lack": 50298, | |
"lades": 50517, | |
"laimed": 50601, | |
"lant": 50356, | |
"lend": 50596, | |
"leyball": 50604, | |
"licated": 50758, | |
"lications": 50668, | |
"licit": 50703, | |
"loy": 50305, | |
"loyd": 50550, | |
"lywood": 50472, | |
"mina": 50486, | |
"miral": 50602, | |
"mphony": 50639, | |
"mpt": 50631, | |
"nold": 50575, | |
"nowiki": 50753, | |
"ntario": 50434, | |
"née": 50687, | |
"occer": 50432, | |
"ociety": 50315, | |
"odeship": 50463, | |
"olished": 50509, | |
"ollowing": 50348, | |
"ommended": 50433, | |
"omot": 50466, | |
"onald": 50481, | |
"onlyinclude": 50647, | |
"onn": 50369, | |
"onsin": 50459, | |
"ootball": 50274, | |
"ordin": 50475, | |
"orning": 50425, | |
"osm": 50565, | |
"osoph": 50416, | |
"osopher": 50756, | |
"osophy": 50478, | |
"ospitals": 50726, | |
"otan": 50637, | |
"ounc": 50299, | |
"ouncil": 50302, | |
"ouncill": 50706, | |
"ounsel": 50683, | |
"oura": 50408, | |
"overn": 50272, | |
"overnment": 50278, | |
"overnor": 50332, | |
"oviet": 50352, | |
"ović": 50522, | |
"owever": 50280, | |
"oye": 50653, | |
"pective": 50622, | |
"poration": 50400, | |
"pril": 50288, | |
"quir": 50727, | |
"racy": 50537, | |
"rague": 50737, | |
"rapy": 50617, | |
"raska": 50679, | |
"rass": 50720, | |
"reater": 50395, | |
"reland": 50343, | |
"rena": 50590, | |
"retary": 50354, | |
"ricket": 50396, | |
"ricketer": 50546, | |
"ricts": 50446, | |
"ricult": 50380, | |
"rif": 50421, | |
"riptions": 50716, | |
"rizona": 50500, | |
"roke": 50524, | |
"rong": 50574, | |
"ronze": 50439, | |
"ropical": 50414, | |
"ropod": 50634, | |
"ryst": 50520, | |
"rystal": 50698, | |
"seud": 50529, | |
"ska": 50701, | |
"sor": 50732, | |
"stance": 50743, | |
"stances": 50608, | |
"stit": 50277, | |
"stitut": 50361, | |
"stitute": 50319, | |
"stitution": 50388, | |
"stitutional": 50725, | |
"teg": 50371, | |
"tend": 50284, | |
"tended": 50304, | |
"ternational": 50283, | |
"ternet": 50456, | |
"thlet": 50341, | |
"thlete": 50712, | |
"thm": 50623, | |
"ture": 50570, | |
"tym": 50751, | |
"ubl": 50266, | |
"ublican": 50382, | |
"ublin": 50502, | |
"uccess": 50504, | |
"uerto": 50527, | |
"ugby": 50372, | |
"ugust": 50282, | |
"uis": 50739, | |
"ularly": 50345, | |
"umbent": 50632, | |
"unior": 50377, | |
"unnel": 50586, | |
"urday": 50513, | |
"urname": 50381, | |
"usband": 50370, | |
"usic": 50273, | |
"velopment": 50391, | |
"vement": 50409, | |
"vements": 50552, | |
"yal": 50440, | |
"ydro": 50420, | |
"ygen": 50755, | |
"ylvan": 50376, | |
"ylvania": 50384, | |
"ymn": 50474, | |
"zil": 50357, | |
"él": 50759, | |
"ów": 50458, | |
"ław": 50735, | |
"ń": 50532, | |
"ž": 50607, | |
"ș": 50576, | |
"’": 50290, | |
"”": 50307, | |
"”.": 50728 | |
} | |