chaeyeonl33 commited on
Commit
b0d3fff
1 Parent(s): 38d864d

Upload tokenizer

Browse files
Files changed (2) hide show
  1. added_tokens.json +205 -0
  2. tokenizer_config.json +0 -4
added_tokens.json ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<mask>": 256203,
3
+ "ace_Arab": 256001,
4
+ "ace_Latn": 256002,
5
+ "acm_Arab": 256003,
6
+ "acq_Arab": 256004,
7
+ "aeb_Arab": 256005,
8
+ "afr_Latn": 256006,
9
+ "ajp_Arab": 256007,
10
+ "aka_Latn": 256008,
11
+ "als_Latn": 256162,
12
+ "amh_Ethi": 256009,
13
+ "apc_Arab": 256010,
14
+ "arb_Arab": 256011,
15
+ "ars_Arab": 256012,
16
+ "ary_Arab": 256013,
17
+ "arz_Arab": 256014,
18
+ "asm_Beng": 256015,
19
+ "ast_Latn": 256016,
20
+ "awa_Deva": 256017,
21
+ "ayr_Latn": 256018,
22
+ "azb_Arab": 256019,
23
+ "azj_Latn": 256020,
24
+ "bak_Cyrl": 256021,
25
+ "bam_Latn": 256022,
26
+ "ban_Latn": 256023,
27
+ "bel_Cyrl": 256024,
28
+ "bem_Latn": 256025,
29
+ "ben_Beng": 256026,
30
+ "bho_Deva": 256027,
31
+ "bjn_Arab": 256028,
32
+ "bjn_Latn": 256029,
33
+ "bod_Tibt": 256030,
34
+ "bos_Latn": 256031,
35
+ "bug_Latn": 256032,
36
+ "bul_Cyrl": 256033,
37
+ "cat_Latn": 256034,
38
+ "ceb_Latn": 256035,
39
+ "ces_Latn": 256036,
40
+ "cjk_Latn": 256037,
41
+ "ckb_Arab": 256038,
42
+ "crh_Latn": 256039,
43
+ "cym_Latn": 256040,
44
+ "dan_Latn": 256041,
45
+ "deu_Latn": 256042,
46
+ "dik_Latn": 256043,
47
+ "dyu_Latn": 256044,
48
+ "dzo_Tibt": 256045,
49
+ "ell_Grek": 256046,
50
+ "eng_Latn": 256047,
51
+ "epo_Latn": 256048,
52
+ "est_Latn": 256049,
53
+ "eus_Latn": 256050,
54
+ "ewe_Latn": 256051,
55
+ "fao_Latn": 256052,
56
+ "fij_Latn": 256054,
57
+ "fin_Latn": 256055,
58
+ "fon_Latn": 256056,
59
+ "fra_Latn": 256057,
60
+ "fur_Latn": 256058,
61
+ "fuv_Latn": 256059,
62
+ "gaz_Latn": 256135,
63
+ "gla_Latn": 256060,
64
+ "gle_Latn": 256061,
65
+ "glg_Latn": 256062,
66
+ "grn_Latn": 256063,
67
+ "guj_Gujr": 256064,
68
+ "hat_Latn": 256065,
69
+ "hau_Latn": 256066,
70
+ "heb_Hebr": 256067,
71
+ "hin_Deva": 256068,
72
+ "hne_Deva": 256069,
73
+ "hrv_Latn": 256070,
74
+ "hun_Latn": 256071,
75
+ "hye_Armn": 256072,
76
+ "ibo_Latn": 256073,
77
+ "ilo_Latn": 256074,
78
+ "ind_Latn": 256075,
79
+ "isl_Latn": 256076,
80
+ "ita_Latn": 256077,
81
+ "jav_Latn": 256078,
82
+ "jpn_Jpan": 256079,
83
+ "kab_Latn": 256080,
84
+ "kac_Latn": 256081,
85
+ "kam_Latn": 256082,
86
+ "kan_Knda": 256083,
87
+ "kas_Arab": 256084,
88
+ "kas_Deva": 256085,
89
+ "kat_Geor": 256086,
90
+ "kaz_Cyrl": 256089,
91
+ "kbp_Latn": 256090,
92
+ "kea_Latn": 256091,
93
+ "khk_Cyrl": 256122,
94
+ "khm_Khmr": 256092,
95
+ "kik_Latn": 256093,
96
+ "kin_Latn": 256094,
97
+ "kir_Cyrl": 256095,
98
+ "kmb_Latn": 256096,
99
+ "kmr_Latn": 256099,
100
+ "knc_Arab": 256087,
101
+ "knc_Latn": 256088,
102
+ "kon_Latn": 256097,
103
+ "kor_Hang": 256098,
104
+ "lao_Laoo": 256100,
105
+ "lij_Latn": 256102,
106
+ "lim_Latn": 256103,
107
+ "lin_Latn": 256104,
108
+ "lit_Latn": 256105,
109
+ "lmo_Latn": 256106,
110
+ "ltg_Latn": 256107,
111
+ "ltz_Latn": 256108,
112
+ "lua_Latn": 256109,
113
+ "lug_Latn": 256110,
114
+ "luo_Latn": 256111,
115
+ "lus_Latn": 256112,
116
+ "lvs_Latn": 256101,
117
+ "mag_Deva": 256113,
118
+ "mai_Deva": 256114,
119
+ "mal_Mlym": 256115,
120
+ "mar_Deva": 256116,
121
+ "min_Latn": 256117,
122
+ "mkd_Cyrl": 256118,
123
+ "mlt_Latn": 256120,
124
+ "mni_Beng": 256121,
125
+ "mos_Latn": 256123,
126
+ "mri_Latn": 256124,
127
+ "mya_Mymr": 256126,
128
+ "nld_Latn": 256127,
129
+ "nno_Latn": 256128,
130
+ "nob_Latn": 256129,
131
+ "npi_Deva": 256130,
132
+ "nso_Latn": 256131,
133
+ "nus_Latn": 256132,
134
+ "nya_Latn": 256133,
135
+ "oci_Latn": 256134,
136
+ "ory_Orya": 256136,
137
+ "pag_Latn": 256137,
138
+ "pan_Guru": 256138,
139
+ "pap_Latn": 256139,
140
+ "pbt_Arab": 256143,
141
+ "pes_Arab": 256053,
142
+ "plt_Latn": 256119,
143
+ "pol_Latn": 256140,
144
+ "por_Latn": 256141,
145
+ "prs_Arab": 256142,
146
+ "quy_Latn": 256144,
147
+ "ron_Latn": 256145,
148
+ "run_Latn": 256146,
149
+ "rus_Cyrl": 256147,
150
+ "sag_Latn": 256148,
151
+ "san_Deva": 256149,
152
+ "sat_Beng": 256150,
153
+ "scn_Latn": 256151,
154
+ "shn_Mymr": 256152,
155
+ "sin_Sinh": 256153,
156
+ "slk_Latn": 256154,
157
+ "slv_Latn": 256155,
158
+ "smo_Latn": 256156,
159
+ "sna_Latn": 256157,
160
+ "snd_Arab": 256158,
161
+ "som_Latn": 256159,
162
+ "sot_Latn": 256160,
163
+ "spa_Latn": 256161,
164
+ "srd_Latn": 256163,
165
+ "srp_Cyrl": 256164,
166
+ "ssw_Latn": 256165,
167
+ "sun_Latn": 256166,
168
+ "swe_Latn": 256167,
169
+ "swh_Latn": 256168,
170
+ "szl_Latn": 256169,
171
+ "tam_Taml": 256170,
172
+ "taq_Latn": 256177,
173
+ "taq_Tfng": 256178,
174
+ "tat_Cyrl": 256171,
175
+ "tel_Telu": 256172,
176
+ "tgk_Cyrl": 256173,
177
+ "tgl_Latn": 256174,
178
+ "tha_Thai": 256175,
179
+ "tir_Ethi": 256176,
180
+ "tpi_Latn": 256179,
181
+ "tsn_Latn": 256180,
182
+ "tso_Latn": 256181,
183
+ "tuk_Latn": 256182,
184
+ "tum_Latn": 256183,
185
+ "tur_Latn": 256184,
186
+ "twi_Latn": 256185,
187
+ "tzm_Tfng": 256186,
188
+ "uig_Arab": 256187,
189
+ "ukr_Cyrl": 256188,
190
+ "umb_Latn": 256189,
191
+ "urd_Arab": 256190,
192
+ "uzn_Latn": 256191,
193
+ "vec_Latn": 256192,
194
+ "vie_Latn": 256193,
195
+ "war_Latn": 256194,
196
+ "wol_Latn": 256195,
197
+ "xho_Latn": 256196,
198
+ "ydd_Hebr": 256197,
199
+ "yor_Latn": 256198,
200
+ "yue_Hant": 256199,
201
+ "zho_Hans": 256200,
202
+ "zho_Hant": 256201,
203
+ "zsm_Latn": 256125,
204
+ "zul_Latn": 256202
205
+ }
tokenizer_config.json CHANGED
@@ -1867,16 +1867,12 @@
1867
  "eos_token": "</s>",
1868
  "legacy_behaviour": false,
1869
  "mask_token": "<mask>",
1870
- "max_length": 128,
1871
  "model_max_length": 1024,
1872
  "pad_token": "<pad>",
1873
  "sep_token": "</s>",
1874
  "sp_model_kwargs": {},
1875
  "src_lang": "kor-Hang",
1876
- "stride": 0,
1877
  "tgt_lang": "eng-Latn",
1878
  "tokenizer_class": "NllbTokenizer",
1879
- "truncation_side": "right",
1880
- "truncation_strategy": "longest_first",
1881
  "unk_token": "<unk>"
1882
  }
 
1867
  "eos_token": "</s>",
1868
  "legacy_behaviour": false,
1869
  "mask_token": "<mask>",
 
1870
  "model_max_length": 1024,
1871
  "pad_token": "<pad>",
1872
  "sep_token": "</s>",
1873
  "sp_model_kwargs": {},
1874
  "src_lang": "kor-Hang",
 
1875
  "tgt_lang": "eng-Latn",
1876
  "tokenizer_class": "NllbTokenizer",
 
 
1877
  "unk_token": "<unk>"
1878
  }