Saripudin commited on
Commit
9ebf9a0
·
verified ·
1 Parent(s): 35a07fd

Upload indonesia_tokenizer_v1.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. indonesia_tokenizer_v1.json +529 -0
indonesia_tokenizer_v1.json ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[STOP]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SPACE]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "Whitespace"
37
+ },
38
+ "post_processor": null,
39
+ "decoder": null,
40
+ "model": {
41
+ "type": "BPE",
42
+ "dropout": null,
43
+ "unk_token": "[UNK]",
44
+ "continuing_subword_prefix": null,
45
+ "end_of_word_suffix": null,
46
+ "fuse_unk": false,
47
+ "byte_fallback": false,
48
+ "vocab": {
49
+ "[STOP]": 0,
50
+ "[UNK]": 1,
51
+ "[SPACE]": 2,
52
+ "!": 3,
53
+ "'": 4,
54
+ ",": 5,
55
+ "-": 6,
56
+ ".": 7,
57
+ "?": 8,
58
+ "a": 9,
59
+ "b": 10,
60
+ "c": 11,
61
+ "d": 12,
62
+ "e": 13,
63
+ "f": 14,
64
+ "g": 15,
65
+ "h": 16,
66
+ "i": 17,
67
+ "j": 18,
68
+ "k": 19,
69
+ "l": 20,
70
+ "m": 21,
71
+ "n": 22,
72
+ "o": 23,
73
+ "p": 24,
74
+ "q": 25,
75
+ "r": 26,
76
+ "s": 27,
77
+ "t": 28,
78
+ "u": 29,
79
+ "v": 30,
80
+ "w": 31,
81
+ "x": 32,
82
+ "y": 33,
83
+ "z": 34,
84
+ "an": 35,
85
+ "en": 36,
86
+ "er": 37,
87
+ "in": 38,
88
+ "ang": 39,
89
+ "at": 40,
90
+ "ar": 41,
91
+ "am": 42,
92
+ "as": 43,
93
+ "ak": 44,
94
+ "ah": 45,
95
+ "di": 46,
96
+ "em": 47,
97
+ "al": 48,
98
+ "un": 49,
99
+ "ik": 50,
100
+ "uk": 51,
101
+ "men": 52,
102
+ "se": 53,
103
+ "ke": 54,
104
+ "ap": 55,
105
+ "ber": 56,
106
+ "si": 57,
107
+ "yang": 58,
108
+ "eng": 59,
109
+ "mem": 60,
110
+ "it": 61,
111
+ "ya": 62,
112
+ "da": 63,
113
+ "dan": 64,
114
+ "bi": 65,
115
+ "el": 66,
116
+ "ini": 67,
117
+ "asi": 68,
118
+ "ter": 69,
119
+ "per": 70,
120
+ "ti": 71,
121
+ "on": 72,
122
+ "ba": 73,
123
+ "kam": 74,
124
+ "kan": 75,
125
+ "tuk": 76,
126
+ "bu": 77,
127
+ "ing": 78,
128
+ "ai": 79,
129
+ "akan": 80,
130
+ "untuk": 81,
131
+ "or": 82,
132
+ "ikan": 83,
133
+ "et": 84,
134
+ "sa": 85,
135
+ "us": 86,
136
+ "ung": 87,
137
+ "au": 88,
138
+ "tu": 89,
139
+ "deng": 90,
140
+ "pr": 91,
141
+ "ja": 92,
142
+ "dengan": 93,
143
+ "ek": 94,
144
+ "gi": 95,
145
+ "es": 96,
146
+ "ga": 97,
147
+ "alam": 98,
148
+ "ol": 99,
149
+ "il": 100,
150
+ "ari": 101,
151
+ "pen": 102,
152
+ "kami": 103,
153
+ "ih": 104,
154
+ "ur": 105,
155
+ "ta": 106,
156
+ "anan": 107,
157
+ "lu": 108,
158
+ "meng": 109,
159
+ "is": 110,
160
+ "anda": 111,
161
+ "ada": 112,
162
+ "pro": 113,
163
+ "ad": 114,
164
+ "man": 115,
165
+ "ara": 116,
166
+ "bang": 117,
167
+ "ita": 118,
168
+ "berikan": 119,
169
+ "ok": 120,
170
+ "la": 121,
171
+ "peng": 122,
172
+ "kita": 123,
173
+ "memberikan": 124,
174
+ "uh": 125,
175
+ "ma": 126,
176
+ "mu": 127,
177
+ "bar": 128,
178
+ "nya": 129,
179
+ "angan": 130,
180
+ "tem": 131,
181
+ "ban": 132,
182
+ "tr": 133,
183
+ "tan": 134,
184
+ "wa": 135,
185
+ "su": 136,
186
+ "le": 137,
187
+ "pem": 138,
188
+ "atan": 139,
189
+ "apa": 140,
190
+ "ama": 141,
191
+ "ku": 142,
192
+ "kamu": 143,
193
+ "bisa": 144,
194
+ "dalam": 145,
195
+ "atau": 146,
196
+ "dap": 147,
197
+ "ten": 148,
198
+ "ati": 149,
199
+ "bel": 150,
200
+ "ju": 151,
201
+ "pat": 152,
202
+ "asa": 153,
203
+ "sel": 154,
204
+ "ser": 155,
205
+ "ko": 156,
206
+ "dapat": 157,
207
+ "ahan": 158,
208
+ "gu": 159,
209
+ "pan": 160,
210
+ "jadi": 161,
211
+ "akah": 162,
212
+ "kon": 163,
213
+ "ir": 164,
214
+ "banget": 165,
215
+ "de": 166,
216
+ "pel": 167,
217
+ "buat": 168,
218
+ "tang": 169,
219
+ "mo": 170,
220
+ "han": 171,
221
+ "wah": 172,
222
+ "um": 173,
223
+ "bag": 174,
224
+ "kah": 175,
225
+ "cara": 176,
226
+ "kal": 177,
227
+ "har": 178,
228
+ "aman": 179,
229
+ "duk": 180,
230
+ "aan": 181,
231
+ "sem": 182,
232
+ "mana": 183,
233
+ "ker": 184,
234
+ "ram": 185,
235
+ "ken": 186,
236
+ "alah": 187,
237
+ "gan": 188,
238
+ "itas": 189,
239
+ "dah": 190,
240
+ "aran": 191,
241
+ "lebi": 192,
242
+ "meny": 193,
243
+ "sen": 194,
244
+ "lebih": 195,
245
+ "he": 196,
246
+ "gak": 197,
247
+ "hi": 198,
248
+ "pas": 199,
249
+ "ka": 200,
250
+ "asan": 201,
251
+ "pa": 202,
252
+ "ul": 203,
253
+ "ot": 204,
254
+ "ukan": 205,
255
+ "kap": 206,
256
+ "baru": 207,
257
+ "masi": 208,
258
+ "jal": 209,
259
+ "kar": 210,
260
+ "mer": 211,
261
+ "re": 212,
262
+ "luar": 213,
263
+ "ben": 214,
264
+ "dari": 215,
265
+ "du": 216,
266
+ "pe": 217,
267
+ "sini": 218,
268
+ "uka": 219,
269
+ "bik": 220,
270
+ "main": 221,
271
+ "eh": 222,
272
+ "bikin": 223,
273
+ "yanan": 224,
274
+ "ian": 225,
275
+ "for": 226,
276
+ "bat": 227,
277
+ "ut": 228,
278
+ "lan": 229,
279
+ "tentang": 230,
280
+ "bis": 231,
281
+ "gun": 232,
282
+ "co": 233,
283
+ "mas": 234,
284
+ "bagai": 235,
285
+ "seti": 236,
286
+ "amp": 237,
287
+ "aw": 238,
288
+ "mel": 239,
289
+ "bah": 240,
290
+ "bantu": 241,
291
+ "me": 242,
292
+ "sama": 243,
293
+ "ind": 244,
294
+ "baik": 245,
295
+ "pada": 246,
296
+ "li": 247,
297
+ "apakah": 248,
298
+ "angat": 249,
299
+ "adalah": 250,
300
+ "setiap": 251,
301
+ "ci": 252,
302
+ "aku": 253,
303
+ "aduh": 254
304
+ },
305
+ "merges": [
306
+ "a n",
307
+ "e n",
308
+ "e r",
309
+ "i n",
310
+ "an g",
311
+ "a t",
312
+ "a r",
313
+ "a m",
314
+ "a s",
315
+ "a k",
316
+ "a h",
317
+ "d i",
318
+ "e m",
319
+ "a l",
320
+ "u n",
321
+ "i k",
322
+ "u k",
323
+ "m en",
324
+ "s e",
325
+ "k e",
326
+ "a p",
327
+ "b er",
328
+ "s i",
329
+ "y ang",
330
+ "en g",
331
+ "m em",
332
+ "i t",
333
+ "y a",
334
+ "d a",
335
+ "d an",
336
+ "b i",
337
+ "e l",
338
+ "in i",
339
+ "as i",
340
+ "t er",
341
+ "p er",
342
+ "t i",
343
+ "o n",
344
+ "b a",
345
+ "k am",
346
+ "k an",
347
+ "t uk",
348
+ "b u",
349
+ "in g",
350
+ "a i",
351
+ "ak an",
352
+ "un tuk",
353
+ "o r",
354
+ "ik an",
355
+ "e t",
356
+ "s a",
357
+ "u s",
358
+ "un g",
359
+ "a u",
360
+ "t u",
361
+ "d eng",
362
+ "p r",
363
+ "j a",
364
+ "deng an",
365
+ "e k",
366
+ "g i",
367
+ "e s",
368
+ "g a",
369
+ "al am",
370
+ "o l",
371
+ "i l",
372
+ "ar i",
373
+ "p en",
374
+ "kam i",
375
+ "i h",
376
+ "u r",
377
+ "t a",
378
+ "an an",
379
+ "l u",
380
+ "men g",
381
+ "i s",
382
+ "an da",
383
+ "a da",
384
+ "pr o",
385
+ "a d",
386
+ "m an",
387
+ "ar a",
388
+ "b ang",
389
+ "it a",
390
+ "ber ikan",
391
+ "o k",
392
+ "l a",
393
+ "p eng",
394
+ "k ita",
395
+ "mem berikan",
396
+ "u h",
397
+ "m a",
398
+ "m u",
399
+ "b ar",
400
+ "n ya",
401
+ "ang an",
402
+ "t em",
403
+ "b an",
404
+ "t r",
405
+ "t an",
406
+ "w a",
407
+ "s u",
408
+ "l e",
409
+ "p em",
410
+ "at an",
411
+ "ap a",
412
+ "am a",
413
+ "k u",
414
+ "kam u",
415
+ "bi sa",
416
+ "d alam",
417
+ "at au",
418
+ "d ap",
419
+ "t en",
420
+ "at i",
421
+ "b el",
422
+ "j u",
423
+ "p at",
424
+ "as a",
425
+ "se l",
426
+ "s er",
427
+ "k o",
428
+ "dap at",
429
+ "ah an",
430
+ "g u",
431
+ "p an",
432
+ "ja di",
433
+ "ak ah",
434
+ "k on",
435
+ "i r",
436
+ "bang et",
437
+ "d e",
438
+ "p el",
439
+ "bu at",
440
+ "t ang",
441
+ "m o",
442
+ "h an",
443
+ "w ah",
444
+ "u m",
445
+ "ba g",
446
+ "k ah",
447
+ "c ara",
448
+ "k al",
449
+ "h ar",
450
+ "am an",
451
+ "d uk",
452
+ "a an",
453
+ "s em",
454
+ "man a",
455
+ "k er",
456
+ "r am",
457
+ "k en",
458
+ "al ah",
459
+ "g an",
460
+ "it as",
461
+ "d ah",
462
+ "ar an",
463
+ "le bi",
464
+ "men y",
465
+ "s en",
466
+ "lebi h",
467
+ "h e",
468
+ "g ak",
469
+ "h i",
470
+ "p as",
471
+ "k a",
472
+ "as an",
473
+ "p a",
474
+ "u l",
475
+ "o t",
476
+ "uk an",
477
+ "k ap",
478
+ "bar u",
479
+ "m asi",
480
+ "j al",
481
+ "k ar",
482
+ "m er",
483
+ "r e",
484
+ "lu ar",
485
+ "b en",
486
+ "d ari",
487
+ "d u",
488
+ "p e",
489
+ "s ini",
490
+ "uk a",
491
+ "b ik",
492
+ "ma in",
493
+ "e h",
494
+ "bik in",
495
+ "y anan",
496
+ "i an",
497
+ "f or",
498
+ "b at",
499
+ "u t",
500
+ "l an",
501
+ "ten tang",
502
+ "bi s",
503
+ "g un",
504
+ "c o",
505
+ "m as",
506
+ "bag ai",
507
+ "se ti",
508
+ "am p",
509
+ "a w",
510
+ "m el",
511
+ "b ah",
512
+ "ban tu",
513
+ "m e",
514
+ "s ama",
515
+ "in d",
516
+ "ba ik",
517
+ "p ada",
518
+ "l i",
519
+ "ap akah",
520
+ "ang at",
521
+ "ad alah",
522
+ "seti ap",
523
+ "c i",
524
+ "ak u",
525
+ "ad uh"
526
+ ],
527
+ "language": "multi"
528
+ }
529
+ }