arrivederci19 commited on
Commit
b8d92b6
1 Parent(s): b52d66f

added tokenizer

Browse files
Files changed (1) hide show
  1. dutch_vl.json +435 -0
dutch_vl.json ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[STOP]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SPACE]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "Whitespace"
37
+ },
38
+ "post_processor": null,
39
+ "decoder": null,
40
+ "model": {
41
+ "type": "BPE",
42
+ "language": "nl",
43
+ "dropout": null,
44
+ "unk_token": "[UNK]",
45
+ "continuing_subword_prefix": null,
46
+ "end_of_word_suffix": null,
47
+ "fuse_unk": false,
48
+ "byte_fallback": false,
49
+ "vocab": {
50
+ "[STOP]": 0,
51
+ "[UNK]": 1,
52
+ "[SPACE]": 2,
53
+ "!": 3,
54
+ "\"": 4,
55
+ "$": 5,
56
+ "&": 6,
57
+ "'": 7,
58
+ "(": 8,
59
+ ")": 9,
60
+ "*": 10,
61
+ ",": 11,
62
+ "-": 12,
63
+ ".": 13,
64
+ "/": 14,
65
+ "0": 15,
66
+ "1": 16,
67
+ "2": 17,
68
+ "3": 18,
69
+ "4": 19,
70
+ "5": 20,
71
+ "6": 21,
72
+ "7": 22,
73
+ "8": 23,
74
+ "9": 24,
75
+ ":": 25,
76
+ ";": 26,
77
+ "<": 27,
78
+ "=": 28,
79
+ ">": 29,
80
+ "?": 30,
81
+ "A": 31,
82
+ "B": 32,
83
+ "C": 33,
84
+ "D": 34,
85
+ "E": 35,
86
+ "F": 36,
87
+ "G": 37,
88
+ "H": 38,
89
+ "I": 39,
90
+ "J": 40,
91
+ "K": 41,
92
+ "L": 42,
93
+ "M": 43,
94
+ "N": 44,
95
+ "O": 45,
96
+ "P": 46,
97
+ "Q": 47,
98
+ "R": 48,
99
+ "S": 49,
100
+ "T": 50,
101
+ "U": 51,
102
+ "V": 52,
103
+ "W": 53,
104
+ "X": 54,
105
+ "Y": 55,
106
+ "Z": 56,
107
+ "a": 57,
108
+ "b": 58,
109
+ "c": 59,
110
+ "d": 60,
111
+ "e": 61,
112
+ "f": 62,
113
+ "g": 63,
114
+ "h": 64,
115
+ "i": 65,
116
+ "j": 66,
117
+ "k": 67,
118
+ "l": 68,
119
+ "m": 69,
120
+ "n": 70,
121
+ "o": 71,
122
+ "p": 72,
123
+ "q": 73,
124
+ "r": 74,
125
+ "s": 75,
126
+ "t": 76,
127
+ "u": 77,
128
+ "v": 78,
129
+ "w": 79,
130
+ "x": 80,
131
+ "y": 81,
132
+ "z": 82,
133
+ "©": 83,
134
+ "«": 84,
135
+ "°": 85,
136
+ "»": 86,
137
+ "¿": 87,
138
+ "Ó": 88,
139
+ "Ö": 89,
140
+ "Ü": 90,
141
+ "ß": 91,
142
+ "à": 92,
143
+ "á": 93,
144
+ "ä": 94,
145
+ "ç": 95,
146
+ "è": 96,
147
+ "é": 97,
148
+ "ê": 98,
149
+ "ë": 99,
150
+ "í": 100,
151
+ "î": 101,
152
+ "ï": 102,
153
+ "ñ": 103,
154
+ "ò": 104,
155
+ "ó": 105,
156
+ "ô": 106,
157
+ "ö": 107,
158
+ "ú": 108,
159
+ "û": 109,
160
+ "ü": 110,
161
+ "č": 111,
162
+ "ę": 112,
163
+ "ł": 113,
164
+ "œ": 114,
165
+ "ř": 115,
166
+ "ś": 116,
167
+ "ƒ": 117,
168
+ "α": 118,
169
+ "π": 119,
170
+ "–": 120,
171
+ "‘": 121,
172
+ "’": 122,
173
+ "“": 123,
174
+ "”": 124,
175
+ "•": 125,
176
+ "…": 126,
177
+ "Ω": 127,
178
+ "ℵ": 128,
179
+ "en": 129,
180
+ "er": 130,
181
+ "ij": 131,
182
+ "de": 132,
183
+ "et": 133,
184
+ "aa": 134,
185
+ "an": 135,
186
+ "el": 136,
187
+ "in": 137,
188
+ "st": 138,
189
+ "ch": 139,
190
+ "aar": 140,
191
+ "oo": 141,
192
+ "at": 142,
193
+ "een": 143,
194
+ "ge": 144,
195
+ "on": 145,
196
+ "ie": 146,
197
+ "te": 147,
198
+ "het": 148,
199
+ "al": 149,
200
+ "ver": 150,
201
+ "op": 151,
202
+ "ijn": 152,
203
+ "van": 153,
204
+ "ze": 154,
205
+ "gen": 155,
206
+ "oe": 156,
207
+ "wa": 157,
208
+ "ee": 158,
209
+ "it": 159,
210
+ "den": 160,
211
+ "oor": 161,
212
+ "hij": 162,
213
+ "dat": 163,
214
+ "cht": 164,
215
+ "der": 165,
216
+ "is": 166,
217
+ "iet": 167,
218
+ "zijn": 168,
219
+ "he": 169,
220
+ "om": 170,
221
+ "be": 171,
222
+ "aan": 172,
223
+ "je": 173,
224
+ "ou": 174,
225
+ "ken": 175,
226
+ "niet": 176,
227
+ "ik": 177,
228
+ "ar": 178,
229
+ "eer": 179,
230
+ "or": 180,
231
+ "sch": 181,
232
+ "was": 182,
233
+ "le": 183,
234
+ "die": 184,
235
+ "met": 185,
236
+ "ad": 186,
237
+ "ijk": 187,
238
+ "zi": 188,
239
+ "ing": 189,
240
+ "re": 190,
241
+ "ur": 191,
242
+ "uit": 192,
243
+ "we": 193,
244
+ "had": 194,
245
+ "il": 195,
246
+ "to": 196,
247
+ "ig": 197,
248
+ "ven": 198,
249
+ "voor": 199,
250
+ "zei": 200,
251
+ "ol": 201,
252
+ "no": 202,
253
+ "acht": 203,
254
+ "am": 204,
255
+ "maar": 205,
256
+ "ten": 206,
257
+ "als": 207,
258
+ "naar": 208,
259
+ "us": 209,
260
+ "ien": 210,
261
+ "gr": 211,
262
+ "hem": 212,
263
+ "gel": 213,
264
+ "un": 214,
265
+ "af": 215,
266
+ "vr": 216,
267
+ "over": 217,
268
+ "id": 218,
269
+ "haar": 219,
270
+ "of": 220,
271
+ "zo": 221,
272
+ "ste": 222,
273
+ "and": 223,
274
+ "Hij": 224,
275
+ "men": 225,
276
+ "sp": 226,
277
+ "dr": 227,
278
+ "la": 228,
279
+ "waar": 229,
280
+ "arr": 230,
281
+ "Harr": 231,
282
+ "lijk": 232,
283
+ "Harry": 233,
284
+ "zich": 234,
285
+ "ter": 235,
286
+ "ond": 236,
287
+ ".’": 237,
288
+ "aal": 238,
289
+ "ui": 239,
290
+ "wer": 240,
291
+ "ier": 241,
292
+ "nog": 242,
293
+ "door": 243,
294
+ "Ik": 244,
295
+ "dan": 245,
296
+ "ro": 246,
297
+ "ook": 247,
298
+ "aat": 248,
299
+ "heb": 249,
300
+ "ben": 250,
301
+ "bl": 251,
302
+ "ag": 252,
303
+ "bij": 253,
304
+ "ak": 254
305
+ },
306
+ "merges": [
307
+ "e n",
308
+ "e r",
309
+ "i j",
310
+ "d e",
311
+ "e t",
312
+ "a a",
313
+ "a n",
314
+ "e l",
315
+ "i n",
316
+ "s t",
317
+ "c h",
318
+ "aa r",
319
+ "o o",
320
+ "a t",
321
+ "e en",
322
+ "g e",
323
+ "o n",
324
+ "i e",
325
+ "t e",
326
+ "h et",
327
+ "a l",
328
+ "v er",
329
+ "o p",
330
+ "ij n",
331
+ "v an",
332
+ "z e",
333
+ "g en",
334
+ "o e",
335
+ "w a",
336
+ "e e",
337
+ "i t",
338
+ "d en",
339
+ "oo r",
340
+ "h ij",
341
+ "d at",
342
+ "ch t",
343
+ "d er",
344
+ "i s",
345
+ "i et",
346
+ "z ijn",
347
+ "h e",
348
+ "o m",
349
+ "b e",
350
+ "aa n",
351
+ "j e",
352
+ "o u",
353
+ "k en",
354
+ "n iet",
355
+ "i k",
356
+ "a r",
357
+ "e er",
358
+ "o r",
359
+ "s ch",
360
+ "wa s",
361
+ "l e",
362
+ "d ie",
363
+ "m et",
364
+ "a d",
365
+ "ij k",
366
+ "z i",
367
+ "in g",
368
+ "r e",
369
+ "u r",
370
+ "u it",
371
+ "w e",
372
+ "h ad",
373
+ "i l",
374
+ "t o",
375
+ "i g",
376
+ "v en",
377
+ "v oor",
378
+ "ze i",
379
+ "o l",
380
+ "n o",
381
+ "a cht",
382
+ "a m",
383
+ "m aar",
384
+ "t en",
385
+ "al s",
386
+ "n aar",
387
+ "u s",
388
+ "i en",
389
+ "g r",
390
+ "he m",
391
+ "g el",
392
+ "u n",
393
+ "a f",
394
+ "v r",
395
+ "o ver",
396
+ "i d",
397
+ "h aar",
398
+ "o f",
399
+ "z o",
400
+ "st e",
401
+ "an d",
402
+ "H ij",
403
+ "m en",
404
+ "s p",
405
+ "d r",
406
+ "l a",
407
+ "w aar",
408
+ "ar r",
409
+ "H arr",
410
+ "l ijk",
411
+ "Harr y",
412
+ "zi ch",
413
+ "t er",
414
+ "on d",
415
+ ". ’",
416
+ "aa l",
417
+ "u i",
418
+ "w er",
419
+ "i er",
420
+ "no g",
421
+ "d oor",
422
+ "I k",
423
+ "d an",
424
+ "r o",
425
+ "oo k",
426
+ "aa t",
427
+ "he b",
428
+ "b en",
429
+ "b l",
430
+ "a g",
431
+ "b ij",
432
+ "a k"
433
+ ]
434
+ }
435
+ }