Zaid commited on
Commit
f11724f
1 Parent(s): a132208

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +64 -73
  2. vocab.json +1 -1
tokenizer.json CHANGED
@@ -23,7 +23,7 @@
23
  },
24
  {
25
  "id": 2,
26
- "content": "<|vsep|>",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 3,
35
- "content": "<|bsep|>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "id": 4,
44
- "content": "<|pad|>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -50,7 +50,7 @@
50
  },
51
  {
52
  "id": 5,
53
- "content": "<|meter_0|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
@@ -59,7 +59,7 @@
59
  },
60
  {
61
  "id": 6,
62
- "content": "<|meter_1|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
@@ -68,7 +68,7 @@
68
  },
69
  {
70
  "id": 7,
71
- "content": "<|meter_2|>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": false,
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<|meter_3|>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -86,7 +86,7 @@
86
  },
87
  {
88
  "id": 9,
89
- "content": "<|meter_4|>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<|meter_5|>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<|meter_6|>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -113,7 +113,7 @@
113
  },
114
  {
115
  "id": 12,
116
- "content": "<|meter_7|>",
117
  "single_word": false,
118
  "lstrip": false,
119
  "rstrip": false,
@@ -122,7 +122,7 @@
122
  },
123
  {
124
  "id": 13,
125
- "content": "<|meter_8|>",
126
  "single_word": false,
127
  "lstrip": false,
128
  "rstrip": false,
@@ -131,7 +131,7 @@
131
  },
132
  {
133
  "id": 14,
134
- "content": "<|meter_9|>",
135
  "single_word": false,
136
  "lstrip": false,
137
  "rstrip": false,
@@ -140,7 +140,7 @@
140
  },
141
  {
142
  "id": 15,
143
- "content": "<|meter_10|>",
144
  "single_word": false,
145
  "lstrip": false,
146
  "rstrip": false,
@@ -149,7 +149,7 @@
149
  },
150
  {
151
  "id": 16,
152
- "content": "<|meter_11|>",
153
  "single_word": false,
154
  "lstrip": false,
155
  "rstrip": false,
@@ -158,7 +158,7 @@
158
  },
159
  {
160
  "id": 17,
161
- "content": "<|meter_12|>",
162
  "single_word": false,
163
  "lstrip": false,
164
  "rstrip": false,
@@ -167,7 +167,7 @@
167
  },
168
  {
169
  "id": 18,
170
- "content": "<|meter_13|>",
171
  "single_word": false,
172
  "lstrip": false,
173
  "rstrip": false,
@@ -176,7 +176,7 @@
176
  },
177
  {
178
  "id": 19,
179
- "content": "<|meter_14|>",
180
  "single_word": false,
181
  "lstrip": false,
182
  "rstrip": false,
@@ -185,7 +185,7 @@
185
  },
186
  {
187
  "id": 20,
188
- "content": "<|meter_15|>",
189
  "single_word": false,
190
  "lstrip": false,
191
  "rstrip": false,
@@ -194,7 +194,7 @@
194
  },
195
  {
196
  "id": 21,
197
- "content": "<|res_0|>",
198
  "single_word": false,
199
  "lstrip": false,
200
  "rstrip": false,
@@ -203,7 +203,7 @@
203
  },
204
  {
205
  "id": 22,
206
- "content": "<|res_1|>",
207
  "single_word": false,
208
  "lstrip": false,
209
  "rstrip": false,
@@ -212,7 +212,7 @@
212
  },
213
  {
214
  "id": 23,
215
- "content": "<|res_2|>",
216
  "single_word": false,
217
  "lstrip": false,
218
  "rstrip": false,
@@ -221,7 +221,7 @@
221
  },
222
  {
223
  "id": 24,
224
- "content": "<|res_3|>",
225
  "single_word": false,
226
  "lstrip": false,
227
  "rstrip": false,
@@ -230,7 +230,7 @@
230
  },
231
  {
232
  "id": 25,
233
- "content": "<|res_4|>",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
@@ -239,7 +239,7 @@
239
  },
240
  {
241
  "id": 26,
242
- "content": "<|res_5|>",
243
  "single_word": false,
244
  "lstrip": false,
245
  "rstrip": false,
@@ -248,7 +248,7 @@
248
  },
249
  {
250
  "id": 27,
251
- "content": "<|res_6|>",
252
  "single_word": false,
253
  "lstrip": false,
254
  "rstrip": false,
@@ -257,7 +257,7 @@
257
  },
258
  {
259
  "id": 28,
260
- "content": "<|res_7|>",
261
  "single_word": false,
262
  "lstrip": false,
263
  "rstrip": false,
@@ -266,7 +266,7 @@
266
  },
267
  {
268
  "id": 29,
269
- "content": "<|res_8|>",
270
  "single_word": false,
271
  "lstrip": false,
272
  "rstrip": false,
@@ -275,7 +275,7 @@
275
  },
276
  {
277
  "id": 30,
278
- "content": "<|res_9|>",
279
  "single_word": false,
280
  "lstrip": false,
281
  "rstrip": false,
@@ -284,7 +284,7 @@
284
  },
285
  {
286
  "id": 31,
287
- "content": "<|res_10|>",
288
  "single_word": false,
289
  "lstrip": false,
290
  "rstrip": false,
@@ -293,16 +293,7 @@
293
  },
294
  {
295
  "id": 32,
296
- "content": "<|res_11|>",
297
- "single_word": false,
298
- "lstrip": false,
299
- "rstrip": false,
300
- "normalized": false,
301
- "special": true
302
- },
303
- {
304
- "id": 33,
305
- "content": "<|res_12|>",
306
  "single_word": false,
307
  "lstrip": false,
308
  "rstrip": false,
@@ -327,39 +318,39 @@
327
  "vocab": {
328
  "<|endoftext|>": 0,
329
  "<|psep|>": 1,
330
- "<|vsep|>": 2,
331
- "<|bsep|>": 3,
332
- "<|pad|>": 4,
333
- "<|meter_0|>": 5,
334
- "<|meter_1|>": 6,
335
- "<|meter_2|>": 7,
336
- "<|meter_3|>": 8,
337
- "<|meter_4|>": 9,
338
- "<|meter_5|>": 10,
339
- "<|meter_6|>": 11,
340
- "<|meter_7|>": 12,
341
- "<|meter_8|>": 13,
342
- "<|meter_9|>": 14,
343
- "<|meter_10|>": 15,
344
- "<|meter_11|>": 16,
345
- "<|meter_12|>": 17,
346
- "<|meter_13|>": 18,
347
- "<|meter_14|>": 19,
348
- "<|meter_15|>": 20,
349
- "<|res_0|>": 21,
350
- "<|res_1|>": 22,
351
- "<|res_2|>": 23,
352
- "<|res_3|>": 24,
353
- "<|res_4|>": 25,
354
- "<|res_5|>": 26,
355
- "<|res_6|>": 27,
356
- "<|res_7|>": 28,
357
- "<|res_8|>": 29,
358
- "<|res_9|>": 30,
359
- "<|res_10|>": 31,
360
- "<|res_11|>": 32,
361
- "<|res_12|>": 33,
362
- " ": 34,
363
  "0": 35,
364
  "1": 36,
365
  "2": 37,
 
23
  },
24
  {
25
  "id": 2,
26
+ "content": "</|psep|>",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
 
32
  },
33
  {
34
  "id": 3,
35
+ "content": "<|vsep|>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 4,
44
+ "content": "<|bsep|>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
50
  },
51
  {
52
  "id": 5,
53
+ "content": "</|bsep|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
 
59
  },
60
  {
61
  "id": 6,
62
+ "content": "<|pad|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
 
68
  },
69
  {
70
  "id": 7,
71
+ "content": "<|meter_0|>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": false,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<|meter_1|>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
86
  },
87
  {
88
  "id": 9,
89
+ "content": "<|meter_2|>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<|meter_3|>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<|meter_4|>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
113
  },
114
  {
115
  "id": 12,
116
+ "content": "<|meter_5|>",
117
  "single_word": false,
118
  "lstrip": false,
119
  "rstrip": false,
 
122
  },
123
  {
124
  "id": 13,
125
+ "content": "<|meter_6|>",
126
  "single_word": false,
127
  "lstrip": false,
128
  "rstrip": false,
 
131
  },
132
  {
133
  "id": 14,
134
+ "content": "<|meter_7|>",
135
  "single_word": false,
136
  "lstrip": false,
137
  "rstrip": false,
 
140
  },
141
  {
142
  "id": 15,
143
+ "content": "<|meter_8|>",
144
  "single_word": false,
145
  "lstrip": false,
146
  "rstrip": false,
 
149
  },
150
  {
151
  "id": 16,
152
+ "content": "<|meter_9|>",
153
  "single_word": false,
154
  "lstrip": false,
155
  "rstrip": false,
 
158
  },
159
  {
160
  "id": 17,
161
+ "content": "<|meter_10|>",
162
  "single_word": false,
163
  "lstrip": false,
164
  "rstrip": false,
 
167
  },
168
  {
169
  "id": 18,
170
+ "content": "<|meter_11|>",
171
  "single_word": false,
172
  "lstrip": false,
173
  "rstrip": false,
 
176
  },
177
  {
178
  "id": 19,
179
+ "content": "<|meter_12|>",
180
  "single_word": false,
181
  "lstrip": false,
182
  "rstrip": false,
 
185
  },
186
  {
187
  "id": 20,
188
+ "content": "<|meter_13|>",
189
  "single_word": false,
190
  "lstrip": false,
191
  "rstrip": false,
 
194
  },
195
  {
196
  "id": 21,
197
+ "content": "<|meter_14|>",
198
  "single_word": false,
199
  "lstrip": false,
200
  "rstrip": false,
 
203
  },
204
  {
205
  "id": 22,
206
+ "content": "<|meter_15|>",
207
  "single_word": false,
208
  "lstrip": false,
209
  "rstrip": false,
 
212
  },
213
  {
214
  "id": 23,
215
+ "content": "<|res_0|>",
216
  "single_word": false,
217
  "lstrip": false,
218
  "rstrip": false,
 
221
  },
222
  {
223
  "id": 24,
224
+ "content": "<|res_1|>",
225
  "single_word": false,
226
  "lstrip": false,
227
  "rstrip": false,
 
230
  },
231
  {
232
  "id": 25,
233
+ "content": "<|res_2|>",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
 
239
  },
240
  {
241
  "id": 26,
242
+ "content": "<|res_3|>",
243
  "single_word": false,
244
  "lstrip": false,
245
  "rstrip": false,
 
248
  },
249
  {
250
  "id": 27,
251
+ "content": "<|res_4|>",
252
  "single_word": false,
253
  "lstrip": false,
254
  "rstrip": false,
 
257
  },
258
  {
259
  "id": 28,
260
+ "content": "<|res_5|>",
261
  "single_word": false,
262
  "lstrip": false,
263
  "rstrip": false,
 
266
  },
267
  {
268
  "id": 29,
269
+ "content": "<|res_6|>",
270
  "single_word": false,
271
  "lstrip": false,
272
  "rstrip": false,
 
275
  },
276
  {
277
  "id": 30,
278
+ "content": "<|res_7|>",
279
  "single_word": false,
280
  "lstrip": false,
281
  "rstrip": false,
 
284
  },
285
  {
286
  "id": 31,
287
+ "content": "<|res_8|>",
288
  "single_word": false,
289
  "lstrip": false,
290
  "rstrip": false,
 
293
  },
294
  {
295
  "id": 32,
296
+ "content": "<|res_9|>",
 
 
 
 
 
 
 
 
 
297
  "single_word": false,
298
  "lstrip": false,
299
  "rstrip": false,
 
318
  "vocab": {
319
  "<|endoftext|>": 0,
320
  "<|psep|>": 1,
321
+ "</|psep|>": 2,
322
+ "<|vsep|>": 3,
323
+ "<|bsep|>": 4,
324
+ "</|bsep|>": 5,
325
+ "<|pad|>": 6,
326
+ "<|meter_0|>": 7,
327
+ "<|meter_1|>": 8,
328
+ "<|meter_2|>": 9,
329
+ "<|meter_3|>": 10,
330
+ "<|meter_4|>": 11,
331
+ "<|meter_5|>": 12,
332
+ "<|meter_6|>": 13,
333
+ "<|meter_7|>": 14,
334
+ "<|meter_8|>": 15,
335
+ "<|meter_9|>": 16,
336
+ "<|meter_10|>": 17,
337
+ "<|meter_11|>": 18,
338
+ "<|meter_12|>": 19,
339
+ "<|meter_13|>": 20,
340
+ "<|meter_14|>": 21,
341
+ "<|meter_15|>": 22,
342
+ "<|res_0|>": 23,
343
+ "<|res_1|>": 24,
344
+ "<|res_2|>": 25,
345
+ "<|res_3|>": 26,
346
+ "<|res_4|>": 27,
347
+ "<|res_5|>": 28,
348
+ "<|res_6|>": 29,
349
+ "<|res_7|>": 30,
350
+ "<|res_8|>": 31,
351
+ "<|res_9|>": 32,
352
+ " ": 33,
353
+ "/": 34,
354
  "0": 35,
355
  "1": 36,
356
  "2": 37,
vocab.json CHANGED
@@ -1 +1 @@
1
- {"<|endoftext|>":0,"<|psep|>":1,"<|vsep|>":2,"<|bsep|>":3,"<|pad|>":4,"<|meter_0|>":5,"<|meter_1|>":6,"<|meter_2|>":7,"<|meter_3|>":8,"<|meter_4|>":9,"<|meter_5|>":10,"<|meter_6|>":11,"<|meter_7|>":12,"<|meter_8|>":13,"<|meter_9|>":14,"<|meter_10|>":15,"<|meter_11|>":16,"<|meter_12|>":17,"<|meter_13|>":18,"<|meter_14|>":19,"<|meter_15|>":20,"<|res_0|>":21,"<|res_1|>":22,"<|res_2|>":23,"<|res_3|>":24,"<|res_4|>":25,"<|res_5|>":26,"<|res_6|>":27,"<|res_7|>":28,"<|res_8|>":29,"<|res_9|>":30,"<|res_10|>":31,"<|res_11|>":32,"<|res_12|>":33," ":34,"0":35,"1":36,"2":37,"3":38,"4":39,"5":40,"6":41,"7":42,"8":43,"9":44,"<":45,">":46,"_":47,"b":48,"e":49,"m":50,"p":51,"r":52,"s":53,"t":54,"v":55,"|":56,"~":57,"ء":58,"أ":59,"ؤ":60,"ئ":61,"ا":62,"ب":63,"ة":64,"ت":65,"ث":66,"ج":67,"ح":68,"خ":69,"د":70,"ذ":71,"ر":72,"ز":73,"س":74,"ش":75,"ص":76,"ض":77,"ط":78,"ظ":79,"ع":80,"غ":81,"ف":82,"ق":83,"ك":84,"ل":85,"م":86,"ن":87,"ه":88,"و":89,"ى":90,"ي":91,"ً":92,"ٌ":93,"ٍ":94,"َ":95,"ُ":96,"ِ":97,"ّ":98,"ْ":99}
 
1
+ {"<|endoftext|>":0,"<|psep|>":1,"</|psep|>":2,"<|vsep|>":3,"<|bsep|>":4,"</|bsep|>":5,"<|pad|>":6,"<|meter_0|>":7,"<|meter_1|>":8,"<|meter_2|>":9,"<|meter_3|>":10,"<|meter_4|>":11,"<|meter_5|>":12,"<|meter_6|>":13,"<|meter_7|>":14,"<|meter_8|>":15,"<|meter_9|>":16,"<|meter_10|>":17,"<|meter_11|>":18,"<|meter_12|>":19,"<|meter_13|>":20,"<|meter_14|>":21,"<|meter_15|>":22,"<|res_0|>":23,"<|res_1|>":24,"<|res_2|>":25,"<|res_3|>":26,"<|res_4|>":27,"<|res_5|>":28,"<|res_6|>":29,"<|res_7|>":30,"<|res_8|>":31,"<|res_9|>":32," ":33,"/":34,"0":35,"1":36,"2":37,"3":38,"4":39,"5":40,"6":41,"7":42,"8":43,"9":44,"<":45,">":46,"_":47,"b":48,"e":49,"m":50,"p":51,"r":52,"s":53,"t":54,"v":55,"|":56,"~":57,"ء":58,"أ":59,"ؤ":60,"ئ":61,"ا":62,"ب":63,"ة":64,"ت":65,"ث":66,"ج":67,"ح":68,"خ":69,"د":70,"ذ":71,"ر":72,"ز":73,"س":74,"ش":75,"ص":76,"ض":77,"ط":78,"ظ":79,"ع":80,"غ":81,"ف":82,"ق":83,"ك":84,"ل":85,"م":86,"ن":87,"ه":88,"و":89,"ى":90,"ي":91,"ً":92,"ٌ":93,"ٍ":94,"َ":95,"ُ":96,"ِ":97,"ّ":98,"ْ":99}