codingwithlewis commited on
Commit
4bce3a1
1 Parent(s): 469ca88

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +2 -2
  2. tokenizer.json +1 -294
  3. tokenizer_config.json +3 -262
special_tokens_map.json CHANGED
@@ -7,14 +7,14 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|im_end|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "</s>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "</s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<unk>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 2048,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
@@ -34,294 +29,6 @@
34
  "rstrip": false,
35
  "normalized": false,
36
  "special": true
37
- },
38
- {
39
- "id": 32000,
40
- "content": "<|im_end|>",
41
- "single_word": false,
42
- "lstrip": false,
43
- "rstrip": false,
44
- "normalized": false,
45
- "special": true
46
- },
47
- {
48
- "id": 32001,
49
- "content": "<|im_start|>",
50
- "single_word": false,
51
- "lstrip": false,
52
- "rstrip": false,
53
- "normalized": false,
54
- "special": false
55
- },
56
- {
57
- "id": 32002,
58
- "content": "<pad2>",
59
- "single_word": false,
60
- "lstrip": false,
61
- "rstrip": false,
62
- "normalized": false,
63
- "special": false
64
- },
65
- {
66
- "id": 32003,
67
- "content": "<pad3>",
68
- "single_word": false,
69
- "lstrip": false,
70
- "rstrip": false,
71
- "normalized": false,
72
- "special": false
73
- },
74
- {
75
- "id": 32004,
76
- "content": "<pad4>",
77
- "single_word": false,
78
- "lstrip": false,
79
- "rstrip": false,
80
- "normalized": false,
81
- "special": false
82
- },
83
- {
84
- "id": 32005,
85
- "content": "<pad5>",
86
- "single_word": false,
87
- "lstrip": false,
88
- "rstrip": false,
89
- "normalized": false,
90
- "special": false
91
- },
92
- {
93
- "id": 32006,
94
- "content": "<pad6>",
95
- "single_word": false,
96
- "lstrip": false,
97
- "rstrip": false,
98
- "normalized": false,
99
- "special": false
100
- },
101
- {
102
- "id": 32007,
103
- "content": "<pad7>",
104
- "single_word": false,
105
- "lstrip": false,
106
- "rstrip": false,
107
- "normalized": false,
108
- "special": false
109
- },
110
- {
111
- "id": 32008,
112
- "content": "<pad8>",
113
- "single_word": false,
114
- "lstrip": false,
115
- "rstrip": false,
116
- "normalized": false,
117
- "special": false
118
- },
119
- {
120
- "id": 32009,
121
- "content": "<pad9>",
122
- "single_word": false,
123
- "lstrip": false,
124
- "rstrip": false,
125
- "normalized": false,
126
- "special": false
127
- },
128
- {
129
- "id": 32010,
130
- "content": "<pad10>",
131
- "single_word": false,
132
- "lstrip": false,
133
- "rstrip": false,
134
- "normalized": false,
135
- "special": false
136
- },
137
- {
138
- "id": 32011,
139
- "content": "<pad11>",
140
- "single_word": false,
141
- "lstrip": false,
142
- "rstrip": false,
143
- "normalized": false,
144
- "special": false
145
- },
146
- {
147
- "id": 32012,
148
- "content": "<pad12>",
149
- "single_word": false,
150
- "lstrip": false,
151
- "rstrip": false,
152
- "normalized": false,
153
- "special": false
154
- },
155
- {
156
- "id": 32013,
157
- "content": "<pad13>",
158
- "single_word": false,
159
- "lstrip": false,
160
- "rstrip": false,
161
- "normalized": false,
162
- "special": false
163
- },
164
- {
165
- "id": 32014,
166
- "content": "<pad14>",
167
- "single_word": false,
168
- "lstrip": false,
169
- "rstrip": false,
170
- "normalized": false,
171
- "special": false
172
- },
173
- {
174
- "id": 32015,
175
- "content": "<pad15>",
176
- "single_word": false,
177
- "lstrip": false,
178
- "rstrip": false,
179
- "normalized": false,
180
- "special": false
181
- },
182
- {
183
- "id": 32016,
184
- "content": "<pad16>",
185
- "single_word": false,
186
- "lstrip": false,
187
- "rstrip": false,
188
- "normalized": false,
189
- "special": false
190
- },
191
- {
192
- "id": 32017,
193
- "content": "<pad17>",
194
- "single_word": false,
195
- "lstrip": false,
196
- "rstrip": false,
197
- "normalized": false,
198
- "special": false
199
- },
200
- {
201
- "id": 32018,
202
- "content": "<pad18>",
203
- "single_word": false,
204
- "lstrip": false,
205
- "rstrip": false,
206
- "normalized": false,
207
- "special": false
208
- },
209
- {
210
- "id": 32019,
211
- "content": "<pad19>",
212
- "single_word": false,
213
- "lstrip": false,
214
- "rstrip": false,
215
- "normalized": false,
216
- "special": false
217
- },
218
- {
219
- "id": 32020,
220
- "content": "<pad20>",
221
- "single_word": false,
222
- "lstrip": false,
223
- "rstrip": false,
224
- "normalized": false,
225
- "special": false
226
- },
227
- {
228
- "id": 32021,
229
- "content": "<pad21>",
230
- "single_word": false,
231
- "lstrip": false,
232
- "rstrip": false,
233
- "normalized": false,
234
- "special": false
235
- },
236
- {
237
- "id": 32022,
238
- "content": "<pad22>",
239
- "single_word": false,
240
- "lstrip": false,
241
- "rstrip": false,
242
- "normalized": false,
243
- "special": false
244
- },
245
- {
246
- "id": 32023,
247
- "content": "<pad23>",
248
- "single_word": false,
249
- "lstrip": false,
250
- "rstrip": false,
251
- "normalized": false,
252
- "special": false
253
- },
254
- {
255
- "id": 32024,
256
- "content": "<pad24>",
257
- "single_word": false,
258
- "lstrip": false,
259
- "rstrip": false,
260
- "normalized": false,
261
- "special": false
262
- },
263
- {
264
- "id": 32025,
265
- "content": "<pad25>",
266
- "single_word": false,
267
- "lstrip": false,
268
- "rstrip": false,
269
- "normalized": false,
270
- "special": false
271
- },
272
- {
273
- "id": 32026,
274
- "content": "<pad26>",
275
- "single_word": false,
276
- "lstrip": false,
277
- "rstrip": false,
278
- "normalized": false,
279
- "special": false
280
- },
281
- {
282
- "id": 32027,
283
- "content": "<pad27>",
284
- "single_word": false,
285
- "lstrip": false,
286
- "rstrip": false,
287
- "normalized": false,
288
- "special": false
289
- },
290
- {
291
- "id": 32028,
292
- "content": "<pad28>",
293
- "single_word": false,
294
- "lstrip": false,
295
- "rstrip": false,
296
- "normalized": false,
297
- "special": false
298
- },
299
- {
300
- "id": 32029,
301
- "content": "<pad29>",
302
- "single_word": false,
303
- "lstrip": false,
304
- "rstrip": false,
305
- "normalized": false,
306
- "special": false
307
- },
308
- {
309
- "id": 32030,
310
- "content": "<pad30>",
311
- "single_word": false,
312
- "lstrip": false,
313
- "rstrip": false,
314
- "normalized": false,
315
- "special": false
316
- },
317
- {
318
- "id": 32031,
319
- "content": "<pad31>",
320
- "single_word": false,
321
- "lstrip": false,
322
- "rstrip": false,
323
- "normalized": false,
324
- "special": false
325
  }
326
  ],
327
  "normalizer": {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
  ],
34
  "normalizer": {
tokenizer_config.json CHANGED
@@ -26,277 +26,18 @@
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
29
- },
30
- "32000": {
31
- "content": "<|im_end|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "32001": {
39
- "content": "<|im_start|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false,
44
- "special": false
45
- },
46
- "32002": {
47
- "content": "<pad2>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": false,
51
- "single_word": false,
52
- "special": false
53
- },
54
- "32003": {
55
- "content": "<pad3>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": false,
59
- "single_word": false,
60
- "special": false
61
- },
62
- "32004": {
63
- "content": "<pad4>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": false,
67
- "single_word": false,
68
- "special": false
69
- },
70
- "32005": {
71
- "content": "<pad5>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": false,
75
- "single_word": false,
76
- "special": false
77
- },
78
- "32006": {
79
- "content": "<pad6>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": false,
83
- "single_word": false,
84
- "special": false
85
- },
86
- "32007": {
87
- "content": "<pad7>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": false,
91
- "single_word": false,
92
- "special": false
93
- },
94
- "32008": {
95
- "content": "<pad8>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": false,
99
- "single_word": false,
100
- "special": false
101
- },
102
- "32009": {
103
- "content": "<pad9>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": false
109
- },
110
- "32010": {
111
- "content": "<pad10>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": false,
115
- "single_word": false,
116
- "special": false
117
- },
118
- "32011": {
119
- "content": "<pad11>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false,
124
- "special": false
125
- },
126
- "32012": {
127
- "content": "<pad12>",
128
- "lstrip": false,
129
- "normalized": false,
130
- "rstrip": false,
131
- "single_word": false,
132
- "special": false
133
- },
134
- "32013": {
135
- "content": "<pad13>",
136
- "lstrip": false,
137
- "normalized": false,
138
- "rstrip": false,
139
- "single_word": false,
140
- "special": false
141
- },
142
- "32014": {
143
- "content": "<pad14>",
144
- "lstrip": false,
145
- "normalized": false,
146
- "rstrip": false,
147
- "single_word": false,
148
- "special": false
149
- },
150
- "32015": {
151
- "content": "<pad15>",
152
- "lstrip": false,
153
- "normalized": false,
154
- "rstrip": false,
155
- "single_word": false,
156
- "special": false
157
- },
158
- "32016": {
159
- "content": "<pad16>",
160
- "lstrip": false,
161
- "normalized": false,
162
- "rstrip": false,
163
- "single_word": false,
164
- "special": false
165
- },
166
- "32017": {
167
- "content": "<pad17>",
168
- "lstrip": false,
169
- "normalized": false,
170
- "rstrip": false,
171
- "single_word": false,
172
- "special": false
173
- },
174
- "32018": {
175
- "content": "<pad18>",
176
- "lstrip": false,
177
- "normalized": false,
178
- "rstrip": false,
179
- "single_word": false,
180
- "special": false
181
- },
182
- "32019": {
183
- "content": "<pad19>",
184
- "lstrip": false,
185
- "normalized": false,
186
- "rstrip": false,
187
- "single_word": false,
188
- "special": false
189
- },
190
- "32020": {
191
- "content": "<pad20>",
192
- "lstrip": false,
193
- "normalized": false,
194
- "rstrip": false,
195
- "single_word": false,
196
- "special": false
197
- },
198
- "32021": {
199
- "content": "<pad21>",
200
- "lstrip": false,
201
- "normalized": false,
202
- "rstrip": false,
203
- "single_word": false,
204
- "special": false
205
- },
206
- "32022": {
207
- "content": "<pad22>",
208
- "lstrip": false,
209
- "normalized": false,
210
- "rstrip": false,
211
- "single_word": false,
212
- "special": false
213
- },
214
- "32023": {
215
- "content": "<pad23>",
216
- "lstrip": false,
217
- "normalized": false,
218
- "rstrip": false,
219
- "single_word": false,
220
- "special": false
221
- },
222
- "32024": {
223
- "content": "<pad24>",
224
- "lstrip": false,
225
- "normalized": false,
226
- "rstrip": false,
227
- "single_word": false,
228
- "special": false
229
- },
230
- "32025": {
231
- "content": "<pad25>",
232
- "lstrip": false,
233
- "normalized": false,
234
- "rstrip": false,
235
- "single_word": false,
236
- "special": false
237
- },
238
- "32026": {
239
- "content": "<pad26>",
240
- "lstrip": false,
241
- "normalized": false,
242
- "rstrip": false,
243
- "single_word": false,
244
- "special": false
245
- },
246
- "32027": {
247
- "content": "<pad27>",
248
- "lstrip": false,
249
- "normalized": false,
250
- "rstrip": false,
251
- "single_word": false,
252
- "special": false
253
- },
254
- "32028": {
255
- "content": "<pad28>",
256
- "lstrip": false,
257
- "normalized": false,
258
- "rstrip": false,
259
- "single_word": false,
260
- "special": false
261
- },
262
- "32029": {
263
- "content": "<pad29>",
264
- "lstrip": false,
265
- "normalized": false,
266
- "rstrip": false,
267
- "single_word": false,
268
- "special": false
269
- },
270
- "32030": {
271
- "content": "<pad30>",
272
- "lstrip": false,
273
- "normalized": false,
274
- "rstrip": false,
275
- "single_word": false,
276
- "special": false
277
- },
278
- "32031": {
279
- "content": "<pad31>",
280
- "lstrip": false,
281
- "normalized": false,
282
- "rstrip": false,
283
- "single_word": false,
284
- "special": false
285
  }
286
  },
287
- "additional_special_tokens": [],
288
  "bos_token": "<s>",
289
- "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% if messages[1]['role'] == 'user' %}{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}{% set loop_messages = messages[2:] %}{% else %}{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}{% set loop_messages = messages[1:] %}{% endif %}{% else %}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
290
  "clean_up_tokenization_spaces": false,
291
- "eos_token": "<|im_end|>",
292
  "legacy": true,
293
  "model_max_length": 32768,
294
- "pad_token": "</s>",
295
  "padding_side": "left",
296
  "sp_model_kwargs": {},
297
  "spaces_between_special_tokens": false,
298
  "tokenizer_class": "LlamaTokenizer",
299
  "unk_token": "<unk>",
300
- "use_default_system_prompt": false,
301
- "use_fast": true
302
  }
 
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
30
  },
 
31
  "bos_token": "<s>",
 
32
  "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
  "legacy": true,
35
  "model_max_length": 32768,
36
+ "pad_token": "<unk>",
37
  "padding_side": "left",
38
  "sp_model_kwargs": {},
39
  "spaces_between_special_tokens": false,
40
  "tokenizer_class": "LlamaTokenizer",
41
  "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
 
43
  }