birgermoell commited on
Commit
258d5ca
1 Parent(s): cc8140c

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +19 -14
  2. tokenizer.json +18 -0
  3. tokenizer_config.json +24 -3
special_tokens_map.json CHANGED
@@ -1,16 +1,21 @@
1
  {
2
- "eos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<|padding|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- }
 
 
 
 
 
16
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>"
21
  }
tokenizer.json CHANGED
@@ -254,6 +254,24 @@
254
  "rstrip": false,
255
  "normalized": false,
256
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  }
258
  ],
259
  "normalizer": {
 
254
  "rstrip": false,
255
  "normalized": false,
256
  "special": true
257
+ },
258
+ {
259
+ "id": 50280,
260
+ "content": "<|im_start|>",
261
+ "single_word": false,
262
+ "lstrip": false,
263
+ "rstrip": false,
264
+ "normalized": false,
265
+ "special": true
266
+ },
267
+ {
268
+ "id": 50281,
269
+ "content": "<|im_end|>",
270
+ "single_word": false,
271
+ "lstrip": false,
272
+ "rstrip": false,
273
+ "normalized": false,
274
+ "special": true
275
  }
276
  ],
277
  "normalizer": {
tokenizer_config.json CHANGED
@@ -226,13 +226,34 @@
226
  "rstrip": false,
227
  "single_word": false,
228
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  }
230
  },
231
- "bos_token": null,
 
 
 
 
 
232
  "clean_up_tokenization_spaces": true,
233
- "eos_token": "<|endoftext|>",
234
  "model_max_length": 1000000000000000019884624838656,
235
- "pad_token": "<|padding|>",
236
  "tokenizer_class": "GPTNeoXTokenizer",
237
  "unk_token": null
238
  }
 
226
  "rstrip": false,
227
  "single_word": false,
228
  "special": true
229
+ },
230
+ "50280": {
231
+ "content": "<|im_start|>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "50281": {
239
+ "content": "<|im_end|>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
  }
246
  },
247
+ "additional_special_tokens": [
248
+ "<|im_start|>",
249
+ "<|im_end|>"
250
+ ],
251
+ "bos_token": "<|im_start|>",
252
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
253
  "clean_up_tokenization_spaces": true,
254
+ "eos_token": "<|im_end|>",
255
  "model_max_length": 1000000000000000019884624838656,
256
+ "pad_token": "<|im_end|>",
257
  "tokenizer_class": "GPTNeoXTokenizer",
258
  "unk_token": null
259
  }