alea-institute commited on
Commit
29526e3
1 Parent(s): 8e9541d

Upload tokenizer

Browse files
Files changed (4) hide show
  1. README.md +6 -6
  2. special_tokens_map.json +14 -0
  3. tokenizer.json +1 -8
  4. tokenizer_config.json +12 -3
README.md CHANGED
@@ -1,5 +1,11 @@
1
  ---
 
 
 
 
 
2
  library_name: tokenizers
 
3
  tags:
4
  - kl3m
5
  - kl3m-003
@@ -7,12 +13,6 @@ tags:
7
  - legal
8
  - financial
9
  date: '2024-03-15T00:00:00.000Z'
10
- license: cc-by-4.0
11
- language:
12
- - en
13
- - es
14
- - fr
15
- - de
16
  ---
17
 
18
  # kl3m-003-64k tokenizer
 
1
  ---
2
+ language:
3
+ - en
4
+ - es
5
+ - fr
6
+ - de
7
  library_name: tokenizers
8
+ license: cc-by-4.0
9
  tags:
10
  - kl3m
11
  - kl3m-003
 
13
  - legal
14
  - financial
15
  date: '2024-03-15T00:00:00.000Z'
 
 
 
 
 
 
16
  ---
17
 
18
  # kl3m-003-64k tokenizer
special_tokens_map.json CHANGED
@@ -6,6 +6,13 @@
6
  "rstrip": false,
7
  "single_word": false
8
  },
 
 
 
 
 
 
 
9
  "eos_token": {
10
  "content": "<|end|>",
11
  "lstrip": false,
@@ -27,6 +34,13 @@
27
  "rstrip": false,
28
  "single_word": false
29
  },
 
 
 
 
 
 
 
30
  "unk_token": {
31
  "content": "<|unk|>",
32
  "lstrip": false,
 
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
+ "cls_token": {
10
+ "content": "<|cls|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
  "eos_token": {
17
  "content": "<|end|>",
18
  "lstrip": false,
 
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
+ "sep_token": {
38
+ "content": "<|sep|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
  "unk_token": {
45
  "content": "<|unk|>",
46
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,14 +1,7 @@
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
- "padding": {
5
- "strategy": "BatchLongest",
6
- "direction": "Left",
7
- "pad_to_multiple_of": null,
8
- "pad_id": 0,
9
- "pad_type_id": 0,
10
- "pad_token": "<|pad|>"
11
- },
12
  "added_tokens": [
13
  {
14
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
tokenizer_config.json CHANGED
@@ -35709,15 +35709,24 @@
35709
  }
35710
  },
35711
  "bos_token": "<|start|>",
 
35712
  "clean_up_tokenization_spaces": false,
 
 
35713
  "eos_token": "<|end|>",
 
35714
  "mask_token": "<|mask|>",
 
35715
  "max_length": null,
35716
- "model_max_length": 1000000000000000019884624838656,
35717
  "pad_to_multiple_of": null,
35718
  "pad_token": "<|pad|>",
 
35719
  "pad_token_type_id": 0,
35720
  "padding_side": "left",
35721
- "tokenizer_class": "GPTNeoXTokenizer",
35722
- "unk_token": "<|unk|>"
 
 
 
35723
  }
 
35709
  }
35710
  },
35711
  "bos_token": "<|start|>",
35712
+ "bos_token_id": 0,
35713
  "clean_up_tokenization_spaces": false,
35714
+ "cls_token": "<|cls|>",
35715
+ "cls_token_id": 5,
35716
  "eos_token": "<|end|>",
35717
+ "eos_token_id": 1,
35718
  "mask_token": "<|mask|>",
35719
+ "mask_token_id": 6,
35720
  "max_length": null,
35721
+ "model_max_length": 1048576,
35722
  "pad_to_multiple_of": null,
35723
  "pad_token": "<|pad|>",
35724
+ "pad_token_id": 2,
35725
  "pad_token_type_id": 0,
35726
  "padding_side": "left",
35727
+ "sep_token": "<|sep|>",
35728
+ "sep_token_id": 4,
35729
+ "tokenizer_class": "PreTrainedTokenizerFast",
35730
+ "unk_token": "<|unk|>",
35731
+ "unk_token_id": 3
35732
  }