alea-institute
commited on
Commit
•
29526e3
1
Parent(s):
8e9541d
Upload tokenizer
Browse files- README.md +6 -6
- special_tokens_map.json +14 -0
- tokenizer.json +1 -8
- tokenizer_config.json +12 -3
README.md
CHANGED
@@ -1,5 +1,11 @@
|
|
1 |
---
|
|
|
|
|
|
|
|
|
|
|
2 |
library_name: tokenizers
|
|
|
3 |
tags:
|
4 |
- kl3m
|
5 |
- kl3m-003
|
@@ -7,12 +13,6 @@ tags:
|
|
7 |
- legal
|
8 |
- financial
|
9 |
date: '2024-03-15T00:00:00.000Z'
|
10 |
-
license: cc-by-4.0
|
11 |
-
language:
|
12 |
-
- en
|
13 |
-
- es
|
14 |
-
- fr
|
15 |
-
- de
|
16 |
---
|
17 |
|
18 |
# kl3m-003-64k tokenizer
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
+
- en
|
4 |
+
- es
|
5 |
+
- fr
|
6 |
+
- de
|
7 |
library_name: tokenizers
|
8 |
+
license: cc-by-4.0
|
9 |
tags:
|
10 |
- kl3m
|
11 |
- kl3m-003
|
|
|
13 |
- legal
|
14 |
- financial
|
15 |
date: '2024-03-15T00:00:00.000Z'
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
---
|
17 |
|
18 |
# kl3m-003-64k tokenizer
|
special_tokens_map.json
CHANGED
@@ -6,6 +6,13 @@
|
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
"eos_token": {
|
10 |
"content": "<|end|>",
|
11 |
"lstrip": false,
|
@@ -27,6 +34,13 @@
|
|
27 |
"rstrip": false,
|
28 |
"single_word": false
|
29 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
"unk_token": {
|
31 |
"content": "<|unk|>",
|
32 |
"lstrip": false,
|
|
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<|cls|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
"eos_token": {
|
17 |
"content": "<|end|>",
|
18 |
"lstrip": false,
|
|
|
34 |
"rstrip": false,
|
35 |
"single_word": false
|
36 |
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "<|sep|>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
"unk_token": {
|
45 |
"content": "<|unk|>",
|
46 |
"lstrip": false,
|
tokenizer.json
CHANGED
@@ -1,14 +1,7 @@
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
-
"padding":
|
5 |
-
"strategy": "BatchLongest",
|
6 |
-
"direction": "Left",
|
7 |
-
"pad_to_multiple_of": null,
|
8 |
-
"pad_id": 0,
|
9 |
-
"pad_type_id": 0,
|
10 |
-
"pad_token": "<|pad|>"
|
11 |
-
},
|
12 |
"added_tokens": [
|
13 |
{
|
14 |
"id": 0,
|
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
+
"padding": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"added_tokens": [
|
6 |
{
|
7 |
"id": 0,
|
tokenizer_config.json
CHANGED
@@ -35709,15 +35709,24 @@
|
|
35709 |
}
|
35710 |
},
|
35711 |
"bos_token": "<|start|>",
|
|
|
35712 |
"clean_up_tokenization_spaces": false,
|
|
|
|
|
35713 |
"eos_token": "<|end|>",
|
|
|
35714 |
"mask_token": "<|mask|>",
|
|
|
35715 |
"max_length": null,
|
35716 |
-
"model_max_length":
|
35717 |
"pad_to_multiple_of": null,
|
35718 |
"pad_token": "<|pad|>",
|
|
|
35719 |
"pad_token_type_id": 0,
|
35720 |
"padding_side": "left",
|
35721 |
-
"
|
35722 |
-
"
|
|
|
|
|
|
|
35723 |
}
|
|
|
35709 |
}
|
35710 |
},
|
35711 |
"bos_token": "<|start|>",
|
35712 |
+
"bos_token_id": 0,
|
35713 |
"clean_up_tokenization_spaces": false,
|
35714 |
+
"cls_token": "<|cls|>",
|
35715 |
+
"cls_token_id": 5,
|
35716 |
"eos_token": "<|end|>",
|
35717 |
+
"eos_token_id": 1,
|
35718 |
"mask_token": "<|mask|>",
|
35719 |
+
"mask_token_id": 6,
|
35720 |
"max_length": null,
|
35721 |
+
"model_max_length": 1048576,
|
35722 |
"pad_to_multiple_of": null,
|
35723 |
"pad_token": "<|pad|>",
|
35724 |
+
"pad_token_id": 2,
|
35725 |
"pad_token_type_id": 0,
|
35726 |
"padding_side": "left",
|
35727 |
+
"sep_token": "<|sep|>",
|
35728 |
+
"sep_token_id": 4,
|
35729 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
35730 |
+
"unk_token": "<|unk|>",
|
35731 |
+
"unk_token_id": 3
|
35732 |
}
|