andrecornman commited on
Commit
d850aa6
1 Parent(s): 5a7d048

Upload tokenizer

Browse files
glm_tokenizer.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer
2
+ from tokenizers.models import BPE
3
+ from transformers import PreTrainedTokenizerFast
4
+
5
+
6
+ class gLM2Tokenizer(PreTrainedTokenizerFast):
7
+
8
+ VOCAB = [
9
+ "<cls>", "<pad>", "<eos>", "<unk>",
10
+ "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K",
11
+ "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z",
12
+ "O", "a", "t", "c", "g", "<+>", "<->", "<mask>", "<sep>",
13
+ ]
14
+
15
+ def __init__(
16
+ self,
17
+ unk_token="<unk>",
18
+ cls_token="<cls>",
19
+ pad_token="<pad>",
20
+ mask_token="<mask>",
21
+ eos_token="<eos>",
22
+ sep_token="<sep>",
23
+ pos_token="<+>",
24
+ neg_token="<->",
25
+ **kwargs,
26
+ ):
27
+ all_tokens = self.VOCAB
28
+ token_to_id = {tok: ind for ind, tok in enumerate(all_tokens)}
29
+
30
+ bpe = BPE(token_to_id, merges=[], unk_token=str(unk_token))
31
+ tokenizer = Tokenizer(bpe)
32
+ special_tokens = [cls_token, pad_token,
33
+ mask_token, eos_token, sep_token, pos_token, neg_token]
34
+
35
+ tokenizer.add_special_tokens(
36
+ special_tokens,
37
+ )
38
+
39
+ super().__init__(
40
+ tokenizer_object=tokenizer,
41
+ unk_token=unk_token,
42
+ cls_token=cls_token,
43
+ pad_token=pad_token,
44
+ mask_token=mask_token,
45
+ eos_token=eos_token,
46
+ sep_token=sep_token,
47
+ **kwargs,
48
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "<cls>",
3
+ "eos_token": "<eos>",
4
+ "mask_token": "<mask>",
5
+ "pad_token": "<pad>",
6
+ "sep_token": "<sep>",
7
+ "unk_token": "<unk>"
8
+ }
tokenizer.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<cls>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<eos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 33,
44
+ "content": "<+>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 34,
53
+ "content": "<->",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 35,
62
+ "content": "<mask>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 36,
71
+ "content": "<sep>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ }
78
+ ],
79
+ "normalizer": null,
80
+ "pre_tokenizer": null,
81
+ "post_processor": null,
82
+ "decoder": null,
83
+ "model": {
84
+ "type": "BPE",
85
+ "dropout": null,
86
+ "unk_token": "<unk>",
87
+ "continuing_subword_prefix": null,
88
+ "end_of_word_suffix": null,
89
+ "fuse_unk": false,
90
+ "byte_fallback": false,
91
+ "ignore_merges": false,
92
+ "vocab": {
93
+ "<cls>": 0,
94
+ "<pad>": 1,
95
+ "<eos>": 2,
96
+ "<unk>": 3,
97
+ "L": 4,
98
+ "A": 5,
99
+ "G": 6,
100
+ "V": 7,
101
+ "S": 8,
102
+ "E": 9,
103
+ "R": 10,
104
+ "T": 11,
105
+ "I": 12,
106
+ "D": 13,
107
+ "P": 14,
108
+ "K": 15,
109
+ "Q": 16,
110
+ "N": 17,
111
+ "F": 18,
112
+ "Y": 19,
113
+ "M": 20,
114
+ "H": 21,
115
+ "W": 22,
116
+ "C": 23,
117
+ "X": 24,
118
+ "B": 25,
119
+ "U": 26,
120
+ "Z": 27,
121
+ "O": 28,
122
+ "a": 29,
123
+ "t": 30,
124
+ "c": 31,
125
+ "g": 32,
126
+ "<+>": 33,
127
+ "<->": 34,
128
+ "<mask>": 35,
129
+ "<sep>": 36
130
+ },
131
+ "merges": []
132
+ }
133
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<cls>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<eos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "33": {
36
+ "content": "<+>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "34": {
44
+ "content": "<->",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "35": {
52
+ "content": "<mask>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "36": {
60
+ "content": "<sep>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ }
67
+ },
68
+ "auto_map": {
69
+ "AutoTokenizer": [
70
+ "glm_tokenizer.gLM2Tokenizer",
71
+ null
72
+ ]
73
+ },
74
+ "clean_up_tokenization_spaces": true,
75
+ "cls_token": "<cls>",
76
+ "eos_token": "<eos>",
77
+ "mask_token": "<mask>",
78
+ "model_max_length": 1000000000000000019884624838656,
79
+ "pad_token": "<pad>",
80
+ "sep_token": "<sep>",
81
+ "tokenizer_class": "gLM2Tokenizer",
82
+ "unk_token": "<unk>"
83
+ }