maykcaldas commited on
Commit
fb57838
·
1 Parent(s): 3852685

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +104 -0
  3. tokenizer_config.json +8 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[bos]",
3
+ "mask_token": "[mask]",
4
+ "pad_token": "[nop]",
5
+ "sep_token": "[eos]",
6
+ "unk_token": "[unk]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [],
6
+ "normalizer": null,
7
+ "pre_tokenizer": {
8
+ "type": "WhitespaceSplit"
9
+ },
10
+ "post_processor": null,
11
+ "decoder": null,
12
+ "model": {
13
+ "type": "WordPiece",
14
+ "unk_token": "[unk]",
15
+ "continuing_subword_prefix": "##",
16
+ "max_input_chars_per_word": 100,
17
+ "vocab": {
18
+ "[As+1]": 0,
19
+ "[=SH0]": 1,
20
+ "[=SH1]": 2,
21
+ "[=Ring2]": 3,
22
+ "[=Ring1]": 4,
23
+ "[CH1]": 5,
24
+ "[S]": 6,
25
+ "[NH2+1]": 7,
26
+ "[B]": 8,
27
+ "[C-1]": 9,
28
+ "[#C]": 10,
29
+ "[=P]": 11,
30
+ "[As]": 12,
31
+ "[B-1]": 13,
32
+ "[bos]": 14,
33
+ "[O]": 15,
34
+ "[OH0]": 16,
35
+ "[I]": 17,
36
+ "[nop]": 18,
37
+ "[Cl]": 19,
38
+ "[SiH2]": 20,
39
+ "[Ring1]": 21,
40
+ "[Fe-4]": 22,
41
+ "[CH0]": 23,
42
+ "[Fe]": 24,
43
+ "[Fe+2]": 25,
44
+ "[CH1-1]": 26,
45
+ "[=Branch3]": 27,
46
+ "[#Branch1]": 28,
47
+ "[=Branch2]": 29,
48
+ "[NH0]": 30,
49
+ "[N-1]": 31,
50
+ "[C]": 32,
51
+ "[=NH2+1]": 33,
52
+ "[NH1-1]": 34,
53
+ "[#N+1]": 35,
54
+ "[SeH1]": 36,
55
+ "[Branch3]": 37,
56
+ "[SH1]": 38,
57
+ "[CH2-1]": 39,
58
+ "[SH0]": 40,
59
+ "[=Se]": 41,
60
+ "[NH1+1]": 42,
61
+ "[K]": 43,
62
+ "[Ring2]": 44,
63
+ "[#N]": 45,
64
+ "[O-1]": 46,
65
+ "[OH1+1]": 47,
66
+ "[#Branch2]": 48,
67
+ "[=C]": 49,
68
+ "[I+1]": 50,
69
+ "[Si]": 51,
70
+ "[F]": 52,
71
+ "[=N+1]": 53,
72
+ "[=OH1+1]": 54,
73
+ "[Branch2]": 55,
74
+ "[=O+1]": 56,
75
+ "[#S]": 57,
76
+ "[Na]": 58,
77
+ "[C+1]": 59,
78
+ "[=B]": 60,
79
+ "[S+1]": 61,
80
+ "[unk]": 62,
81
+ "[=Fe]": 63,
82
+ "[P]": 64,
83
+ "[=N]": 65,
84
+ "[SiH1]": 66,
85
+ "[NH3+1]": 67,
86
+ "[Fe-3]": 68,
87
+ "[CH1+1]": 69,
88
+ "[Branch1]": 70,
89
+ "[Fe+1]": 71,
90
+ "[=Branch1]": 72,
91
+ "[=S]": 73,
92
+ "[Se]": 74,
93
+ "[N]": 75,
94
+ "[=As]": 76,
95
+ "[#Ring2]": 77,
96
+ "[Br]": 78,
97
+ "[=O]": 79,
98
+ "[P+1]": 80,
99
+ "[N+1]": 81,
100
+ "[eos]": 82,
101
+ "[Se+1]": 83
102
+ }
103
+ }
104
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[bos]",
3
+ "mask_token": "[mask]",
4
+ "pad_token": "[nop]",
5
+ "sep_token": "[eos]",
6
+ "tokenizer_class": "PreTrainedTokenizerFast",
7
+ "unk_token": "[unk]"
8
+ }