maykcaldas
commited on
Commit
·
fb57838
1
Parent(s):
3852685
add tokenizer
Browse files- special_tokens_map.json +7 -0
- tokenizer.json +104 -0
- tokenizer_config.json +8 -0
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[bos]",
|
3 |
+
"mask_token": "[mask]",
|
4 |
+
"pad_token": "[nop]",
|
5 |
+
"sep_token": "[eos]",
|
6 |
+
"unk_token": "[unk]"
|
7 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "1.0",
|
3 |
+
"truncation": null,
|
4 |
+
"padding": null,
|
5 |
+
"added_tokens": [],
|
6 |
+
"normalizer": null,
|
7 |
+
"pre_tokenizer": {
|
8 |
+
"type": "WhitespaceSplit"
|
9 |
+
},
|
10 |
+
"post_processor": null,
|
11 |
+
"decoder": null,
|
12 |
+
"model": {
|
13 |
+
"type": "WordPiece",
|
14 |
+
"unk_token": "[unk]",
|
15 |
+
"continuing_subword_prefix": "##",
|
16 |
+
"max_input_chars_per_word": 100,
|
17 |
+
"vocab": {
|
18 |
+
"[As+1]": 0,
|
19 |
+
"[=SH0]": 1,
|
20 |
+
"[=SH1]": 2,
|
21 |
+
"[=Ring2]": 3,
|
22 |
+
"[=Ring1]": 4,
|
23 |
+
"[CH1]": 5,
|
24 |
+
"[S]": 6,
|
25 |
+
"[NH2+1]": 7,
|
26 |
+
"[B]": 8,
|
27 |
+
"[C-1]": 9,
|
28 |
+
"[#C]": 10,
|
29 |
+
"[=P]": 11,
|
30 |
+
"[As]": 12,
|
31 |
+
"[B-1]": 13,
|
32 |
+
"[bos]": 14,
|
33 |
+
"[O]": 15,
|
34 |
+
"[OH0]": 16,
|
35 |
+
"[I]": 17,
|
36 |
+
"[nop]": 18,
|
37 |
+
"[Cl]": 19,
|
38 |
+
"[SiH2]": 20,
|
39 |
+
"[Ring1]": 21,
|
40 |
+
"[Fe-4]": 22,
|
41 |
+
"[CH0]": 23,
|
42 |
+
"[Fe]": 24,
|
43 |
+
"[Fe+2]": 25,
|
44 |
+
"[CH1-1]": 26,
|
45 |
+
"[=Branch3]": 27,
|
46 |
+
"[#Branch1]": 28,
|
47 |
+
"[=Branch2]": 29,
|
48 |
+
"[NH0]": 30,
|
49 |
+
"[N-1]": 31,
|
50 |
+
"[C]": 32,
|
51 |
+
"[=NH2+1]": 33,
|
52 |
+
"[NH1-1]": 34,
|
53 |
+
"[#N+1]": 35,
|
54 |
+
"[SeH1]": 36,
|
55 |
+
"[Branch3]": 37,
|
56 |
+
"[SH1]": 38,
|
57 |
+
"[CH2-1]": 39,
|
58 |
+
"[SH0]": 40,
|
59 |
+
"[=Se]": 41,
|
60 |
+
"[NH1+1]": 42,
|
61 |
+
"[K]": 43,
|
62 |
+
"[Ring2]": 44,
|
63 |
+
"[#N]": 45,
|
64 |
+
"[O-1]": 46,
|
65 |
+
"[OH1+1]": 47,
|
66 |
+
"[#Branch2]": 48,
|
67 |
+
"[=C]": 49,
|
68 |
+
"[I+1]": 50,
|
69 |
+
"[Si]": 51,
|
70 |
+
"[F]": 52,
|
71 |
+
"[=N+1]": 53,
|
72 |
+
"[=OH1+1]": 54,
|
73 |
+
"[Branch2]": 55,
|
74 |
+
"[=O+1]": 56,
|
75 |
+
"[#S]": 57,
|
76 |
+
"[Na]": 58,
|
77 |
+
"[C+1]": 59,
|
78 |
+
"[=B]": 60,
|
79 |
+
"[S+1]": 61,
|
80 |
+
"[unk]": 62,
|
81 |
+
"[=Fe]": 63,
|
82 |
+
"[P]": 64,
|
83 |
+
"[=N]": 65,
|
84 |
+
"[SiH1]": 66,
|
85 |
+
"[NH3+1]": 67,
|
86 |
+
"[Fe-3]": 68,
|
87 |
+
"[CH1+1]": 69,
|
88 |
+
"[Branch1]": 70,
|
89 |
+
"[Fe+1]": 71,
|
90 |
+
"[=Branch1]": 72,
|
91 |
+
"[=S]": 73,
|
92 |
+
"[Se]": 74,
|
93 |
+
"[N]": 75,
|
94 |
+
"[=As]": 76,
|
95 |
+
"[#Ring2]": 77,
|
96 |
+
"[Br]": 78,
|
97 |
+
"[=O]": 79,
|
98 |
+
"[P+1]": 80,
|
99 |
+
"[N+1]": 81,
|
100 |
+
"[eos]": 82,
|
101 |
+
"[Se+1]": 83
|
102 |
+
}
|
103 |
+
}
|
104 |
+
}
|
tokenizer_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[bos]",
|
3 |
+
"mask_token": "[mask]",
|
4 |
+
"pad_token": "[nop]",
|
5 |
+
"sep_token": "[eos]",
|
6 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
7 |
+
"unk_token": "[unk]"
|
8 |
+
}
|