jogonba2 commited on
Commit
c9f5f6f
1 Parent(s): a5fd687

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +191 -0
  2. special_tokens_map.json +30 -0
  3. tokenizer.json +1065 -0
  4. tokenizer_config.json +22 -0
  5. vocab.json +1 -0
merges.txt ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2
2
+ Ġ t
3
+ h e
4
+ Ġ a
5
+ o u
6
+ Ġ s
7
+ Ġ m
8
+ i n
9
+ Ġ w
10
+ r e
11
+ h a
12
+ Ġt he
13
+ n d
14
+ Ġ b
15
+ i s
16
+ o r
17
+ Ġ f
18
+ e r
19
+ l l
20
+ i t
21
+ o n
22
+ Ġ d
23
+ Ġ c
24
+ e s
25
+ Ġ l
26
+ e n
27
+ Ġ n
28
+ Ġ y
29
+ a r
30
+ Ġt h
31
+ Ġ h
32
+ Ġ o
33
+ Ġt o
34
+ Ġ p
35
+ Ġy ou
36
+ ha t
37
+ Ġ I
38
+ Ġ he
39
+ o t
40
+ v e
41
+ in g
42
+ Ġo f
43
+ s t
44
+ Ġa nd
45
+ o w
46
+ a n
47
+ o m
48
+ Ġ g
49
+ a t
50
+ Ġb e
51
+ s e
52
+ c e
53
+ Ġm y
54
+ Ġ in
55
+ Ġ ha
56
+ l e
57
+ a y
58
+ l d
59
+ e t
60
+ i r
61
+ e d
62
+ u t
63
+ i m
64
+ it h
65
+ ' s
66
+ Ġm e
67
+ Ġn ot
68
+ Ġt hat
69
+ c h
70
+ g h
71
+ ou r
72
+ Ġ is
73
+ A nd
74
+ Ġ u
75
+ Ġf or
76
+ k e
77
+ Ġw e
78
+ o o
79
+ i ll
80
+ Ġ e
81
+ he r
82
+ Ġw ith
83
+ Ġyou r
84
+ Ġ it
85
+ a d
86
+ en t
87
+ r i
88
+ Ġs t
89
+ Ġth ou
90
+ Ġh is
91
+ ' d
92
+ Ġ k
93
+ or d
94
+ om e
95
+ E N
96
+ gh t
97
+ r a
98
+ T he
99
+ Ġ re
100
+ I N
101
+ Ġh im
102
+ l y
103
+ Ġl i
104
+ Ġha ve
105
+ i d
106
+ a s
107
+ u r
108
+ a l
109
+ Ġth is
110
+ Ġd e
111
+ Ġs o
112
+ Ġ on
113
+ Ġa s
114
+ A R
115
+ r o
116
+ o re
117
+ h i
118
+ ou ld
119
+ oo d
120
+ c k
121
+ v er
122
+ a in
123
+ es t
124
+ es s
125
+ Ġth y
126
+ Ġs ha
127
+ U S
128
+ Ġd o
129
+ e a
130
+ Ġw ill
131
+ Ġn o
132
+ a m
133
+ u s
134
+ Ġb ut
135
+ g e
136
+ a nd
137
+ Ġs e
138
+ i f
139
+ I O
140
+ T h
141
+ i on
142
+ T o
143
+ Ġa ll
144
+ Ġs u
145
+ a ke
146
+ t h
147
+ e ar
148
+ u e
149
+ Ġa n
150
+ t er
151
+ Ġl o
152
+ ar d
153
+ IN G
154
+ ha n
155
+ Ġ our
156
+ Ġhe r
157
+ Ġb y
158
+ Ġs p
159
+ Ġf a
160
+ e ll
161
+ Ġ R
162
+ Ġsha ll
163
+ Ġ C
164
+ Ġthe e
165
+ r om
166
+ h o
167
+ i l
168
+ E S
169
+ c t
170
+ ou s
171
+ O R
172
+ u st
173
+ Ġ v
174
+ Ġn e
175
+ Ġa re
176
+ T hat
177
+ u l
178
+ Ġk n
179
+ i ght
180
+ E R
181
+ Ġw hat
182
+ Ġl ord
183
+ Ġs h
184
+ a st
185
+ at h
186
+ se l
187
+ Ġu p
188
+ ar t
189
+ Ġ E
190
+ L A
191
+ K ING
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,1065 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|endoftext|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": null,
17
+ "pre_tokenizer": {
18
+ "type": "ByteLevel",
19
+ "add_prefix_space": false,
20
+ "trim_offsets": true,
21
+ "use_regex": true
22
+ },
23
+ "post_processor": {
24
+ "type": "ByteLevel",
25
+ "add_prefix_space": true,
26
+ "trim_offsets": false,
27
+ "use_regex": true
28
+ },
29
+ "decoder": {
30
+ "type": "ByteLevel",
31
+ "add_prefix_space": true,
32
+ "trim_offsets": true,
33
+ "use_regex": true
34
+ },
35
+ "model": {
36
+ "type": "BPE",
37
+ "dropout": null,
38
+ "unk_token": null,
39
+ "continuing_subword_prefix": "",
40
+ "end_of_word_suffix": "",
41
+ "fuse_unk": false,
42
+ "byte_fallback": false,
43
+ "ignore_merges": false,
44
+ "vocab": {
45
+ "<|endoftext|>": 0,
46
+ "!": 1,
47
+ "$": 2,
48
+ "&": 3,
49
+ "'": 4,
50
+ ",": 5,
51
+ "-": 6,
52
+ ".": 7,
53
+ "3": 8,
54
+ ":": 9,
55
+ ";": 10,
56
+ "?": 11,
57
+ "A": 12,
58
+ "B": 13,
59
+ "C": 14,
60
+ "D": 15,
61
+ "E": 16,
62
+ "F": 17,
63
+ "G": 18,
64
+ "H": 19,
65
+ "I": 20,
66
+ "J": 21,
67
+ "K": 22,
68
+ "L": 23,
69
+ "M": 24,
70
+ "N": 25,
71
+ "O": 26,
72
+ "P": 27,
73
+ "Q": 28,
74
+ "R": 29,
75
+ "S": 30,
76
+ "T": 31,
77
+ "U": 32,
78
+ "V": 33,
79
+ "W": 34,
80
+ "X": 35,
81
+ "Y": 36,
82
+ "Z": 37,
83
+ "a": 38,
84
+ "b": 39,
85
+ "c": 40,
86
+ "d": 41,
87
+ "e": 42,
88
+ "f": 43,
89
+ "g": 44,
90
+ "h": 45,
91
+ "i": 46,
92
+ "j": 47,
93
+ "k": 48,
94
+ "l": 49,
95
+ "m": 50,
96
+ "n": 51,
97
+ "o": 52,
98
+ "p": 53,
99
+ "q": 54,
100
+ "r": 55,
101
+ "s": 56,
102
+ "t": 57,
103
+ "u": 58,
104
+ "v": 59,
105
+ "w": 60,
106
+ "x": 61,
107
+ "y": 62,
108
+ "z": 63,
109
+ "Ċ": 64,
110
+ "Ġ": 65,
111
+ "Ġt": 66,
112
+ "he": 67,
113
+ "Ġa": 68,
114
+ "ou": 69,
115
+ "Ġs": 70,
116
+ "Ġm": 71,
117
+ "in": 72,
118
+ "Ġw": 73,
119
+ "re": 74,
120
+ "ha": 75,
121
+ "Ġthe": 76,
122
+ "nd": 77,
123
+ "Ġb": 78,
124
+ "is": 79,
125
+ "or": 80,
126
+ "Ġf": 81,
127
+ "er": 82,
128
+ "ll": 83,
129
+ "it": 84,
130
+ "on": 85,
131
+ "Ġd": 86,
132
+ "Ġc": 87,
133
+ "es": 88,
134
+ "Ġl": 89,
135
+ "en": 90,
136
+ "Ġn": 91,
137
+ "Ġy": 92,
138
+ "ar": 93,
139
+ "Ġth": 94,
140
+ "Ġh": 95,
141
+ "Ġo": 96,
142
+ "Ġto": 97,
143
+ "Ġp": 98,
144
+ "Ġyou": 99,
145
+ "hat": 100,
146
+ "ĠI": 101,
147
+ "Ġhe": 102,
148
+ "ot": 103,
149
+ "ve": 104,
150
+ "ing": 105,
151
+ "Ġof": 106,
152
+ "st": 107,
153
+ "Ġand": 108,
154
+ "ow": 109,
155
+ "an": 110,
156
+ "om": 111,
157
+ "Ġg": 112,
158
+ "at": 113,
159
+ "Ġbe": 114,
160
+ "se": 115,
161
+ "ce": 116,
162
+ "Ġmy": 117,
163
+ "Ġin": 118,
164
+ "Ġha": 119,
165
+ "le": 120,
166
+ "ay": 121,
167
+ "ld": 122,
168
+ "et": 123,
169
+ "ir": 124,
170
+ "ed": 125,
171
+ "ut": 126,
172
+ "im": 127,
173
+ "ith": 128,
174
+ "'s": 129,
175
+ "Ġme": 130,
176
+ "Ġnot": 131,
177
+ "Ġthat": 132,
178
+ "ch": 133,
179
+ "gh": 134,
180
+ "our": 135,
181
+ "Ġis": 136,
182
+ "And": 137,
183
+ "Ġu": 138,
184
+ "Ġfor": 139,
185
+ "ke": 140,
186
+ "Ġwe": 141,
187
+ "oo": 142,
188
+ "ill": 143,
189
+ "Ġe": 144,
190
+ "her": 145,
191
+ "Ġwith": 146,
192
+ "Ġyour": 147,
193
+ "Ġit": 148,
194
+ "ad": 149,
195
+ "ent": 150,
196
+ "ri": 151,
197
+ "Ġst": 152,
198
+ "Ġthou": 153,
199
+ "Ġhis": 154,
200
+ "'d": 155,
201
+ "Ġk": 156,
202
+ "ord": 157,
203
+ "ome": 158,
204
+ "EN": 159,
205
+ "ght": 160,
206
+ "ra": 161,
207
+ "The": 162,
208
+ "Ġre": 163,
209
+ "IN": 164,
210
+ "Ġhim": 165,
211
+ "ly": 166,
212
+ "Ġli": 167,
213
+ "Ġhave": 168,
214
+ "id": 169,
215
+ "as": 170,
216
+ "ur": 171,
217
+ "al": 172,
218
+ "Ġthis": 173,
219
+ "Ġde": 174,
220
+ "Ġso": 175,
221
+ "Ġon": 176,
222
+ "Ġas": 177,
223
+ "AR": 178,
224
+ "ro": 179,
225
+ "ore": 180,
226
+ "hi": 181,
227
+ "ould": 182,
228
+ "ood": 183,
229
+ "ck": 184,
230
+ "ver": 185,
231
+ "ain": 186,
232
+ "est": 187,
233
+ "ess": 188,
234
+ "Ġthy": 189,
235
+ "Ġsha": 190,
236
+ "US": 191,
237
+ "Ġdo": 192,
238
+ "ea": 193,
239
+ "Ġwill": 194,
240
+ "Ġno": 195,
241
+ "am": 196,
242
+ "us": 197,
243
+ "Ġbut": 198,
244
+ "ge": 199,
245
+ "and": 200,
246
+ "Ġse": 201,
247
+ "if": 202,
248
+ "IO": 203,
249
+ "Th": 204,
250
+ "ion": 205,
251
+ "To": 206,
252
+ "Ġall": 207,
253
+ "Ġsu": 208,
254
+ "ake": 209,
255
+ "th": 210,
256
+ "ear": 211,
257
+ "ue": 212,
258
+ "Ġan": 213,
259
+ "ter": 214,
260
+ "Ġlo": 215,
261
+ "ard": 216,
262
+ "ING": 217,
263
+ "han": 218,
264
+ "Ġour": 219,
265
+ "Ġher": 220,
266
+ "Ġby": 221,
267
+ "Ġsp": 222,
268
+ "Ġfa": 223,
269
+ "ell": 224,
270
+ "ĠR": 225,
271
+ "Ġshall": 226,
272
+ "ĠC": 227,
273
+ "Ġthee": 228,
274
+ "rom": 229,
275
+ "ho": 230,
276
+ "il": 231,
277
+ "ES": 232,
278
+ "ct": 233,
279
+ "ous": 234,
280
+ "OR": 235,
281
+ "ust": 236,
282
+ "Ġv": 237,
283
+ "Ġne": 238,
284
+ "Ġare": 239,
285
+ "That": 240,
286
+ "ul": 241,
287
+ "Ġkn": 242,
288
+ "ight": 243,
289
+ "ER": 244,
290
+ "Ġwhat": 245,
291
+ "Ġlord": 246,
292
+ "Ġsh": 247,
293
+ "ast": 248,
294
+ "ath": 249,
295
+ "sel": 250,
296
+ "Ġup": 251,
297
+ "art": 252,
298
+ "ĠE": 253,
299
+ "LA": 254,
300
+ "KING": 255
301
+ },
302
+ "merges": [
303
+ [
304
+ "Ġ",
305
+ "t"
306
+ ],
307
+ [
308
+ "h",
309
+ "e"
310
+ ],
311
+ [
312
+ "Ġ",
313
+ "a"
314
+ ],
315
+ [
316
+ "o",
317
+ "u"
318
+ ],
319
+ [
320
+ "Ġ",
321
+ "s"
322
+ ],
323
+ [
324
+ "Ġ",
325
+ "m"
326
+ ],
327
+ [
328
+ "i",
329
+ "n"
330
+ ],
331
+ [
332
+ "Ġ",
333
+ "w"
334
+ ],
335
+ [
336
+ "r",
337
+ "e"
338
+ ],
339
+ [
340
+ "h",
341
+ "a"
342
+ ],
343
+ [
344
+ "Ġt",
345
+ "he"
346
+ ],
347
+ [
348
+ "n",
349
+ "d"
350
+ ],
351
+ [
352
+ "Ġ",
353
+ "b"
354
+ ],
355
+ [
356
+ "i",
357
+ "s"
358
+ ],
359
+ [
360
+ "o",
361
+ "r"
362
+ ],
363
+ [
364
+ "Ġ",
365
+ "f"
366
+ ],
367
+ [
368
+ "e",
369
+ "r"
370
+ ],
371
+ [
372
+ "l",
373
+ "l"
374
+ ],
375
+ [
376
+ "i",
377
+ "t"
378
+ ],
379
+ [
380
+ "o",
381
+ "n"
382
+ ],
383
+ [
384
+ "Ġ",
385
+ "d"
386
+ ],
387
+ [
388
+ "Ġ",
389
+ "c"
390
+ ],
391
+ [
392
+ "e",
393
+ "s"
394
+ ],
395
+ [
396
+ "Ġ",
397
+ "l"
398
+ ],
399
+ [
400
+ "e",
401
+ "n"
402
+ ],
403
+ [
404
+ "Ġ",
405
+ "n"
406
+ ],
407
+ [
408
+ "Ġ",
409
+ "y"
410
+ ],
411
+ [
412
+ "a",
413
+ "r"
414
+ ],
415
+ [
416
+ "Ġt",
417
+ "h"
418
+ ],
419
+ [
420
+ "Ġ",
421
+ "h"
422
+ ],
423
+ [
424
+ "Ġ",
425
+ "o"
426
+ ],
427
+ [
428
+ "Ġt",
429
+ "o"
430
+ ],
431
+ [
432
+ "Ġ",
433
+ "p"
434
+ ],
435
+ [
436
+ "Ġy",
437
+ "ou"
438
+ ],
439
+ [
440
+ "ha",
441
+ "t"
442
+ ],
443
+ [
444
+ "Ġ",
445
+ "I"
446
+ ],
447
+ [
448
+ "Ġ",
449
+ "he"
450
+ ],
451
+ [
452
+ "o",
453
+ "t"
454
+ ],
455
+ [
456
+ "v",
457
+ "e"
458
+ ],
459
+ [
460
+ "in",
461
+ "g"
462
+ ],
463
+ [
464
+ "Ġo",
465
+ "f"
466
+ ],
467
+ [
468
+ "s",
469
+ "t"
470
+ ],
471
+ [
472
+ "Ġa",
473
+ "nd"
474
+ ],
475
+ [
476
+ "o",
477
+ "w"
478
+ ],
479
+ [
480
+ "a",
481
+ "n"
482
+ ],
483
+ [
484
+ "o",
485
+ "m"
486
+ ],
487
+ [
488
+ "Ġ",
489
+ "g"
490
+ ],
491
+ [
492
+ "a",
493
+ "t"
494
+ ],
495
+ [
496
+ "Ġb",
497
+ "e"
498
+ ],
499
+ [
500
+ "s",
501
+ "e"
502
+ ],
503
+ [
504
+ "c",
505
+ "e"
506
+ ],
507
+ [
508
+ "Ġm",
509
+ "y"
510
+ ],
511
+ [
512
+ "Ġ",
513
+ "in"
514
+ ],
515
+ [
516
+ "Ġ",
517
+ "ha"
518
+ ],
519
+ [
520
+ "l",
521
+ "e"
522
+ ],
523
+ [
524
+ "a",
525
+ "y"
526
+ ],
527
+ [
528
+ "l",
529
+ "d"
530
+ ],
531
+ [
532
+ "e",
533
+ "t"
534
+ ],
535
+ [
536
+ "i",
537
+ "r"
538
+ ],
539
+ [
540
+ "e",
541
+ "d"
542
+ ],
543
+ [
544
+ "u",
545
+ "t"
546
+ ],
547
+ [
548
+ "i",
549
+ "m"
550
+ ],
551
+ [
552
+ "it",
553
+ "h"
554
+ ],
555
+ [
556
+ "'",
557
+ "s"
558
+ ],
559
+ [
560
+ "Ġm",
561
+ "e"
562
+ ],
563
+ [
564
+ "Ġn",
565
+ "ot"
566
+ ],
567
+ [
568
+ "Ġt",
569
+ "hat"
570
+ ],
571
+ [
572
+ "c",
573
+ "h"
574
+ ],
575
+ [
576
+ "g",
577
+ "h"
578
+ ],
579
+ [
580
+ "ou",
581
+ "r"
582
+ ],
583
+ [
584
+ "Ġ",
585
+ "is"
586
+ ],
587
+ [
588
+ "A",
589
+ "nd"
590
+ ],
591
+ [
592
+ "Ġ",
593
+ "u"
594
+ ],
595
+ [
596
+ "Ġf",
597
+ "or"
598
+ ],
599
+ [
600
+ "k",
601
+ "e"
602
+ ],
603
+ [
604
+ "Ġw",
605
+ "e"
606
+ ],
607
+ [
608
+ "o",
609
+ "o"
610
+ ],
611
+ [
612
+ "i",
613
+ "ll"
614
+ ],
615
+ [
616
+ "Ġ",
617
+ "e"
618
+ ],
619
+ [
620
+ "he",
621
+ "r"
622
+ ],
623
+ [
624
+ "Ġw",
625
+ "ith"
626
+ ],
627
+ [
628
+ "Ġyou",
629
+ "r"
630
+ ],
631
+ [
632
+ "Ġ",
633
+ "it"
634
+ ],
635
+ [
636
+ "a",
637
+ "d"
638
+ ],
639
+ [
640
+ "en",
641
+ "t"
642
+ ],
643
+ [
644
+ "r",
645
+ "i"
646
+ ],
647
+ [
648
+ "Ġs",
649
+ "t"
650
+ ],
651
+ [
652
+ "Ġth",
653
+ "ou"
654
+ ],
655
+ [
656
+ "Ġh",
657
+ "is"
658
+ ],
659
+ [
660
+ "'",
661
+ "d"
662
+ ],
663
+ [
664
+ "Ġ",
665
+ "k"
666
+ ],
667
+ [
668
+ "or",
669
+ "d"
670
+ ],
671
+ [
672
+ "om",
673
+ "e"
674
+ ],
675
+ [
676
+ "E",
677
+ "N"
678
+ ],
679
+ [
680
+ "gh",
681
+ "t"
682
+ ],
683
+ [
684
+ "r",
685
+ "a"
686
+ ],
687
+ [
688
+ "T",
689
+ "he"
690
+ ],
691
+ [
692
+ "Ġ",
693
+ "re"
694
+ ],
695
+ [
696
+ "I",
697
+ "N"
698
+ ],
699
+ [
700
+ "Ġh",
701
+ "im"
702
+ ],
703
+ [
704
+ "l",
705
+ "y"
706
+ ],
707
+ [
708
+ "Ġl",
709
+ "i"
710
+ ],
711
+ [
712
+ "Ġha",
713
+ "ve"
714
+ ],
715
+ [
716
+ "i",
717
+ "d"
718
+ ],
719
+ [
720
+ "a",
721
+ "s"
722
+ ],
723
+ [
724
+ "u",
725
+ "r"
726
+ ],
727
+ [
728
+ "a",
729
+ "l"
730
+ ],
731
+ [
732
+ "Ġth",
733
+ "is"
734
+ ],
735
+ [
736
+ "Ġd",
737
+ "e"
738
+ ],
739
+ [
740
+ "Ġs",
741
+ "o"
742
+ ],
743
+ [
744
+ "Ġ",
745
+ "on"
746
+ ],
747
+ [
748
+ "Ġa",
749
+ "s"
750
+ ],
751
+ [
752
+ "A",
753
+ "R"
754
+ ],
755
+ [
756
+ "r",
757
+ "o"
758
+ ],
759
+ [
760
+ "o",
761
+ "re"
762
+ ],
763
+ [
764
+ "h",
765
+ "i"
766
+ ],
767
+ [
768
+ "ou",
769
+ "ld"
770
+ ],
771
+ [
772
+ "oo",
773
+ "d"
774
+ ],
775
+ [
776
+ "c",
777
+ "k"
778
+ ],
779
+ [
780
+ "v",
781
+ "er"
782
+ ],
783
+ [
784
+ "a",
785
+ "in"
786
+ ],
787
+ [
788
+ "es",
789
+ "t"
790
+ ],
791
+ [
792
+ "es",
793
+ "s"
794
+ ],
795
+ [
796
+ "Ġth",
797
+ "y"
798
+ ],
799
+ [
800
+ "Ġs",
801
+ "ha"
802
+ ],
803
+ [
804
+ "U",
805
+ "S"
806
+ ],
807
+ [
808
+ "Ġd",
809
+ "o"
810
+ ],
811
+ [
812
+ "e",
813
+ "a"
814
+ ],
815
+ [
816
+ "Ġw",
817
+ "ill"
818
+ ],
819
+ [
820
+ "Ġn",
821
+ "o"
822
+ ],
823
+ [
824
+ "a",
825
+ "m"
826
+ ],
827
+ [
828
+ "u",
829
+ "s"
830
+ ],
831
+ [
832
+ "Ġb",
833
+ "ut"
834
+ ],
835
+ [
836
+ "g",
837
+ "e"
838
+ ],
839
+ [
840
+ "a",
841
+ "nd"
842
+ ],
843
+ [
844
+ "Ġs",
845
+ "e"
846
+ ],
847
+ [
848
+ "i",
849
+ "f"
850
+ ],
851
+ [
852
+ "I",
853
+ "O"
854
+ ],
855
+ [
856
+ "T",
857
+ "h"
858
+ ],
859
+ [
860
+ "i",
861
+ "on"
862
+ ],
863
+ [
864
+ "T",
865
+ "o"
866
+ ],
867
+ [
868
+ "Ġa",
869
+ "ll"
870
+ ],
871
+ [
872
+ "Ġs",
873
+ "u"
874
+ ],
875
+ [
876
+ "a",
877
+ "ke"
878
+ ],
879
+ [
880
+ "t",
881
+ "h"
882
+ ],
883
+ [
884
+ "e",
885
+ "ar"
886
+ ],
887
+ [
888
+ "u",
889
+ "e"
890
+ ],
891
+ [
892
+ "Ġa",
893
+ "n"
894
+ ],
895
+ [
896
+ "t",
897
+ "er"
898
+ ],
899
+ [
900
+ "Ġl",
901
+ "o"
902
+ ],
903
+ [
904
+ "ar",
905
+ "d"
906
+ ],
907
+ [
908
+ "IN",
909
+ "G"
910
+ ],
911
+ [
912
+ "ha",
913
+ "n"
914
+ ],
915
+ [
916
+ "Ġ",
917
+ "our"
918
+ ],
919
+ [
920
+ "Ġhe",
921
+ "r"
922
+ ],
923
+ [
924
+ "Ġb",
925
+ "y"
926
+ ],
927
+ [
928
+ "Ġs",
929
+ "p"
930
+ ],
931
+ [
932
+ "Ġf",
933
+ "a"
934
+ ],
935
+ [
936
+ "e",
937
+ "ll"
938
+ ],
939
+ [
940
+ "Ġ",
941
+ "R"
942
+ ],
943
+ [
944
+ "Ġsha",
945
+ "ll"
946
+ ],
947
+ [
948
+ "Ġ",
949
+ "C"
950
+ ],
951
+ [
952
+ "Ġthe",
953
+ "e"
954
+ ],
955
+ [
956
+ "r",
957
+ "om"
958
+ ],
959
+ [
960
+ "h",
961
+ "o"
962
+ ],
963
+ [
964
+ "i",
965
+ "l"
966
+ ],
967
+ [
968
+ "E",
969
+ "S"
970
+ ],
971
+ [
972
+ "c",
973
+ "t"
974
+ ],
975
+ [
976
+ "ou",
977
+ "s"
978
+ ],
979
+ [
980
+ "O",
981
+ "R"
982
+ ],
983
+ [
984
+ "u",
985
+ "st"
986
+ ],
987
+ [
988
+ "Ġ",
989
+ "v"
990
+ ],
991
+ [
992
+ "Ġn",
993
+ "e"
994
+ ],
995
+ [
996
+ "Ġa",
997
+ "re"
998
+ ],
999
+ [
1000
+ "T",
1001
+ "hat"
1002
+ ],
1003
+ [
1004
+ "u",
1005
+ "l"
1006
+ ],
1007
+ [
1008
+ "Ġk",
1009
+ "n"
1010
+ ],
1011
+ [
1012
+ "i",
1013
+ "ght"
1014
+ ],
1015
+ [
1016
+ "E",
1017
+ "R"
1018
+ ],
1019
+ [
1020
+ "Ġw",
1021
+ "hat"
1022
+ ],
1023
+ [
1024
+ "Ġl",
1025
+ "ord"
1026
+ ],
1027
+ [
1028
+ "Ġs",
1029
+ "h"
1030
+ ],
1031
+ [
1032
+ "a",
1033
+ "st"
1034
+ ],
1035
+ [
1036
+ "at",
1037
+ "h"
1038
+ ],
1039
+ [
1040
+ "se",
1041
+ "l"
1042
+ ],
1043
+ [
1044
+ "Ġu",
1045
+ "p"
1046
+ ],
1047
+ [
1048
+ "ar",
1049
+ "t"
1050
+ ],
1051
+ [
1052
+ "Ġ",
1053
+ "E"
1054
+ ],
1055
+ [
1056
+ "L",
1057
+ "A"
1058
+ ],
1059
+ [
1060
+ "K",
1061
+ "ING"
1062
+ ]
1063
+ ]
1064
+ }
1065
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "model_max_length": 1000000000000000019884624838656,
19
+ "pad_token": "<|endoftext|>",
20
+ "tokenizer_class": "MinGRUTokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<|endoftext|>":0,"!":1,"$":2,"&":3,"'":4,",":5,"-":6,".":7,"3":8,":":9,";":10,"?":11,"A":12,"B":13,"C":14,"D":15,"E":16,"F":17,"G":18,"H":19,"I":20,"J":21,"K":22,"L":23,"M":24,"N":25,"O":26,"P":27,"Q":28,"R":29,"S":30,"T":31,"U":32,"V":33,"W":34,"X":35,"Y":36,"Z":37,"a":38,"b":39,"c":40,"d":41,"e":42,"f":43,"g":44,"h":45,"i":46,"j":47,"k":48,"l":49,"m":50,"n":51,"o":52,"p":53,"q":54,"r":55,"s":56,"t":57,"u":58,"v":59,"w":60,"x":61,"y":62,"z":63,"Ċ":64,"Ġ":65,"Ġt":66,"he":67,"Ġa":68,"ou":69,"Ġs":70,"Ġm":71,"in":72,"Ġw":73,"re":74,"ha":75,"Ġthe":76,"nd":77,"Ġb":78,"is":79,"or":80,"Ġf":81,"er":82,"ll":83,"it":84,"on":85,"Ġd":86,"Ġc":87,"es":88,"Ġl":89,"en":90,"Ġn":91,"Ġy":92,"ar":93,"Ġth":94,"Ġh":95,"Ġo":96,"Ġto":97,"Ġp":98,"Ġyou":99,"hat":100,"ĠI":101,"Ġhe":102,"ot":103,"ve":104,"ing":105,"Ġof":106,"st":107,"Ġand":108,"ow":109,"an":110,"om":111,"Ġg":112,"at":113,"Ġbe":114,"se":115,"ce":116,"Ġmy":117,"Ġin":118,"Ġha":119,"le":120,"ay":121,"ld":122,"et":123,"ir":124,"ed":125,"ut":126,"im":127,"ith":128,"'s":129,"Ġme":130,"Ġnot":131,"Ġthat":132,"ch":133,"gh":134,"our":135,"Ġis":136,"And":137,"Ġu":138,"Ġfor":139,"ke":140,"Ġwe":141,"oo":142,"ill":143,"Ġe":144,"her":145,"Ġwith":146,"Ġyour":147,"Ġit":148,"ad":149,"ent":150,"ri":151,"Ġst":152,"Ġthou":153,"Ġhis":154,"'d":155,"Ġk":156,"ord":157,"ome":158,"EN":159,"ght":160,"ra":161,"The":162,"Ġre":163,"IN":164,"Ġhim":165,"ly":166,"Ġli":167,"Ġhave":168,"id":169,"as":170,"ur":171,"al":172,"Ġthis":173,"Ġde":174,"Ġso":175,"Ġon":176,"Ġas":177,"AR":178,"ro":179,"ore":180,"hi":181,"ould":182,"ood":183,"ck":184,"ver":185,"ain":186,"est":187,"ess":188,"Ġthy":189,"Ġsha":190,"US":191,"Ġdo":192,"ea":193,"Ġwill":194,"Ġno":195,"am":196,"us":197,"Ġbut":198,"ge":199,"and":200,"Ġse":201,"if":202,"IO":203,"Th":204,"ion":205,"To":206,"Ġall":207,"Ġsu":208,"ake":209,"th":210,"ear":211,"ue":212,"Ġan":213,"ter":214,"Ġlo":215,"ard":216,"ING":217,"han":218,"Ġour":219,"Ġher":220,"Ġby":221,"Ġsp":222,"Ġfa":223,"ell":224,"ĠR":225,"Ġshall":226,"ĠC":227,"Ġthee":228,"rom":229,"ho":230,"il":231,"ES":232,"ct":233,"ous":234,"OR":235,"ust":236,"Ġv":237,"Ġne":238,"Ġare":239,"That":240,"ul":241,"Ġkn":242,"ight":243,"ER":244,"Ġwhat":245,"Ġlord":246,"Ġsh":247,"ast":248,"ath":249,"sel":250,"Ġup":251,"art":252,"ĠE":253,"LA":254,"KING":255}