AIDSC TimeRobber commited on
Commit
45e3df8
0 Parent(s):

Duplicate from bigscience/mt0-xxl

Browse files

Co-authored-by: Thomas Wang <TimeRobber@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,960 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - bigscience/xP3
4
+ - mc4
5
+ license: apache-2.0
6
+ language:
7
+ - af
8
+ - am
9
+ - ar
10
+ - az
11
+ - be
12
+ - bg
13
+ - bn
14
+ - ca
15
+ - ceb
16
+ - co
17
+ - cs
18
+ - cy
19
+ - da
20
+ - de
21
+ - el
22
+ - en
23
+ - eo
24
+ - es
25
+ - et
26
+ - eu
27
+ - fa
28
+ - fi
29
+ - fil
30
+ - fr
31
+ - fy
32
+ - ga
33
+ - gd
34
+ - gl
35
+ - gu
36
+ - ha
37
+ - haw
38
+ - hi
39
+ - hmn
40
+ - ht
41
+ - hu
42
+ - hy
43
+ - ig
44
+ - is
45
+ - it
46
+ - iw
47
+ - ja
48
+ - jv
49
+ - ka
50
+ - kk
51
+ - km
52
+ - kn
53
+ - ko
54
+ - ku
55
+ - ky
56
+ - la
57
+ - lb
58
+ - lo
59
+ - lt
60
+ - lv
61
+ - mg
62
+ - mi
63
+ - mk
64
+ - ml
65
+ - mn
66
+ - mr
67
+ - ms
68
+ - mt
69
+ - my
70
+ - ne
71
+ - nl
72
+ - 'no'
73
+ - ny
74
+ - pa
75
+ - pl
76
+ - ps
77
+ - pt
78
+ - ro
79
+ - ru
80
+ - sd
81
+ - si
82
+ - sk
83
+ - sl
84
+ - sm
85
+ - sn
86
+ - so
87
+ - sq
88
+ - sr
89
+ - st
90
+ - su
91
+ - sv
92
+ - sw
93
+ - ta
94
+ - te
95
+ - tg
96
+ - th
97
+ - tr
98
+ - uk
99
+ - und
100
+ - ur
101
+ - uz
102
+ - vi
103
+ - xh
104
+ - yi
105
+ - yo
106
+ - zh
107
+ - zu
108
+ tags:
109
+ - text2text-generation
110
+ widget:
111
+ - text: >-
112
+ <table> <tr> <th>Name</th> <th>Explanation</th> <th>Example models</th>
113
+ </tr> <tr> <td><a
114
+ href=https://huggingface.co/datasets/bigscience/xP3>xP3</a></t> <td>Mixture
115
+ of 13 training tasks in 46 languages with English prompts</td> <td><a
116
+ href=https://huggingface.co/bigscience/bloomz>bloomz</a> & <a
117
+ href=https://huggingface.co/bigscience/mt0-xxl>mt0-xxl</a></td> </tr> <tr>
118
+ <td><a href=https://huggingface.co/datasets/bigscience/xP3mt>xP3mt</a></t>
119
+ <td>Mixture of 13 training tasks in 46 languages with prompts in 20
120
+ languages (machine-translated from English)</td> <td><a
121
+ href=https://huggingface.co/bigscience/bloomz-mt>bloomz-mt</a> & <a
122
+ href=https://huggingface.co/bigscience/mt0-xxl-mt>mt0-xxl-mt</a></td> </tr>
123
+ <tr> <td><a
124
+ href=https://huggingface.co/datasets/bigscience/xP3all>xP3all</a></t>
125
+ <td>xP3 + our evaluation datasets adding an additional 3 tasks for a total
126
+ of 16 tasks in 46 languages with English prompts</td> <td></td> </tr> <tr>
127
+ <td><a
128
+ href=https://huggingface.co/datasets/bigscience/xP3megds>xP3megds</a></t>
129
+ <td><a
130
+ href=https://github.com/bigscience-workshop/Megatron-DeepSpeed>Megatron-DeepSpeed</a>
131
+ processed version of xP3</td> <td><a
132
+ href=https://huggingface.co/bigscience/bloomz>bloomz</a></td> </tr> <tr>
133
+ <td><a href=https://huggingface.co/datasets/Muennighoff/P3>P3</a></t>
134
+ <td>Repreprocessed version of the English-only <a
135
+ href=https://huggingface.co/datasets/bigscience/P3>P3</a> with 8 training
136
+ tasks</td> <td><a
137
+ href=https://huggingface.co/bigscience/bloomz-p3>bloomz-p3</a> & <a
138
+ href=https://huggingface.co/bigscience/mt0-xxl-p3>mt0-xxl-p3</a></td> </tr>
139
+ </table> Which dataset has the most tasks?
140
+ example_title: en-en struct-to-text
141
+ - text: Life is beautiful! Translate to Mongolian.
142
+ example_title: mn-en translation
143
+ - text: Le mot japonais «憂鬱» veut dire quoi en Odia?
144
+ example_title: jp-or-fr translation
145
+ - text: >-
146
+ Stell mir eine schwierige Quiz Frage bei der es um Astronomie geht. Bitte
147
+ stell die Frage auf Norwegisch.
148
+ example_title: de-nb quiz
149
+ - text: >-
150
+ We present BLOOMZ & mT0, a family of models capable of following human
151
+ instructions in dozens of languages zero-shot. We finetune BLOOM & mT5
152
+ pretrained multilingual language models on our crosslingual task mixture
153
+ (xP3) and find our resulting models capable of crosslingual generalization
154
+ to unseen tasks & languages. What are the keywords in Chinese?
155
+ example_title: zh-en keywords
156
+ - text: >-
157
+ 一个传奇的开端,一个不灭的神话,这不仅仅是一部电影,而是作为一个走进新时代的标签,永远彪炳史册。Would you rate the previous
158
+ review as positive, neutral or negative?
159
+ example_title: zh-en sentiment
160
+ - text: 一个传奇的开端,一个不灭的神话,这不仅仅是一部电影,而是作为一个走进新时代的标签,永远彪炳史册。你认为这句话的立场是赞扬、中立还是批评?
161
+ example_title: zh-zh sentiment
162
+ - text: Suggest at least five related search terms to "Mạng neural nhân tạo".
163
+ example_title: vi-en query
164
+ - text: >-
165
+ Proposez au moins cinq mots clés concernant «Réseau de neurones
166
+ artificiels».
167
+ example_title: fr-fr query
168
+ - text: Explain in a sentence in Telugu what is backpropagation in neural networks.
169
+ example_title: te-en qa
170
+ - text: Why is the sky blue?
171
+ example_title: en-en qa
172
+ - text: >-
173
+ Write a fairy tale about a troll saving a princess from a dangerous dragon.
174
+ The fairy tale is a masterpiece that has achieved praise worldwide and its
175
+ moral is "Heroes Come in All Shapes and Sizes". Story (in Spanish):
176
+ example_title: es-en fable
177
+ - text: >-
178
+ Write a fable about wood elves living in a forest that is suddenly invaded
179
+ by ogres. The fable is a masterpiece that has achieved praise worldwide and
180
+ its moral is "Violence is the last refuge of the incompetent". Fable (in
181
+ Hindi):
182
+ example_title: hi-en fable
183
+ model-index:
184
+ - name: mt0-xxl
185
+ results:
186
+ - task:
187
+ type: Coreference resolution
188
+ dataset:
189
+ type: winogrande
190
+ name: Winogrande XL (xl)
191
+ config: xl
192
+ split: validation
193
+ revision: a80f460359d1e9a67c006011c94de42a8759430c
194
+ metrics:
195
+ - type: Accuracy
196
+ value: 63.38
197
+ - task:
198
+ type: Coreference resolution
199
+ dataset:
200
+ type: Muennighoff/xwinograd
201
+ name: XWinograd (en)
202
+ config: en
203
+ split: test
204
+ revision: 9dd5ea5505fad86b7bedad667955577815300cee
205
+ metrics:
206
+ - type: Accuracy
207
+ value: 81.29
208
+ - task:
209
+ type: Coreference resolution
210
+ dataset:
211
+ type: Muennighoff/xwinograd
212
+ name: XWinograd (fr)
213
+ config: fr
214
+ split: test
215
+ revision: 9dd5ea5505fad86b7bedad667955577815300cee
216
+ metrics:
217
+ - type: Accuracy
218
+ value: 78.31
219
+ - task:
220
+ type: Coreference resolution
221
+ dataset:
222
+ type: Muennighoff/xwinograd
223
+ name: XWinograd (jp)
224
+ config: jp
225
+ split: test
226
+ revision: 9dd5ea5505fad86b7bedad667955577815300cee
227
+ metrics:
228
+ - type: Accuracy
229
+ value: 78.62
230
+ - task:
231
+ type: Coreference resolution
232
+ dataset:
233
+ type: Muennighoff/xwinograd
234
+ name: XWinograd (pt)
235
+ config: pt
236
+ split: test
237
+ revision: 9dd5ea5505fad86b7bedad667955577815300cee
238
+ metrics:
239
+ - type: Accuracy
240
+ value: 77.95
241
+ - task:
242
+ type: Coreference resolution
243
+ dataset:
244
+ type: Muennighoff/xwinograd
245
+ name: XWinograd (ru)
246
+ config: ru
247
+ split: test
248
+ revision: 9dd5ea5505fad86b7bedad667955577815300cee
249
+ metrics:
250
+ - type: Accuracy
251
+ value: 76.51
252
+ - task:
253
+ type: Coreference resolution
254
+ dataset:
255
+ type: Muennighoff/xwinograd
256
+ name: XWinograd (zh)
257
+ config: zh
258
+ split: test
259
+ revision: 9dd5ea5505fad86b7bedad667955577815300cee
260
+ metrics:
261
+ - type: Accuracy
262
+ value: 77.38
263
+ - task:
264
+ type: Natural language inference
265
+ dataset:
266
+ type: anli
267
+ name: ANLI (r1)
268
+ config: r1
269
+ split: validation
270
+ revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
271
+ metrics:
272
+ - type: Accuracy
273
+ value: 49.5
274
+ - task:
275
+ type: Natural language inference
276
+ dataset:
277
+ type: anli
278
+ name: ANLI (r2)
279
+ config: r2
280
+ split: validation
281
+ revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
282
+ metrics:
283
+ - type: Accuracy
284
+ value: 43
285
+ - task:
286
+ type: Natural language inference
287
+ dataset:
288
+ type: anli
289
+ name: ANLI (r3)
290
+ config: r3
291
+ split: validation
292
+ revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
293
+ metrics:
294
+ - type: Accuracy
295
+ value: 46.08
296
+ - task:
297
+ type: Natural language inference
298
+ dataset:
299
+ type: super_glue
300
+ name: SuperGLUE (cb)
301
+ config: cb
302
+ split: validation
303
+ revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
304
+ metrics:
305
+ - type: Accuracy
306
+ value: 85.71
307
+ - task:
308
+ type: Natural language inference
309
+ dataset:
310
+ type: super_glue
311
+ name: SuperGLUE (rte)
312
+ config: rte
313
+ split: validation
314
+ revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
315
+ metrics:
316
+ - type: Accuracy
317
+ value: 85.56
318
+ - task:
319
+ type: Natural language inference
320
+ dataset:
321
+ type: xnli
322
+ name: XNLI (ar)
323
+ config: ar
324
+ split: validation
325
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
326
+ metrics:
327
+ - type: Accuracy
328
+ value: 57.91
329
+ - task:
330
+ type: Natural language inference
331
+ dataset:
332
+ type: xnli
333
+ name: XNLI (bg)
334
+ config: bg
335
+ split: validation
336
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
337
+ metrics:
338
+ - type: Accuracy
339
+ value: 59.88
340
+ - task:
341
+ type: Natural language inference
342
+ dataset:
343
+ type: xnli
344
+ name: XNLI (de)
345
+ config: de
346
+ split: validation
347
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
348
+ metrics:
349
+ - type: Accuracy
350
+ value: 60.64
351
+ - task:
352
+ type: Natural language inference
353
+ dataset:
354
+ type: xnli
355
+ name: XNLI (el)
356
+ config: el
357
+ split: validation
358
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
359
+ metrics:
360
+ - type: Accuracy
361
+ value: 59
362
+ - task:
363
+ type: Natural language inference
364
+ dataset:
365
+ type: xnli
366
+ name: XNLI (en)
367
+ config: en
368
+ split: validation
369
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
370
+ metrics:
371
+ - type: Accuracy
372
+ value: 62.01
373
+ - task:
374
+ type: Natural language inference
375
+ dataset:
376
+ type: xnli
377
+ name: XNLI (es)
378
+ config: es
379
+ split: validation
380
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
381
+ metrics:
382
+ - type: Accuracy
383
+ value: 60.8
384
+ - task:
385
+ type: Natural language inference
386
+ dataset:
387
+ type: xnli
388
+ name: XNLI (fr)
389
+ config: fr
390
+ split: validation
391
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
392
+ metrics:
393
+ - type: Accuracy
394
+ value: 59.88
395
+ - task:
396
+ type: Natural language inference
397
+ dataset:
398
+ type: xnli
399
+ name: XNLI (hi)
400
+ config: hi
401
+ split: validation
402
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
403
+ metrics:
404
+ - type: Accuracy
405
+ value: 57.23
406
+ - task:
407
+ type: Natural language inference
408
+ dataset:
409
+ type: xnli
410
+ name: XNLI (ru)
411
+ config: ru
412
+ split: validation
413
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
414
+ metrics:
415
+ - type: Accuracy
416
+ value: 58.88
417
+ - task:
418
+ type: Natural language inference
419
+ dataset:
420
+ type: xnli
421
+ name: XNLI (sw)
422
+ config: sw
423
+ split: validation
424
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
425
+ metrics:
426
+ - type: Accuracy
427
+ value: 55.66
428
+ - task:
429
+ type: Natural language inference
430
+ dataset:
431
+ type: xnli
432
+ name: XNLI (th)
433
+ config: th
434
+ split: validation
435
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
436
+ metrics:
437
+ - type: Accuracy
438
+ value: 57.43
439
+ - task:
440
+ type: Natural language inference
441
+ dataset:
442
+ type: xnli
443
+ name: XNLI (tr)
444
+ config: tr
445
+ split: validation
446
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
447
+ metrics:
448
+ - type: Accuracy
449
+ value: 57.59
450
+ - task:
451
+ type: Natural language inference
452
+ dataset:
453
+ type: xnli
454
+ name: XNLI (ur)
455
+ config: ur
456
+ split: validation
457
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
458
+ metrics:
459
+ - type: Accuracy
460
+ value: 55.42
461
+ - task:
462
+ type: Natural language inference
463
+ dataset:
464
+ type: xnli
465
+ name: XNLI (vi)
466
+ config: vi
467
+ split: validation
468
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
469
+ metrics:
470
+ - type: Accuracy
471
+ value: 58.51
472
+ - task:
473
+ type: Natural language inference
474
+ dataset:
475
+ type: xnli
476
+ name: XNLI (zh)
477
+ config: zh
478
+ split: validation
479
+ revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
480
+ metrics:
481
+ - type: Accuracy
482
+ value: 59.12
483
+ - task:
484
+ type: Sentence completion
485
+ dataset:
486
+ type: story_cloze
487
+ name: StoryCloze (2016)
488
+ config: '2016'
489
+ split: validation
490
+ revision: e724c6f8cdf7c7a2fb229d862226e15b023ee4db
491
+ metrics:
492
+ - type: Accuracy
493
+ value: 96.04
494
+ - task:
495
+ type: Sentence completion
496
+ dataset:
497
+ type: super_glue
498
+ name: SuperGLUE (copa)
499
+ config: copa
500
+ split: validation
501
+ revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
502
+ metrics:
503
+ - type: Accuracy
504
+ value: 93
505
+ - task:
506
+ type: Sentence completion
507
+ dataset:
508
+ type: xcopa
509
+ name: XCOPA (et)
510
+ config: et
511
+ split: validation
512
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
513
+ metrics:
514
+ - type: Accuracy
515
+ value: 79
516
+ - task:
517
+ type: Sentence completion
518
+ dataset:
519
+ type: xcopa
520
+ name: XCOPA (ht)
521
+ config: ht
522
+ split: validation
523
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
524
+ metrics:
525
+ - type: Accuracy
526
+ value: 81
527
+ - task:
528
+ type: Sentence completion
529
+ dataset:
530
+ type: xcopa
531
+ name: XCOPA (id)
532
+ config: id
533
+ split: validation
534
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
535
+ metrics:
536
+ - type: Accuracy
537
+ value: 92
538
+ - task:
539
+ type: Sentence completion
540
+ dataset:
541
+ type: xcopa
542
+ name: XCOPA (it)
543
+ config: it
544
+ split: validation
545
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
546
+ metrics:
547
+ - type: Accuracy
548
+ value: 90
549
+ - task:
550
+ type: Sentence completion
551
+ dataset:
552
+ type: xcopa
553
+ name: XCOPA (qu)
554
+ config: qu
555
+ split: validation
556
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
557
+ metrics:
558
+ - type: Accuracy
559
+ value: 59
560
+ - task:
561
+ type: Sentence completion
562
+ dataset:
563
+ type: xcopa
564
+ name: XCOPA (sw)
565
+ config: sw
566
+ split: validation
567
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
568
+ metrics:
569
+ - type: Accuracy
570
+ value: 79
571
+ - task:
572
+ type: Sentence completion
573
+ dataset:
574
+ type: xcopa
575
+ name: XCOPA (ta)
576
+ config: ta
577
+ split: validation
578
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
579
+ metrics:
580
+ - type: Accuracy
581
+ value: 84
582
+ - task:
583
+ type: Sentence completion
584
+ dataset:
585
+ type: xcopa
586
+ name: XCOPA (th)
587
+ config: th
588
+ split: validation
589
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
590
+ metrics:
591
+ - type: Accuracy
592
+ value: 77
593
+ - task:
594
+ type: Sentence completion
595
+ dataset:
596
+ type: xcopa
597
+ name: XCOPA (tr)
598
+ config: tr
599
+ split: validation
600
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
601
+ metrics:
602
+ - type: Accuracy
603
+ value: 79
604
+ - task:
605
+ type: Sentence completion
606
+ dataset:
607
+ type: xcopa
608
+ name: XCOPA (vi)
609
+ config: vi
610
+ split: validation
611
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
612
+ metrics:
613
+ - type: Accuracy
614
+ value: 88
615
+ - task:
616
+ type: Sentence completion
617
+ dataset:
618
+ type: xcopa
619
+ name: XCOPA (zh)
620
+ config: zh
621
+ split: validation
622
+ revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
623
+ metrics:
624
+ - type: Accuracy
625
+ value: 89
626
+ - task:
627
+ type: Sentence completion
628
+ dataset:
629
+ type: Muennighoff/xstory_cloze
630
+ name: XStoryCloze (ar)
631
+ config: ar
632
+ split: validation
633
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
634
+ metrics:
635
+ - type: Accuracy
636
+ value: 91.07
637
+ - task:
638
+ type: Sentence completion
639
+ dataset:
640
+ type: Muennighoff/xstory_cloze
641
+ name: XStoryCloze (es)
642
+ config: es
643
+ split: validation
644
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
645
+ metrics:
646
+ - type: Accuracy
647
+ value: 92.52
648
+ - task:
649
+ type: Sentence completion
650
+ dataset:
651
+ type: Muennighoff/xstory_cloze
652
+ name: XStoryCloze (eu)
653
+ config: eu
654
+ split: validation
655
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
656
+ metrics:
657
+ - type: Accuracy
658
+ value: 90.6
659
+ - task:
660
+ type: Sentence completion
661
+ dataset:
662
+ type: Muennighoff/xstory_cloze
663
+ name: XStoryCloze (hi)
664
+ config: hi
665
+ split: validation
666
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
667
+ metrics:
668
+ - type: Accuracy
669
+ value: 92.32
670
+ - task:
671
+ type: Sentence completion
672
+ dataset:
673
+ type: Muennighoff/xstory_cloze
674
+ name: XStoryCloze (id)
675
+ config: id
676
+ split: validation
677
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
678
+ metrics:
679
+ - type: Accuracy
680
+ value: 93.51
681
+ - task:
682
+ type: Sentence completion
683
+ dataset:
684
+ type: Muennighoff/xstory_cloze
685
+ name: XStoryCloze (my)
686
+ config: my
687
+ split: validation
688
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
689
+ metrics:
690
+ - type: Accuracy
691
+ value: 87.49
692
+ - task:
693
+ type: Sentence completion
694
+ dataset:
695
+ type: Muennighoff/xstory_cloze
696
+ name: XStoryCloze (ru)
697
+ config: ru
698
+ split: validation
699
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
700
+ metrics:
701
+ - type: Accuracy
702
+ value: 91.4
703
+ - task:
704
+ type: Sentence completion
705
+ dataset:
706
+ type: Muennighoff/xstory_cloze
707
+ name: XStoryCloze (sw)
708
+ config: sw
709
+ split: validation
710
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
711
+ metrics:
712
+ - type: Accuracy
713
+ value: 89.41
714
+ - task:
715
+ type: Sentence completion
716
+ dataset:
717
+ type: Muennighoff/xstory_cloze
718
+ name: XStoryCloze (te)
719
+ config: te
720
+ split: validation
721
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
722
+ metrics:
723
+ - type: Accuracy
724
+ value: 90.54
725
+ - task:
726
+ type: Sentence completion
727
+ dataset:
728
+ type: Muennighoff/xstory_cloze
729
+ name: XStoryCloze (zh)
730
+ config: zh
731
+ split: validation
732
+ revision: 8bb76e594b68147f1a430e86829d07189622b90d
733
+ metrics:
734
+ - type: Accuracy
735
+ value: 93.85
736
+ pipeline_tag: text2text-generation
737
+ ---
738
+
739
+ ![xmtf](https://github.com/bigscience-workshop/xmtf/blob/master/xmtf_banner.png?raw=true)
740
+
741
+ # Table of Contents
742
+
743
+ 1. [Model Summary](#model-summary)
744
+ 2. [Use](#use)
745
+ 3. [Limitations](#limitations)
746
+ 4. [Training](#training)
747
+ 5. [Evaluation](#evaluation)
748
+ 7. [Citation](#citation)
749
+
750
+ # Model Summary
751
+
752
+ > We present BLOOMZ & mT0, a family of models capable of following human instructions in dozens of languages zero-shot. We finetune BLOOM & mT5 pretrained multilingual language models on our crosslingual task mixture (xP3) and find our resulting models capable of crosslingual generalization to unseen tasks & languages.
753
+
754
+ - **Repository:** [bigscience-workshop/xmtf](https://github.com/bigscience-workshop/xmtf)
755
+ - **Paper:** [Crosslingual Generalization through Multitask Finetuning](https://arxiv.org/abs/2211.01786)
756
+ - **Point of Contact:** [Niklas Muennighoff](mailto:niklas@hf.co)
757
+ - **Languages:** Refer to [mc4](https://huggingface.co/datasets/mc4) for pretraining & [xP3](https://huggingface.co/bigscience/xP3) for finetuning language proportions. It understands both pretraining & finetuning languages.
758
+ - **BLOOMZ & mT0 Model Family:**
759
+
760
+ <div class="max-w-full overflow-auto">
761
+ <table>
762
+ <tr>
763
+ <th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/bigscience/xP3>xP3</a>. Recommended for prompting in English.
764
+ </tr>
765
+ <tr>
766
+ <td>Parameters</td>
767
+ <td>300M</td>
768
+ <td>580M</td>
769
+ <td>1.2B</td>
770
+ <td>3.7B</td>
771
+ <td>13B</td>
772
+ <td>560M</td>
773
+ <td>1.1B</td>
774
+ <td>1.7B</td>
775
+ <td>3B</td>
776
+ <td>7.1B</td>
777
+ <td>176B</td>
778
+ </tr>
779
+ <tr>
780
+ <td>Finetuned Model</td>
781
+ <td><a href=https://huggingface.co/bigscience/mt0-small>mt0-small</a></td>
782
+ <td><a href=https://huggingface.co/bigscience/mt0-base>mt0-base</a></td>
783
+ <td><a href=https://huggingface.co/bigscience/mt0-large>mt0-large</a></td>
784
+ <td><a href=https://huggingface.co/bigscience/mt0-xl>mt0-xl</a></td>
785
+ <td><a href=https://huggingface.co/bigscience/mt0-xxl>mt0-xxl</a></td>
786
+ <td><a href=https://huggingface.co/bigscience/bloomz-560m>bloomz-560m</a></td>
787
+ <td><a href=https://huggingface.co/bigscience/bloomz-1b1>bloomz-1b1</a></td>
788
+ <td><a href=https://huggingface.co/bigscience/bloomz-1b7>bloomz-1b7</a></td>
789
+ <td><a href=https://huggingface.co/bigscience/bloomz-3b>bloomz-3b</a></td>
790
+ <td><a href=https://huggingface.co/bigscience/bloomz-7b1>bloomz-7b1</a></td>
791
+ <td><a href=https://huggingface.co/bigscience/bloomz>bloomz</a></td>
792
+ </tr>
793
+ </tr>
794
+ <tr>
795
+ <th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/bigscience/xP3mt>xP3mt</a>. Recommended for prompting in non-English.</th>
796
+ </tr>
797
+ <tr>
798
+ <td>Finetuned Model</td>
799
+ <td></td>
800
+ <td></td>
801
+ <td></td>
802
+ <td></td>
803
+ <td><a href=https://huggingface.co/bigscience/mt0-xxl-mt>mt0-xxl-mt</a></td>
804
+ <td></td>
805
+ <td></td>
806
+ <td></td>
807
+ <td></td>
808
+ <td><a href=https://huggingface.co/bigscience/bloomz-7b1-mt>bloomz-7b1-mt</a></td>
809
+ <td><a href=https://huggingface.co/bigscience/bloomz-mt>bloomz-mt</a></td>
810
+ </tr>
811
+ <th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/Muennighoff/P3>P3</a>. Released for research purposes only. Strictly inferior to above models!</th>
812
+ </tr>
813
+ <tr>
814
+ <td>Finetuned Model</td>
815
+ <td></td>
816
+ <td></td>
817
+ <td></td>
818
+ <td></td>
819
+ <td><a href=https://huggingface.co/bigscience/mt0-xxl-p3>mt0-xxl-p3</a></td>
820
+ <td></td>
821
+ <td></td>
822
+ <td></td>
823
+ <td></td>
824
+ <td><a href=https://huggingface.co/bigscience/bloomz-7b1-p3>bloomz-7b1-p3</a></td>
825
+ <td><a href=https://huggingface.co/bigscience/bloomz-p3>bloomz-p3</a></td>
826
+ </tr>
827
+ <th colspan="12">Original pretrained checkpoints. Not recommended.</th>
828
+ <tr>
829
+ <td>Pretrained Model</td>
830
+ <td><a href=https://huggingface.co/google/mt5-small>mt5-small</a></td>
831
+ <td><a href=https://huggingface.co/google/mt5-base>mt5-base</a></td>
832
+ <td><a href=https://huggingface.co/google/mt5-large>mt5-large</a></td>
833
+ <td><a href=https://huggingface.co/google/mt5-xl>mt5-xl</a></td>
834
+ <td><a href=https://huggingface.co/google/mt5-xxl>mt5-xxl</a></td>
835
+ <td><a href=https://huggingface.co/bigscience/bloom-560m>bloom-560m</a></td>
836
+ <td><a href=https://huggingface.co/bigscience/bloom-1b1>bloom-1b1</a></td>
837
+ <td><a href=https://huggingface.co/bigscience/bloom-1b7>bloom-1b7</a></td>
838
+ <td><a href=https://huggingface.co/bigscience/bloom-3b>bloom-3b</a></td>
839
+ <td><a href=https://huggingface.co/bigscience/bloom-7b1>bloom-7b1</a></td>
840
+ <td><a href=https://huggingface.co/bigscience/bloom>bloom</a></td>
841
+ </tr>
842
+ </table>
843
+ </div>
844
+
845
+
846
+ # Use
847
+
848
+ ## Intended use
849
+
850
+ We recommend using the model to perform tasks expressed in natural language. For example, given the prompt "*Translate to English: Je t’aime.*", the model will most likely answer "*I love you.*". Some prompt ideas from our paper:
851
+ - 一个传奇的开端,一个不灭的神话,这不仅仅是一部电影,而是作为一个走进新时代的标签,永远彪炳史册。你认为这句话的立场是赞扬、中立还是批评?
852
+ - Suggest at least five related search terms to "Mạng neural nhân tạo".
853
+ - Write a fairy tale about a troll saving a princess from a dangerous dragon. The fairy tale is a masterpiece that has achieved praise worldwide and its moral is "Heroes Come in All Shapes and Sizes". Story (in Spanish):
854
+ - Explain in a sentence in Telugu what is backpropagation in neural networks.
855
+
856
+ **Feel free to share your generations in the Community tab!**
857
+
858
+ ## How to use
859
+
860
+ ### CPU
861
+
862
+ <details>
863
+ <summary> Click to expand </summary>
864
+
865
+ ```python
866
+ # pip install -q transformers
867
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
868
+
869
+ checkpoint = "bigscience/mt0-xxl"
870
+
871
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
872
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
873
+
874
+ inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt")
875
+ outputs = model.generate(inputs)
876
+ print(tokenizer.decode(outputs[0]))
877
+ ```
878
+
879
+ </details>
880
+
881
+ ### GPU
882
+
883
+ <details>
884
+ <summary> Click to expand </summary>
885
+
886
+ ```python
887
+ # pip install -q transformers accelerate
888
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
889
+
890
+ checkpoint = "bigscience/mt0-xxl"
891
+
892
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
893
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
894
+
895
+ inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
896
+ outputs = model.generate(inputs)
897
+ print(tokenizer.decode(outputs[0]))
898
+ ```
899
+
900
+ </details>
901
+
902
+ ### GPU in 8bit
903
+
904
+ <details>
905
+ <summary> Click to expand </summary>
906
+
907
+ ```python
908
+ # pip install -q transformers accelerate bitsandbytes
909
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
910
+
911
+ checkpoint = "bigscience/mt0-xxl"
912
+
913
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
914
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto", load_in_8bit=True)
915
+
916
+ inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
917
+ outputs = model.generate(inputs)
918
+ print(tokenizer.decode(outputs[0]))
919
+ ```
920
+
921
+ </details>
922
+
923
+ <!-- Necessary for whitespace -->
924
+ ###
925
+
926
+ # Limitations
927
+
928
+ **Prompt Engineering:** The performance may vary depending on the prompt. For BLOOMZ models, we recommend making it very clear when the input stops to avoid the model trying to continue it. For example, the prompt "*Translate to English: Je t'aime*" without the full stop (.) at the end, may result in the model trying to continue the French sentence. Better prompts are e.g. "*Translate to English: Je t'aime.*", "*Translate to English: Je t'aime. Translation:*" "*What is "Je t'aime." in English?*", where it is clear for the model when it should answer. Further, we recommend providing the model as much context as possible. For example, if you want it to answer in Telugu, then tell the model, e.g. "*Explain in a sentence in Telugu what is backpropagation in neural networks.*".
929
+
930
+ # Training
931
+
932
+ ## Model
933
+
934
+ - **Architecture:** Same as [mt5-xxl](https://huggingface.co/google/mt5-xxl), also refer to the `config.json` file
935
+ - **Finetuning steps:** 7000
936
+ - **Finetuning tokens:** 1.29 billion
937
+ - **Precision:** bfloat16
938
+
939
+ ## Hardware
940
+
941
+ - **TPUs:** TPUv4-256
942
+
943
+ ## Software
944
+
945
+ - **Orchestration:** [T5X](https://github.com/google-research/t5x)
946
+ - **Neural networks:** [Jax](https://github.com/google/jax)
947
+
948
+ # Evaluation
949
+
950
+ We refer to Table 7 from our [paper](https://arxiv.org/abs/2211.01786) & [bigscience/evaluation-results](https://huggingface.co/datasets/bigscience/evaluation-results) for zero-shot results on unseen tasks. The sidebar reports zero-shot performance of the best prompt per dataset config.
951
+
952
+ # Citation
953
+ ```bibtex
954
+ @article{muennighoff2022crosslingual,
955
+ title={Crosslingual generalization through multitask finetuning},
956
+ author={Muennighoff, Niklas and Wang, Thomas and Sutawika, Lintang and Roberts, Adam and Biderman, Stella and Scao, Teven Le and Bari, M Saiful and Shen, Sheng and Yong, Zheng-Xin and Schoelkopf, Hailey and others},
957
+ journal={arXiv preprint arXiv:2211.01786},
958
+ year={2022}
959
+ }
960
+ ```
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-xxl",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 10240,
7
+ "d_kv": 64,
8
+ "d_model": 4096,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "mt5",
19
+ "num_decoder_layers": 24,
20
+ "num_heads": 64,
21
+ "num_layers": 24,
22
+ "output_past": true,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.23.1",
30
+ "use_cache": true,
31
+ "vocab_size": 250112
32
+ }
model-00001-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d248e71582e351642d5e44e381e9be233ee2b04eb5833a7603d69ed51ac39585
3
+ size 9936568872
model-00002-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eda538fe84abd7cab450a68c69a76588dc7a6c34ddc2418b6ab0e514bcdc0952
3
+ size 9865443384
model-00003-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:881ec52a2544492e0a96949cef398ba8c9b2aa37a33618a0c37f418fa259a94e
3
+ size 9869476808
model-00004-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fee75c951a13df924f0d496087fb477c4c72ac1c6a26f4b860ceaa825014cb87
3
+ size 9999712688
model-00005-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:644f87dc1851d7c073cb46747fb0fb72738174e93ed0e07fd7c8e6aeca002f3d
3
+ size 9999712416
model-00006-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:287e19e2e0b91211df7cf90d8760e60572ca04b1ada82110e582a4581baa9255
3
+ size 6111219176
model.safetensors.index.json ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 55782064128
4
+ },
5
+ "weight_map": {
6
+ "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
7
+ "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
8
+ "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
9
+ "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00003-of-00006.safetensors",
10
+ "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
11
+ "decoder.block.0.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
12
+ "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00003-of-00006.safetensors",
13
+ "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00003-of-00006.safetensors",
14
+ "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00003-of-00006.safetensors",
15
+ "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00003-of-00006.safetensors",
16
+ "decoder.block.0.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
17
+ "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
18
+ "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
19
+ "decoder.block.0.layer.2.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
20
+ "decoder.block.0.layer.2.layer_norm.weight": "model-00003-of-00006.safetensors",
21
+ "decoder.block.1.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
22
+ "decoder.block.1.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
23
+ "decoder.block.1.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
24
+ "decoder.block.1.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
25
+ "decoder.block.1.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
26
+ "decoder.block.1.layer.1.EncDecAttention.k.weight": "model-00003-of-00006.safetensors",
27
+ "decoder.block.1.layer.1.EncDecAttention.o.weight": "model-00003-of-00006.safetensors",
28
+ "decoder.block.1.layer.1.EncDecAttention.q.weight": "model-00003-of-00006.safetensors",
29
+ "decoder.block.1.layer.1.EncDecAttention.v.weight": "model-00003-of-00006.safetensors",
30
+ "decoder.block.1.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
31
+ "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
32
+ "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
33
+ "decoder.block.1.layer.2.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
34
+ "decoder.block.1.layer.2.layer_norm.weight": "model-00003-of-00006.safetensors",
35
+ "decoder.block.10.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
36
+ "decoder.block.10.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
37
+ "decoder.block.10.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
38
+ "decoder.block.10.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
39
+ "decoder.block.10.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
40
+ "decoder.block.10.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
41
+ "decoder.block.10.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
42
+ "decoder.block.10.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
43
+ "decoder.block.10.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
44
+ "decoder.block.10.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
45
+ "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
46
+ "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
47
+ "decoder.block.10.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
48
+ "decoder.block.10.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
49
+ "decoder.block.11.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
50
+ "decoder.block.11.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
51
+ "decoder.block.11.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
52
+ "decoder.block.11.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
53
+ "decoder.block.11.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
54
+ "decoder.block.11.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
55
+ "decoder.block.11.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
56
+ "decoder.block.11.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
57
+ "decoder.block.11.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
58
+ "decoder.block.11.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
59
+ "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
60
+ "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
61
+ "decoder.block.11.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
62
+ "decoder.block.11.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
63
+ "decoder.block.12.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
64
+ "decoder.block.12.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
65
+ "decoder.block.12.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
66
+ "decoder.block.12.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
67
+ "decoder.block.12.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
68
+ "decoder.block.12.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
69
+ "decoder.block.12.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
70
+ "decoder.block.12.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
71
+ "decoder.block.12.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
72
+ "decoder.block.12.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
73
+ "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
74
+ "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
75
+ "decoder.block.12.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
76
+ "decoder.block.12.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
77
+ "decoder.block.13.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
78
+ "decoder.block.13.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
79
+ "decoder.block.13.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
80
+ "decoder.block.13.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
81
+ "decoder.block.13.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
82
+ "decoder.block.13.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
83
+ "decoder.block.13.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
84
+ "decoder.block.13.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
85
+ "decoder.block.13.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
86
+ "decoder.block.13.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
87
+ "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
88
+ "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
89
+ "decoder.block.13.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
90
+ "decoder.block.13.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
91
+ "decoder.block.14.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
92
+ "decoder.block.14.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
93
+ "decoder.block.14.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
94
+ "decoder.block.14.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
95
+ "decoder.block.14.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
96
+ "decoder.block.14.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
97
+ "decoder.block.14.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
98
+ "decoder.block.14.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
99
+ "decoder.block.14.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
100
+ "decoder.block.14.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
101
+ "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
102
+ "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
103
+ "decoder.block.14.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
104
+ "decoder.block.14.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
105
+ "decoder.block.15.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
106
+ "decoder.block.15.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
107
+ "decoder.block.15.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
108
+ "decoder.block.15.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
109
+ "decoder.block.15.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
110
+ "decoder.block.15.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
111
+ "decoder.block.15.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
112
+ "decoder.block.15.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
113
+ "decoder.block.15.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
114
+ "decoder.block.15.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
115
+ "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
116
+ "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
117
+ "decoder.block.15.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
118
+ "decoder.block.15.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
119
+ "decoder.block.16.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
120
+ "decoder.block.16.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
121
+ "decoder.block.16.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
122
+ "decoder.block.16.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
123
+ "decoder.block.16.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
124
+ "decoder.block.16.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
125
+ "decoder.block.16.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
126
+ "decoder.block.16.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
127
+ "decoder.block.16.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
128
+ "decoder.block.16.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
129
+ "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
130
+ "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
131
+ "decoder.block.16.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
132
+ "decoder.block.16.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
133
+ "decoder.block.17.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
134
+ "decoder.block.17.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
135
+ "decoder.block.17.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
136
+ "decoder.block.17.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
137
+ "decoder.block.17.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
138
+ "decoder.block.17.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
139
+ "decoder.block.17.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
140
+ "decoder.block.17.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
141
+ "decoder.block.17.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
142
+ "decoder.block.17.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
143
+ "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
144
+ "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
145
+ "decoder.block.17.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
146
+ "decoder.block.17.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
147
+ "decoder.block.18.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
148
+ "decoder.block.18.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
149
+ "decoder.block.18.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
150
+ "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
151
+ "decoder.block.18.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
152
+ "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
153
+ "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
154
+ "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
155
+ "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
156
+ "decoder.block.18.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
157
+ "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
158
+ "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
159
+ "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
160
+ "decoder.block.18.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
161
+ "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
162
+ "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
163
+ "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
164
+ "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
165
+ "decoder.block.19.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
166
+ "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
167
+ "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
168
+ "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
169
+ "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
170
+ "decoder.block.19.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
171
+ "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
172
+ "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
173
+ "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
174
+ "decoder.block.19.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
175
+ "decoder.block.2.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
176
+ "decoder.block.2.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
177
+ "decoder.block.2.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
178
+ "decoder.block.2.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
179
+ "decoder.block.2.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
180
+ "decoder.block.2.layer.1.EncDecAttention.k.weight": "model-00003-of-00006.safetensors",
181
+ "decoder.block.2.layer.1.EncDecAttention.o.weight": "model-00003-of-00006.safetensors",
182
+ "decoder.block.2.layer.1.EncDecAttention.q.weight": "model-00003-of-00006.safetensors",
183
+ "decoder.block.2.layer.1.EncDecAttention.v.weight": "model-00003-of-00006.safetensors",
184
+ "decoder.block.2.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
185
+ "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
186
+ "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
187
+ "decoder.block.2.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
188
+ "decoder.block.2.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
189
+ "decoder.block.20.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
190
+ "decoder.block.20.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
191
+ "decoder.block.20.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
192
+ "decoder.block.20.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
193
+ "decoder.block.20.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
194
+ "decoder.block.20.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
195
+ "decoder.block.20.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
196
+ "decoder.block.20.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
197
+ "decoder.block.20.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
198
+ "decoder.block.20.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
199
+ "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
200
+ "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
201
+ "decoder.block.20.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
202
+ "decoder.block.20.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
203
+ "decoder.block.21.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
204
+ "decoder.block.21.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
205
+ "decoder.block.21.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
206
+ "decoder.block.21.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
207
+ "decoder.block.21.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
208
+ "decoder.block.21.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
209
+ "decoder.block.21.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
210
+ "decoder.block.21.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
211
+ "decoder.block.21.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
212
+ "decoder.block.21.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
213
+ "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
214
+ "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
215
+ "decoder.block.21.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
216
+ "decoder.block.21.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
217
+ "decoder.block.22.layer.0.SelfAttention.k.weight": "model-00006-of-00006.safetensors",
218
+ "decoder.block.22.layer.0.SelfAttention.o.weight": "model-00006-of-00006.safetensors",
219
+ "decoder.block.22.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
220
+ "decoder.block.22.layer.0.SelfAttention.v.weight": "model-00006-of-00006.safetensors",
221
+ "decoder.block.22.layer.0.layer_norm.weight": "model-00006-of-00006.safetensors",
222
+ "decoder.block.22.layer.1.EncDecAttention.k.weight": "model-00006-of-00006.safetensors",
223
+ "decoder.block.22.layer.1.EncDecAttention.o.weight": "model-00006-of-00006.safetensors",
224
+ "decoder.block.22.layer.1.EncDecAttention.q.weight": "model-00006-of-00006.safetensors",
225
+ "decoder.block.22.layer.1.EncDecAttention.v.weight": "model-00006-of-00006.safetensors",
226
+ "decoder.block.22.layer.1.layer_norm.weight": "model-00006-of-00006.safetensors",
227
+ "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00006.safetensors",
228
+ "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00006.safetensors",
229
+ "decoder.block.22.layer.2.DenseReluDense.wo.weight": "model-00006-of-00006.safetensors",
230
+ "decoder.block.22.layer.2.layer_norm.weight": "model-00006-of-00006.safetensors",
231
+ "decoder.block.23.layer.0.SelfAttention.k.weight": "model-00006-of-00006.safetensors",
232
+ "decoder.block.23.layer.0.SelfAttention.o.weight": "model-00006-of-00006.safetensors",
233
+ "decoder.block.23.layer.0.SelfAttention.q.weight": "model-00006-of-00006.safetensors",
234
+ "decoder.block.23.layer.0.SelfAttention.v.weight": "model-00006-of-00006.safetensors",
235
+ "decoder.block.23.layer.0.layer_norm.weight": "model-00006-of-00006.safetensors",
236
+ "decoder.block.23.layer.1.EncDecAttention.k.weight": "model-00006-of-00006.safetensors",
237
+ "decoder.block.23.layer.1.EncDecAttention.o.weight": "model-00006-of-00006.safetensors",
238
+ "decoder.block.23.layer.1.EncDecAttention.q.weight": "model-00006-of-00006.safetensors",
239
+ "decoder.block.23.layer.1.EncDecAttention.v.weight": "model-00006-of-00006.safetensors",
240
+ "decoder.block.23.layer.1.layer_norm.weight": "model-00006-of-00006.safetensors",
241
+ "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00006.safetensors",
242
+ "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00006.safetensors",
243
+ "decoder.block.23.layer.2.DenseReluDense.wo.weight": "model-00006-of-00006.safetensors",
244
+ "decoder.block.23.layer.2.layer_norm.weight": "model-00006-of-00006.safetensors",
245
+ "decoder.block.3.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
246
+ "decoder.block.3.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
247
+ "decoder.block.3.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
248
+ "decoder.block.3.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
249
+ "decoder.block.3.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
250
+ "decoder.block.3.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
251
+ "decoder.block.3.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
252
+ "decoder.block.3.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
253
+ "decoder.block.3.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
254
+ "decoder.block.3.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
255
+ "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
256
+ "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
257
+ "decoder.block.3.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
258
+ "decoder.block.3.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
259
+ "decoder.block.4.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
260
+ "decoder.block.4.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
261
+ "decoder.block.4.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
262
+ "decoder.block.4.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
263
+ "decoder.block.4.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
264
+ "decoder.block.4.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
265
+ "decoder.block.4.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
266
+ "decoder.block.4.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
267
+ "decoder.block.4.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
268
+ "decoder.block.4.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
269
+ "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
270
+ "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
271
+ "decoder.block.4.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
272
+ "decoder.block.4.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
273
+ "decoder.block.5.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
274
+ "decoder.block.5.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
275
+ "decoder.block.5.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
276
+ "decoder.block.5.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
277
+ "decoder.block.5.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
278
+ "decoder.block.5.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
279
+ "decoder.block.5.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
280
+ "decoder.block.5.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
281
+ "decoder.block.5.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
282
+ "decoder.block.5.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
283
+ "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
284
+ "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
285
+ "decoder.block.5.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
286
+ "decoder.block.5.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
287
+ "decoder.block.6.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
288
+ "decoder.block.6.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
289
+ "decoder.block.6.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
290
+ "decoder.block.6.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
291
+ "decoder.block.6.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
292
+ "decoder.block.6.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
293
+ "decoder.block.6.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
294
+ "decoder.block.6.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
295
+ "decoder.block.6.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
296
+ "decoder.block.6.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
297
+ "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
298
+ "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
299
+ "decoder.block.6.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
300
+ "decoder.block.6.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
301
+ "decoder.block.7.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
302
+ "decoder.block.7.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
303
+ "decoder.block.7.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
304
+ "decoder.block.7.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
305
+ "decoder.block.7.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
306
+ "decoder.block.7.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
307
+ "decoder.block.7.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
308
+ "decoder.block.7.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
309
+ "decoder.block.7.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
310
+ "decoder.block.7.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
311
+ "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
312
+ "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
313
+ "decoder.block.7.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
314
+ "decoder.block.7.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
315
+ "decoder.block.8.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
316
+ "decoder.block.8.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
317
+ "decoder.block.8.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
318
+ "decoder.block.8.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
319
+ "decoder.block.8.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
320
+ "decoder.block.8.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
321
+ "decoder.block.8.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
322
+ "decoder.block.8.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
323
+ "decoder.block.8.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
324
+ "decoder.block.8.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
325
+ "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
326
+ "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
327
+ "decoder.block.8.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
328
+ "decoder.block.8.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
329
+ "decoder.block.9.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
330
+ "decoder.block.9.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
331
+ "decoder.block.9.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
332
+ "decoder.block.9.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
333
+ "decoder.block.9.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
334
+ "decoder.block.9.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
335
+ "decoder.block.9.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
336
+ "decoder.block.9.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
337
+ "decoder.block.9.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
338
+ "decoder.block.9.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
339
+ "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
340
+ "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
341
+ "decoder.block.9.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
342
+ "decoder.block.9.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
343
+ "decoder.embed_tokens.weight": "model-00003-of-00006.safetensors",
344
+ "decoder.final_layer_norm.weight": "model-00006-of-00006.safetensors",
345
+ "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
346
+ "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
347
+ "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
348
+ "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00006.safetensors",
349
+ "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
350
+ "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
351
+ "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
352
+ "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
353
+ "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
354
+ "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
355
+ "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
356
+ "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
357
+ "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
358
+ "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
359
+ "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
360
+ "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
361
+ "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
362
+ "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
363
+ "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
364
+ "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
365
+ "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
366
+ "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
367
+ "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
368
+ "encoder.block.10.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
369
+ "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
370
+ "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
371
+ "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
372
+ "encoder.block.10.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
373
+ "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
374
+ "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
375
+ "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
376
+ "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
377
+ "encoder.block.11.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
378
+ "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
379
+ "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
380
+ "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
381
+ "encoder.block.11.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
382
+ "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
383
+ "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
384
+ "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
385
+ "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
386
+ "encoder.block.12.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
387
+ "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
388
+ "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
389
+ "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
390
+ "encoder.block.12.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
391
+ "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
392
+ "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
393
+ "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
394
+ "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
395
+ "encoder.block.13.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
396
+ "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
397
+ "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
398
+ "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
399
+ "encoder.block.13.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
400
+ "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
401
+ "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
402
+ "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
403
+ "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
404
+ "encoder.block.14.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
405
+ "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
406
+ "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
407
+ "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
408
+ "encoder.block.14.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
409
+ "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
410
+ "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
411
+ "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
412
+ "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
413
+ "encoder.block.15.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
414
+ "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
415
+ "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
416
+ "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
417
+ "encoder.block.15.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
418
+ "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
419
+ "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
420
+ "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
421
+ "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
422
+ "encoder.block.16.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
423
+ "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
424
+ "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
425
+ "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
426
+ "encoder.block.16.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
427
+ "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
428
+ "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
429
+ "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
430
+ "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
431
+ "encoder.block.17.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
432
+ "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
433
+ "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
434
+ "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
435
+ "encoder.block.17.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
436
+ "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
437
+ "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
438
+ "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
439
+ "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
440
+ "encoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
441
+ "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
442
+ "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
443
+ "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
444
+ "encoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
445
+ "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
446
+ "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
447
+ "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
448
+ "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
449
+ "encoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
450
+ "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
451
+ "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
452
+ "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
453
+ "encoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
454
+ "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
455
+ "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
456
+ "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
457
+ "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
458
+ "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
459
+ "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
460
+ "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
461
+ "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
462
+ "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
463
+ "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
464
+ "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
465
+ "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
466
+ "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
467
+ "encoder.block.20.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
468
+ "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
469
+ "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
470
+ "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
471
+ "encoder.block.20.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
472
+ "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
473
+ "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
474
+ "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
475
+ "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
476
+ "encoder.block.21.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
477
+ "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
478
+ "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
479
+ "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
480
+ "encoder.block.21.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
481
+ "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
482
+ "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
483
+ "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
484
+ "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
485
+ "encoder.block.22.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
486
+ "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
487
+ "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
488
+ "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
489
+ "encoder.block.22.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
490
+ "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
491
+ "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
492
+ "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
493
+ "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
494
+ "encoder.block.23.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
495
+ "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
496
+ "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
497
+ "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
498
+ "encoder.block.23.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
499
+ "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
500
+ "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
501
+ "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
502
+ "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
503
+ "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
504
+ "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
505
+ "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
506
+ "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
507
+ "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
508
+ "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
509
+ "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
510
+ "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
511
+ "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
512
+ "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
513
+ "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
514
+ "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
515
+ "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
516
+ "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
517
+ "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
518
+ "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
519
+ "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
520
+ "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
521
+ "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
522
+ "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
523
+ "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
524
+ "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
525
+ "encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
526
+ "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
527
+ "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
528
+ "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
529
+ "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
530
+ "encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
531
+ "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
532
+ "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
533
+ "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
534
+ "encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
535
+ "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
536
+ "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
537
+ "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
538
+ "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
539
+ "encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
540
+ "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
541
+ "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
542
+ "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
543
+ "encoder.block.7.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
544
+ "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
545
+ "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
546
+ "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
547
+ "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
548
+ "encoder.block.8.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
549
+ "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
550
+ "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
551
+ "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
552
+ "encoder.block.8.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
553
+ "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
554
+ "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
555
+ "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
556
+ "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
557
+ "encoder.block.9.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
558
+ "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
559
+ "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
560
+ "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
561
+ "encoder.block.9.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
562
+ "encoder.final_layer_norm.weight": "model-00003-of-00006.safetensors",
563
+ "lm_head.weight": "model-00006-of-00006.safetensors",
564
+ "shared.weight": "model-00001-of-00006.safetensors"
565
+ }
566
+ }
pytorch_model-00001-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295a276775d79359cfd243bd93c9e2c408a8e33718e5bee1d05625f026af6175
3
+ size 9936583612
pytorch_model-00002-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c21533a6182886bec48cd0190952b3c5e71224873234135c2754f7c81d02ac82
3
+ size 9865466989
pytorch_model-00003-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62cc874eb7f5cfa6fcbde4a19bab7de1f7bf8b47f0f01c45713927115c85a153
3
+ size 9869491791
pytorch_model-00004-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36b2a5945f7c037b99eaf5ed891fc158b23a791b92042861a5298b0c8ec224be
3
+ size 9999740653
pytorch_model-00005-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f769732a1c4ba3a9cbd9ea1c2701ade3cdf2a35f73e75ac77d0c26788a5d88f
3
+ size 9999739675
pytorch_model-00006-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92679f99746d0e1082d7407091cb7f2a588d49b9bf13724f706e8912f86c5786
3
+ size 6111224758
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 55782064128
4
+ },
5
+ "weight_map": {
6
+ "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
7
+ "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
8
+ "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
9
+ "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00003-of-00006.bin",
10
+ "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
11
+ "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
12
+ "decoder.block.0.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00006.bin",
13
+ "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00006.bin",
14
+ "decoder.block.0.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00006.bin",
15
+ "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00006.bin",
16
+ "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
17
+ "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
18
+ "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
19
+ "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
20
+ "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
21
+ "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
22
+ "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
23
+ "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
24
+ "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
25
+ "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
26
+ "decoder.block.1.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00006.bin",
27
+ "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00006.bin",
28
+ "decoder.block.1.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00006.bin",
29
+ "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00006.bin",
30
+ "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
31
+ "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
32
+ "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
33
+ "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
34
+ "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
35
+ "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
36
+ "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
37
+ "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
38
+ "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
39
+ "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
40
+ "decoder.block.10.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
41
+ "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
42
+ "decoder.block.10.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
43
+ "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
44
+ "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
45
+ "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
46
+ "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
47
+ "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
48
+ "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
49
+ "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
50
+ "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
51
+ "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
52
+ "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
53
+ "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
54
+ "decoder.block.11.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
55
+ "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
56
+ "decoder.block.11.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
57
+ "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
58
+ "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
59
+ "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
60
+ "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
61
+ "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
62
+ "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
63
+ "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
64
+ "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
65
+ "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
66
+ "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
67
+ "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
68
+ "decoder.block.12.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
69
+ "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
70
+ "decoder.block.12.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
71
+ "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
72
+ "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
73
+ "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
74
+ "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
75
+ "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
76
+ "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
77
+ "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
78
+ "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
79
+ "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
80
+ "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
81
+ "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
82
+ "decoder.block.13.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
83
+ "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
84
+ "decoder.block.13.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
85
+ "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
86
+ "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
87
+ "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
88
+ "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
89
+ "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
90
+ "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
91
+ "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
92
+ "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
93
+ "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
94
+ "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
95
+ "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
96
+ "decoder.block.14.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
97
+ "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
98
+ "decoder.block.14.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
99
+ "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
100
+ "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
101
+ "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
102
+ "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
103
+ "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
104
+ "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
105
+ "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
106
+ "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
107
+ "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
108
+ "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
109
+ "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
110
+ "decoder.block.15.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
111
+ "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
112
+ "decoder.block.15.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
113
+ "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
114
+ "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
115
+ "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
116
+ "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
117
+ "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
118
+ "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
119
+ "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
120
+ "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
121
+ "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
122
+ "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
123
+ "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
124
+ "decoder.block.16.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
125
+ "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
126
+ "decoder.block.16.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
127
+ "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
128
+ "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
129
+ "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
130
+ "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
131
+ "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
132
+ "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
133
+ "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
134
+ "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
135
+ "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
136
+ "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
137
+ "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
138
+ "decoder.block.17.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
139
+ "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
140
+ "decoder.block.17.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
141
+ "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
142
+ "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
143
+ "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
144
+ "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
145
+ "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
146
+ "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
147
+ "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
148
+ "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
149
+ "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
150
+ "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
151
+ "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
152
+ "decoder.block.18.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
153
+ "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
154
+ "decoder.block.18.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
155
+ "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
156
+ "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
157
+ "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
158
+ "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
159
+ "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
160
+ "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
161
+ "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
162
+ "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
163
+ "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
164
+ "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
165
+ "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
166
+ "decoder.block.19.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
167
+ "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
168
+ "decoder.block.19.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
169
+ "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
170
+ "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
171
+ "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
172
+ "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
173
+ "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
174
+ "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
175
+ "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
176
+ "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
177
+ "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
178
+ "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
179
+ "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
180
+ "decoder.block.2.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00006.bin",
181
+ "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00006.bin",
182
+ "decoder.block.2.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00006.bin",
183
+ "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00006.bin",
184
+ "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
185
+ "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
186
+ "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
187
+ "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
188
+ "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
189
+ "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
190
+ "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
191
+ "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
192
+ "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
193
+ "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
194
+ "decoder.block.20.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
195
+ "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
196
+ "decoder.block.20.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
197
+ "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
198
+ "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
199
+ "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
200
+ "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
201
+ "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
202
+ "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
203
+ "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
204
+ "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
205
+ "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
206
+ "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
207
+ "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
208
+ "decoder.block.21.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
209
+ "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
210
+ "decoder.block.21.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
211
+ "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
212
+ "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
213
+ "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
214
+ "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
215
+ "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
216
+ "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
217
+ "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00006-of-00006.bin",
218
+ "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00006-of-00006.bin",
219
+ "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
220
+ "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00006-of-00006.bin",
221
+ "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
222
+ "decoder.block.22.layer.1.EncDecAttention.k.weight": "pytorch_model-00006-of-00006.bin",
223
+ "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00006-of-00006.bin",
224
+ "decoder.block.22.layer.1.EncDecAttention.q.weight": "pytorch_model-00006-of-00006.bin",
225
+ "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00006-of-00006.bin",
226
+ "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
227
+ "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00006-of-00006.bin",
228
+ "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00006-of-00006.bin",
229
+ "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00006-of-00006.bin",
230
+ "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
231
+ "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00006-of-00006.bin",
232
+ "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00006-of-00006.bin",
233
+ "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00006-of-00006.bin",
234
+ "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00006-of-00006.bin",
235
+ "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
236
+ "decoder.block.23.layer.1.EncDecAttention.k.weight": "pytorch_model-00006-of-00006.bin",
237
+ "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00006-of-00006.bin",
238
+ "decoder.block.23.layer.1.EncDecAttention.q.weight": "pytorch_model-00006-of-00006.bin",
239
+ "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00006-of-00006.bin",
240
+ "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
241
+ "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00006-of-00006.bin",
242
+ "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00006-of-00006.bin",
243
+ "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00006-of-00006.bin",
244
+ "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
245
+ "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
246
+ "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
247
+ "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
248
+ "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
249
+ "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
250
+ "decoder.block.3.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
251
+ "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
252
+ "decoder.block.3.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
253
+ "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
254
+ "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
255
+ "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
256
+ "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
257
+ "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
258
+ "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
259
+ "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
260
+ "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
261
+ "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
262
+ "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
263
+ "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
264
+ "decoder.block.4.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
265
+ "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
266
+ "decoder.block.4.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
267
+ "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
268
+ "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
269
+ "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
270
+ "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
271
+ "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
272
+ "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
273
+ "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
274
+ "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
275
+ "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
276
+ "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
277
+ "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
278
+ "decoder.block.5.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
279
+ "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
280
+ "decoder.block.5.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
281
+ "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
282
+ "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
283
+ "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
284
+ "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
285
+ "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
286
+ "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
287
+ "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
288
+ "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
289
+ "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
290
+ "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
291
+ "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
292
+ "decoder.block.6.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
293
+ "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
294
+ "decoder.block.6.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
295
+ "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
296
+ "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
297
+ "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
298
+ "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
299
+ "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
300
+ "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
301
+ "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
302
+ "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
303
+ "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
304
+ "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
305
+ "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
306
+ "decoder.block.7.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
307
+ "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
308
+ "decoder.block.7.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
309
+ "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
310
+ "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
311
+ "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
312
+ "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
313
+ "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
314
+ "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
315
+ "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
316
+ "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
317
+ "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
318
+ "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
319
+ "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
320
+ "decoder.block.8.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
321
+ "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
322
+ "decoder.block.8.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
323
+ "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
324
+ "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
325
+ "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
326
+ "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
327
+ "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
328
+ "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
329
+ "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
330
+ "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
331
+ "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
332
+ "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
333
+ "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
334
+ "decoder.block.9.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
335
+ "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
336
+ "decoder.block.9.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
337
+ "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
338
+ "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
339
+ "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
340
+ "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
341
+ "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
342
+ "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
343
+ "decoder.embed_tokens.weight": "pytorch_model-00003-of-00006.bin",
344
+ "decoder.final_layer_norm.weight": "pytorch_model-00006-of-00006.bin",
345
+ "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
346
+ "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
347
+ "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
348
+ "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00006.bin",
349
+ "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
350
+ "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
351
+ "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
352
+ "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
353
+ "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
354
+ "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
355
+ "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
356
+ "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
357
+ "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
358
+ "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
359
+ "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
360
+ "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
361
+ "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
362
+ "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
363
+ "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
364
+ "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
365
+ "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
366
+ "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
367
+ "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
368
+ "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
369
+ "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
370
+ "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
371
+ "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
372
+ "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
373
+ "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
374
+ "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
375
+ "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
376
+ "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
377
+ "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
378
+ "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
379
+ "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
380
+ "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
381
+ "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
382
+ "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
383
+ "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
384
+ "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
385
+ "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
386
+ "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
387
+ "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
388
+ "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
389
+ "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
390
+ "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
391
+ "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
392
+ "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
393
+ "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
394
+ "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
395
+ "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
396
+ "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
397
+ "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
398
+ "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
399
+ "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
400
+ "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
401
+ "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
402
+ "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
403
+ "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
404
+ "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
405
+ "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
406
+ "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
407
+ "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
408
+ "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
409
+ "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
410
+ "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
411
+ "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
412
+ "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
413
+ "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
414
+ "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
415
+ "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
416
+ "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
417
+ "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
418
+ "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
419
+ "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
420
+ "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
421
+ "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
422
+ "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
423
+ "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
424
+ "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
425
+ "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
426
+ "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
427
+ "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
428
+ "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
429
+ "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
430
+ "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
431
+ "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
432
+ "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
433
+ "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
434
+ "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
435
+ "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
436
+ "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
437
+ "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
438
+ "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
439
+ "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
440
+ "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
441
+ "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
442
+ "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
443
+ "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
444
+ "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
445
+ "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
446
+ "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
447
+ "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
448
+ "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
449
+ "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
450
+ "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
451
+ "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
452
+ "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
453
+ "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
454
+ "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
455
+ "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
456
+ "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
457
+ "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
458
+ "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
459
+ "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
460
+ "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
461
+ "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
462
+ "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
463
+ "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
464
+ "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
465
+ "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
466
+ "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
467
+ "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
468
+ "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
469
+ "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
470
+ "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
471
+ "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
472
+ "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
473
+ "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
474
+ "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
475
+ "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
476
+ "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
477
+ "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
478
+ "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
479
+ "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
480
+ "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
481
+ "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
482
+ "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
483
+ "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
484
+ "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
485
+ "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
486
+ "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
487
+ "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
488
+ "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
489
+ "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
490
+ "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
491
+ "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
492
+ "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
493
+ "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
494
+ "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
495
+ "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
496
+ "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
497
+ "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
498
+ "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
499
+ "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
500
+ "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
501
+ "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
502
+ "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
503
+ "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
504
+ "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
505
+ "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
506
+ "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
507
+ "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
508
+ "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
509
+ "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
510
+ "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
511
+ "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
512
+ "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
513
+ "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
514
+ "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
515
+ "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
516
+ "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
517
+ "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
518
+ "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
519
+ "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
520
+ "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
521
+ "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
522
+ "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
523
+ "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
524
+ "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
525
+ "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
526
+ "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
527
+ "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
528
+ "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
529
+ "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
530
+ "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
531
+ "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
532
+ "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
533
+ "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
534
+ "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
535
+ "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
536
+ "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
537
+ "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
538
+ "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
539
+ "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
540
+ "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
541
+ "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
542
+ "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
543
+ "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
544
+ "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
545
+ "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
546
+ "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
547
+ "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
548
+ "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
549
+ "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
550
+ "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
551
+ "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
552
+ "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
553
+ "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
554
+ "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
555
+ "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
556
+ "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
557
+ "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
558
+ "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
559
+ "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
560
+ "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
561
+ "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
562
+ "encoder.final_layer_norm.weight": "pytorch_model-00003-of-00006.bin",
563
+ "lm_head.weight": "pytorch_model-00006-of-00006.bin",
564
+ "shared.weight": "pytorch_model-00001-of-00006.bin"
565
+ }
566
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93c3578052e1605d8332eb961bc08d72e246071974e4cc54aa6991826b802aa5
3
+ size 16330369
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "eos_token": "</s>",
4
+ "extra_ids": 0,
5
+ "name_or_path": "google/mt5-xxl",
6
+ "pad_token": "<pad>",
7
+ "sp_model_kwargs": {},
8
+ "special_tokens_map_file": "/home/patrick/.cache/torch/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276",
9
+ "tokenizer_class": "T5Tokenizer",
10
+ "unk_token": "<unk>"
11
+ }