CharlieFRuan commited on
Commit
49b7cec
1 Parent(s): 569c8cf

Initial commit

Browse files
logs.txt ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
mlc-chat-config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "gpt2",
3
+ "quantization": "q0f16",
4
+ "model_config": {
5
+ "vocab_size": 50257,
6
+ "n_embd": 768,
7
+ "n_layer": 12,
8
+ "n_head": 12,
9
+ "layer_norm_epsilon": 1e-05,
10
+ "n_inner": 3072,
11
+ "context_window_size": 1024,
12
+ "prefill_chunk_size": 1024,
13
+ "scale_attn_by_inverse_layer_idx": false,
14
+ "tensor_parallel_shards": 1
15
+ },
16
+ "vocab_size": 50257,
17
+ "context_window_size": 1024,
18
+ "sliding_window_size": -1,
19
+ "prefill_chunk_size": 1024,
20
+ "attention_sink_size": -1,
21
+ "tensor_parallel_shards": 1,
22
+ "mean_gen_len": 128,
23
+ "max_gen_len": 512,
24
+ "shift_fill_factor": 0.3,
25
+ "temperature": 0.7,
26
+ "repetition_penalty": 1.0,
27
+ "top_p": 0.95,
28
+ "conv_template": "gpt2",
29
+ "pad_token_id": 0,
30
+ "bos_token_id": 50256,
31
+ "eos_token_id": 50256,
32
+ "tokenizer_files": [
33
+ "tokenizer.json",
34
+ "vocab.json",
35
+ "merges.txt"
36
+ ],
37
+ "version": "0.1.0"
38
+ }
ndarray-cache.json ADDED
@@ -0,0 +1,1614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "ParamSize": 149,
4
+ "ParamBytes": 326074368.0,
5
+ "BitsPerParam": 16.0
6
+ },
7
+ "records": [
8
+ {
9
+ "dataPath": "params_shard_0.bin",
10
+ "format": "raw-shard",
11
+ "nbytes": 77194752,
12
+ "records": [
13
+ {
14
+ "name": "lm_head.weight",
15
+ "shape": [
16
+ 50257,
17
+ 768
18
+ ],
19
+ "dtype": "float16",
20
+ "format": "f32-to-bf16",
21
+ "nbytes": 77194752,
22
+ "byteOffset": 0
23
+ }
24
+ ],
25
+ "md5sum": "8aa629e8739ff337f4983e485d0a145e"
26
+ },
27
+ {
28
+ "dataPath": "params_shard_1.bin",
29
+ "format": "raw-shard",
30
+ "nbytes": 77194752,
31
+ "records": [
32
+ {
33
+ "name": "transformer.wte.weight",
34
+ "shape": [
35
+ 50257,
36
+ 768
37
+ ],
38
+ "dtype": "float16",
39
+ "format": "f32-to-bf16",
40
+ "nbytes": 77194752,
41
+ "byteOffset": 0
42
+ }
43
+ ],
44
+ "md5sum": "8aa629e8739ff337f4983e485d0a145e"
45
+ },
46
+ {
47
+ "dataPath": "params_shard_2.bin",
48
+ "format": "raw-shard",
49
+ "nbytes": 33470976,
50
+ "records": [
51
+ {
52
+ "name": "transformer.wpe.weight",
53
+ "shape": [
54
+ 1024,
55
+ 768
56
+ ],
57
+ "dtype": "float16",
58
+ "format": "f32-to-bf16",
59
+ "nbytes": 1572864,
60
+ "byteOffset": 0
61
+ },
62
+ {
63
+ "name": "transformer.h.0.ln_1.weight",
64
+ "shape": [
65
+ 768
66
+ ],
67
+ "dtype": "float16",
68
+ "format": "f32-to-bf16",
69
+ "nbytes": 1536,
70
+ "byteOffset": 1572864
71
+ },
72
+ {
73
+ "name": "transformer.h.0.ln_1.bias",
74
+ "shape": [
75
+ 768
76
+ ],
77
+ "dtype": "float16",
78
+ "format": "f32-to-bf16",
79
+ "nbytes": 1536,
80
+ "byteOffset": 1574400
81
+ },
82
+ {
83
+ "name": "transformer.h.0.attn.c_attn.weight",
84
+ "shape": [
85
+ 2304,
86
+ 768
87
+ ],
88
+ "dtype": "float16",
89
+ "format": "f32-to-bf16",
90
+ "nbytes": 3538944,
91
+ "byteOffset": 1575936
92
+ },
93
+ {
94
+ "name": "transformer.h.0.attn.c_attn.bias",
95
+ "shape": [
96
+ 2304
97
+ ],
98
+ "dtype": "float16",
99
+ "format": "f32-to-bf16",
100
+ "nbytes": 4608,
101
+ "byteOffset": 5114880
102
+ },
103
+ {
104
+ "name": "transformer.h.0.attn.c_proj.weight",
105
+ "shape": [
106
+ 768,
107
+ 768
108
+ ],
109
+ "dtype": "float16",
110
+ "format": "f32-to-bf16",
111
+ "nbytes": 1179648,
112
+ "byteOffset": 5119488
113
+ },
114
+ {
115
+ "name": "transformer.h.0.attn.c_proj.bias",
116
+ "shape": [
117
+ 768
118
+ ],
119
+ "dtype": "float16",
120
+ "format": "f32-to-bf16",
121
+ "nbytes": 1536,
122
+ "byteOffset": 6299136
123
+ },
124
+ {
125
+ "name": "transformer.h.0.ln_2.weight",
126
+ "shape": [
127
+ 768
128
+ ],
129
+ "dtype": "float16",
130
+ "format": "f32-to-bf16",
131
+ "nbytes": 1536,
132
+ "byteOffset": 6300672
133
+ },
134
+ {
135
+ "name": "transformer.h.0.ln_2.bias",
136
+ "shape": [
137
+ 768
138
+ ],
139
+ "dtype": "float16",
140
+ "format": "f32-to-bf16",
141
+ "nbytes": 1536,
142
+ "byteOffset": 6302208
143
+ },
144
+ {
145
+ "name": "transformer.h.0.mlp.c_fc.weight",
146
+ "shape": [
147
+ 3072,
148
+ 768
149
+ ],
150
+ "dtype": "float16",
151
+ "format": "f32-to-bf16",
152
+ "nbytes": 4718592,
153
+ "byteOffset": 6303744
154
+ },
155
+ {
156
+ "name": "transformer.h.0.mlp.c_fc.bias",
157
+ "shape": [
158
+ 3072
159
+ ],
160
+ "dtype": "float16",
161
+ "format": "f32-to-bf16",
162
+ "nbytes": 6144,
163
+ "byteOffset": 11022336
164
+ },
165
+ {
166
+ "name": "transformer.h.0.mlp.c_proj.weight",
167
+ "shape": [
168
+ 768,
169
+ 3072
170
+ ],
171
+ "dtype": "float16",
172
+ "format": "f32-to-bf16",
173
+ "nbytes": 4718592,
174
+ "byteOffset": 11028480
175
+ },
176
+ {
177
+ "name": "transformer.h.0.mlp.c_proj.bias",
178
+ "shape": [
179
+ 768
180
+ ],
181
+ "dtype": "float16",
182
+ "format": "f32-to-bf16",
183
+ "nbytes": 1536,
184
+ "byteOffset": 15747072
185
+ },
186
+ {
187
+ "name": "transformer.h.1.ln_1.weight",
188
+ "shape": [
189
+ 768
190
+ ],
191
+ "dtype": "float16",
192
+ "format": "f32-to-bf16",
193
+ "nbytes": 1536,
194
+ "byteOffset": 15748608
195
+ },
196
+ {
197
+ "name": "transformer.h.1.ln_1.bias",
198
+ "shape": [
199
+ 768
200
+ ],
201
+ "dtype": "float16",
202
+ "format": "f32-to-bf16",
203
+ "nbytes": 1536,
204
+ "byteOffset": 15750144
205
+ },
206
+ {
207
+ "name": "transformer.h.1.attn.c_attn.weight",
208
+ "shape": [
209
+ 2304,
210
+ 768
211
+ ],
212
+ "dtype": "float16",
213
+ "format": "f32-to-bf16",
214
+ "nbytes": 3538944,
215
+ "byteOffset": 15751680
216
+ },
217
+ {
218
+ "name": "transformer.h.1.attn.c_attn.bias",
219
+ "shape": [
220
+ 2304
221
+ ],
222
+ "dtype": "float16",
223
+ "format": "f32-to-bf16",
224
+ "nbytes": 4608,
225
+ "byteOffset": 19290624
226
+ },
227
+ {
228
+ "name": "transformer.h.1.attn.c_proj.weight",
229
+ "shape": [
230
+ 768,
231
+ 768
232
+ ],
233
+ "dtype": "float16",
234
+ "format": "f32-to-bf16",
235
+ "nbytes": 1179648,
236
+ "byteOffset": 19295232
237
+ },
238
+ {
239
+ "name": "transformer.h.1.attn.c_proj.bias",
240
+ "shape": [
241
+ 768
242
+ ],
243
+ "dtype": "float16",
244
+ "format": "f32-to-bf16",
245
+ "nbytes": 1536,
246
+ "byteOffset": 20474880
247
+ },
248
+ {
249
+ "name": "transformer.h.1.ln_2.weight",
250
+ "shape": [
251
+ 768
252
+ ],
253
+ "dtype": "float16",
254
+ "format": "f32-to-bf16",
255
+ "nbytes": 1536,
256
+ "byteOffset": 20476416
257
+ },
258
+ {
259
+ "name": "transformer.h.1.ln_2.bias",
260
+ "shape": [
261
+ 768
262
+ ],
263
+ "dtype": "float16",
264
+ "format": "f32-to-bf16",
265
+ "nbytes": 1536,
266
+ "byteOffset": 20477952
267
+ },
268
+ {
269
+ "name": "transformer.h.1.mlp.c_fc.weight",
270
+ "shape": [
271
+ 3072,
272
+ 768
273
+ ],
274
+ "dtype": "float16",
275
+ "format": "f32-to-bf16",
276
+ "nbytes": 4718592,
277
+ "byteOffset": 20479488
278
+ },
279
+ {
280
+ "name": "transformer.h.1.mlp.c_fc.bias",
281
+ "shape": [
282
+ 3072
283
+ ],
284
+ "dtype": "float16",
285
+ "format": "f32-to-bf16",
286
+ "nbytes": 6144,
287
+ "byteOffset": 25198080
288
+ },
289
+ {
290
+ "name": "transformer.h.1.mlp.c_proj.weight",
291
+ "shape": [
292
+ 768,
293
+ 3072
294
+ ],
295
+ "dtype": "float16",
296
+ "format": "f32-to-bf16",
297
+ "nbytes": 4718592,
298
+ "byteOffset": 25204224
299
+ },
300
+ {
301
+ "name": "transformer.h.1.mlp.c_proj.bias",
302
+ "shape": [
303
+ 768
304
+ ],
305
+ "dtype": "float16",
306
+ "format": "f32-to-bf16",
307
+ "nbytes": 1536,
308
+ "byteOffset": 29922816
309
+ },
310
+ {
311
+ "name": "transformer.h.2.ln_1.weight",
312
+ "shape": [
313
+ 768
314
+ ],
315
+ "dtype": "float16",
316
+ "format": "f32-to-bf16",
317
+ "nbytes": 1536,
318
+ "byteOffset": 29924352
319
+ },
320
+ {
321
+ "name": "transformer.h.2.ln_1.bias",
322
+ "shape": [
323
+ 768
324
+ ],
325
+ "dtype": "float16",
326
+ "format": "f32-to-bf16",
327
+ "nbytes": 1536,
328
+ "byteOffset": 29925888
329
+ },
330
+ {
331
+ "name": "transformer.h.2.attn.c_attn.weight",
332
+ "shape": [
333
+ 2304,
334
+ 768
335
+ ],
336
+ "dtype": "float16",
337
+ "format": "f32-to-bf16",
338
+ "nbytes": 3538944,
339
+ "byteOffset": 29927424
340
+ },
341
+ {
342
+ "name": "transformer.h.2.attn.c_attn.bias",
343
+ "shape": [
344
+ 2304
345
+ ],
346
+ "dtype": "float16",
347
+ "format": "f32-to-bf16",
348
+ "nbytes": 4608,
349
+ "byteOffset": 33466368
350
+ }
351
+ ],
352
+ "md5sum": "748d75aca928d18aac58d8f9301c3bcb"
353
+ },
354
+ {
355
+ "dataPath": "params_shard_3.bin",
356
+ "format": "raw-shard",
357
+ "nbytes": 29535744,
358
+ "records": [
359
+ {
360
+ "name": "transformer.h.2.attn.c_proj.weight",
361
+ "shape": [
362
+ 768,
363
+ 768
364
+ ],
365
+ "dtype": "float16",
366
+ "format": "f32-to-bf16",
367
+ "nbytes": 1179648,
368
+ "byteOffset": 0
369
+ },
370
+ {
371
+ "name": "transformer.h.2.attn.c_proj.bias",
372
+ "shape": [
373
+ 768
374
+ ],
375
+ "dtype": "float16",
376
+ "format": "f32-to-bf16",
377
+ "nbytes": 1536,
378
+ "byteOffset": 1179648
379
+ },
380
+ {
381
+ "name": "transformer.h.2.ln_2.weight",
382
+ "shape": [
383
+ 768
384
+ ],
385
+ "dtype": "float16",
386
+ "format": "f32-to-bf16",
387
+ "nbytes": 1536,
388
+ "byteOffset": 1181184
389
+ },
390
+ {
391
+ "name": "transformer.h.2.ln_2.bias",
392
+ "shape": [
393
+ 768
394
+ ],
395
+ "dtype": "float16",
396
+ "format": "f32-to-bf16",
397
+ "nbytes": 1536,
398
+ "byteOffset": 1182720
399
+ },
400
+ {
401
+ "name": "transformer.h.2.mlp.c_fc.weight",
402
+ "shape": [
403
+ 3072,
404
+ 768
405
+ ],
406
+ "dtype": "float16",
407
+ "format": "f32-to-bf16",
408
+ "nbytes": 4718592,
409
+ "byteOffset": 1184256
410
+ },
411
+ {
412
+ "name": "transformer.h.2.mlp.c_fc.bias",
413
+ "shape": [
414
+ 3072
415
+ ],
416
+ "dtype": "float16",
417
+ "format": "f32-to-bf16",
418
+ "nbytes": 6144,
419
+ "byteOffset": 5902848
420
+ },
421
+ {
422
+ "name": "transformer.h.2.mlp.c_proj.weight",
423
+ "shape": [
424
+ 768,
425
+ 3072
426
+ ],
427
+ "dtype": "float16",
428
+ "format": "f32-to-bf16",
429
+ "nbytes": 4718592,
430
+ "byteOffset": 5908992
431
+ },
432
+ {
433
+ "name": "transformer.h.2.mlp.c_proj.bias",
434
+ "shape": [
435
+ 768
436
+ ],
437
+ "dtype": "float16",
438
+ "format": "f32-to-bf16",
439
+ "nbytes": 1536,
440
+ "byteOffset": 10627584
441
+ },
442
+ {
443
+ "name": "transformer.h.3.ln_1.weight",
444
+ "shape": [
445
+ 768
446
+ ],
447
+ "dtype": "float16",
448
+ "format": "f32-to-bf16",
449
+ "nbytes": 1536,
450
+ "byteOffset": 10629120
451
+ },
452
+ {
453
+ "name": "transformer.h.3.ln_1.bias",
454
+ "shape": [
455
+ 768
456
+ ],
457
+ "dtype": "float16",
458
+ "format": "f32-to-bf16",
459
+ "nbytes": 1536,
460
+ "byteOffset": 10630656
461
+ },
462
+ {
463
+ "name": "transformer.h.3.attn.c_attn.weight",
464
+ "shape": [
465
+ 2304,
466
+ 768
467
+ ],
468
+ "dtype": "float16",
469
+ "format": "f32-to-bf16",
470
+ "nbytes": 3538944,
471
+ "byteOffset": 10632192
472
+ },
473
+ {
474
+ "name": "transformer.h.3.attn.c_attn.bias",
475
+ "shape": [
476
+ 2304
477
+ ],
478
+ "dtype": "float16",
479
+ "format": "f32-to-bf16",
480
+ "nbytes": 4608,
481
+ "byteOffset": 14171136
482
+ },
483
+ {
484
+ "name": "transformer.h.3.attn.c_proj.weight",
485
+ "shape": [
486
+ 768,
487
+ 768
488
+ ],
489
+ "dtype": "float16",
490
+ "format": "f32-to-bf16",
491
+ "nbytes": 1179648,
492
+ "byteOffset": 14175744
493
+ },
494
+ {
495
+ "name": "transformer.h.3.attn.c_proj.bias",
496
+ "shape": [
497
+ 768
498
+ ],
499
+ "dtype": "float16",
500
+ "format": "f32-to-bf16",
501
+ "nbytes": 1536,
502
+ "byteOffset": 15355392
503
+ },
504
+ {
505
+ "name": "transformer.h.3.ln_2.weight",
506
+ "shape": [
507
+ 768
508
+ ],
509
+ "dtype": "float16",
510
+ "format": "f32-to-bf16",
511
+ "nbytes": 1536,
512
+ "byteOffset": 15356928
513
+ },
514
+ {
515
+ "name": "transformer.h.3.ln_2.bias",
516
+ "shape": [
517
+ 768
518
+ ],
519
+ "dtype": "float16",
520
+ "format": "f32-to-bf16",
521
+ "nbytes": 1536,
522
+ "byteOffset": 15358464
523
+ },
524
+ {
525
+ "name": "transformer.h.3.mlp.c_fc.weight",
526
+ "shape": [
527
+ 3072,
528
+ 768
529
+ ],
530
+ "dtype": "float16",
531
+ "format": "f32-to-bf16",
532
+ "nbytes": 4718592,
533
+ "byteOffset": 15360000
534
+ },
535
+ {
536
+ "name": "transformer.h.3.mlp.c_fc.bias",
537
+ "shape": [
538
+ 3072
539
+ ],
540
+ "dtype": "float16",
541
+ "format": "f32-to-bf16",
542
+ "nbytes": 6144,
543
+ "byteOffset": 20078592
544
+ },
545
+ {
546
+ "name": "transformer.h.3.mlp.c_proj.weight",
547
+ "shape": [
548
+ 768,
549
+ 3072
550
+ ],
551
+ "dtype": "float16",
552
+ "format": "f32-to-bf16",
553
+ "nbytes": 4718592,
554
+ "byteOffset": 20084736
555
+ },
556
+ {
557
+ "name": "transformer.h.3.mlp.c_proj.bias",
558
+ "shape": [
559
+ 768
560
+ ],
561
+ "dtype": "float16",
562
+ "format": "f32-to-bf16",
563
+ "nbytes": 1536,
564
+ "byteOffset": 24803328
565
+ },
566
+ {
567
+ "name": "transformer.h.4.ln_1.weight",
568
+ "shape": [
569
+ 768
570
+ ],
571
+ "dtype": "float16",
572
+ "format": "f32-to-bf16",
573
+ "nbytes": 1536,
574
+ "byteOffset": 24804864
575
+ },
576
+ {
577
+ "name": "transformer.h.4.ln_1.bias",
578
+ "shape": [
579
+ 768
580
+ ],
581
+ "dtype": "float16",
582
+ "format": "f32-to-bf16",
583
+ "nbytes": 1536,
584
+ "byteOffset": 24806400
585
+ },
586
+ {
587
+ "name": "transformer.h.4.attn.c_attn.weight",
588
+ "shape": [
589
+ 2304,
590
+ 768
591
+ ],
592
+ "dtype": "float16",
593
+ "format": "f32-to-bf16",
594
+ "nbytes": 3538944,
595
+ "byteOffset": 24807936
596
+ },
597
+ {
598
+ "name": "transformer.h.4.attn.c_attn.bias",
599
+ "shape": [
600
+ 2304
601
+ ],
602
+ "dtype": "float16",
603
+ "format": "f32-to-bf16",
604
+ "nbytes": 4608,
605
+ "byteOffset": 28346880
606
+ },
607
+ {
608
+ "name": "transformer.h.4.attn.c_proj.weight",
609
+ "shape": [
610
+ 768,
611
+ 768
612
+ ],
613
+ "dtype": "float16",
614
+ "format": "f32-to-bf16",
615
+ "nbytes": 1179648,
616
+ "byteOffset": 28351488
617
+ },
618
+ {
619
+ "name": "transformer.h.4.attn.c_proj.bias",
620
+ "shape": [
621
+ 768
622
+ ],
623
+ "dtype": "float16",
624
+ "format": "f32-to-bf16",
625
+ "nbytes": 1536,
626
+ "byteOffset": 29531136
627
+ },
628
+ {
629
+ "name": "transformer.h.4.ln_2.weight",
630
+ "shape": [
631
+ 768
632
+ ],
633
+ "dtype": "float16",
634
+ "format": "f32-to-bf16",
635
+ "nbytes": 1536,
636
+ "byteOffset": 29532672
637
+ },
638
+ {
639
+ "name": "transformer.h.4.ln_2.bias",
640
+ "shape": [
641
+ 768
642
+ ],
643
+ "dtype": "float16",
644
+ "format": "f32-to-bf16",
645
+ "nbytes": 1536,
646
+ "byteOffset": 29534208
647
+ }
648
+ ],
649
+ "md5sum": "94d1cea00ea6a1492cb7bb23eb5d64b5"
650
+ },
651
+ {
652
+ "dataPath": "params_shard_4.bin",
653
+ "format": "raw-shard",
654
+ "nbytes": 33076224,
655
+ "records": [
656
+ {
657
+ "name": "transformer.h.4.mlp.c_fc.weight",
658
+ "shape": [
659
+ 3072,
660
+ 768
661
+ ],
662
+ "dtype": "float16",
663
+ "format": "f32-to-bf16",
664
+ "nbytes": 4718592,
665
+ "byteOffset": 0
666
+ },
667
+ {
668
+ "name": "transformer.h.4.mlp.c_fc.bias",
669
+ "shape": [
670
+ 3072
671
+ ],
672
+ "dtype": "float16",
673
+ "format": "f32-to-bf16",
674
+ "nbytes": 6144,
675
+ "byteOffset": 4718592
676
+ },
677
+ {
678
+ "name": "transformer.h.4.mlp.c_proj.weight",
679
+ "shape": [
680
+ 768,
681
+ 3072
682
+ ],
683
+ "dtype": "float16",
684
+ "format": "f32-to-bf16",
685
+ "nbytes": 4718592,
686
+ "byteOffset": 4724736
687
+ },
688
+ {
689
+ "name": "transformer.h.4.mlp.c_proj.bias",
690
+ "shape": [
691
+ 768
692
+ ],
693
+ "dtype": "float16",
694
+ "format": "f32-to-bf16",
695
+ "nbytes": 1536,
696
+ "byteOffset": 9443328
697
+ },
698
+ {
699
+ "name": "transformer.h.5.ln_1.weight",
700
+ "shape": [
701
+ 768
702
+ ],
703
+ "dtype": "float16",
704
+ "format": "f32-to-bf16",
705
+ "nbytes": 1536,
706
+ "byteOffset": 9444864
707
+ },
708
+ {
709
+ "name": "transformer.h.5.ln_1.bias",
710
+ "shape": [
711
+ 768
712
+ ],
713
+ "dtype": "float16",
714
+ "format": "f32-to-bf16",
715
+ "nbytes": 1536,
716
+ "byteOffset": 9446400
717
+ },
718
+ {
719
+ "name": "transformer.h.5.attn.c_attn.weight",
720
+ "shape": [
721
+ 2304,
722
+ 768
723
+ ],
724
+ "dtype": "float16",
725
+ "format": "f32-to-bf16",
726
+ "nbytes": 3538944,
727
+ "byteOffset": 9447936
728
+ },
729
+ {
730
+ "name": "transformer.h.5.attn.c_attn.bias",
731
+ "shape": [
732
+ 2304
733
+ ],
734
+ "dtype": "float16",
735
+ "format": "f32-to-bf16",
736
+ "nbytes": 4608,
737
+ "byteOffset": 12986880
738
+ },
739
+ {
740
+ "name": "transformer.h.5.attn.c_proj.weight",
741
+ "shape": [
742
+ 768,
743
+ 768
744
+ ],
745
+ "dtype": "float16",
746
+ "format": "f32-to-bf16",
747
+ "nbytes": 1179648,
748
+ "byteOffset": 12991488
749
+ },
750
+ {
751
+ "name": "transformer.h.5.attn.c_proj.bias",
752
+ "shape": [
753
+ 768
754
+ ],
755
+ "dtype": "float16",
756
+ "format": "f32-to-bf16",
757
+ "nbytes": 1536,
758
+ "byteOffset": 14171136
759
+ },
760
+ {
761
+ "name": "transformer.h.5.ln_2.weight",
762
+ "shape": [
763
+ 768
764
+ ],
765
+ "dtype": "float16",
766
+ "format": "f32-to-bf16",
767
+ "nbytes": 1536,
768
+ "byteOffset": 14172672
769
+ },
770
+ {
771
+ "name": "transformer.h.5.ln_2.bias",
772
+ "shape": [
773
+ 768
774
+ ],
775
+ "dtype": "float16",
776
+ "format": "f32-to-bf16",
777
+ "nbytes": 1536,
778
+ "byteOffset": 14174208
779
+ },
780
+ {
781
+ "name": "transformer.h.5.mlp.c_fc.weight",
782
+ "shape": [
783
+ 3072,
784
+ 768
785
+ ],
786
+ "dtype": "float16",
787
+ "format": "f32-to-bf16",
788
+ "nbytes": 4718592,
789
+ "byteOffset": 14175744
790
+ },
791
+ {
792
+ "name": "transformer.h.5.mlp.c_fc.bias",
793
+ "shape": [
794
+ 3072
795
+ ],
796
+ "dtype": "float16",
797
+ "format": "f32-to-bf16",
798
+ "nbytes": 6144,
799
+ "byteOffset": 18894336
800
+ },
801
+ {
802
+ "name": "transformer.h.5.mlp.c_proj.weight",
803
+ "shape": [
804
+ 768,
805
+ 3072
806
+ ],
807
+ "dtype": "float16",
808
+ "format": "f32-to-bf16",
809
+ "nbytes": 4718592,
810
+ "byteOffset": 18900480
811
+ },
812
+ {
813
+ "name": "transformer.h.5.mlp.c_proj.bias",
814
+ "shape": [
815
+ 768
816
+ ],
817
+ "dtype": "float16",
818
+ "format": "f32-to-bf16",
819
+ "nbytes": 1536,
820
+ "byteOffset": 23619072
821
+ },
822
+ {
823
+ "name": "transformer.h.6.ln_1.weight",
824
+ "shape": [
825
+ 768
826
+ ],
827
+ "dtype": "float16",
828
+ "format": "f32-to-bf16",
829
+ "nbytes": 1536,
830
+ "byteOffset": 23620608
831
+ },
832
+ {
833
+ "name": "transformer.h.6.ln_1.bias",
834
+ "shape": [
835
+ 768
836
+ ],
837
+ "dtype": "float16",
838
+ "format": "f32-to-bf16",
839
+ "nbytes": 1536,
840
+ "byteOffset": 23622144
841
+ },
842
+ {
843
+ "name": "transformer.h.6.attn.c_attn.weight",
844
+ "shape": [
845
+ 2304,
846
+ 768
847
+ ],
848
+ "dtype": "float16",
849
+ "format": "f32-to-bf16",
850
+ "nbytes": 3538944,
851
+ "byteOffset": 23623680
852
+ },
853
+ {
854
+ "name": "transformer.h.6.attn.c_attn.bias",
855
+ "shape": [
856
+ 2304
857
+ ],
858
+ "dtype": "float16",
859
+ "format": "f32-to-bf16",
860
+ "nbytes": 4608,
861
+ "byteOffset": 27162624
862
+ },
863
+ {
864
+ "name": "transformer.h.6.attn.c_proj.weight",
865
+ "shape": [
866
+ 768,
867
+ 768
868
+ ],
869
+ "dtype": "float16",
870
+ "format": "f32-to-bf16",
871
+ "nbytes": 1179648,
872
+ "byteOffset": 27167232
873
+ },
874
+ {
875
+ "name": "transformer.h.6.attn.c_proj.bias",
876
+ "shape": [
877
+ 768
878
+ ],
879
+ "dtype": "float16",
880
+ "format": "f32-to-bf16",
881
+ "nbytes": 1536,
882
+ "byteOffset": 28346880
883
+ },
884
+ {
885
+ "name": "transformer.h.6.ln_2.weight",
886
+ "shape": [
887
+ 768
888
+ ],
889
+ "dtype": "float16",
890
+ "format": "f32-to-bf16",
891
+ "nbytes": 1536,
892
+ "byteOffset": 28348416
893
+ },
894
+ {
895
+ "name": "transformer.h.6.ln_2.bias",
896
+ "shape": [
897
+ 768
898
+ ],
899
+ "dtype": "float16",
900
+ "format": "f32-to-bf16",
901
+ "nbytes": 1536,
902
+ "byteOffset": 28349952
903
+ },
904
+ {
905
+ "name": "transformer.h.6.mlp.c_fc.weight",
906
+ "shape": [
907
+ 3072,
908
+ 768
909
+ ],
910
+ "dtype": "float16",
911
+ "format": "f32-to-bf16",
912
+ "nbytes": 4718592,
913
+ "byteOffset": 28351488
914
+ },
915
+ {
916
+ "name": "transformer.h.6.mlp.c_fc.bias",
917
+ "shape": [
918
+ 3072
919
+ ],
920
+ "dtype": "float16",
921
+ "format": "f32-to-bf16",
922
+ "nbytes": 6144,
923
+ "byteOffset": 33070080
924
+ }
925
+ ],
926
+ "md5sum": "43957cf0e0da3bc016aa35099e8a4e53"
927
+ },
928
+ {
929
+ "dataPath": "params_shard_5.bin",
930
+ "format": "raw-shard",
931
+ "nbytes": 33074688,
932
+ "records": [
933
+ {
934
+ "name": "transformer.h.6.mlp.c_proj.weight",
935
+ "shape": [
936
+ 768,
937
+ 3072
938
+ ],
939
+ "dtype": "float16",
940
+ "format": "f32-to-bf16",
941
+ "nbytes": 4718592,
942
+ "byteOffset": 0
943
+ },
944
+ {
945
+ "name": "transformer.h.6.mlp.c_proj.bias",
946
+ "shape": [
947
+ 768
948
+ ],
949
+ "dtype": "float16",
950
+ "format": "f32-to-bf16",
951
+ "nbytes": 1536,
952
+ "byteOffset": 4718592
953
+ },
954
+ {
955
+ "name": "transformer.h.7.ln_1.weight",
956
+ "shape": [
957
+ 768
958
+ ],
959
+ "dtype": "float16",
960
+ "format": "f32-to-bf16",
961
+ "nbytes": 1536,
962
+ "byteOffset": 4720128
963
+ },
964
+ {
965
+ "name": "transformer.h.7.ln_1.bias",
966
+ "shape": [
967
+ 768
968
+ ],
969
+ "dtype": "float16",
970
+ "format": "f32-to-bf16",
971
+ "nbytes": 1536,
972
+ "byteOffset": 4721664
973
+ },
974
+ {
975
+ "name": "transformer.h.7.attn.c_attn.weight",
976
+ "shape": [
977
+ 2304,
978
+ 768
979
+ ],
980
+ "dtype": "float16",
981
+ "format": "f32-to-bf16",
982
+ "nbytes": 3538944,
983
+ "byteOffset": 4723200
984
+ },
985
+ {
986
+ "name": "transformer.h.7.attn.c_attn.bias",
987
+ "shape": [
988
+ 2304
989
+ ],
990
+ "dtype": "float16",
991
+ "format": "f32-to-bf16",
992
+ "nbytes": 4608,
993
+ "byteOffset": 8262144
994
+ },
995
+ {
996
+ "name": "transformer.h.7.attn.c_proj.weight",
997
+ "shape": [
998
+ 768,
999
+ 768
1000
+ ],
1001
+ "dtype": "float16",
1002
+ "format": "f32-to-bf16",
1003
+ "nbytes": 1179648,
1004
+ "byteOffset": 8266752
1005
+ },
1006
+ {
1007
+ "name": "transformer.h.7.attn.c_proj.bias",
1008
+ "shape": [
1009
+ 768
1010
+ ],
1011
+ "dtype": "float16",
1012
+ "format": "f32-to-bf16",
1013
+ "nbytes": 1536,
1014
+ "byteOffset": 9446400
1015
+ },
1016
+ {
1017
+ "name": "transformer.h.7.ln_2.weight",
1018
+ "shape": [
1019
+ 768
1020
+ ],
1021
+ "dtype": "float16",
1022
+ "format": "f32-to-bf16",
1023
+ "nbytes": 1536,
1024
+ "byteOffset": 9447936
1025
+ },
1026
+ {
1027
+ "name": "transformer.h.7.ln_2.bias",
1028
+ "shape": [
1029
+ 768
1030
+ ],
1031
+ "dtype": "float16",
1032
+ "format": "f32-to-bf16",
1033
+ "nbytes": 1536,
1034
+ "byteOffset": 9449472
1035
+ },
1036
+ {
1037
+ "name": "transformer.h.7.mlp.c_fc.weight",
1038
+ "shape": [
1039
+ 3072,
1040
+ 768
1041
+ ],
1042
+ "dtype": "float16",
1043
+ "format": "f32-to-bf16",
1044
+ "nbytes": 4718592,
1045
+ "byteOffset": 9451008
1046
+ },
1047
+ {
1048
+ "name": "transformer.h.7.mlp.c_fc.bias",
1049
+ "shape": [
1050
+ 3072
1051
+ ],
1052
+ "dtype": "float16",
1053
+ "format": "f32-to-bf16",
1054
+ "nbytes": 6144,
1055
+ "byteOffset": 14169600
1056
+ },
1057
+ {
1058
+ "name": "transformer.h.7.mlp.c_proj.weight",
1059
+ "shape": [
1060
+ 768,
1061
+ 3072
1062
+ ],
1063
+ "dtype": "float16",
1064
+ "format": "f32-to-bf16",
1065
+ "nbytes": 4718592,
1066
+ "byteOffset": 14175744
1067
+ },
1068
+ {
1069
+ "name": "transformer.h.7.mlp.c_proj.bias",
1070
+ "shape": [
1071
+ 768
1072
+ ],
1073
+ "dtype": "float16",
1074
+ "format": "f32-to-bf16",
1075
+ "nbytes": 1536,
1076
+ "byteOffset": 18894336
1077
+ },
1078
+ {
1079
+ "name": "transformer.h.8.ln_1.weight",
1080
+ "shape": [
1081
+ 768
1082
+ ],
1083
+ "dtype": "float16",
1084
+ "format": "f32-to-bf16",
1085
+ "nbytes": 1536,
1086
+ "byteOffset": 18895872
1087
+ },
1088
+ {
1089
+ "name": "transformer.h.8.ln_1.bias",
1090
+ "shape": [
1091
+ 768
1092
+ ],
1093
+ "dtype": "float16",
1094
+ "format": "f32-to-bf16",
1095
+ "nbytes": 1536,
1096
+ "byteOffset": 18897408
1097
+ },
1098
+ {
1099
+ "name": "transformer.h.8.attn.c_attn.weight",
1100
+ "shape": [
1101
+ 2304,
1102
+ 768
1103
+ ],
1104
+ "dtype": "float16",
1105
+ "format": "f32-to-bf16",
1106
+ "nbytes": 3538944,
1107
+ "byteOffset": 18898944
1108
+ },
1109
+ {
1110
+ "name": "transformer.h.8.attn.c_attn.bias",
1111
+ "shape": [
1112
+ 2304
1113
+ ],
1114
+ "dtype": "float16",
1115
+ "format": "f32-to-bf16",
1116
+ "nbytes": 4608,
1117
+ "byteOffset": 22437888
1118
+ },
1119
+ {
1120
+ "name": "transformer.h.8.attn.c_proj.weight",
1121
+ "shape": [
1122
+ 768,
1123
+ 768
1124
+ ],
1125
+ "dtype": "float16",
1126
+ "format": "f32-to-bf16",
1127
+ "nbytes": 1179648,
1128
+ "byteOffset": 22442496
1129
+ },
1130
+ {
1131
+ "name": "transformer.h.8.attn.c_proj.bias",
1132
+ "shape": [
1133
+ 768
1134
+ ],
1135
+ "dtype": "float16",
1136
+ "format": "f32-to-bf16",
1137
+ "nbytes": 1536,
1138
+ "byteOffset": 23622144
1139
+ },
1140
+ {
1141
+ "name": "transformer.h.8.ln_2.weight",
1142
+ "shape": [
1143
+ 768
1144
+ ],
1145
+ "dtype": "float16",
1146
+ "format": "f32-to-bf16",
1147
+ "nbytes": 1536,
1148
+ "byteOffset": 23623680
1149
+ },
1150
+ {
1151
+ "name": "transformer.h.8.ln_2.bias",
1152
+ "shape": [
1153
+ 768
1154
+ ],
1155
+ "dtype": "float16",
1156
+ "format": "f32-to-bf16",
1157
+ "nbytes": 1536,
1158
+ "byteOffset": 23625216
1159
+ },
1160
+ {
1161
+ "name": "transformer.h.8.mlp.c_fc.weight",
1162
+ "shape": [
1163
+ 3072,
1164
+ 768
1165
+ ],
1166
+ "dtype": "float16",
1167
+ "format": "f32-to-bf16",
1168
+ "nbytes": 4718592,
1169
+ "byteOffset": 23626752
1170
+ },
1171
+ {
1172
+ "name": "transformer.h.8.mlp.c_fc.bias",
1173
+ "shape": [
1174
+ 3072
1175
+ ],
1176
+ "dtype": "float16",
1177
+ "format": "f32-to-bf16",
1178
+ "nbytes": 6144,
1179
+ "byteOffset": 28345344
1180
+ },
1181
+ {
1182
+ "name": "transformer.h.8.mlp.c_proj.weight",
1183
+ "shape": [
1184
+ 768,
1185
+ 3072
1186
+ ],
1187
+ "dtype": "float16",
1188
+ "format": "f32-to-bf16",
1189
+ "nbytes": 4718592,
1190
+ "byteOffset": 28351488
1191
+ },
1192
+ {
1193
+ "name": "transformer.h.8.mlp.c_proj.bias",
1194
+ "shape": [
1195
+ 768
1196
+ ],
1197
+ "dtype": "float16",
1198
+ "format": "f32-to-bf16",
1199
+ "nbytes": 1536,
1200
+ "byteOffset": 33070080
1201
+ },
1202
+ {
1203
+ "name": "transformer.h.9.ln_1.weight",
1204
+ "shape": [
1205
+ 768
1206
+ ],
1207
+ "dtype": "float16",
1208
+ "format": "f32-to-bf16",
1209
+ "nbytes": 1536,
1210
+ "byteOffset": 33071616
1211
+ },
1212
+ {
1213
+ "name": "transformer.h.9.ln_1.bias",
1214
+ "shape": [
1215
+ 768
1216
+ ],
1217
+ "dtype": "float16",
1218
+ "format": "f32-to-bf16",
1219
+ "nbytes": 1536,
1220
+ "byteOffset": 33073152
1221
+ }
1222
+ ],
1223
+ "md5sum": "4cbbe2cce00b37c72cd95d22b3aa03d3"
1224
+ },
1225
+ {
1226
+ "dataPath": "params_shard_6.bin",
1227
+ "format": "raw-shard",
1228
+ "nbytes": 33079296,
1229
+ "records": [
1230
+ {
1231
+ "name": "transformer.h.9.attn.c_attn.weight",
1232
+ "shape": [
1233
+ 2304,
1234
+ 768
1235
+ ],
1236
+ "dtype": "float16",
1237
+ "format": "f32-to-bf16",
1238
+ "nbytes": 3538944,
1239
+ "byteOffset": 0
1240
+ },
1241
+ {
1242
+ "name": "transformer.h.9.attn.c_attn.bias",
1243
+ "shape": [
1244
+ 2304
1245
+ ],
1246
+ "dtype": "float16",
1247
+ "format": "f32-to-bf16",
1248
+ "nbytes": 4608,
1249
+ "byteOffset": 3538944
1250
+ },
1251
+ {
1252
+ "name": "transformer.h.9.attn.c_proj.weight",
1253
+ "shape": [
1254
+ 768,
1255
+ 768
1256
+ ],
1257
+ "dtype": "float16",
1258
+ "format": "f32-to-bf16",
1259
+ "nbytes": 1179648,
1260
+ "byteOffset": 3543552
1261
+ },
1262
+ {
1263
+ "name": "transformer.h.9.attn.c_proj.bias",
1264
+ "shape": [
1265
+ 768
1266
+ ],
1267
+ "dtype": "float16",
1268
+ "format": "f32-to-bf16",
1269
+ "nbytes": 1536,
1270
+ "byteOffset": 4723200
1271
+ },
1272
+ {
1273
+ "name": "transformer.h.9.ln_2.weight",
1274
+ "shape": [
1275
+ 768
1276
+ ],
1277
+ "dtype": "float16",
1278
+ "format": "f32-to-bf16",
1279
+ "nbytes": 1536,
1280
+ "byteOffset": 4724736
1281
+ },
1282
+ {
1283
+ "name": "transformer.h.9.ln_2.bias",
1284
+ "shape": [
1285
+ 768
1286
+ ],
1287
+ "dtype": "float16",
1288
+ "format": "f32-to-bf16",
1289
+ "nbytes": 1536,
1290
+ "byteOffset": 4726272
1291
+ },
1292
+ {
1293
+ "name": "transformer.h.9.mlp.c_fc.weight",
1294
+ "shape": [
1295
+ 3072,
1296
+ 768
1297
+ ],
1298
+ "dtype": "float16",
1299
+ "format": "f32-to-bf16",
1300
+ "nbytes": 4718592,
1301
+ "byteOffset": 4727808
1302
+ },
1303
+ {
1304
+ "name": "transformer.h.9.mlp.c_fc.bias",
1305
+ "shape": [
1306
+ 3072
1307
+ ],
1308
+ "dtype": "float16",
1309
+ "format": "f32-to-bf16",
1310
+ "nbytes": 6144,
1311
+ "byteOffset": 9446400
1312
+ },
1313
+ {
1314
+ "name": "transformer.h.9.mlp.c_proj.weight",
1315
+ "shape": [
1316
+ 768,
1317
+ 3072
1318
+ ],
1319
+ "dtype": "float16",
1320
+ "format": "f32-to-bf16",
1321
+ "nbytes": 4718592,
1322
+ "byteOffset": 9452544
1323
+ },
1324
+ {
1325
+ "name": "transformer.h.9.mlp.c_proj.bias",
1326
+ "shape": [
1327
+ 768
1328
+ ],
1329
+ "dtype": "float16",
1330
+ "format": "f32-to-bf16",
1331
+ "nbytes": 1536,
1332
+ "byteOffset": 14171136
1333
+ },
1334
+ {
1335
+ "name": "transformer.h.10.ln_1.weight",
1336
+ "shape": [
1337
+ 768
1338
+ ],
1339
+ "dtype": "float16",
1340
+ "format": "f32-to-bf16",
1341
+ "nbytes": 1536,
1342
+ "byteOffset": 14172672
1343
+ },
1344
+ {
1345
+ "name": "transformer.h.10.ln_1.bias",
1346
+ "shape": [
1347
+ 768
1348
+ ],
1349
+ "dtype": "float16",
1350
+ "format": "f32-to-bf16",
1351
+ "nbytes": 1536,
1352
+ "byteOffset": 14174208
1353
+ },
1354
+ {
1355
+ "name": "transformer.h.10.attn.c_attn.weight",
1356
+ "shape": [
1357
+ 2304,
1358
+ 768
1359
+ ],
1360
+ "dtype": "float16",
1361
+ "format": "f32-to-bf16",
1362
+ "nbytes": 3538944,
1363
+ "byteOffset": 14175744
1364
+ },
1365
+ {
1366
+ "name": "transformer.h.10.attn.c_attn.bias",
1367
+ "shape": [
1368
+ 2304
1369
+ ],
1370
+ "dtype": "float16",
1371
+ "format": "f32-to-bf16",
1372
+ "nbytes": 4608,
1373
+ "byteOffset": 17714688
1374
+ },
1375
+ {
1376
+ "name": "transformer.h.10.attn.c_proj.weight",
1377
+ "shape": [
1378
+ 768,
1379
+ 768
1380
+ ],
1381
+ "dtype": "float16",
1382
+ "format": "f32-to-bf16",
1383
+ "nbytes": 1179648,
1384
+ "byteOffset": 17719296
1385
+ },
1386
+ {
1387
+ "name": "transformer.h.10.attn.c_proj.bias",
1388
+ "shape": [
1389
+ 768
1390
+ ],
1391
+ "dtype": "float16",
1392
+ "format": "f32-to-bf16",
1393
+ "nbytes": 1536,
1394
+ "byteOffset": 18898944
1395
+ },
1396
+ {
1397
+ "name": "transformer.h.10.ln_2.weight",
1398
+ "shape": [
1399
+ 768
1400
+ ],
1401
+ "dtype": "float16",
1402
+ "format": "f32-to-bf16",
1403
+ "nbytes": 1536,
1404
+ "byteOffset": 18900480
1405
+ },
1406
+ {
1407
+ "name": "transformer.h.10.ln_2.bias",
1408
+ "shape": [
1409
+ 768
1410
+ ],
1411
+ "dtype": "float16",
1412
+ "format": "f32-to-bf16",
1413
+ "nbytes": 1536,
1414
+ "byteOffset": 18902016
1415
+ },
1416
+ {
1417
+ "name": "transformer.h.10.mlp.c_fc.weight",
1418
+ "shape": [
1419
+ 3072,
1420
+ 768
1421
+ ],
1422
+ "dtype": "float16",
1423
+ "format": "f32-to-bf16",
1424
+ "nbytes": 4718592,
1425
+ "byteOffset": 18903552
1426
+ },
1427
+ {
1428
+ "name": "transformer.h.10.mlp.c_fc.bias",
1429
+ "shape": [
1430
+ 3072
1431
+ ],
1432
+ "dtype": "float16",
1433
+ "format": "f32-to-bf16",
1434
+ "nbytes": 6144,
1435
+ "byteOffset": 23622144
1436
+ },
1437
+ {
1438
+ "name": "transformer.h.10.mlp.c_proj.weight",
1439
+ "shape": [
1440
+ 768,
1441
+ 3072
1442
+ ],
1443
+ "dtype": "float16",
1444
+ "format": "f32-to-bf16",
1445
+ "nbytes": 4718592,
1446
+ "byteOffset": 23628288
1447
+ },
1448
+ {
1449
+ "name": "transformer.h.10.mlp.c_proj.bias",
1450
+ "shape": [
1451
+ 768
1452
+ ],
1453
+ "dtype": "float16",
1454
+ "format": "f32-to-bf16",
1455
+ "nbytes": 1536,
1456
+ "byteOffset": 28346880
1457
+ },
1458
+ {
1459
+ "name": "transformer.h.11.ln_1.weight",
1460
+ "shape": [
1461
+ 768
1462
+ ],
1463
+ "dtype": "float16",
1464
+ "format": "f32-to-bf16",
1465
+ "nbytes": 1536,
1466
+ "byteOffset": 28348416
1467
+ },
1468
+ {
1469
+ "name": "transformer.h.11.ln_1.bias",
1470
+ "shape": [
1471
+ 768
1472
+ ],
1473
+ "dtype": "float16",
1474
+ "format": "f32-to-bf16",
1475
+ "nbytes": 1536,
1476
+ "byteOffset": 28349952
1477
+ },
1478
+ {
1479
+ "name": "transformer.h.11.attn.c_attn.weight",
1480
+ "shape": [
1481
+ 2304,
1482
+ 768
1483
+ ],
1484
+ "dtype": "float16",
1485
+ "format": "f32-to-bf16",
1486
+ "nbytes": 3538944,
1487
+ "byteOffset": 28351488
1488
+ },
1489
+ {
1490
+ "name": "transformer.h.11.attn.c_attn.bias",
1491
+ "shape": [
1492
+ 2304
1493
+ ],
1494
+ "dtype": "float16",
1495
+ "format": "f32-to-bf16",
1496
+ "nbytes": 4608,
1497
+ "byteOffset": 31890432
1498
+ },
1499
+ {
1500
+ "name": "transformer.h.11.attn.c_proj.weight",
1501
+ "shape": [
1502
+ 768,
1503
+ 768
1504
+ ],
1505
+ "dtype": "float16",
1506
+ "format": "f32-to-bf16",
1507
+ "nbytes": 1179648,
1508
+ "byteOffset": 31895040
1509
+ },
1510
+ {
1511
+ "name": "transformer.h.11.attn.c_proj.bias",
1512
+ "shape": [
1513
+ 768
1514
+ ],
1515
+ "dtype": "float16",
1516
+ "format": "f32-to-bf16",
1517
+ "nbytes": 1536,
1518
+ "byteOffset": 33074688
1519
+ },
1520
+ {
1521
+ "name": "transformer.h.11.ln_2.weight",
1522
+ "shape": [
1523
+ 768
1524
+ ],
1525
+ "dtype": "float16",
1526
+ "format": "f32-to-bf16",
1527
+ "nbytes": 1536,
1528
+ "byteOffset": 33076224
1529
+ },
1530
+ {
1531
+ "name": "transformer.h.11.ln_2.bias",
1532
+ "shape": [
1533
+ 768
1534
+ ],
1535
+ "dtype": "float16",
1536
+ "format": "f32-to-bf16",
1537
+ "nbytes": 1536,
1538
+ "byteOffset": 33077760
1539
+ }
1540
+ ],
1541
+ "md5sum": "4f1aa298f8b0af477effc227d22f5ef1"
1542
+ },
1543
+ {
1544
+ "dataPath": "params_shard_7.bin",
1545
+ "format": "raw-shard",
1546
+ "nbytes": 9447936,
1547
+ "records": [
1548
+ {
1549
+ "name": "transformer.h.11.mlp.c_fc.weight",
1550
+ "shape": [
1551
+ 3072,
1552
+ 768
1553
+ ],
1554
+ "dtype": "float16",
1555
+ "format": "f32-to-bf16",
1556
+ "nbytes": 4718592,
1557
+ "byteOffset": 0
1558
+ },
1559
+ {
1560
+ "name": "transformer.h.11.mlp.c_fc.bias",
1561
+ "shape": [
1562
+ 3072
1563
+ ],
1564
+ "dtype": "float16",
1565
+ "format": "f32-to-bf16",
1566
+ "nbytes": 6144,
1567
+ "byteOffset": 4718592
1568
+ },
1569
+ {
1570
+ "name": "transformer.h.11.mlp.c_proj.weight",
1571
+ "shape": [
1572
+ 768,
1573
+ 3072
1574
+ ],
1575
+ "dtype": "float16",
1576
+ "format": "f32-to-bf16",
1577
+ "nbytes": 4718592,
1578
+ "byteOffset": 4724736
1579
+ },
1580
+ {
1581
+ "name": "transformer.h.11.mlp.c_proj.bias",
1582
+ "shape": [
1583
+ 768
1584
+ ],
1585
+ "dtype": "float16",
1586
+ "format": "f32-to-bf16",
1587
+ "nbytes": 1536,
1588
+ "byteOffset": 9443328
1589
+ },
1590
+ {
1591
+ "name": "transformer.ln_f.weight",
1592
+ "shape": [
1593
+ 768
1594
+ ],
1595
+ "dtype": "float16",
1596
+ "format": "f32-to-bf16",
1597
+ "nbytes": 1536,
1598
+ "byteOffset": 9444864
1599
+ },
1600
+ {
1601
+ "name": "transformer.ln_f.bias",
1602
+ "shape": [
1603
+ 768
1604
+ ],
1605
+ "dtype": "float16",
1606
+ "format": "f32-to-bf16",
1607
+ "nbytes": 1536,
1608
+ "byteOffset": 9446400
1609
+ }
1610
+ ],
1611
+ "md5sum": "1532478e2b064b31091ff718a8c67188"
1612
+ }
1613
+ ]
1614
+ }
params_shard_0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea802f0a275727502b40e6b46d6464ec8ccb17366d57ece5e9b1b199ad783ea8
3
+ size 77194752
params_shard_1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea802f0a275727502b40e6b46d6464ec8ccb17366d57ece5e9b1b199ad783ea8
3
+ size 77194752
params_shard_2.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c29b9276f5fb71ad0dd00dedc7defbfe79106324dca669b728daf886f55036e2
3
+ size 33470976
params_shard_3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3888bac08b451fe974a3dd18b7dd5e3ac7b107eed807e6aded3b1f9b714cb563
3
+ size 29535744
params_shard_4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a98fa039c46b6f18388611e321988d9c6ac7030a2967d60bc7b33f81f28abe
3
+ size 33076224
params_shard_5.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef4b35db263c4a98b61f6a6cadd880bfcf86521f3700727e795d6ab38e03d809
3
+ size 33074688
params_shard_6.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59adb5499274cc05166e1c1e03d21dfa26934c1d840a58c4914ce128cdf37f55
3
+ size 33079296
params_shard_7.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fc8c44ee6c025faf66e1b45c1e7d7bda64c2698aef331e1b945009b7f45e4c5
3
+ size 9447936
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff