photonmz commited on
Commit
06e4304
·
1 Parent(s): 6c13533
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ images/
softmax0-15m-2023_08_22_14_44_50/ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:702532f11d9611cfca7bb19fa9071774f4772db44057a90e3e573d06f243cda3
3
+ size 182363350
softmax0-15m-2023_08_22_14_44_50/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "out_dir": "out",
3
+ "eval_interval": 2000,
4
+ "log_interval": 1,
5
+ "eval_iters": 100,
6
+ "eval_only": false,
7
+ "always_save_checkpoint": true,
8
+ "init_from": "scratch",
9
+ "wandb_log": true,
10
+ "wandb_project": "softmax1-tinystories",
11
+ "wandb_run_name": "softmax0-15m-2023_08_22_14_44_50",
12
+ "batch_size": 96,
13
+ "max_seq_len": 256,
14
+ "vocab_source": "llama2",
15
+ "vocab_size": 32000,
16
+ "dim": 288,
17
+ "n_layers": 6,
18
+ "n_heads": 6,
19
+ "n_kv_heads": 6,
20
+ "multiple_of": 32,
21
+ "dropout": 0.0,
22
+ "gradient_accumulation_steps": 4,
23
+ "learning_rate": 0.0005,
24
+ "max_iters": 100000,
25
+ "weight_decay": 0.1,
26
+ "beta1": 0.9,
27
+ "beta2": 0.95,
28
+ "grad_clip": 1.0,
29
+ "decay_lr": true,
30
+ "warmup_iters": 1000,
31
+ "device": "cuda",
32
+ "dtype": "float16",
33
+ "compile": true,
34
+ "softmax1": false
35
+ }
softmax0-15m-2023_08_22_14_44_50/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6f0df381bcc8058c8ff6ec48864df2c9eb61b3b926f22e87569c9badec0e0a9
3
+ size 60816028
softmax0-15m-2023_08_22_14_44_50/weights.json ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_embeddings.weight": {
3
+ "mean": -0.0016343051102012396,
4
+ "var": 0.0038931926246732473,
5
+ "std": 0.062395453453063965,
6
+ "skews": -2.7351033687591553,
7
+ "kurtosis": 29.507484436035156,
8
+ "outliers": 23467,
9
+ "outlier_percent": 0.002546332465277778
10
+ },
11
+ "layers.0.attention.wq.weight": {
12
+ "mean": -0.00015833647921681404,
13
+ "var": 0.005146522540599108,
14
+ "std": 0.07173927128314972,
15
+ "skews": -0.0028434402775019407,
16
+ "kurtosis": 1.6188158988952637,
17
+ "outliers": 23,
18
+ "outlier_percent": 0.000277295524691358
19
+ },
20
+ "layers.0.attention.wk.weight": {
21
+ "mean": 6.526386641780846e-06,
22
+ "var": 0.004242290742695332,
23
+ "std": 0.06513287127017975,
24
+ "skews": 0.021585451439023018,
25
+ "kurtosis": 1.1567010879516602,
26
+ "outliers": 9,
27
+ "outlier_percent": 0.00010850694444444444
28
+ },
29
+ "layers.0.attention.wv.weight": {
30
+ "mean": -4.457616887520999e-05,
31
+ "var": 0.00026332412380725145,
32
+ "std": 0.01622726395726204,
33
+ "skews": -0.013842478394508362,
34
+ "kurtosis": 0.297649621963501,
35
+ "outliers": 0,
36
+ "outlier_percent": 0.0
37
+ },
38
+ "layers.0.attention.wo.weight": {
39
+ "mean": -7.100501306922524e-07,
40
+ "var": 0.00024368301092181355,
41
+ "std": 0.015610349364578724,
42
+ "skews": 0.10141679644584656,
43
+ "kurtosis": 11.977744102478027,
44
+ "outliers": 16,
45
+ "outlier_percent": 0.00019290123456790122
46
+ },
47
+ "layers.0.feed_forward.w1.weight": {
48
+ "mean": -2.4933683562267106e-06,
49
+ "var": 0.0010707820765674114,
50
+ "std": 0.03272280842065811,
51
+ "skews": 0.006910220254212618,
52
+ "kurtosis": -0.0008280277252197266,
53
+ "outliers": 0,
54
+ "outlier_percent": 0.0
55
+ },
56
+ "layers.0.feed_forward.w2.weight": {
57
+ "mean": -4.803382034879178e-05,
58
+ "var": 0.0010309461504220963,
59
+ "std": 0.03210835158824921,
60
+ "skews": 0.03226399049162865,
61
+ "kurtosis": 1.2938251495361328,
62
+ "outliers": 47,
63
+ "outlier_percent": 0.0002124927662037037
64
+ },
65
+ "layers.0.feed_forward.w3.weight": {
66
+ "mean": -1.1704322787409183e-05,
67
+ "var": 0.0010558386566117406,
68
+ "std": 0.03249366953969002,
69
+ "skews": -0.0073517016135156155,
70
+ "kurtosis": 0.0046155452728271484,
71
+ "outliers": 0,
72
+ "outlier_percent": 0.0
73
+ },
74
+ "layers.0.attention_norm.weight": {
75
+ "mean": 0.5683733820915222,
76
+ "var": 0.014715575613081455,
77
+ "std": 0.12130777537822723,
78
+ "skews": 2.2385318279266357,
79
+ "kurtosis": 8.328707695007324,
80
+ "outliers": 1,
81
+ "outlier_percent": 0.003472222222222222
82
+ },
83
+ "layers.0.ffn_norm.weight": {
84
+ "mean": 0.7457764148712158,
85
+ "var": 0.003284212201833725,
86
+ "std": 0.057308048009872437,
87
+ "skews": -4.060823917388916,
88
+ "kurtosis": 42.8275260925293,
89
+ "outliers": 1,
90
+ "outlier_percent": 0.003472222222222222
91
+ },
92
+ "layers.1.attention.wq.weight": {
93
+ "mean": -4.265182724338956e-05,
94
+ "var": 0.0027510877698659897,
95
+ "std": 0.052450813353061676,
96
+ "skews": 0.00352106848731637,
97
+ "kurtosis": 0.6108942031860352,
98
+ "outliers": 4,
99
+ "outlier_percent": 4.8225308641975306e-05
100
+ },
101
+ "layers.1.attention.wk.weight": {
102
+ "mean": 5.963511284790002e-05,
103
+ "var": 0.002435738919302821,
104
+ "std": 0.04935320466756821,
105
+ "skews": -0.07012113928794861,
106
+ "kurtosis": 1.8377490043640137,
107
+ "outliers": 8,
108
+ "outlier_percent": 9.645061728395061e-05
109
+ },
110
+ "layers.1.attention.wv.weight": {
111
+ "mean": 2.9321125111891888e-05,
112
+ "var": 0.0004078703641425818,
113
+ "std": 0.020195800811052322,
114
+ "skews": -0.011546331457793713,
115
+ "kurtosis": 0.0876009464263916,
116
+ "outliers": 0,
117
+ "outlier_percent": 0.0
118
+ },
119
+ "layers.1.attention.wo.weight": {
120
+ "mean": 4.54976589026046e-06,
121
+ "var": 0.0003835293173324317,
122
+ "std": 0.01958390511572361,
123
+ "skews": 0.05651681870222092,
124
+ "kurtosis": 0.7702975273132324,
125
+ "outliers": 14,
126
+ "outlier_percent": 0.00016878858024691357
127
+ },
128
+ "layers.1.feed_forward.w1.weight": {
129
+ "mean": -0.0002082372084259987,
130
+ "var": 0.001096143270842731,
131
+ "std": 0.03310805559158325,
132
+ "skews": 0.005317448638379574,
133
+ "kurtosis": 1.2885775566101074,
134
+ "outliers": 53,
135
+ "outlier_percent": 0.00023961950231481482
136
+ },
137
+ "layers.1.feed_forward.w2.weight": {
138
+ "mean": 1.3776403648080304e-05,
139
+ "var": 0.001043531228788197,
140
+ "std": 0.03230373561382294,
141
+ "skews": -0.010474198497831821,
142
+ "kurtosis": 0.4791874885559082,
143
+ "outliers": 18,
144
+ "outlier_percent": 8.138020833333333e-05
145
+ },
146
+ "layers.1.feed_forward.w3.weight": {
147
+ "mean": -4.969057499693008e-06,
148
+ "var": 0.00107937294524163,
149
+ "std": 0.03285381197929382,
150
+ "skews": 0.0024307132698595524,
151
+ "kurtosis": -0.002953767776489258,
152
+ "outliers": 0,
153
+ "outlier_percent": 0.0
154
+ },
155
+ "layers.1.attention_norm.weight": {
156
+ "mean": 0.9582674503326416,
157
+ "var": 0.003743925830349326,
158
+ "std": 0.06118762865662575,
159
+ "skews": 0.16415190696716309,
160
+ "kurtosis": 2.457639694213867,
161
+ "outliers": 0,
162
+ "outlier_percent": 0.0
163
+ },
164
+ "layers.1.ffn_norm.weight": {
165
+ "mean": 0.82928866147995,
166
+ "var": 0.0031697505619376898,
167
+ "std": 0.05630053952336311,
168
+ "skews": -4.1606831550598145,
169
+ "kurtosis": 34.9595947265625,
170
+ "outliers": 1,
171
+ "outlier_percent": 0.003472222222222222
172
+ },
173
+ "layers.2.attention.wq.weight": {
174
+ "mean": -3.796460077865049e-05,
175
+ "var": 0.002584402682259679,
176
+ "std": 0.05083702132105827,
177
+ "skews": 0.0032156507950276136,
178
+ "kurtosis": 0.6036031246185303,
179
+ "outliers": 5,
180
+ "outlier_percent": 6.028163580246913e-05
181
+ },
182
+ "layers.2.attention.wk.weight": {
183
+ "mean": -3.2622701837681234e-05,
184
+ "var": 0.0023682292085140944,
185
+ "std": 0.048664454370737076,
186
+ "skews": -0.044531866908073425,
187
+ "kurtosis": 1.5378341674804688,
188
+ "outliers": 4,
189
+ "outlier_percent": 4.8225308641975306e-05
190
+ },
191
+ "layers.2.attention.wv.weight": {
192
+ "mean": -6.883700552862138e-05,
193
+ "var": 0.0005191161762923002,
194
+ "std": 0.022784121334552765,
195
+ "skews": -0.015684092417359352,
196
+ "kurtosis": 0.05707192420959473,
197
+ "outliers": 0,
198
+ "outlier_percent": 0.0
199
+ },
200
+ "layers.2.attention.wo.weight": {
201
+ "mean": 0.00011038936645491049,
202
+ "var": 0.00049241678789258,
203
+ "std": 0.02219046652317047,
204
+ "skews": -0.01890524849295616,
205
+ "kurtosis": 0.3118934631347656,
206
+ "outliers": 4,
207
+ "outlier_percent": 4.8225308641975306e-05
208
+ },
209
+ "layers.2.feed_forward.w1.weight": {
210
+ "mean": -0.00011393482418498024,
211
+ "var": 0.0010629615280777216,
212
+ "std": 0.03260309249162674,
213
+ "skews": 0.012040580622851849,
214
+ "kurtosis": 0.15316557884216309,
215
+ "outliers": 6,
216
+ "outlier_percent": 2.712673611111111e-05
217
+ },
218
+ "layers.2.feed_forward.w2.weight": {
219
+ "mean": -3.49236506735906e-05,
220
+ "var": 0.0010735696414485574,
221
+ "std": 0.03276537358760834,
222
+ "skews": 0.0018738987855613232,
223
+ "kurtosis": 0.18375515937805176,
224
+ "outliers": 5,
225
+ "outlier_percent": 2.2605613425925925e-05
226
+ },
227
+ "layers.2.feed_forward.w3.weight": {
228
+ "mean": -6.427209154935554e-05,
229
+ "var": 0.0011157323606312275,
230
+ "std": 0.03340258076786995,
231
+ "skews": 0.005643382202833891,
232
+ "kurtosis": 0.014159917831420898,
233
+ "outliers": 1,
234
+ "outlier_percent": 4.521122685185185e-06
235
+ },
236
+ "layers.2.attention_norm.weight": {
237
+ "mean": 1.0052924156188965,
238
+ "var": 0.0030417025554925203,
239
+ "std": 0.05515163391828537,
240
+ "skews": 0.2681165039539337,
241
+ "kurtosis": 1.2076926231384277,
242
+ "outliers": 0,
243
+ "outlier_percent": 0.0
244
+ },
245
+ "layers.2.ffn_norm.weight": {
246
+ "mean": 0.9500963687896729,
247
+ "var": 0.0031354704406112432,
248
+ "std": 0.05599527060985565,
249
+ "skews": -3.9257915019989014,
250
+ "kurtosis": 35.144386291503906,
251
+ "outliers": 1,
252
+ "outlier_percent": 0.003472222222222222
253
+ },
254
+ "layers.3.attention.wq.weight": {
255
+ "mean": -0.0003145245718769729,
256
+ "var": 0.0020177229307591915,
257
+ "std": 0.04491906985640526,
258
+ "skews": 0.03000479005277157,
259
+ "kurtosis": 0.6789233684539795,
260
+ "outliers": 6,
261
+ "outlier_percent": 7.233796296296296e-05
262
+ },
263
+ "layers.3.attention.wk.weight": {
264
+ "mean": 0.0001325952762272209,
265
+ "var": 0.0018331342143937945,
266
+ "std": 0.04281511530280113,
267
+ "skews": -0.033595889806747437,
268
+ "kurtosis": 1.3161125183105469,
269
+ "outliers": 14,
270
+ "outlier_percent": 0.00016878858024691357
271
+ },
272
+ "layers.3.attention.wv.weight": {
273
+ "mean": -4.994049959350377e-05,
274
+ "var": 0.0007408911478705704,
275
+ "std": 0.027219315990805626,
276
+ "skews": 0.013677406124770641,
277
+ "kurtosis": 0.11978745460510254,
278
+ "outliers": 0,
279
+ "outlier_percent": 0.0
280
+ },
281
+ "layers.3.attention.wo.weight": {
282
+ "mean": 6.271307597671694e-07,
283
+ "var": 0.0007163779227994382,
284
+ "std": 0.026765236631035805,
285
+ "skews": 0.015188642777502537,
286
+ "kurtosis": 0.39975571632385254,
287
+ "outliers": 4,
288
+ "outlier_percent": 4.8225308641975306e-05
289
+ },
290
+ "layers.3.feed_forward.w1.weight": {
291
+ "mean": 1.6025409422582015e-05,
292
+ "var": 0.0010597744258120656,
293
+ "std": 0.032554175704717636,
294
+ "skews": 0.001135399448685348,
295
+ "kurtosis": 0.12209343910217285,
296
+ "outliers": 1,
297
+ "outlier_percent": 4.521122685185185e-06
298
+ },
299
+ "layers.3.feed_forward.w2.weight": {
300
+ "mean": 3.5237095289630815e-05,
301
+ "var": 0.0011318204924464226,
302
+ "std": 0.033642541617155075,
303
+ "skews": -0.008913942612707615,
304
+ "kurtosis": 0.3426053524017334,
305
+ "outliers": 6,
306
+ "outlier_percent": 2.712673611111111e-05
307
+ },
308
+ "layers.3.feed_forward.w3.weight": {
309
+ "mean": 1.8510358131607063e-05,
310
+ "var": 0.0011854986660182476,
311
+ "std": 0.03443107008934021,
312
+ "skews": 0.0032921642996370792,
313
+ "kurtosis": 0.02032184600830078,
314
+ "outliers": 0,
315
+ "outlier_percent": 0.0
316
+ },
317
+ "layers.3.attention_norm.weight": {
318
+ "mean": 1.1416233777999878,
319
+ "var": 0.002959593664854765,
320
+ "std": 0.054402146488428116,
321
+ "skews": 0.22000834345817566,
322
+ "kurtosis": 2.438382625579834,
323
+ "outliers": 0,
324
+ "outlier_percent": 0.0
325
+ },
326
+ "layers.3.ffn_norm.weight": {
327
+ "mean": 1.0585883855819702,
328
+ "var": 0.0033200366888195276,
329
+ "std": 0.05761975795030594,
330
+ "skews": -5.2314934730529785,
331
+ "kurtosis": 54.92455291748047,
332
+ "outliers": 1,
333
+ "outlier_percent": 0.003472222222222222
334
+ },
335
+ "layers.4.attention.wq.weight": {
336
+ "mean": -0.00025949961855076253,
337
+ "var": 0.0013903075596317649,
338
+ "std": 0.03728682920336723,
339
+ "skews": -0.010436750948429108,
340
+ "kurtosis": 0.7299864292144775,
341
+ "outliers": 4,
342
+ "outlier_percent": 4.8225308641975306e-05
343
+ },
344
+ "layers.4.attention.wk.weight": {
345
+ "mean": 0.00014466408174484968,
346
+ "var": 0.0013631607871502638,
347
+ "std": 0.03692100569605827,
348
+ "skews": -0.011070731095969677,
349
+ "kurtosis": 1.4433116912841797,
350
+ "outliers": 9,
351
+ "outlier_percent": 0.00010850694444444444
352
+ },
353
+ "layers.4.attention.wv.weight": {
354
+ "mean": -7.966956763993949e-05,
355
+ "var": 0.0011041760444641113,
356
+ "std": 0.033229146152734756,
357
+ "skews": -0.011746554635465145,
358
+ "kurtosis": 0.19126129150390625,
359
+ "outliers": 0,
360
+ "outlier_percent": 0.0
361
+ },
362
+ "layers.4.attention.wo.weight": {
363
+ "mean": 1.5019092643342447e-05,
364
+ "var": 0.0011383997043594718,
365
+ "std": 0.03374017775058746,
366
+ "skews": -0.004771851468831301,
367
+ "kurtosis": 0.26372838020324707,
368
+ "outliers": 0,
369
+ "outlier_percent": 0.0
370
+ },
371
+ "layers.4.feed_forward.w1.weight": {
372
+ "mean": 0.00011195617844350636,
373
+ "var": 0.0010532090673223138,
374
+ "std": 0.032453183084726334,
375
+ "skews": 0.00047589949099346995,
376
+ "kurtosis": 0.5533373355865479,
377
+ "outliers": 32,
378
+ "outlier_percent": 0.00014467592592592592
379
+ },
380
+ "layers.4.feed_forward.w2.weight": {
381
+ "mean": 5.1059960242128e-05,
382
+ "var": 0.001199280726723373,
383
+ "std": 0.034630633890628815,
384
+ "skews": 0.0022110706195235252,
385
+ "kurtosis": 0.08130240440368652,
386
+ "outliers": 3,
387
+ "outlier_percent": 1.3563368055555555e-05
388
+ },
389
+ "layers.4.feed_forward.w3.weight": {
390
+ "mean": -5.813660391140729e-05,
391
+ "var": 0.0012630914570763707,
392
+ "std": 0.03553999960422516,
393
+ "skews": -0.004379538353532553,
394
+ "kurtosis": 0.0008075237274169922,
395
+ "outliers": 0,
396
+ "outlier_percent": 0.0
397
+ },
398
+ "layers.4.attention_norm.weight": {
399
+ "mean": 1.4452831745147705,
400
+ "var": 0.0042164139449596405,
401
+ "std": 0.06493391841650009,
402
+ "skews": -1.056495189666748,
403
+ "kurtosis": 5.870485305786133,
404
+ "outliers": 1,
405
+ "outlier_percent": 0.003472222222222222
406
+ },
407
+ "layers.4.ffn_norm.weight": {
408
+ "mean": 1.2039875984191895,
409
+ "var": 0.003980363253504038,
410
+ "std": 0.06309012323617935,
411
+ "skews": -4.837477207183838,
412
+ "kurtosis": 49.78935241699219,
413
+ "outliers": 1,
414
+ "outlier_percent": 0.003472222222222222
415
+ },
416
+ "layers.5.attention.wq.weight": {
417
+ "mean": 8.131389040499926e-05,
418
+ "var": 0.001353550935164094,
419
+ "std": 0.0367906354367733,
420
+ "skews": -0.017600344493985176,
421
+ "kurtosis": 0.852405309677124,
422
+ "outliers": 7,
423
+ "outlier_percent": 8.439429012345679e-05
424
+ },
425
+ "layers.5.attention.wk.weight": {
426
+ "mean": 3.1251300242729485e-05,
427
+ "var": 0.0013346867635846138,
428
+ "std": 0.03653336688876152,
429
+ "skews": -0.03897131234407425,
430
+ "kurtosis": 1.5126566886901855,
431
+ "outliers": 7,
432
+ "outlier_percent": 8.439429012345679e-05
433
+ },
434
+ "layers.5.attention.wv.weight": {
435
+ "mean": -5.62375171284657e-05,
436
+ "var": 0.0012745895655825734,
437
+ "std": 0.03570139408111572,
438
+ "skews": 0.0008129411144182086,
439
+ "kurtosis": 0.4274141788482666,
440
+ "outliers": 0,
441
+ "outlier_percent": 0.0
442
+ },
443
+ "layers.5.attention.wo.weight": {
444
+ "mean": -1.0276995453750715e-05,
445
+ "var": 0.0012555326102301478,
446
+ "std": 0.03543349727988243,
447
+ "skews": 0.0017857198836281896,
448
+ "kurtosis": 0.5696825981140137,
449
+ "outliers": 2,
450
+ "outlier_percent": 2.4112654320987653e-05
451
+ },
452
+ "layers.5.feed_forward.w1.weight": {
453
+ "mean": 0.00023112430062610656,
454
+ "var": 0.0010687459725886583,
455
+ "std": 0.032691679894924164,
456
+ "skews": -0.004319785162806511,
457
+ "kurtosis": 0.11712980270385742,
458
+ "outliers": 2,
459
+ "outlier_percent": 9.04224537037037e-06
460
+ },
461
+ "layers.5.feed_forward.w2.weight": {
462
+ "mean": 6.056282927602297e-06,
463
+ "var": 0.0011791514698415995,
464
+ "std": 0.034338776022195816,
465
+ "skews": 0.0026711937971413136,
466
+ "kurtosis": 0.3927266597747803,
467
+ "outliers": 17,
468
+ "outlier_percent": 7.685908564814815e-05
469
+ },
470
+ "layers.5.feed_forward.w3.weight": {
471
+ "mean": -4.802293187822215e-05,
472
+ "var": 0.0012868741760030389,
473
+ "std": 0.03587302938103676,
474
+ "skews": -0.004913518205285072,
475
+ "kurtosis": 0.09062385559082031,
476
+ "outliers": 1,
477
+ "outlier_percent": 4.521122685185185e-06
478
+ },
479
+ "layers.5.attention_norm.weight": {
480
+ "mean": 1.4747096300125122,
481
+ "var": 0.004917146638035774,
482
+ "std": 0.0701223686337471,
483
+ "skews": -0.6923183798789978,
484
+ "kurtosis": 2.5739073753356934,
485
+ "outliers": 0,
486
+ "outlier_percent": 0.0
487
+ },
488
+ "layers.5.ffn_norm.weight": {
489
+ "mean": 1.330501914024353,
490
+ "var": 0.0026364095974713564,
491
+ "std": 0.05134597793221474,
492
+ "skews": -1.7850385904312134,
493
+ "kurtosis": 12.851001739501953,
494
+ "outliers": 1,
495
+ "outlier_percent": 0.003472222222222222
496
+ },
497
+ "norm.weight": {
498
+ "mean": 4.583681106567383,
499
+ "var": 0.10091519355773926,
500
+ "std": 0.3176715075969696,
501
+ "skews": 0.33317092061042786,
502
+ "kurtosis": 7.595536231994629,
503
+ "outliers": 0,
504
+ "outlier_percent": 0.0
505
+ }
506
+ }
softmax1-15m-2023_08_22_03_16_17/ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2c0188edf23095abc4142b46e5ab62a4438156bd58837f1aad284338db7124
3
+ size 60784783
softmax1-15m-2023_08_22_03_16_17/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "out_dir": "out",
3
+ "eval_interval": 2000,
4
+ "log_interval": 1,
5
+ "eval_iters": 100,
6
+ "eval_only": false,
7
+ "always_save_checkpoint": true,
8
+ "init_from": "scratch",
9
+ "wandb_log": true,
10
+ "wandb_project": "softmax1-tinystories",
11
+ "wandb_run_name": "run2023_08_22_03_16_17",
12
+ "batch_size": 96,
13
+ "max_seq_len": 256,
14
+ "vocab_source": "llama2",
15
+ "vocab_size": 32000,
16
+ "dim": 288,
17
+ "n_layers": 6,
18
+ "n_heads": 6,
19
+ "n_kv_heads": 6,
20
+ "multiple_of": 32,
21
+ "dropout": 0.0,
22
+ "gradient_accumulation_steps": 4,
23
+ "learning_rate": 0.0005,
24
+ "max_iters": 100000,
25
+ "weight_decay": 0.1,
26
+ "beta1": 0.9,
27
+ "beta2": 0.95,
28
+ "grad_clip": 1.0,
29
+ "decay_lr": true,
30
+ "warmup_iters": 1000,
31
+ "device": "cuda",
32
+ "dtype": "float16",
33
+ "compile": true,
34
+ "softmax1": true
35
+ }
softmax1-15m-2023_08_22_03_16_17/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b8fdf4e3ebeb36f1f4ad74d2c3b63c328a59acbff314271100b013b1e6b4b67
3
+ size 60816028
softmax1-15m-2023_08_22_03_16_17/weights.json ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_embeddings.weight": {
3
+ "mean": -0.0030575105920434,
4
+ "var": 0.008043774403631687,
5
+ "std": 0.08968709409236908,
6
+ "skews": -0.8338078260421753,
7
+ "kurtosis": 18.851457595825195,
8
+ "outliers": 44748,
9
+ "outlier_percent": 0.00485546875
10
+ },
11
+ "layers.0.attention.wq.weight": {
12
+ "mean": 0.00035618283436633646,
13
+ "var": 0.004454279784113169,
14
+ "std": 0.06674039363861084,
15
+ "skews": 0.010199151001870632,
16
+ "kurtosis": 1.0952239036560059,
17
+ "outliers": 9,
18
+ "outlier_percent": 0.00010850694444444444
19
+ },
20
+ "layers.0.attention.wk.weight": {
21
+ "mean": 0.0002400689263595268,
22
+ "var": 0.0038942936807870865,
23
+ "std": 0.06240427494049072,
24
+ "skews": 0.04288599640130997,
25
+ "kurtosis": 1.2222018241882324,
26
+ "outliers": 8,
27
+ "outlier_percent": 9.645061728395061e-05
28
+ },
29
+ "layers.0.attention.wv.weight": {
30
+ "mean": 4.090589936822653e-05,
31
+ "var": 0.00035867199767380953,
32
+ "std": 0.018938638269901276,
33
+ "skews": -0.006655456963926554,
34
+ "kurtosis": 0.36399269104003906,
35
+ "outliers": 0,
36
+ "outlier_percent": 0.0
37
+ },
38
+ "layers.0.attention.wo.weight": {
39
+ "mean": -6.33133968221955e-05,
40
+ "var": 0.00037953953142277896,
41
+ "std": 0.019481774419546127,
42
+ "skews": -0.009102443233132362,
43
+ "kurtosis": 13.020711898803711,
44
+ "outliers": 18,
45
+ "outlier_percent": 0.00021701388888888888
46
+ },
47
+ "layers.0.feed_forward.w1.weight": {
48
+ "mean": -8.492724009556696e-05,
49
+ "var": 0.0016241124831140041,
50
+ "std": 0.04030027985572815,
51
+ "skews": 0.005218073725700378,
52
+ "kurtosis": -0.021692514419555664,
53
+ "outliers": 0,
54
+ "outlier_percent": 0.0
55
+ },
56
+ "layers.0.feed_forward.w2.weight": {
57
+ "mean": -6.97666619089432e-05,
58
+ "var": 0.001631619525142014,
59
+ "std": 0.04039331153035164,
60
+ "skews": 0.004510404542088509,
61
+ "kurtosis": 1.0991830825805664,
62
+ "outliers": 50,
63
+ "outlier_percent": 0.00022605613425925926
64
+ },
65
+ "layers.0.feed_forward.w3.weight": {
66
+ "mean": -6.370133633026853e-05,
67
+ "var": 0.001632346655242145,
68
+ "std": 0.04040231183171272,
69
+ "skews": -0.0011075332295149565,
70
+ "kurtosis": 0.010745048522949219,
71
+ "outliers": 0,
72
+ "outlier_percent": 0.0
73
+ },
74
+ "layers.0.attention_norm.weight": {
75
+ "mean": 0.5556919574737549,
76
+ "var": 0.012906110845506191,
77
+ "std": 0.1136050671339035,
78
+ "skews": 1.3937958478927612,
79
+ "kurtosis": 2.516613483428955,
80
+ "outliers": 0,
81
+ "outlier_percent": 0.0
82
+ },
83
+ "layers.0.ffn_norm.weight": {
84
+ "mean": 0.6750139594078064,
85
+ "var": 0.0032933931797742844,
86
+ "std": 0.05738809332251549,
87
+ "skews": -4.392051696777344,
88
+ "kurtosis": 42.291107177734375,
89
+ "outliers": 1,
90
+ "outlier_percent": 0.003472222222222222
91
+ },
92
+ "layers.1.attention.wq.weight": {
93
+ "mean": -5.454247457237216e-06,
94
+ "var": 0.0035072374157607555,
95
+ "std": 0.05922193452715874,
96
+ "skews": -0.014804880134761333,
97
+ "kurtosis": 0.5731871128082275,
98
+ "outliers": 3,
99
+ "outlier_percent": 3.616898148148148e-05
100
+ },
101
+ "layers.1.attention.wk.weight": {
102
+ "mean": -0.00011433610052336007,
103
+ "var": 0.0029967124573886395,
104
+ "std": 0.0547422356903553,
105
+ "skews": -0.06437145173549652,
106
+ "kurtosis": 1.6788911819458008,
107
+ "outliers": 13,
108
+ "outlier_percent": 0.00015673225308641974
109
+ },
110
+ "layers.1.attention.wv.weight": {
111
+ "mean": 0.00011750552948797122,
112
+ "var": 0.0007435075822286308,
113
+ "std": 0.0272673349827528,
114
+ "skews": 0.012133199721574783,
115
+ "kurtosis": 0.07190084457397461,
116
+ "outliers": 0,
117
+ "outlier_percent": 0.0
118
+ },
119
+ "layers.1.attention.wo.weight": {
120
+ "mean": 2.586756818345748e-05,
121
+ "var": 0.0007432072889059782,
122
+ "std": 0.027261829003691673,
123
+ "skews": 0.00605379045009613,
124
+ "kurtosis": 0.41370391845703125,
125
+ "outliers": 8,
126
+ "outlier_percent": 9.645061728395061e-05
127
+ },
128
+ "layers.1.feed_forward.w1.weight": {
129
+ "mean": -3.618240953073837e-05,
130
+ "var": 0.001606133533641696,
131
+ "std": 0.040076594799757004,
132
+ "skews": -0.0033451258204877377,
133
+ "kurtosis": 0.13593506813049316,
134
+ "outliers": 5,
135
+ "outlier_percent": 2.2605613425925925e-05
136
+ },
137
+ "layers.1.feed_forward.w2.weight": {
138
+ "mean": -5.651494939229451e-05,
139
+ "var": 0.0016629855381324887,
140
+ "std": 0.0407797209918499,
141
+ "skews": -0.0027265246026217937,
142
+ "kurtosis": 0.3058323860168457,
143
+ "outliers": 13,
144
+ "outlier_percent": 5.8774594907407404e-05
145
+ },
146
+ "layers.1.feed_forward.w3.weight": {
147
+ "mean": 5.0682170694926754e-05,
148
+ "var": 0.0016612057806923985,
149
+ "std": 0.04075789079070091,
150
+ "skews": 0.003567819483578205,
151
+ "kurtosis": -0.02021002769470215,
152
+ "outliers": 0,
153
+ "outlier_percent": 0.0
154
+ },
155
+ "layers.1.attention_norm.weight": {
156
+ "mean": 0.8679685592651367,
157
+ "var": 0.002409706823527813,
158
+ "std": 0.04908876493573189,
159
+ "skews": -0.0007233686046674848,
160
+ "kurtosis": 0.4187917709350586,
161
+ "outliers": 0,
162
+ "outlier_percent": 0.0
163
+ },
164
+ "layers.1.ffn_norm.weight": {
165
+ "mean": 0.7828017473220825,
166
+ "var": 0.0038013842422515154,
167
+ "std": 0.06165536493062973,
168
+ "skews": -4.157523155212402,
169
+ "kurtosis": 32.78773498535156,
170
+ "outliers": 1,
171
+ "outlier_percent": 0.003472222222222222
172
+ },
173
+ "layers.2.attention.wq.weight": {
174
+ "mean": 5.771218638983555e-05,
175
+ "var": 0.0031918222084641457,
176
+ "std": 0.056496214121580124,
177
+ "skews": 0.006333012133836746,
178
+ "kurtosis": 0.5441603660583496,
179
+ "outliers": 3,
180
+ "outlier_percent": 3.616898148148148e-05
181
+ },
182
+ "layers.2.attention.wk.weight": {
183
+ "mean": -0.00023540656547993422,
184
+ "var": 0.002815589774399996,
185
+ "std": 0.05306212976574898,
186
+ "skews": -0.049821335822343826,
187
+ "kurtosis": 1.6562185287475586,
188
+ "outliers": 9,
189
+ "outlier_percent": 0.00010850694444444444
190
+ },
191
+ "layers.2.attention.wv.weight": {
192
+ "mean": 2.4973463951027952e-05,
193
+ "var": 0.0009142293711192906,
194
+ "std": 0.030236225575208664,
195
+ "skews": -0.007988173514604568,
196
+ "kurtosis": 0.07765412330627441,
197
+ "outliers": 0,
198
+ "outlier_percent": 0.0
199
+ },
200
+ "layers.2.attention.wo.weight": {
201
+ "mean": -6.22320658294484e-06,
202
+ "var": 0.0009230568539351225,
203
+ "std": 0.03038185089826584,
204
+ "skews": 0.000765857519581914,
205
+ "kurtosis": 0.08483433723449707,
206
+ "outliers": 0,
207
+ "outlier_percent": 0.0
208
+ },
209
+ "layers.2.feed_forward.w1.weight": {
210
+ "mean": -6.062598549760878e-05,
211
+ "var": 0.0015908096684142947,
212
+ "std": 0.03988495469093323,
213
+ "skews": 0.008102444000542164,
214
+ "kurtosis": 0.0035903453826904297,
215
+ "outliers": 0,
216
+ "outlier_percent": 0.0
217
+ },
218
+ "layers.2.feed_forward.w2.weight": {
219
+ "mean": 4.470473504625261e-05,
220
+ "var": 0.0017242009053006768,
221
+ "std": 0.04152349755167961,
222
+ "skews": 0.0011632241075858474,
223
+ "kurtosis": 0.06214785575866699,
224
+ "outliers": 1,
225
+ "outlier_percent": 4.521122685185185e-06
226
+ },
227
+ "layers.2.feed_forward.w3.weight": {
228
+ "mean": -2.1270183424348943e-05,
229
+ "var": 0.0017243118491023779,
230
+ "std": 0.04152483493089676,
231
+ "skews": -0.005408828612416983,
232
+ "kurtosis": 0.012958765029907227,
233
+ "outliers": 0,
234
+ "outlier_percent": 0.0
235
+ },
236
+ "layers.2.attention_norm.weight": {
237
+ "mean": 0.9104444980621338,
238
+ "var": 0.0019502700306475163,
239
+ "std": 0.04416185989975929,
240
+ "skews": 0.6367732286453247,
241
+ "kurtosis": 0.9952096939086914,
242
+ "outliers": 0,
243
+ "outlier_percent": 0.0
244
+ },
245
+ "layers.2.ffn_norm.weight": {
246
+ "mean": 0.88186115026474,
247
+ "var": 0.0035577910020947456,
248
+ "std": 0.059647221118211746,
249
+ "skews": -3.8068487644195557,
250
+ "kurtosis": 32.94072723388672,
251
+ "outliers": 1,
252
+ "outlier_percent": 0.003472222222222222
253
+ },
254
+ "layers.3.attention.wq.weight": {
255
+ "mean": -3.916208697773982e-06,
256
+ "var": 0.002655748976394534,
257
+ "std": 0.051533959805965424,
258
+ "skews": 0.0036733371671289206,
259
+ "kurtosis": 0.4783966541290283,
260
+ "outliers": 3,
261
+ "outlier_percent": 3.616898148148148e-05
262
+ },
263
+ "layers.3.attention.wk.weight": {
264
+ "mean": 0.00031962458160705864,
265
+ "var": 0.0023793901782482862,
266
+ "std": 0.048778992146253586,
267
+ "skews": -0.014764788560569286,
268
+ "kurtosis": 1.5564613342285156,
269
+ "outliers": 13,
270
+ "outlier_percent": 0.00015673225308641974
271
+ },
272
+ "layers.3.attention.wv.weight": {
273
+ "mean": -4.206255835015327e-05,
274
+ "var": 0.0012175313895568252,
275
+ "std": 0.03489314392209053,
276
+ "skews": -0.004365447908639908,
277
+ "kurtosis": 0.09970426559448242,
278
+ "outliers": 0,
279
+ "outlier_percent": 0.0
280
+ },
281
+ "layers.3.attention.wo.weight": {
282
+ "mean": 0.000249701552093029,
283
+ "var": 0.0012641714420169592,
284
+ "std": 0.03555518761277199,
285
+ "skews": 0.00975084025412798,
286
+ "kurtosis": 0.09047365188598633,
287
+ "outliers": 0,
288
+ "outlier_percent": 0.0
289
+ },
290
+ "layers.3.feed_forward.w1.weight": {
291
+ "mean": 0.00019631479517556727,
292
+ "var": 0.0015764598501846194,
293
+ "std": 0.03970465809106827,
294
+ "skews": -0.00013909985136706382,
295
+ "kurtosis": 0.029094934463500977,
296
+ "outliers": 0,
297
+ "outlier_percent": 0.0
298
+ },
299
+ "layers.3.feed_forward.w2.weight": {
300
+ "mean": -7.194675708888099e-05,
301
+ "var": 0.0017914645140990615,
302
+ "std": 0.0423256941139698,
303
+ "skews": -0.010688533075153828,
304
+ "kurtosis": 0.10126662254333496,
305
+ "outliers": 2,
306
+ "outlier_percent": 9.04224537037037e-06
307
+ },
308
+ "layers.3.feed_forward.w3.weight": {
309
+ "mean": -7.225230547192041e-06,
310
+ "var": 0.0017912057228386402,
311
+ "std": 0.042322639375925064,
312
+ "skews": 0.0029504415579140186,
313
+ "kurtosis": 0.02350449562072754,
314
+ "outliers": 0,
315
+ "outlier_percent": 0.0
316
+ },
317
+ "layers.3.attention_norm.weight": {
318
+ "mean": 0.9992889761924744,
319
+ "var": 0.001841739285737276,
320
+ "std": 0.04291548952460289,
321
+ "skews": 0.28296077251434326,
322
+ "kurtosis": 0.21933650970458984,
323
+ "outliers": 0,
324
+ "outlier_percent": 0.0
325
+ },
326
+ "layers.3.ffn_norm.weight": {
327
+ "mean": 0.956415057182312,
328
+ "var": 0.002891228999942541,
329
+ "std": 0.05377015098929405,
330
+ "skews": -4.3508172035217285,
331
+ "kurtosis": 41.48064041137695,
332
+ "outliers": 1,
333
+ "outlier_percent": 0.003472222222222222
334
+ },
335
+ "layers.4.attention.wq.weight": {
336
+ "mean": -0.0002995177055709064,
337
+ "var": 0.001933008199557662,
338
+ "std": 0.043965987861156464,
339
+ "skews": -0.013897748664021492,
340
+ "kurtosis": 0.6071853637695312,
341
+ "outliers": 3,
342
+ "outlier_percent": 3.616898148148148e-05
343
+ },
344
+ "layers.4.attention.wk.weight": {
345
+ "mean": 0.0002864231646526605,
346
+ "var": 0.0018339046509936452,
347
+ "std": 0.04282411187887192,
348
+ "skews": -0.010061254724860191,
349
+ "kurtosis": 1.4288434982299805,
350
+ "outliers": 11,
351
+ "outlier_percent": 0.0001326195987654321
352
+ },
353
+ "layers.4.attention.wv.weight": {
354
+ "mean": 0.00010028992255683988,
355
+ "var": 0.0018044327152892947,
356
+ "std": 0.042478613555431366,
357
+ "skews": 0.004125483334064484,
358
+ "kurtosis": 0.32813596725463867,
359
+ "outliers": 0,
360
+ "outlier_percent": 0.0
361
+ },
362
+ "layers.4.attention.wo.weight": {
363
+ "mean": 1.004967725748429e-05,
364
+ "var": 0.0020177457481622696,
365
+ "std": 0.044919323176145554,
366
+ "skews": -0.0013239302206784487,
367
+ "kurtosis": 0.3486945629119873,
368
+ "outliers": 0,
369
+ "outlier_percent": 0.0
370
+ },
371
+ "layers.4.feed_forward.w1.weight": {
372
+ "mean": 2.9631637517013587e-05,
373
+ "var": 0.0015774235362187028,
374
+ "std": 0.03971679136157036,
375
+ "skews": 0.004260245710611343,
376
+ "kurtosis": 0.011400461196899414,
377
+ "outliers": 0,
378
+ "outlier_percent": 0.0
379
+ },
380
+ "layers.4.feed_forward.w2.weight": {
381
+ "mean": -5.6489670896553434e-06,
382
+ "var": 0.0018962562317028642,
383
+ "std": 0.04354602470993996,
384
+ "skews": 0.00325192348100245,
385
+ "kurtosis": 0.03762507438659668,
386
+ "outliers": 0,
387
+ "outlier_percent": 0.0
388
+ },
389
+ "layers.4.feed_forward.w3.weight": {
390
+ "mean": -6.117334123700857e-05,
391
+ "var": 0.0019177846843376756,
392
+ "std": 0.043792519718408585,
393
+ "skews": 0.0020223369356244802,
394
+ "kurtosis": 3.62396240234375e-05,
395
+ "outliers": 0,
396
+ "outlier_percent": 0.0
397
+ },
398
+ "layers.4.attention_norm.weight": {
399
+ "mean": 1.2174360752105713,
400
+ "var": 0.0019354402320459485,
401
+ "std": 0.04399363696575165,
402
+ "skews": -0.6009857654571533,
403
+ "kurtosis": 1.8265156745910645,
404
+ "outliers": 0,
405
+ "outlier_percent": 0.0
406
+ },
407
+ "layers.4.ffn_norm.weight": {
408
+ "mean": 1.0664386749267578,
409
+ "var": 0.0028373233508318663,
410
+ "std": 0.053266532719135284,
411
+ "skews": -3.981571674346924,
412
+ "kurtosis": 35.60630416870117,
413
+ "outliers": 1,
414
+ "outlier_percent": 0.003472222222222222
415
+ },
416
+ "layers.5.attention.wq.weight": {
417
+ "mean": 0.0002450960164424032,
418
+ "var": 0.0019403173355385661,
419
+ "std": 0.04404903203248978,
420
+ "skews": -0.0036919021513313055,
421
+ "kurtosis": 0.747020959854126,
422
+ "outliers": 3,
423
+ "outlier_percent": 3.616898148148148e-05
424
+ },
425
+ "layers.5.attention.wk.weight": {
426
+ "mean": 0.0002591797092463821,
427
+ "var": 0.0018460049759596586,
428
+ "std": 0.04296515882015228,
429
+ "skews": -0.02168109081685543,
430
+ "kurtosis": 1.5742673873901367,
431
+ "outliers": 13,
432
+ "outlier_percent": 0.00015673225308641974
433
+ },
434
+ "layers.5.attention.wv.weight": {
435
+ "mean": -6.999686593189836e-05,
436
+ "var": 0.0019269806798547506,
437
+ "std": 0.04389738664031029,
438
+ "skews": 0.011553842574357986,
439
+ "kurtosis": 0.36922311782836914,
440
+ "outliers": 1,
441
+ "outlier_percent": 1.2056327160493826e-05
442
+ },
443
+ "layers.5.attention.wo.weight": {
444
+ "mean": 7.272008224390447e-05,
445
+ "var": 0.00202196859754622,
446
+ "std": 0.044966306537389755,
447
+ "skews": 0.008191176690161228,
448
+ "kurtosis": 0.34751224517822266,
449
+ "outliers": 0,
450
+ "outlier_percent": 0.0
451
+ },
452
+ "layers.5.feed_forward.w1.weight": {
453
+ "mean": 0.0001724090107018128,
454
+ "var": 0.0015984694473445415,
455
+ "std": 0.03998086228966713,
456
+ "skews": -0.004892218858003616,
457
+ "kurtosis": 0.030857563018798828,
458
+ "outliers": 0,
459
+ "outlier_percent": 0.0
460
+ },
461
+ "layers.5.feed_forward.w2.weight": {
462
+ "mean": -5.098358542454662e-06,
463
+ "var": 0.0018277923809364438,
464
+ "std": 0.04275268688797951,
465
+ "skews": -0.00011612088565016165,
466
+ "kurtosis": 0.23499655723571777,
467
+ "outliers": 5,
468
+ "outlier_percent": 2.2605613425925925e-05
469
+ },
470
+ "layers.5.feed_forward.w3.weight": {
471
+ "mean": 1.0925466085609514e-05,
472
+ "var": 0.0018982859328389168,
473
+ "std": 0.04356932267546654,
474
+ "skews": 0.0032867027912288904,
475
+ "kurtosis": 0.05734658241271973,
476
+ "outliers": 0,
477
+ "outlier_percent": 0.0
478
+ },
479
+ "layers.5.attention_norm.weight": {
480
+ "mean": 1.2220523357391357,
481
+ "var": 0.002029956318438053,
482
+ "std": 0.04505503550171852,
483
+ "skews": 0.05655462294816971,
484
+ "kurtosis": 0.417694091796875,
485
+ "outliers": 0,
486
+ "outlier_percent": 0.0
487
+ },
488
+ "layers.5.ffn_norm.weight": {
489
+ "mean": 1.159726619720459,
490
+ "var": 0.0014754422008991241,
491
+ "std": 0.03841148689389229,
492
+ "skews": -1.1254332065582275,
493
+ "kurtosis": 5.061956405639648,
494
+ "outliers": 1,
495
+ "outlier_percent": 0.003472222222222222
496
+ },
497
+ "norm.weight": {
498
+ "mean": 2.7584409713745117,
499
+ "var": 0.02132660150527954,
500
+ "std": 0.146036297082901,
501
+ "skews": -1.0050147771835327,
502
+ "kurtosis": 3.220973491668701,
503
+ "outliers": 0,
504
+ "outlier_percent": 0.0
505
+ }
506
+ }