15M model
Browse files- .gitignore +1 -0
- softmax0-15m-2023_08_22_14_44_50/ckpt.pt +3 -0
- softmax0-15m-2023_08_22_14_44_50/config.json +35 -0
- softmax0-15m-2023_08_22_14_44_50/model.bin +3 -0
- softmax0-15m-2023_08_22_14_44_50/weights.json +506 -0
- softmax1-15m-2023_08_22_03_16_17/ckpt.pt +3 -0
- softmax1-15m-2023_08_22_03_16_17/config.json +35 -0
- softmax1-15m-2023_08_22_03_16_17/model.bin +3 -0
- softmax1-15m-2023_08_22_03_16_17/weights.json +506 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
images/
|
softmax0-15m-2023_08_22_14_44_50/ckpt.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:702532f11d9611cfca7bb19fa9071774f4772db44057a90e3e573d06f243cda3
|
3 |
+
size 182363350
|
softmax0-15m-2023_08_22_14_44_50/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"out_dir": "out",
|
3 |
+
"eval_interval": 2000,
|
4 |
+
"log_interval": 1,
|
5 |
+
"eval_iters": 100,
|
6 |
+
"eval_only": false,
|
7 |
+
"always_save_checkpoint": true,
|
8 |
+
"init_from": "scratch",
|
9 |
+
"wandb_log": true,
|
10 |
+
"wandb_project": "softmax1-tinystories",
|
11 |
+
"wandb_run_name": "softmax0-15m-2023_08_22_14_44_50",
|
12 |
+
"batch_size": 96,
|
13 |
+
"max_seq_len": 256,
|
14 |
+
"vocab_source": "llama2",
|
15 |
+
"vocab_size": 32000,
|
16 |
+
"dim": 288,
|
17 |
+
"n_layers": 6,
|
18 |
+
"n_heads": 6,
|
19 |
+
"n_kv_heads": 6,
|
20 |
+
"multiple_of": 32,
|
21 |
+
"dropout": 0.0,
|
22 |
+
"gradient_accumulation_steps": 4,
|
23 |
+
"learning_rate": 0.0005,
|
24 |
+
"max_iters": 100000,
|
25 |
+
"weight_decay": 0.1,
|
26 |
+
"beta1": 0.9,
|
27 |
+
"beta2": 0.95,
|
28 |
+
"grad_clip": 1.0,
|
29 |
+
"decay_lr": true,
|
30 |
+
"warmup_iters": 1000,
|
31 |
+
"device": "cuda",
|
32 |
+
"dtype": "float16",
|
33 |
+
"compile": true,
|
34 |
+
"softmax1": false
|
35 |
+
}
|
softmax0-15m-2023_08_22_14_44_50/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6f0df381bcc8058c8ff6ec48864df2c9eb61b3b926f22e87569c9badec0e0a9
|
3 |
+
size 60816028
|
softmax0-15m-2023_08_22_14_44_50/weights.json
ADDED
@@ -0,0 +1,506 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tok_embeddings.weight": {
|
3 |
+
"mean": -0.0016343051102012396,
|
4 |
+
"var": 0.0038931926246732473,
|
5 |
+
"std": 0.062395453453063965,
|
6 |
+
"skews": -2.7351033687591553,
|
7 |
+
"kurtosis": 29.507484436035156,
|
8 |
+
"outliers": 23467,
|
9 |
+
"outlier_percent": 0.002546332465277778
|
10 |
+
},
|
11 |
+
"layers.0.attention.wq.weight": {
|
12 |
+
"mean": -0.00015833647921681404,
|
13 |
+
"var": 0.005146522540599108,
|
14 |
+
"std": 0.07173927128314972,
|
15 |
+
"skews": -0.0028434402775019407,
|
16 |
+
"kurtosis": 1.6188158988952637,
|
17 |
+
"outliers": 23,
|
18 |
+
"outlier_percent": 0.000277295524691358
|
19 |
+
},
|
20 |
+
"layers.0.attention.wk.weight": {
|
21 |
+
"mean": 6.526386641780846e-06,
|
22 |
+
"var": 0.004242290742695332,
|
23 |
+
"std": 0.06513287127017975,
|
24 |
+
"skews": 0.021585451439023018,
|
25 |
+
"kurtosis": 1.1567010879516602,
|
26 |
+
"outliers": 9,
|
27 |
+
"outlier_percent": 0.00010850694444444444
|
28 |
+
},
|
29 |
+
"layers.0.attention.wv.weight": {
|
30 |
+
"mean": -4.457616887520999e-05,
|
31 |
+
"var": 0.00026332412380725145,
|
32 |
+
"std": 0.01622726395726204,
|
33 |
+
"skews": -0.013842478394508362,
|
34 |
+
"kurtosis": 0.297649621963501,
|
35 |
+
"outliers": 0,
|
36 |
+
"outlier_percent": 0.0
|
37 |
+
},
|
38 |
+
"layers.0.attention.wo.weight": {
|
39 |
+
"mean": -7.100501306922524e-07,
|
40 |
+
"var": 0.00024368301092181355,
|
41 |
+
"std": 0.015610349364578724,
|
42 |
+
"skews": 0.10141679644584656,
|
43 |
+
"kurtosis": 11.977744102478027,
|
44 |
+
"outliers": 16,
|
45 |
+
"outlier_percent": 0.00019290123456790122
|
46 |
+
},
|
47 |
+
"layers.0.feed_forward.w1.weight": {
|
48 |
+
"mean": -2.4933683562267106e-06,
|
49 |
+
"var": 0.0010707820765674114,
|
50 |
+
"std": 0.03272280842065811,
|
51 |
+
"skews": 0.006910220254212618,
|
52 |
+
"kurtosis": -0.0008280277252197266,
|
53 |
+
"outliers": 0,
|
54 |
+
"outlier_percent": 0.0
|
55 |
+
},
|
56 |
+
"layers.0.feed_forward.w2.weight": {
|
57 |
+
"mean": -4.803382034879178e-05,
|
58 |
+
"var": 0.0010309461504220963,
|
59 |
+
"std": 0.03210835158824921,
|
60 |
+
"skews": 0.03226399049162865,
|
61 |
+
"kurtosis": 1.2938251495361328,
|
62 |
+
"outliers": 47,
|
63 |
+
"outlier_percent": 0.0002124927662037037
|
64 |
+
},
|
65 |
+
"layers.0.feed_forward.w3.weight": {
|
66 |
+
"mean": -1.1704322787409183e-05,
|
67 |
+
"var": 0.0010558386566117406,
|
68 |
+
"std": 0.03249366953969002,
|
69 |
+
"skews": -0.0073517016135156155,
|
70 |
+
"kurtosis": 0.0046155452728271484,
|
71 |
+
"outliers": 0,
|
72 |
+
"outlier_percent": 0.0
|
73 |
+
},
|
74 |
+
"layers.0.attention_norm.weight": {
|
75 |
+
"mean": 0.5683733820915222,
|
76 |
+
"var": 0.014715575613081455,
|
77 |
+
"std": 0.12130777537822723,
|
78 |
+
"skews": 2.2385318279266357,
|
79 |
+
"kurtosis": 8.328707695007324,
|
80 |
+
"outliers": 1,
|
81 |
+
"outlier_percent": 0.003472222222222222
|
82 |
+
},
|
83 |
+
"layers.0.ffn_norm.weight": {
|
84 |
+
"mean": 0.7457764148712158,
|
85 |
+
"var": 0.003284212201833725,
|
86 |
+
"std": 0.057308048009872437,
|
87 |
+
"skews": -4.060823917388916,
|
88 |
+
"kurtosis": 42.8275260925293,
|
89 |
+
"outliers": 1,
|
90 |
+
"outlier_percent": 0.003472222222222222
|
91 |
+
},
|
92 |
+
"layers.1.attention.wq.weight": {
|
93 |
+
"mean": -4.265182724338956e-05,
|
94 |
+
"var": 0.0027510877698659897,
|
95 |
+
"std": 0.052450813353061676,
|
96 |
+
"skews": 0.00352106848731637,
|
97 |
+
"kurtosis": 0.6108942031860352,
|
98 |
+
"outliers": 4,
|
99 |
+
"outlier_percent": 4.8225308641975306e-05
|
100 |
+
},
|
101 |
+
"layers.1.attention.wk.weight": {
|
102 |
+
"mean": 5.963511284790002e-05,
|
103 |
+
"var": 0.002435738919302821,
|
104 |
+
"std": 0.04935320466756821,
|
105 |
+
"skews": -0.07012113928794861,
|
106 |
+
"kurtosis": 1.8377490043640137,
|
107 |
+
"outliers": 8,
|
108 |
+
"outlier_percent": 9.645061728395061e-05
|
109 |
+
},
|
110 |
+
"layers.1.attention.wv.weight": {
|
111 |
+
"mean": 2.9321125111891888e-05,
|
112 |
+
"var": 0.0004078703641425818,
|
113 |
+
"std": 0.020195800811052322,
|
114 |
+
"skews": -0.011546331457793713,
|
115 |
+
"kurtosis": 0.0876009464263916,
|
116 |
+
"outliers": 0,
|
117 |
+
"outlier_percent": 0.0
|
118 |
+
},
|
119 |
+
"layers.1.attention.wo.weight": {
|
120 |
+
"mean": 4.54976589026046e-06,
|
121 |
+
"var": 0.0003835293173324317,
|
122 |
+
"std": 0.01958390511572361,
|
123 |
+
"skews": 0.05651681870222092,
|
124 |
+
"kurtosis": 0.7702975273132324,
|
125 |
+
"outliers": 14,
|
126 |
+
"outlier_percent": 0.00016878858024691357
|
127 |
+
},
|
128 |
+
"layers.1.feed_forward.w1.weight": {
|
129 |
+
"mean": -0.0002082372084259987,
|
130 |
+
"var": 0.001096143270842731,
|
131 |
+
"std": 0.03310805559158325,
|
132 |
+
"skews": 0.005317448638379574,
|
133 |
+
"kurtosis": 1.2885775566101074,
|
134 |
+
"outliers": 53,
|
135 |
+
"outlier_percent": 0.00023961950231481482
|
136 |
+
},
|
137 |
+
"layers.1.feed_forward.w2.weight": {
|
138 |
+
"mean": 1.3776403648080304e-05,
|
139 |
+
"var": 0.001043531228788197,
|
140 |
+
"std": 0.03230373561382294,
|
141 |
+
"skews": -0.010474198497831821,
|
142 |
+
"kurtosis": 0.4791874885559082,
|
143 |
+
"outliers": 18,
|
144 |
+
"outlier_percent": 8.138020833333333e-05
|
145 |
+
},
|
146 |
+
"layers.1.feed_forward.w3.weight": {
|
147 |
+
"mean": -4.969057499693008e-06,
|
148 |
+
"var": 0.00107937294524163,
|
149 |
+
"std": 0.03285381197929382,
|
150 |
+
"skews": 0.0024307132698595524,
|
151 |
+
"kurtosis": -0.002953767776489258,
|
152 |
+
"outliers": 0,
|
153 |
+
"outlier_percent": 0.0
|
154 |
+
},
|
155 |
+
"layers.1.attention_norm.weight": {
|
156 |
+
"mean": 0.9582674503326416,
|
157 |
+
"var": 0.003743925830349326,
|
158 |
+
"std": 0.06118762865662575,
|
159 |
+
"skews": 0.16415190696716309,
|
160 |
+
"kurtosis": 2.457639694213867,
|
161 |
+
"outliers": 0,
|
162 |
+
"outlier_percent": 0.0
|
163 |
+
},
|
164 |
+
"layers.1.ffn_norm.weight": {
|
165 |
+
"mean": 0.82928866147995,
|
166 |
+
"var": 0.0031697505619376898,
|
167 |
+
"std": 0.05630053952336311,
|
168 |
+
"skews": -4.1606831550598145,
|
169 |
+
"kurtosis": 34.9595947265625,
|
170 |
+
"outliers": 1,
|
171 |
+
"outlier_percent": 0.003472222222222222
|
172 |
+
},
|
173 |
+
"layers.2.attention.wq.weight": {
|
174 |
+
"mean": -3.796460077865049e-05,
|
175 |
+
"var": 0.002584402682259679,
|
176 |
+
"std": 0.05083702132105827,
|
177 |
+
"skews": 0.0032156507950276136,
|
178 |
+
"kurtosis": 0.6036031246185303,
|
179 |
+
"outliers": 5,
|
180 |
+
"outlier_percent": 6.028163580246913e-05
|
181 |
+
},
|
182 |
+
"layers.2.attention.wk.weight": {
|
183 |
+
"mean": -3.2622701837681234e-05,
|
184 |
+
"var": 0.0023682292085140944,
|
185 |
+
"std": 0.048664454370737076,
|
186 |
+
"skews": -0.044531866908073425,
|
187 |
+
"kurtosis": 1.5378341674804688,
|
188 |
+
"outliers": 4,
|
189 |
+
"outlier_percent": 4.8225308641975306e-05
|
190 |
+
},
|
191 |
+
"layers.2.attention.wv.weight": {
|
192 |
+
"mean": -6.883700552862138e-05,
|
193 |
+
"var": 0.0005191161762923002,
|
194 |
+
"std": 0.022784121334552765,
|
195 |
+
"skews": -0.015684092417359352,
|
196 |
+
"kurtosis": 0.05707192420959473,
|
197 |
+
"outliers": 0,
|
198 |
+
"outlier_percent": 0.0
|
199 |
+
},
|
200 |
+
"layers.2.attention.wo.weight": {
|
201 |
+
"mean": 0.00011038936645491049,
|
202 |
+
"var": 0.00049241678789258,
|
203 |
+
"std": 0.02219046652317047,
|
204 |
+
"skews": -0.01890524849295616,
|
205 |
+
"kurtosis": 0.3118934631347656,
|
206 |
+
"outliers": 4,
|
207 |
+
"outlier_percent": 4.8225308641975306e-05
|
208 |
+
},
|
209 |
+
"layers.2.feed_forward.w1.weight": {
|
210 |
+
"mean": -0.00011393482418498024,
|
211 |
+
"var": 0.0010629615280777216,
|
212 |
+
"std": 0.03260309249162674,
|
213 |
+
"skews": 0.012040580622851849,
|
214 |
+
"kurtosis": 0.15316557884216309,
|
215 |
+
"outliers": 6,
|
216 |
+
"outlier_percent": 2.712673611111111e-05
|
217 |
+
},
|
218 |
+
"layers.2.feed_forward.w2.weight": {
|
219 |
+
"mean": -3.49236506735906e-05,
|
220 |
+
"var": 0.0010735696414485574,
|
221 |
+
"std": 0.03276537358760834,
|
222 |
+
"skews": 0.0018738987855613232,
|
223 |
+
"kurtosis": 0.18375515937805176,
|
224 |
+
"outliers": 5,
|
225 |
+
"outlier_percent": 2.2605613425925925e-05
|
226 |
+
},
|
227 |
+
"layers.2.feed_forward.w3.weight": {
|
228 |
+
"mean": -6.427209154935554e-05,
|
229 |
+
"var": 0.0011157323606312275,
|
230 |
+
"std": 0.03340258076786995,
|
231 |
+
"skews": 0.005643382202833891,
|
232 |
+
"kurtosis": 0.014159917831420898,
|
233 |
+
"outliers": 1,
|
234 |
+
"outlier_percent": 4.521122685185185e-06
|
235 |
+
},
|
236 |
+
"layers.2.attention_norm.weight": {
|
237 |
+
"mean": 1.0052924156188965,
|
238 |
+
"var": 0.0030417025554925203,
|
239 |
+
"std": 0.05515163391828537,
|
240 |
+
"skews": 0.2681165039539337,
|
241 |
+
"kurtosis": 1.2076926231384277,
|
242 |
+
"outliers": 0,
|
243 |
+
"outlier_percent": 0.0
|
244 |
+
},
|
245 |
+
"layers.2.ffn_norm.weight": {
|
246 |
+
"mean": 0.9500963687896729,
|
247 |
+
"var": 0.0031354704406112432,
|
248 |
+
"std": 0.05599527060985565,
|
249 |
+
"skews": -3.9257915019989014,
|
250 |
+
"kurtosis": 35.144386291503906,
|
251 |
+
"outliers": 1,
|
252 |
+
"outlier_percent": 0.003472222222222222
|
253 |
+
},
|
254 |
+
"layers.3.attention.wq.weight": {
|
255 |
+
"mean": -0.0003145245718769729,
|
256 |
+
"var": 0.0020177229307591915,
|
257 |
+
"std": 0.04491906985640526,
|
258 |
+
"skews": 0.03000479005277157,
|
259 |
+
"kurtosis": 0.6789233684539795,
|
260 |
+
"outliers": 6,
|
261 |
+
"outlier_percent": 7.233796296296296e-05
|
262 |
+
},
|
263 |
+
"layers.3.attention.wk.weight": {
|
264 |
+
"mean": 0.0001325952762272209,
|
265 |
+
"var": 0.0018331342143937945,
|
266 |
+
"std": 0.04281511530280113,
|
267 |
+
"skews": -0.033595889806747437,
|
268 |
+
"kurtosis": 1.3161125183105469,
|
269 |
+
"outliers": 14,
|
270 |
+
"outlier_percent": 0.00016878858024691357
|
271 |
+
},
|
272 |
+
"layers.3.attention.wv.weight": {
|
273 |
+
"mean": -4.994049959350377e-05,
|
274 |
+
"var": 0.0007408911478705704,
|
275 |
+
"std": 0.027219315990805626,
|
276 |
+
"skews": 0.013677406124770641,
|
277 |
+
"kurtosis": 0.11978745460510254,
|
278 |
+
"outliers": 0,
|
279 |
+
"outlier_percent": 0.0
|
280 |
+
},
|
281 |
+
"layers.3.attention.wo.weight": {
|
282 |
+
"mean": 6.271307597671694e-07,
|
283 |
+
"var": 0.0007163779227994382,
|
284 |
+
"std": 0.026765236631035805,
|
285 |
+
"skews": 0.015188642777502537,
|
286 |
+
"kurtosis": 0.39975571632385254,
|
287 |
+
"outliers": 4,
|
288 |
+
"outlier_percent": 4.8225308641975306e-05
|
289 |
+
},
|
290 |
+
"layers.3.feed_forward.w1.weight": {
|
291 |
+
"mean": 1.6025409422582015e-05,
|
292 |
+
"var": 0.0010597744258120656,
|
293 |
+
"std": 0.032554175704717636,
|
294 |
+
"skews": 0.001135399448685348,
|
295 |
+
"kurtosis": 0.12209343910217285,
|
296 |
+
"outliers": 1,
|
297 |
+
"outlier_percent": 4.521122685185185e-06
|
298 |
+
},
|
299 |
+
"layers.3.feed_forward.w2.weight": {
|
300 |
+
"mean": 3.5237095289630815e-05,
|
301 |
+
"var": 0.0011318204924464226,
|
302 |
+
"std": 0.033642541617155075,
|
303 |
+
"skews": -0.008913942612707615,
|
304 |
+
"kurtosis": 0.3426053524017334,
|
305 |
+
"outliers": 6,
|
306 |
+
"outlier_percent": 2.712673611111111e-05
|
307 |
+
},
|
308 |
+
"layers.3.feed_forward.w3.weight": {
|
309 |
+
"mean": 1.8510358131607063e-05,
|
310 |
+
"var": 0.0011854986660182476,
|
311 |
+
"std": 0.03443107008934021,
|
312 |
+
"skews": 0.0032921642996370792,
|
313 |
+
"kurtosis": 0.02032184600830078,
|
314 |
+
"outliers": 0,
|
315 |
+
"outlier_percent": 0.0
|
316 |
+
},
|
317 |
+
"layers.3.attention_norm.weight": {
|
318 |
+
"mean": 1.1416233777999878,
|
319 |
+
"var": 0.002959593664854765,
|
320 |
+
"std": 0.054402146488428116,
|
321 |
+
"skews": 0.22000834345817566,
|
322 |
+
"kurtosis": 2.438382625579834,
|
323 |
+
"outliers": 0,
|
324 |
+
"outlier_percent": 0.0
|
325 |
+
},
|
326 |
+
"layers.3.ffn_norm.weight": {
|
327 |
+
"mean": 1.0585883855819702,
|
328 |
+
"var": 0.0033200366888195276,
|
329 |
+
"std": 0.05761975795030594,
|
330 |
+
"skews": -5.2314934730529785,
|
331 |
+
"kurtosis": 54.92455291748047,
|
332 |
+
"outliers": 1,
|
333 |
+
"outlier_percent": 0.003472222222222222
|
334 |
+
},
|
335 |
+
"layers.4.attention.wq.weight": {
|
336 |
+
"mean": -0.00025949961855076253,
|
337 |
+
"var": 0.0013903075596317649,
|
338 |
+
"std": 0.03728682920336723,
|
339 |
+
"skews": -0.010436750948429108,
|
340 |
+
"kurtosis": 0.7299864292144775,
|
341 |
+
"outliers": 4,
|
342 |
+
"outlier_percent": 4.8225308641975306e-05
|
343 |
+
},
|
344 |
+
"layers.4.attention.wk.weight": {
|
345 |
+
"mean": 0.00014466408174484968,
|
346 |
+
"var": 0.0013631607871502638,
|
347 |
+
"std": 0.03692100569605827,
|
348 |
+
"skews": -0.011070731095969677,
|
349 |
+
"kurtosis": 1.4433116912841797,
|
350 |
+
"outliers": 9,
|
351 |
+
"outlier_percent": 0.00010850694444444444
|
352 |
+
},
|
353 |
+
"layers.4.attention.wv.weight": {
|
354 |
+
"mean": -7.966956763993949e-05,
|
355 |
+
"var": 0.0011041760444641113,
|
356 |
+
"std": 0.033229146152734756,
|
357 |
+
"skews": -0.011746554635465145,
|
358 |
+
"kurtosis": 0.19126129150390625,
|
359 |
+
"outliers": 0,
|
360 |
+
"outlier_percent": 0.0
|
361 |
+
},
|
362 |
+
"layers.4.attention.wo.weight": {
|
363 |
+
"mean": 1.5019092643342447e-05,
|
364 |
+
"var": 0.0011383997043594718,
|
365 |
+
"std": 0.03374017775058746,
|
366 |
+
"skews": -0.004771851468831301,
|
367 |
+
"kurtosis": 0.26372838020324707,
|
368 |
+
"outliers": 0,
|
369 |
+
"outlier_percent": 0.0
|
370 |
+
},
|
371 |
+
"layers.4.feed_forward.w1.weight": {
|
372 |
+
"mean": 0.00011195617844350636,
|
373 |
+
"var": 0.0010532090673223138,
|
374 |
+
"std": 0.032453183084726334,
|
375 |
+
"skews": 0.00047589949099346995,
|
376 |
+
"kurtosis": 0.5533373355865479,
|
377 |
+
"outliers": 32,
|
378 |
+
"outlier_percent": 0.00014467592592592592
|
379 |
+
},
|
380 |
+
"layers.4.feed_forward.w2.weight": {
|
381 |
+
"mean": 5.1059960242128e-05,
|
382 |
+
"var": 0.001199280726723373,
|
383 |
+
"std": 0.034630633890628815,
|
384 |
+
"skews": 0.0022110706195235252,
|
385 |
+
"kurtosis": 0.08130240440368652,
|
386 |
+
"outliers": 3,
|
387 |
+
"outlier_percent": 1.3563368055555555e-05
|
388 |
+
},
|
389 |
+
"layers.4.feed_forward.w3.weight": {
|
390 |
+
"mean": -5.813660391140729e-05,
|
391 |
+
"var": 0.0012630914570763707,
|
392 |
+
"std": 0.03553999960422516,
|
393 |
+
"skews": -0.004379538353532553,
|
394 |
+
"kurtosis": 0.0008075237274169922,
|
395 |
+
"outliers": 0,
|
396 |
+
"outlier_percent": 0.0
|
397 |
+
},
|
398 |
+
"layers.4.attention_norm.weight": {
|
399 |
+
"mean": 1.4452831745147705,
|
400 |
+
"var": 0.0042164139449596405,
|
401 |
+
"std": 0.06493391841650009,
|
402 |
+
"skews": -1.056495189666748,
|
403 |
+
"kurtosis": 5.870485305786133,
|
404 |
+
"outliers": 1,
|
405 |
+
"outlier_percent": 0.003472222222222222
|
406 |
+
},
|
407 |
+
"layers.4.ffn_norm.weight": {
|
408 |
+
"mean": 1.2039875984191895,
|
409 |
+
"var": 0.003980363253504038,
|
410 |
+
"std": 0.06309012323617935,
|
411 |
+
"skews": -4.837477207183838,
|
412 |
+
"kurtosis": 49.78935241699219,
|
413 |
+
"outliers": 1,
|
414 |
+
"outlier_percent": 0.003472222222222222
|
415 |
+
},
|
416 |
+
"layers.5.attention.wq.weight": {
|
417 |
+
"mean": 8.131389040499926e-05,
|
418 |
+
"var": 0.001353550935164094,
|
419 |
+
"std": 0.0367906354367733,
|
420 |
+
"skews": -0.017600344493985176,
|
421 |
+
"kurtosis": 0.852405309677124,
|
422 |
+
"outliers": 7,
|
423 |
+
"outlier_percent": 8.439429012345679e-05
|
424 |
+
},
|
425 |
+
"layers.5.attention.wk.weight": {
|
426 |
+
"mean": 3.1251300242729485e-05,
|
427 |
+
"var": 0.0013346867635846138,
|
428 |
+
"std": 0.03653336688876152,
|
429 |
+
"skews": -0.03897131234407425,
|
430 |
+
"kurtosis": 1.5126566886901855,
|
431 |
+
"outliers": 7,
|
432 |
+
"outlier_percent": 8.439429012345679e-05
|
433 |
+
},
|
434 |
+
"layers.5.attention.wv.weight": {
|
435 |
+
"mean": -5.62375171284657e-05,
|
436 |
+
"var": 0.0012745895655825734,
|
437 |
+
"std": 0.03570139408111572,
|
438 |
+
"skews": 0.0008129411144182086,
|
439 |
+
"kurtosis": 0.4274141788482666,
|
440 |
+
"outliers": 0,
|
441 |
+
"outlier_percent": 0.0
|
442 |
+
},
|
443 |
+
"layers.5.attention.wo.weight": {
|
444 |
+
"mean": -1.0276995453750715e-05,
|
445 |
+
"var": 0.0012555326102301478,
|
446 |
+
"std": 0.03543349727988243,
|
447 |
+
"skews": 0.0017857198836281896,
|
448 |
+
"kurtosis": 0.5696825981140137,
|
449 |
+
"outliers": 2,
|
450 |
+
"outlier_percent": 2.4112654320987653e-05
|
451 |
+
},
|
452 |
+
"layers.5.feed_forward.w1.weight": {
|
453 |
+
"mean": 0.00023112430062610656,
|
454 |
+
"var": 0.0010687459725886583,
|
455 |
+
"std": 0.032691679894924164,
|
456 |
+
"skews": -0.004319785162806511,
|
457 |
+
"kurtosis": 0.11712980270385742,
|
458 |
+
"outliers": 2,
|
459 |
+
"outlier_percent": 9.04224537037037e-06
|
460 |
+
},
|
461 |
+
"layers.5.feed_forward.w2.weight": {
|
462 |
+
"mean": 6.056282927602297e-06,
|
463 |
+
"var": 0.0011791514698415995,
|
464 |
+
"std": 0.034338776022195816,
|
465 |
+
"skews": 0.0026711937971413136,
|
466 |
+
"kurtosis": 0.3927266597747803,
|
467 |
+
"outliers": 17,
|
468 |
+
"outlier_percent": 7.685908564814815e-05
|
469 |
+
},
|
470 |
+
"layers.5.feed_forward.w3.weight": {
|
471 |
+
"mean": -4.802293187822215e-05,
|
472 |
+
"var": 0.0012868741760030389,
|
473 |
+
"std": 0.03587302938103676,
|
474 |
+
"skews": -0.004913518205285072,
|
475 |
+
"kurtosis": 0.09062385559082031,
|
476 |
+
"outliers": 1,
|
477 |
+
"outlier_percent": 4.521122685185185e-06
|
478 |
+
},
|
479 |
+
"layers.5.attention_norm.weight": {
|
480 |
+
"mean": 1.4747096300125122,
|
481 |
+
"var": 0.004917146638035774,
|
482 |
+
"std": 0.0701223686337471,
|
483 |
+
"skews": -0.6923183798789978,
|
484 |
+
"kurtosis": 2.5739073753356934,
|
485 |
+
"outliers": 0,
|
486 |
+
"outlier_percent": 0.0
|
487 |
+
},
|
488 |
+
"layers.5.ffn_norm.weight": {
|
489 |
+
"mean": 1.330501914024353,
|
490 |
+
"var": 0.0026364095974713564,
|
491 |
+
"std": 0.05134597793221474,
|
492 |
+
"skews": -1.7850385904312134,
|
493 |
+
"kurtosis": 12.851001739501953,
|
494 |
+
"outliers": 1,
|
495 |
+
"outlier_percent": 0.003472222222222222
|
496 |
+
},
|
497 |
+
"norm.weight": {
|
498 |
+
"mean": 4.583681106567383,
|
499 |
+
"var": 0.10091519355773926,
|
500 |
+
"std": 0.3176715075969696,
|
501 |
+
"skews": 0.33317092061042786,
|
502 |
+
"kurtosis": 7.595536231994629,
|
503 |
+
"outliers": 0,
|
504 |
+
"outlier_percent": 0.0
|
505 |
+
}
|
506 |
+
}
|
softmax1-15m-2023_08_22_03_16_17/ckpt.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff2c0188edf23095abc4142b46e5ab62a4438156bd58837f1aad284338db7124
|
3 |
+
size 60784783
|
softmax1-15m-2023_08_22_03_16_17/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"out_dir": "out",
|
3 |
+
"eval_interval": 2000,
|
4 |
+
"log_interval": 1,
|
5 |
+
"eval_iters": 100,
|
6 |
+
"eval_only": false,
|
7 |
+
"always_save_checkpoint": true,
|
8 |
+
"init_from": "scratch",
|
9 |
+
"wandb_log": true,
|
10 |
+
"wandb_project": "softmax1-tinystories",
|
11 |
+
"wandb_run_name": "run2023_08_22_03_16_17",
|
12 |
+
"batch_size": 96,
|
13 |
+
"max_seq_len": 256,
|
14 |
+
"vocab_source": "llama2",
|
15 |
+
"vocab_size": 32000,
|
16 |
+
"dim": 288,
|
17 |
+
"n_layers": 6,
|
18 |
+
"n_heads": 6,
|
19 |
+
"n_kv_heads": 6,
|
20 |
+
"multiple_of": 32,
|
21 |
+
"dropout": 0.0,
|
22 |
+
"gradient_accumulation_steps": 4,
|
23 |
+
"learning_rate": 0.0005,
|
24 |
+
"max_iters": 100000,
|
25 |
+
"weight_decay": 0.1,
|
26 |
+
"beta1": 0.9,
|
27 |
+
"beta2": 0.95,
|
28 |
+
"grad_clip": 1.0,
|
29 |
+
"decay_lr": true,
|
30 |
+
"warmup_iters": 1000,
|
31 |
+
"device": "cuda",
|
32 |
+
"dtype": "float16",
|
33 |
+
"compile": true,
|
34 |
+
"softmax1": true
|
35 |
+
}
|
softmax1-15m-2023_08_22_03_16_17/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b8fdf4e3ebeb36f1f4ad74d2c3b63c328a59acbff314271100b013b1e6b4b67
|
3 |
+
size 60816028
|
softmax1-15m-2023_08_22_03_16_17/weights.json
ADDED
@@ -0,0 +1,506 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tok_embeddings.weight": {
|
3 |
+
"mean": -0.0030575105920434,
|
4 |
+
"var": 0.008043774403631687,
|
5 |
+
"std": 0.08968709409236908,
|
6 |
+
"skews": -0.8338078260421753,
|
7 |
+
"kurtosis": 18.851457595825195,
|
8 |
+
"outliers": 44748,
|
9 |
+
"outlier_percent": 0.00485546875
|
10 |
+
},
|
11 |
+
"layers.0.attention.wq.weight": {
|
12 |
+
"mean": 0.00035618283436633646,
|
13 |
+
"var": 0.004454279784113169,
|
14 |
+
"std": 0.06674039363861084,
|
15 |
+
"skews": 0.010199151001870632,
|
16 |
+
"kurtosis": 1.0952239036560059,
|
17 |
+
"outliers": 9,
|
18 |
+
"outlier_percent": 0.00010850694444444444
|
19 |
+
},
|
20 |
+
"layers.0.attention.wk.weight": {
|
21 |
+
"mean": 0.0002400689263595268,
|
22 |
+
"var": 0.0038942936807870865,
|
23 |
+
"std": 0.06240427494049072,
|
24 |
+
"skews": 0.04288599640130997,
|
25 |
+
"kurtosis": 1.2222018241882324,
|
26 |
+
"outliers": 8,
|
27 |
+
"outlier_percent": 9.645061728395061e-05
|
28 |
+
},
|
29 |
+
"layers.0.attention.wv.weight": {
|
30 |
+
"mean": 4.090589936822653e-05,
|
31 |
+
"var": 0.00035867199767380953,
|
32 |
+
"std": 0.018938638269901276,
|
33 |
+
"skews": -0.006655456963926554,
|
34 |
+
"kurtosis": 0.36399269104003906,
|
35 |
+
"outliers": 0,
|
36 |
+
"outlier_percent": 0.0
|
37 |
+
},
|
38 |
+
"layers.0.attention.wo.weight": {
|
39 |
+
"mean": -6.33133968221955e-05,
|
40 |
+
"var": 0.00037953953142277896,
|
41 |
+
"std": 0.019481774419546127,
|
42 |
+
"skews": -0.009102443233132362,
|
43 |
+
"kurtosis": 13.020711898803711,
|
44 |
+
"outliers": 18,
|
45 |
+
"outlier_percent": 0.00021701388888888888
|
46 |
+
},
|
47 |
+
"layers.0.feed_forward.w1.weight": {
|
48 |
+
"mean": -8.492724009556696e-05,
|
49 |
+
"var": 0.0016241124831140041,
|
50 |
+
"std": 0.04030027985572815,
|
51 |
+
"skews": 0.005218073725700378,
|
52 |
+
"kurtosis": -0.021692514419555664,
|
53 |
+
"outliers": 0,
|
54 |
+
"outlier_percent": 0.0
|
55 |
+
},
|
56 |
+
"layers.0.feed_forward.w2.weight": {
|
57 |
+
"mean": -6.97666619089432e-05,
|
58 |
+
"var": 0.001631619525142014,
|
59 |
+
"std": 0.04039331153035164,
|
60 |
+
"skews": 0.004510404542088509,
|
61 |
+
"kurtosis": 1.0991830825805664,
|
62 |
+
"outliers": 50,
|
63 |
+
"outlier_percent": 0.00022605613425925926
|
64 |
+
},
|
65 |
+
"layers.0.feed_forward.w3.weight": {
|
66 |
+
"mean": -6.370133633026853e-05,
|
67 |
+
"var": 0.001632346655242145,
|
68 |
+
"std": 0.04040231183171272,
|
69 |
+
"skews": -0.0011075332295149565,
|
70 |
+
"kurtosis": 0.010745048522949219,
|
71 |
+
"outliers": 0,
|
72 |
+
"outlier_percent": 0.0
|
73 |
+
},
|
74 |
+
"layers.0.attention_norm.weight": {
|
75 |
+
"mean": 0.5556919574737549,
|
76 |
+
"var": 0.012906110845506191,
|
77 |
+
"std": 0.1136050671339035,
|
78 |
+
"skews": 1.3937958478927612,
|
79 |
+
"kurtosis": 2.516613483428955,
|
80 |
+
"outliers": 0,
|
81 |
+
"outlier_percent": 0.0
|
82 |
+
},
|
83 |
+
"layers.0.ffn_norm.weight": {
|
84 |
+
"mean": 0.6750139594078064,
|
85 |
+
"var": 0.0032933931797742844,
|
86 |
+
"std": 0.05738809332251549,
|
87 |
+
"skews": -4.392051696777344,
|
88 |
+
"kurtosis": 42.291107177734375,
|
89 |
+
"outliers": 1,
|
90 |
+
"outlier_percent": 0.003472222222222222
|
91 |
+
},
|
92 |
+
"layers.1.attention.wq.weight": {
|
93 |
+
"mean": -5.454247457237216e-06,
|
94 |
+
"var": 0.0035072374157607555,
|
95 |
+
"std": 0.05922193452715874,
|
96 |
+
"skews": -0.014804880134761333,
|
97 |
+
"kurtosis": 0.5731871128082275,
|
98 |
+
"outliers": 3,
|
99 |
+
"outlier_percent": 3.616898148148148e-05
|
100 |
+
},
|
101 |
+
"layers.1.attention.wk.weight": {
|
102 |
+
"mean": -0.00011433610052336007,
|
103 |
+
"var": 0.0029967124573886395,
|
104 |
+
"std": 0.0547422356903553,
|
105 |
+
"skews": -0.06437145173549652,
|
106 |
+
"kurtosis": 1.6788911819458008,
|
107 |
+
"outliers": 13,
|
108 |
+
"outlier_percent": 0.00015673225308641974
|
109 |
+
},
|
110 |
+
"layers.1.attention.wv.weight": {
|
111 |
+
"mean": 0.00011750552948797122,
|
112 |
+
"var": 0.0007435075822286308,
|
113 |
+
"std": 0.0272673349827528,
|
114 |
+
"skews": 0.012133199721574783,
|
115 |
+
"kurtosis": 0.07190084457397461,
|
116 |
+
"outliers": 0,
|
117 |
+
"outlier_percent": 0.0
|
118 |
+
},
|
119 |
+
"layers.1.attention.wo.weight": {
|
120 |
+
"mean": 2.586756818345748e-05,
|
121 |
+
"var": 0.0007432072889059782,
|
122 |
+
"std": 0.027261829003691673,
|
123 |
+
"skews": 0.00605379045009613,
|
124 |
+
"kurtosis": 0.41370391845703125,
|
125 |
+
"outliers": 8,
|
126 |
+
"outlier_percent": 9.645061728395061e-05
|
127 |
+
},
|
128 |
+
"layers.1.feed_forward.w1.weight": {
|
129 |
+
"mean": -3.618240953073837e-05,
|
130 |
+
"var": 0.001606133533641696,
|
131 |
+
"std": 0.040076594799757004,
|
132 |
+
"skews": -0.0033451258204877377,
|
133 |
+
"kurtosis": 0.13593506813049316,
|
134 |
+
"outliers": 5,
|
135 |
+
"outlier_percent": 2.2605613425925925e-05
|
136 |
+
},
|
137 |
+
"layers.1.feed_forward.w2.weight": {
|
138 |
+
"mean": -5.651494939229451e-05,
|
139 |
+
"var": 0.0016629855381324887,
|
140 |
+
"std": 0.0407797209918499,
|
141 |
+
"skews": -0.0027265246026217937,
|
142 |
+
"kurtosis": 0.3058323860168457,
|
143 |
+
"outliers": 13,
|
144 |
+
"outlier_percent": 5.8774594907407404e-05
|
145 |
+
},
|
146 |
+
"layers.1.feed_forward.w3.weight": {
|
147 |
+
"mean": 5.0682170694926754e-05,
|
148 |
+
"var": 0.0016612057806923985,
|
149 |
+
"std": 0.04075789079070091,
|
150 |
+
"skews": 0.003567819483578205,
|
151 |
+
"kurtosis": -0.02021002769470215,
|
152 |
+
"outliers": 0,
|
153 |
+
"outlier_percent": 0.0
|
154 |
+
},
|
155 |
+
"layers.1.attention_norm.weight": {
|
156 |
+
"mean": 0.8679685592651367,
|
157 |
+
"var": 0.002409706823527813,
|
158 |
+
"std": 0.04908876493573189,
|
159 |
+
"skews": -0.0007233686046674848,
|
160 |
+
"kurtosis": 0.4187917709350586,
|
161 |
+
"outliers": 0,
|
162 |
+
"outlier_percent": 0.0
|
163 |
+
},
|
164 |
+
"layers.1.ffn_norm.weight": {
|
165 |
+
"mean": 0.7828017473220825,
|
166 |
+
"var": 0.0038013842422515154,
|
167 |
+
"std": 0.06165536493062973,
|
168 |
+
"skews": -4.157523155212402,
|
169 |
+
"kurtosis": 32.78773498535156,
|
170 |
+
"outliers": 1,
|
171 |
+
"outlier_percent": 0.003472222222222222
|
172 |
+
},
|
173 |
+
"layers.2.attention.wq.weight": {
|
174 |
+
"mean": 5.771218638983555e-05,
|
175 |
+
"var": 0.0031918222084641457,
|
176 |
+
"std": 0.056496214121580124,
|
177 |
+
"skews": 0.006333012133836746,
|
178 |
+
"kurtosis": 0.5441603660583496,
|
179 |
+
"outliers": 3,
|
180 |
+
"outlier_percent": 3.616898148148148e-05
|
181 |
+
},
|
182 |
+
"layers.2.attention.wk.weight": {
|
183 |
+
"mean": -0.00023540656547993422,
|
184 |
+
"var": 0.002815589774399996,
|
185 |
+
"std": 0.05306212976574898,
|
186 |
+
"skews": -0.049821335822343826,
|
187 |
+
"kurtosis": 1.6562185287475586,
|
188 |
+
"outliers": 9,
|
189 |
+
"outlier_percent": 0.00010850694444444444
|
190 |
+
},
|
191 |
+
"layers.2.attention.wv.weight": {
|
192 |
+
"mean": 2.4973463951027952e-05,
|
193 |
+
"var": 0.0009142293711192906,
|
194 |
+
"std": 0.030236225575208664,
|
195 |
+
"skews": -0.007988173514604568,
|
196 |
+
"kurtosis": 0.07765412330627441,
|
197 |
+
"outliers": 0,
|
198 |
+
"outlier_percent": 0.0
|
199 |
+
},
|
200 |
+
"layers.2.attention.wo.weight": {
|
201 |
+
"mean": -6.22320658294484e-06,
|
202 |
+
"var": 0.0009230568539351225,
|
203 |
+
"std": 0.03038185089826584,
|
204 |
+
"skews": 0.000765857519581914,
|
205 |
+
"kurtosis": 0.08483433723449707,
|
206 |
+
"outliers": 0,
|
207 |
+
"outlier_percent": 0.0
|
208 |
+
},
|
209 |
+
"layers.2.feed_forward.w1.weight": {
|
210 |
+
"mean": -6.062598549760878e-05,
|
211 |
+
"var": 0.0015908096684142947,
|
212 |
+
"std": 0.03988495469093323,
|
213 |
+
"skews": 0.008102444000542164,
|
214 |
+
"kurtosis": 0.0035903453826904297,
|
215 |
+
"outliers": 0,
|
216 |
+
"outlier_percent": 0.0
|
217 |
+
},
|
218 |
+
"layers.2.feed_forward.w2.weight": {
|
219 |
+
"mean": 4.470473504625261e-05,
|
220 |
+
"var": 0.0017242009053006768,
|
221 |
+
"std": 0.04152349755167961,
|
222 |
+
"skews": 0.0011632241075858474,
|
223 |
+
"kurtosis": 0.06214785575866699,
|
224 |
+
"outliers": 1,
|
225 |
+
"outlier_percent": 4.521122685185185e-06
|
226 |
+
},
|
227 |
+
"layers.2.feed_forward.w3.weight": {
|
228 |
+
"mean": -2.1270183424348943e-05,
|
229 |
+
"var": 0.0017243118491023779,
|
230 |
+
"std": 0.04152483493089676,
|
231 |
+
"skews": -0.005408828612416983,
|
232 |
+
"kurtosis": 0.012958765029907227,
|
233 |
+
"outliers": 0,
|
234 |
+
"outlier_percent": 0.0
|
235 |
+
},
|
236 |
+
"layers.2.attention_norm.weight": {
|
237 |
+
"mean": 0.9104444980621338,
|
238 |
+
"var": 0.0019502700306475163,
|
239 |
+
"std": 0.04416185989975929,
|
240 |
+
"skews": 0.6367732286453247,
|
241 |
+
"kurtosis": 0.9952096939086914,
|
242 |
+
"outliers": 0,
|
243 |
+
"outlier_percent": 0.0
|
244 |
+
},
|
245 |
+
"layers.2.ffn_norm.weight": {
|
246 |
+
"mean": 0.88186115026474,
|
247 |
+
"var": 0.0035577910020947456,
|
248 |
+
"std": 0.059647221118211746,
|
249 |
+
"skews": -3.8068487644195557,
|
250 |
+
"kurtosis": 32.94072723388672,
|
251 |
+
"outliers": 1,
|
252 |
+
"outlier_percent": 0.003472222222222222
|
253 |
+
},
|
254 |
+
"layers.3.attention.wq.weight": {
|
255 |
+
"mean": -3.916208697773982e-06,
|
256 |
+
"var": 0.002655748976394534,
|
257 |
+
"std": 0.051533959805965424,
|
258 |
+
"skews": 0.0036733371671289206,
|
259 |
+
"kurtosis": 0.4783966541290283,
|
260 |
+
"outliers": 3,
|
261 |
+
"outlier_percent": 3.616898148148148e-05
|
262 |
+
},
|
263 |
+
"layers.3.attention.wk.weight": {
|
264 |
+
"mean": 0.00031962458160705864,
|
265 |
+
"var": 0.0023793901782482862,
|
266 |
+
"std": 0.048778992146253586,
|
267 |
+
"skews": -0.014764788560569286,
|
268 |
+
"kurtosis": 1.5564613342285156,
|
269 |
+
"outliers": 13,
|
270 |
+
"outlier_percent": 0.00015673225308641974
|
271 |
+
},
|
272 |
+
"layers.3.attention.wv.weight": {
|
273 |
+
"mean": -4.206255835015327e-05,
|
274 |
+
"var": 0.0012175313895568252,
|
275 |
+
"std": 0.03489314392209053,
|
276 |
+
"skews": -0.004365447908639908,
|
277 |
+
"kurtosis": 0.09970426559448242,
|
278 |
+
"outliers": 0,
|
279 |
+
"outlier_percent": 0.0
|
280 |
+
},
|
281 |
+
"layers.3.attention.wo.weight": {
|
282 |
+
"mean": 0.000249701552093029,
|
283 |
+
"var": 0.0012641714420169592,
|
284 |
+
"std": 0.03555518761277199,
|
285 |
+
"skews": 0.00975084025412798,
|
286 |
+
"kurtosis": 0.09047365188598633,
|
287 |
+
"outliers": 0,
|
288 |
+
"outlier_percent": 0.0
|
289 |
+
},
|
290 |
+
"layers.3.feed_forward.w1.weight": {
|
291 |
+
"mean": 0.00019631479517556727,
|
292 |
+
"var": 0.0015764598501846194,
|
293 |
+
"std": 0.03970465809106827,
|
294 |
+
"skews": -0.00013909985136706382,
|
295 |
+
"kurtosis": 0.029094934463500977,
|
296 |
+
"outliers": 0,
|
297 |
+
"outlier_percent": 0.0
|
298 |
+
},
|
299 |
+
"layers.3.feed_forward.w2.weight": {
|
300 |
+
"mean": -7.194675708888099e-05,
|
301 |
+
"var": 0.0017914645140990615,
|
302 |
+
"std": 0.0423256941139698,
|
303 |
+
"skews": -0.010688533075153828,
|
304 |
+
"kurtosis": 0.10126662254333496,
|
305 |
+
"outliers": 2,
|
306 |
+
"outlier_percent": 9.04224537037037e-06
|
307 |
+
},
|
308 |
+
"layers.3.feed_forward.w3.weight": {
|
309 |
+
"mean": -7.225230547192041e-06,
|
310 |
+
"var": 0.0017912057228386402,
|
311 |
+
"std": 0.042322639375925064,
|
312 |
+
"skews": 0.0029504415579140186,
|
313 |
+
"kurtosis": 0.02350449562072754,
|
314 |
+
"outliers": 0,
|
315 |
+
"outlier_percent": 0.0
|
316 |
+
},
|
317 |
+
"layers.3.attention_norm.weight": {
|
318 |
+
"mean": 0.9992889761924744,
|
319 |
+
"var": 0.001841739285737276,
|
320 |
+
"std": 0.04291548952460289,
|
321 |
+
"skews": 0.28296077251434326,
|
322 |
+
"kurtosis": 0.21933650970458984,
|
323 |
+
"outliers": 0,
|
324 |
+
"outlier_percent": 0.0
|
325 |
+
},
|
326 |
+
"layers.3.ffn_norm.weight": {
|
327 |
+
"mean": 0.956415057182312,
|
328 |
+
"var": 0.002891228999942541,
|
329 |
+
"std": 0.05377015098929405,
|
330 |
+
"skews": -4.3508172035217285,
|
331 |
+
"kurtosis": 41.48064041137695,
|
332 |
+
"outliers": 1,
|
333 |
+
"outlier_percent": 0.003472222222222222
|
334 |
+
},
|
335 |
+
"layers.4.attention.wq.weight": {
|
336 |
+
"mean": -0.0002995177055709064,
|
337 |
+
"var": 0.001933008199557662,
|
338 |
+
"std": 0.043965987861156464,
|
339 |
+
"skews": -0.013897748664021492,
|
340 |
+
"kurtosis": 0.6071853637695312,
|
341 |
+
"outliers": 3,
|
342 |
+
"outlier_percent": 3.616898148148148e-05
|
343 |
+
},
|
344 |
+
"layers.4.attention.wk.weight": {
|
345 |
+
"mean": 0.0002864231646526605,
|
346 |
+
"var": 0.0018339046509936452,
|
347 |
+
"std": 0.04282411187887192,
|
348 |
+
"skews": -0.010061254724860191,
|
349 |
+
"kurtosis": 1.4288434982299805,
|
350 |
+
"outliers": 11,
|
351 |
+
"outlier_percent": 0.0001326195987654321
|
352 |
+
},
|
353 |
+
"layers.4.attention.wv.weight": {
|
354 |
+
"mean": 0.00010028992255683988,
|
355 |
+
"var": 0.0018044327152892947,
|
356 |
+
"std": 0.042478613555431366,
|
357 |
+
"skews": 0.004125483334064484,
|
358 |
+
"kurtosis": 0.32813596725463867,
|
359 |
+
"outliers": 0,
|
360 |
+
"outlier_percent": 0.0
|
361 |
+
},
|
362 |
+
"layers.4.attention.wo.weight": {
|
363 |
+
"mean": 1.004967725748429e-05,
|
364 |
+
"var": 0.0020177457481622696,
|
365 |
+
"std": 0.044919323176145554,
|
366 |
+
"skews": -0.0013239302206784487,
|
367 |
+
"kurtosis": 0.3486945629119873,
|
368 |
+
"outliers": 0,
|
369 |
+
"outlier_percent": 0.0
|
370 |
+
},
|
371 |
+
"layers.4.feed_forward.w1.weight": {
|
372 |
+
"mean": 2.9631637517013587e-05,
|
373 |
+
"var": 0.0015774235362187028,
|
374 |
+
"std": 0.03971679136157036,
|
375 |
+
"skews": 0.004260245710611343,
|
376 |
+
"kurtosis": 0.011400461196899414,
|
377 |
+
"outliers": 0,
|
378 |
+
"outlier_percent": 0.0
|
379 |
+
},
|
380 |
+
"layers.4.feed_forward.w2.weight": {
|
381 |
+
"mean": -5.6489670896553434e-06,
|
382 |
+
"var": 0.0018962562317028642,
|
383 |
+
"std": 0.04354602470993996,
|
384 |
+
"skews": 0.00325192348100245,
|
385 |
+
"kurtosis": 0.03762507438659668,
|
386 |
+
"outliers": 0,
|
387 |
+
"outlier_percent": 0.0
|
388 |
+
},
|
389 |
+
"layers.4.feed_forward.w3.weight": {
|
390 |
+
"mean": -6.117334123700857e-05,
|
391 |
+
"var": 0.0019177846843376756,
|
392 |
+
"std": 0.043792519718408585,
|
393 |
+
"skews": 0.0020223369356244802,
|
394 |
+
"kurtosis": 3.62396240234375e-05,
|
395 |
+
"outliers": 0,
|
396 |
+
"outlier_percent": 0.0
|
397 |
+
},
|
398 |
+
"layers.4.attention_norm.weight": {
|
399 |
+
"mean": 1.2174360752105713,
|
400 |
+
"var": 0.0019354402320459485,
|
401 |
+
"std": 0.04399363696575165,
|
402 |
+
"skews": -0.6009857654571533,
|
403 |
+
"kurtosis": 1.8265156745910645,
|
404 |
+
"outliers": 0,
|
405 |
+
"outlier_percent": 0.0
|
406 |
+
},
|
407 |
+
"layers.4.ffn_norm.weight": {
|
408 |
+
"mean": 1.0664386749267578,
|
409 |
+
"var": 0.0028373233508318663,
|
410 |
+
"std": 0.053266532719135284,
|
411 |
+
"skews": -3.981571674346924,
|
412 |
+
"kurtosis": 35.60630416870117,
|
413 |
+
"outliers": 1,
|
414 |
+
"outlier_percent": 0.003472222222222222
|
415 |
+
},
|
416 |
+
"layers.5.attention.wq.weight": {
|
417 |
+
"mean": 0.0002450960164424032,
|
418 |
+
"var": 0.0019403173355385661,
|
419 |
+
"std": 0.04404903203248978,
|
420 |
+
"skews": -0.0036919021513313055,
|
421 |
+
"kurtosis": 0.747020959854126,
|
422 |
+
"outliers": 3,
|
423 |
+
"outlier_percent": 3.616898148148148e-05
|
424 |
+
},
|
425 |
+
"layers.5.attention.wk.weight": {
|
426 |
+
"mean": 0.0002591797092463821,
|
427 |
+
"var": 0.0018460049759596586,
|
428 |
+
"std": 0.04296515882015228,
|
429 |
+
"skews": -0.02168109081685543,
|
430 |
+
"kurtosis": 1.5742673873901367,
|
431 |
+
"outliers": 13,
|
432 |
+
"outlier_percent": 0.00015673225308641974
|
433 |
+
},
|
434 |
+
"layers.5.attention.wv.weight": {
|
435 |
+
"mean": -6.999686593189836e-05,
|
436 |
+
"var": 0.0019269806798547506,
|
437 |
+
"std": 0.04389738664031029,
|
438 |
+
"skews": 0.011553842574357986,
|
439 |
+
"kurtosis": 0.36922311782836914,
|
440 |
+
"outliers": 1,
|
441 |
+
"outlier_percent": 1.2056327160493826e-05
|
442 |
+
},
|
443 |
+
"layers.5.attention.wo.weight": {
|
444 |
+
"mean": 7.272008224390447e-05,
|
445 |
+
"var": 0.00202196859754622,
|
446 |
+
"std": 0.044966306537389755,
|
447 |
+
"skews": 0.008191176690161228,
|
448 |
+
"kurtosis": 0.34751224517822266,
|
449 |
+
"outliers": 0,
|
450 |
+
"outlier_percent": 0.0
|
451 |
+
},
|
452 |
+
"layers.5.feed_forward.w1.weight": {
|
453 |
+
"mean": 0.0001724090107018128,
|
454 |
+
"var": 0.0015984694473445415,
|
455 |
+
"std": 0.03998086228966713,
|
456 |
+
"skews": -0.004892218858003616,
|
457 |
+
"kurtosis": 0.030857563018798828,
|
458 |
+
"outliers": 0,
|
459 |
+
"outlier_percent": 0.0
|
460 |
+
},
|
461 |
+
"layers.5.feed_forward.w2.weight": {
|
462 |
+
"mean": -5.098358542454662e-06,
|
463 |
+
"var": 0.0018277923809364438,
|
464 |
+
"std": 0.04275268688797951,
|
465 |
+
"skews": -0.00011612088565016165,
|
466 |
+
"kurtosis": 0.23499655723571777,
|
467 |
+
"outliers": 5,
|
468 |
+
"outlier_percent": 2.2605613425925925e-05
|
469 |
+
},
|
470 |
+
"layers.5.feed_forward.w3.weight": {
|
471 |
+
"mean": 1.0925466085609514e-05,
|
472 |
+
"var": 0.0018982859328389168,
|
473 |
+
"std": 0.04356932267546654,
|
474 |
+
"skews": 0.0032867027912288904,
|
475 |
+
"kurtosis": 0.05734658241271973,
|
476 |
+
"outliers": 0,
|
477 |
+
"outlier_percent": 0.0
|
478 |
+
},
|
479 |
+
"layers.5.attention_norm.weight": {
|
480 |
+
"mean": 1.2220523357391357,
|
481 |
+
"var": 0.002029956318438053,
|
482 |
+
"std": 0.04505503550171852,
|
483 |
+
"skews": 0.05655462294816971,
|
484 |
+
"kurtosis": 0.417694091796875,
|
485 |
+
"outliers": 0,
|
486 |
+
"outlier_percent": 0.0
|
487 |
+
},
|
488 |
+
"layers.5.ffn_norm.weight": {
|
489 |
+
"mean": 1.159726619720459,
|
490 |
+
"var": 0.0014754422008991241,
|
491 |
+
"std": 0.03841148689389229,
|
492 |
+
"skews": -1.1254332065582275,
|
493 |
+
"kurtosis": 5.061956405639648,
|
494 |
+
"outliers": 1,
|
495 |
+
"outlier_percent": 0.003472222222222222
|
496 |
+
},
|
497 |
+
"norm.weight": {
|
498 |
+
"mean": 2.7584409713745117,
|
499 |
+
"var": 0.02132660150527954,
|
500 |
+
"std": 0.146036297082901,
|
501 |
+
"skews": -1.0050147771835327,
|
502 |
+
"kurtosis": 3.220973491668701,
|
503 |
+
"outliers": 0,
|
504 |
+
"outlier_percent": 0.0
|
505 |
+
}
|
506 |
+
}
|