{'dim': 8192, 'n_layers': 80, 'n_heads': 64, 'n_kv_heads': 8, 'vocab_size': 128256, 'ffn_dim_multiplier': 1.3, 'multiple_of': 4096, 'norm_eps': 1e-05, 'rope_theta': 500000.0, 'use_scaled_rope': True}
000: original/consolidated.00.pth
001: original/consolidated.01.pth
002: original/consolidated.02.pth
003: original/consolidated.03.pth
004: original/consolidated.04.pth
005: original/consolidated.05.pth
006: original/consolidated.06.pth
007: original/consolidated.07.pth
-----------------------------------------------------------------------------
0 params in total.
0 bytes in total.
-----------------------------------------------------------------------------
[1/8]: Loading original/consolidated.00.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
-----------------------------------------------------------------------------
8820367360 params in total.
17640734720 bytes in total.
5.89 sec, 5.89 sec, 2858.18 MB/s
-----------------------------------------------------------------------------
[2/8]: Loading original/consolidated.01.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
-----------------------------------------------------------------------------
17640734720 params in total.
35281469440 bytes in total.
12.15 sec, 6.27 sec, 2768.56 MB/s
-----------------------------------------------------------------------------
[3/8]: Loading original/consolidated.02.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
-----------------------------------------------------------------------------
26461102080 params in total.
52922204160 bytes in total.
18.55 sec, 6.39 sec, 2721.17 MB/s
-----------------------------------------------------------------------------
[4/8]: Loading original/consolidated.03.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
-----------------------------------------------------------------------------
35281469440 params in total.
70562938880 bytes in total.
24.61 sec, 6.07 sec, 2734.13 MB/s
-----------------------------------------------------------------------------
[5/8]: Loading original/consolidated.04.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
-----------------------------------------------------------------------------
44101836800 params in total.
88203673600 bytes in total.
30.09 sec, 5.48 sec, 2795.71 MB/s
-----------------------------------------------------------------------------
[6/8]: Loading original/consolidated.05.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
-----------------------------------------------------------------------------
52922204160 params in total.
105844408320 bytes in total.
35.50 sec, 5.41 sec, 2843.42 MB/s
-----------------------------------------------------------------------------
[7/8]: Loading original/consolidated.06.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
-----------------------------------------------------------------------------
61742571520 params in total.
123485143040 bytes in total.
40.41 sec, 4.91 sec, 2913.90 MB/s
-----------------------------------------------------------------------------
[8/8]: Loading original/consolidated.07.pth
0 : 131334144 : tok_embeddings.weight : [16032, 8192] : torch.bfloat16
1 : 8388608 : layers.0.attention.wq.weight : [1024, 8192] : torch.bfloat16
2 : 1048576 : layers.0.attention.wk.weight : [128, 8192] : torch.bfloat16
3 : 1048576 : layers.0.attention.wv.weight : [128, 8192] : torch.bfloat16
4 : 8388608 : layers.0.attention.wo.weight : [8192, 1024] : torch.bfloat16
5 : 29360128 : layers.0.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
6 : 29360128 : layers.0.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
7 : 29360128 : layers.0.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
8 : 8192 : layers.0.attention_norm.weight : [8192] : torch.bfloat16
9 : 8192 : layers.0.ffn_norm.weight : [8192] : torch.bfloat16
10 : 8388608 : layers.1.attention.wq.weight : [1024, 8192] : torch.bfloat16
11 : 1048576 : layers.1.attention.wk.weight : [128, 8192] : torch.bfloat16
12 : 1048576 : layers.1.attention.wv.weight : [128, 8192] : torch.bfloat16
13 : 8388608 : layers.1.attention.wo.weight : [8192, 1024] : torch.bfloat16
14 : 29360128 : layers.1.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
15 : 29360128 : layers.1.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
16 : 29360128 : layers.1.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
17 : 8192 : layers.1.attention_norm.weight : [8192] : torch.bfloat16
18 : 8192 : layers.1.ffn_norm.weight : [8192] : torch.bfloat16
19 : 8388608 : layers.2.attention.wq.weight : [1024, 8192] : torch.bfloat16
20 : 1048576 : layers.2.attention.wk.weight : [128, 8192] : torch.bfloat16
21 : 1048576 : layers.2.attention.wv.weight : [128, 8192] : torch.bfloat16
22 : 8388608 : layers.2.attention.wo.weight : [8192, 1024] : torch.bfloat16
23 : 29360128 : layers.2.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
24 : 29360128 : layers.2.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
25 : 29360128 : layers.2.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
26 : 8192 : layers.2.attention_norm.weight : [8192] : torch.bfloat16
27 : 8192 : layers.2.ffn_norm.weight : [8192] : torch.bfloat16
28 : 8388608 : layers.3.attention.wq.weight : [1024, 8192] : torch.bfloat16
29 : 1048576 : layers.3.attention.wk.weight : [128, 8192] : torch.bfloat16
30 : 1048576 : layers.3.attention.wv.weight : [128, 8192] : torch.bfloat16
31 : 8388608 : layers.3.attention.wo.weight : [8192, 1024] : torch.bfloat16
32 : 29360128 : layers.3.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
33 : 29360128 : layers.3.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
34 : 29360128 : layers.3.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
35 : 8192 : layers.3.attention_norm.weight : [8192] : torch.bfloat16
36 : 8192 : layers.3.ffn_norm.weight : [8192] : torch.bfloat16
37 : 8388608 : layers.4.attention.wq.weight : [1024, 8192] : torch.bfloat16
38 : 1048576 : layers.4.attention.wk.weight : [128, 8192] : torch.bfloat16
39 : 1048576 : layers.4.attention.wv.weight : [128, 8192] : torch.bfloat16
40 : 8388608 : layers.4.attention.wo.weight : [8192, 1024] : torch.bfloat16
41 : 29360128 : layers.4.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
42 : 29360128 : layers.4.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
43 : 29360128 : layers.4.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
44 : 8192 : layers.4.attention_norm.weight : [8192] : torch.bfloat16
45 : 8192 : layers.4.ffn_norm.weight : [8192] : torch.bfloat16
46 : 8388608 : layers.5.attention.wq.weight : [1024, 8192] : torch.bfloat16
47 : 1048576 : layers.5.attention.wk.weight : [128, 8192] : torch.bfloat16
48 : 1048576 : layers.5.attention.wv.weight : [128, 8192] : torch.bfloat16
49 : 8388608 : layers.5.attention.wo.weight : [8192, 1024] : torch.bfloat16
50 : 29360128 : layers.5.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
51 : 29360128 : layers.5.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
52 : 29360128 : layers.5.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
53 : 8192 : layers.5.attention_norm.weight : [8192] : torch.bfloat16
54 : 8192 : layers.5.ffn_norm.weight : [8192] : torch.bfloat16
55 : 8388608 : layers.6.attention.wq.weight : [1024, 8192] : torch.bfloat16
56 : 1048576 : layers.6.attention.wk.weight : [128, 8192] : torch.bfloat16
57 : 1048576 : layers.6.attention.wv.weight : [128, 8192] : torch.bfloat16
58 : 8388608 : layers.6.attention.wo.weight : [8192, 1024] : torch.bfloat16
59 : 29360128 : layers.6.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
60 : 29360128 : layers.6.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
61 : 29360128 : layers.6.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
62 : 8192 : layers.6.attention_norm.weight : [8192] : torch.bfloat16
63 : 8192 : layers.6.ffn_norm.weight : [8192] : torch.bfloat16
64 : 8388608 : layers.7.attention.wq.weight : [1024, 8192] : torch.bfloat16
65 : 1048576 : layers.7.attention.wk.weight : [128, 8192] : torch.bfloat16
66 : 1048576 : layers.7.attention.wv.weight : [128, 8192] : torch.bfloat16
67 : 8388608 : layers.7.attention.wo.weight : [8192, 1024] : torch.bfloat16
68 : 29360128 : layers.7.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
69 : 29360128 : layers.7.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
70 : 29360128 : layers.7.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
71 : 8192 : layers.7.attention_norm.weight : [8192] : torch.bfloat16
72 : 8192 : layers.7.ffn_norm.weight : [8192] : torch.bfloat16
73 : 8388608 : layers.8.attention.wq.weight : [1024, 8192] : torch.bfloat16
74 : 1048576 : layers.8.attention.wk.weight : [128, 8192] : torch.bfloat16
75 : 1048576 : layers.8.attention.wv.weight : [128, 8192] : torch.bfloat16
76 : 8388608 : layers.8.attention.wo.weight : [8192, 1024] : torch.bfloat16
77 : 29360128 : layers.8.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
78 : 29360128 : layers.8.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
79 : 29360128 : layers.8.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
80 : 8192 : layers.8.attention_norm.weight : [8192] : torch.bfloat16
81 : 8192 : layers.8.ffn_norm.weight : [8192] : torch.bfloat16
82 : 8388608 : layers.9.attention.wq.weight : [1024, 8192] : torch.bfloat16
83 : 1048576 : layers.9.attention.wk.weight : [128, 8192] : torch.bfloat16
84 : 1048576 : layers.9.attention.wv.weight : [128, 8192] : torch.bfloat16
85 : 8388608 : layers.9.attention.wo.weight : [8192, 1024] : torch.bfloat16
86 : 29360128 : layers.9.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
87 : 29360128 : layers.9.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
88 : 29360128 : layers.9.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
89 : 8192 : layers.9.attention_norm.weight : [8192] : torch.bfloat16
90 : 8192 : layers.9.ffn_norm.weight : [8192] : torch.bfloat16
91 : 8388608 : layers.10.attention.wq.weight : [1024, 8192] : torch.bfloat16
92 : 1048576 : layers.10.attention.wk.weight : [128, 8192] : torch.bfloat16
93 : 1048576 : layers.10.attention.wv.weight : [128, 8192] : torch.bfloat16
94 : 8388608 : layers.10.attention.wo.weight : [8192, 1024] : torch.bfloat16
95 : 29360128 : layers.10.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
96 : 29360128 : layers.10.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
97 : 29360128 : layers.10.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
98 : 8192 : layers.10.attention_norm.weight : [8192] : torch.bfloat16
99 : 8192 : layers.10.ffn_norm.weight : [8192] : torch.bfloat16
100 : 8388608 : layers.11.attention.wq.weight : [1024, 8192] : torch.bfloat16
101 : 1048576 : layers.11.attention.wk.weight : [128, 8192] : torch.bfloat16
102 : 1048576 : layers.11.attention.wv.weight : [128, 8192] : torch.bfloat16
103 : 8388608 : layers.11.attention.wo.weight : [8192, 1024] : torch.bfloat16
104 : 29360128 : layers.11.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
105 : 29360128 : layers.11.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
106 : 29360128 : layers.11.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
107 : 8192 : layers.11.attention_norm.weight : [8192] : torch.bfloat16
108 : 8192 : layers.11.ffn_norm.weight : [8192] : torch.bfloat16
109 : 8388608 : layers.12.attention.wq.weight : [1024, 8192] : torch.bfloat16
110 : 1048576 : layers.12.attention.wk.weight : [128, 8192] : torch.bfloat16
111 : 1048576 : layers.12.attention.wv.weight : [128, 8192] : torch.bfloat16
112 : 8388608 : layers.12.attention.wo.weight : [8192, 1024] : torch.bfloat16
113 : 29360128 : layers.12.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
114 : 29360128 : layers.12.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
115 : 29360128 : layers.12.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
116 : 8192 : layers.12.attention_norm.weight : [8192] : torch.bfloat16
117 : 8192 : layers.12.ffn_norm.weight : [8192] : torch.bfloat16
118 : 8388608 : layers.13.attention.wq.weight : [1024, 8192] : torch.bfloat16
119 : 1048576 : layers.13.attention.wk.weight : [128, 8192] : torch.bfloat16
120 : 1048576 : layers.13.attention.wv.weight : [128, 8192] : torch.bfloat16
121 : 8388608 : layers.13.attention.wo.weight : [8192, 1024] : torch.bfloat16
122 : 29360128 : layers.13.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
123 : 29360128 : layers.13.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
124 : 29360128 : layers.13.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
125 : 8192 : layers.13.attention_norm.weight : [8192] : torch.bfloat16
126 : 8192 : layers.13.ffn_norm.weight : [8192] : torch.bfloat16
127 : 8388608 : layers.14.attention.wq.weight : [1024, 8192] : torch.bfloat16
128 : 1048576 : layers.14.attention.wk.weight : [128, 8192] : torch.bfloat16
129 : 1048576 : layers.14.attention.wv.weight : [128, 8192] : torch.bfloat16
130 : 8388608 : layers.14.attention.wo.weight : [8192, 1024] : torch.bfloat16
131 : 29360128 : layers.14.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
132 : 29360128 : layers.14.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
133 : 29360128 : layers.14.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
134 : 8192 : layers.14.attention_norm.weight : [8192] : torch.bfloat16
135 : 8192 : layers.14.ffn_norm.weight : [8192] : torch.bfloat16
136 : 8388608 : layers.15.attention.wq.weight : [1024, 8192] : torch.bfloat16
137 : 1048576 : layers.15.attention.wk.weight : [128, 8192] : torch.bfloat16
138 : 1048576 : layers.15.attention.wv.weight : [128, 8192] : torch.bfloat16
139 : 8388608 : layers.15.attention.wo.weight : [8192, 1024] : torch.bfloat16
140 : 29360128 : layers.15.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
141 : 29360128 : layers.15.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
142 : 29360128 : layers.15.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
143 : 8192 : layers.15.attention_norm.weight : [8192] : torch.bfloat16
144 : 8192 : layers.15.ffn_norm.weight : [8192] : torch.bfloat16
145 : 8388608 : layers.16.attention.wq.weight : [1024, 8192] : torch.bfloat16
146 : 1048576 : layers.16.attention.wk.weight : [128, 8192] : torch.bfloat16
147 : 1048576 : layers.16.attention.wv.weight : [128, 8192] : torch.bfloat16
148 : 8388608 : layers.16.attention.wo.weight : [8192, 1024] : torch.bfloat16
149 : 29360128 : layers.16.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
150 : 29360128 : layers.16.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
151 : 29360128 : layers.16.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
152 : 8192 : layers.16.attention_norm.weight : [8192] : torch.bfloat16
153 : 8192 : layers.16.ffn_norm.weight : [8192] : torch.bfloat16
154 : 8388608 : layers.17.attention.wq.weight : [1024, 8192] : torch.bfloat16
155 : 1048576 : layers.17.attention.wk.weight : [128, 8192] : torch.bfloat16
156 : 1048576 : layers.17.attention.wv.weight : [128, 8192] : torch.bfloat16
157 : 8388608 : layers.17.attention.wo.weight : [8192, 1024] : torch.bfloat16
158 : 29360128 : layers.17.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
159 : 29360128 : layers.17.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
160 : 29360128 : layers.17.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
161 : 8192 : layers.17.attention_norm.weight : [8192] : torch.bfloat16
162 : 8192 : layers.17.ffn_norm.weight : [8192] : torch.bfloat16
163 : 8388608 : layers.18.attention.wq.weight : [1024, 8192] : torch.bfloat16
164 : 1048576 : layers.18.attention.wk.weight : [128, 8192] : torch.bfloat16
165 : 1048576 : layers.18.attention.wv.weight : [128, 8192] : torch.bfloat16
166 : 8388608 : layers.18.attention.wo.weight : [8192, 1024] : torch.bfloat16
167 : 29360128 : layers.18.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
168 : 29360128 : layers.18.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
169 : 29360128 : layers.18.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
170 : 8192 : layers.18.attention_norm.weight : [8192] : torch.bfloat16
171 : 8192 : layers.18.ffn_norm.weight : [8192] : torch.bfloat16
172 : 8388608 : layers.19.attention.wq.weight : [1024, 8192] : torch.bfloat16
173 : 1048576 : layers.19.attention.wk.weight : [128, 8192] : torch.bfloat16
174 : 1048576 : layers.19.attention.wv.weight : [128, 8192] : torch.bfloat16
175 : 8388608 : layers.19.attention.wo.weight : [8192, 1024] : torch.bfloat16
176 : 29360128 : layers.19.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
177 : 29360128 : layers.19.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
178 : 29360128 : layers.19.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
179 : 8192 : layers.19.attention_norm.weight : [8192] : torch.bfloat16
180 : 8192 : layers.19.ffn_norm.weight : [8192] : torch.bfloat16
181 : 8388608 : layers.20.attention.wq.weight : [1024, 8192] : torch.bfloat16
182 : 1048576 : layers.20.attention.wk.weight : [128, 8192] : torch.bfloat16
183 : 1048576 : layers.20.attention.wv.weight : [128, 8192] : torch.bfloat16
184 : 8388608 : layers.20.attention.wo.weight : [8192, 1024] : torch.bfloat16
185 : 29360128 : layers.20.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
186 : 29360128 : layers.20.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
187 : 29360128 : layers.20.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
188 : 8192 : layers.20.attention_norm.weight : [8192] : torch.bfloat16
189 : 8192 : layers.20.ffn_norm.weight : [8192] : torch.bfloat16
190 : 8388608 : layers.21.attention.wq.weight : [1024, 8192] : torch.bfloat16
191 : 1048576 : layers.21.attention.wk.weight : [128, 8192] : torch.bfloat16
192 : 1048576 : layers.21.attention.wv.weight : [128, 8192] : torch.bfloat16
193 : 8388608 : layers.21.attention.wo.weight : [8192, 1024] : torch.bfloat16
194 : 29360128 : layers.21.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
195 : 29360128 : layers.21.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
196 : 29360128 : layers.21.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
197 : 8192 : layers.21.attention_norm.weight : [8192] : torch.bfloat16
198 : 8192 : layers.21.ffn_norm.weight : [8192] : torch.bfloat16
199 : 8388608 : layers.22.attention.wq.weight : [1024, 8192] : torch.bfloat16
200 : 1048576 : layers.22.attention.wk.weight : [128, 8192] : torch.bfloat16
201 : 1048576 : layers.22.attention.wv.weight : [128, 8192] : torch.bfloat16
202 : 8388608 : layers.22.attention.wo.weight : [8192, 1024] : torch.bfloat16
203 : 29360128 : layers.22.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
204 : 29360128 : layers.22.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
205 : 29360128 : layers.22.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
206 : 8192 : layers.22.attention_norm.weight : [8192] : torch.bfloat16
207 : 8192 : layers.22.ffn_norm.weight : [8192] : torch.bfloat16
208 : 8388608 : layers.23.attention.wq.weight : [1024, 8192] : torch.bfloat16
209 : 1048576 : layers.23.attention.wk.weight : [128, 8192] : torch.bfloat16
210 : 1048576 : layers.23.attention.wv.weight : [128, 8192] : torch.bfloat16
211 : 8388608 : layers.23.attention.wo.weight : [8192, 1024] : torch.bfloat16
212 : 29360128 : layers.23.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
213 : 29360128 : layers.23.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
214 : 29360128 : layers.23.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
215 : 8192 : layers.23.attention_norm.weight : [8192] : torch.bfloat16
216 : 8192 : layers.23.ffn_norm.weight : [8192] : torch.bfloat16
217 : 8388608 : layers.24.attention.wq.weight : [1024, 8192] : torch.bfloat16
218 : 1048576 : layers.24.attention.wk.weight : [128, 8192] : torch.bfloat16
219 : 1048576 : layers.24.attention.wv.weight : [128, 8192] : torch.bfloat16
220 : 8388608 : layers.24.attention.wo.weight : [8192, 1024] : torch.bfloat16
221 : 29360128 : layers.24.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
222 : 29360128 : layers.24.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
223 : 29360128 : layers.24.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
224 : 8192 : layers.24.attention_norm.weight : [8192] : torch.bfloat16
225 : 8192 : layers.24.ffn_norm.weight : [8192] : torch.bfloat16
226 : 8388608 : layers.25.attention.wq.weight : [1024, 8192] : torch.bfloat16
227 : 1048576 : layers.25.attention.wk.weight : [128, 8192] : torch.bfloat16
228 : 1048576 : layers.25.attention.wv.weight : [128, 8192] : torch.bfloat16
229 : 8388608 : layers.25.attention.wo.weight : [8192, 1024] : torch.bfloat16
230 : 29360128 : layers.25.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
231 : 29360128 : layers.25.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
232 : 29360128 : layers.25.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
233 : 8192 : layers.25.attention_norm.weight : [8192] : torch.bfloat16
234 : 8192 : layers.25.ffn_norm.weight : [8192] : torch.bfloat16
235 : 8388608 : layers.26.attention.wq.weight : [1024, 8192] : torch.bfloat16
236 : 1048576 : layers.26.attention.wk.weight : [128, 8192] : torch.bfloat16
237 : 1048576 : layers.26.attention.wv.weight : [128, 8192] : torch.bfloat16
238 : 8388608 : layers.26.attention.wo.weight : [8192, 1024] : torch.bfloat16
239 : 29360128 : layers.26.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
240 : 29360128 : layers.26.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
241 : 29360128 : layers.26.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
242 : 8192 : layers.26.attention_norm.weight : [8192] : torch.bfloat16
243 : 8192 : layers.26.ffn_norm.weight : [8192] : torch.bfloat16
244 : 8388608 : layers.27.attention.wq.weight : [1024, 8192] : torch.bfloat16
245 : 1048576 : layers.27.attention.wk.weight : [128, 8192] : torch.bfloat16
246 : 1048576 : layers.27.attention.wv.weight : [128, 8192] : torch.bfloat16
247 : 8388608 : layers.27.attention.wo.weight : [8192, 1024] : torch.bfloat16
248 : 29360128 : layers.27.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
249 : 29360128 : layers.27.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
250 : 29360128 : layers.27.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
251 : 8192 : layers.27.attention_norm.weight : [8192] : torch.bfloat16
252 : 8192 : layers.27.ffn_norm.weight : [8192] : torch.bfloat16
253 : 8388608 : layers.28.attention.wq.weight : [1024, 8192] : torch.bfloat16
254 : 1048576 : layers.28.attention.wk.weight : [128, 8192] : torch.bfloat16
255 : 1048576 : layers.28.attention.wv.weight : [128, 8192] : torch.bfloat16
256 : 8388608 : layers.28.attention.wo.weight : [8192, 1024] : torch.bfloat16
257 : 29360128 : layers.28.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
258 : 29360128 : layers.28.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
259 : 29360128 : layers.28.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
260 : 8192 : layers.28.attention_norm.weight : [8192] : torch.bfloat16
261 : 8192 : layers.28.ffn_norm.weight : [8192] : torch.bfloat16
262 : 8388608 : layers.29.attention.wq.weight : [1024, 8192] : torch.bfloat16
263 : 1048576 : layers.29.attention.wk.weight : [128, 8192] : torch.bfloat16
264 : 1048576 : layers.29.attention.wv.weight : [128, 8192] : torch.bfloat16
265 : 8388608 : layers.29.attention.wo.weight : [8192, 1024] : torch.bfloat16
266 : 29360128 : layers.29.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
267 : 29360128 : layers.29.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
268 : 29360128 : layers.29.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
269 : 8192 : layers.29.attention_norm.weight : [8192] : torch.bfloat16
270 : 8192 : layers.29.ffn_norm.weight : [8192] : torch.bfloat16
271 : 8388608 : layers.30.attention.wq.weight : [1024, 8192] : torch.bfloat16
272 : 1048576 : layers.30.attention.wk.weight : [128, 8192] : torch.bfloat16
273 : 1048576 : layers.30.attention.wv.weight : [128, 8192] : torch.bfloat16
274 : 8388608 : layers.30.attention.wo.weight : [8192, 1024] : torch.bfloat16
275 : 29360128 : layers.30.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
276 : 29360128 : layers.30.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
277 : 29360128 : layers.30.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
278 : 8192 : layers.30.attention_norm.weight : [8192] : torch.bfloat16
279 : 8192 : layers.30.ffn_norm.weight : [8192] : torch.bfloat16
280 : 8388608 : layers.31.attention.wq.weight : [1024, 8192] : torch.bfloat16
281 : 1048576 : layers.31.attention.wk.weight : [128, 8192] : torch.bfloat16
282 : 1048576 : layers.31.attention.wv.weight : [128, 8192] : torch.bfloat16
283 : 8388608 : layers.31.attention.wo.weight : [8192, 1024] : torch.bfloat16
284 : 29360128 : layers.31.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
285 : 29360128 : layers.31.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
286 : 29360128 : layers.31.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
287 : 8192 : layers.31.attention_norm.weight : [8192] : torch.bfloat16
288 : 8192 : layers.31.ffn_norm.weight : [8192] : torch.bfloat16
289 : 8388608 : layers.32.attention.wq.weight : [1024, 8192] : torch.bfloat16
290 : 1048576 : layers.32.attention.wk.weight : [128, 8192] : torch.bfloat16
291 : 1048576 : layers.32.attention.wv.weight : [128, 8192] : torch.bfloat16
292 : 8388608 : layers.32.attention.wo.weight : [8192, 1024] : torch.bfloat16
293 : 29360128 : layers.32.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
294 : 29360128 : layers.32.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
295 : 29360128 : layers.32.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
296 : 8192 : layers.32.attention_norm.weight : [8192] : torch.bfloat16
297 : 8192 : layers.32.ffn_norm.weight : [8192] : torch.bfloat16
298 : 8388608 : layers.33.attention.wq.weight : [1024, 8192] : torch.bfloat16
299 : 1048576 : layers.33.attention.wk.weight : [128, 8192] : torch.bfloat16
300 : 1048576 : layers.33.attention.wv.weight : [128, 8192] : torch.bfloat16
301 : 8388608 : layers.33.attention.wo.weight : [8192, 1024] : torch.bfloat16
302 : 29360128 : layers.33.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
303 : 29360128 : layers.33.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
304 : 29360128 : layers.33.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
305 : 8192 : layers.33.attention_norm.weight : [8192] : torch.bfloat16
306 : 8192 : layers.33.ffn_norm.weight : [8192] : torch.bfloat16
307 : 8388608 : layers.34.attention.wq.weight : [1024, 8192] : torch.bfloat16
308 : 1048576 : layers.34.attention.wk.weight : [128, 8192] : torch.bfloat16
309 : 1048576 : layers.34.attention.wv.weight : [128, 8192] : torch.bfloat16
310 : 8388608 : layers.34.attention.wo.weight : [8192, 1024] : torch.bfloat16
311 : 29360128 : layers.34.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
312 : 29360128 : layers.34.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
313 : 29360128 : layers.34.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
314 : 8192 : layers.34.attention_norm.weight : [8192] : torch.bfloat16
315 : 8192 : layers.34.ffn_norm.weight : [8192] : torch.bfloat16
316 : 8388608 : layers.35.attention.wq.weight : [1024, 8192] : torch.bfloat16
317 : 1048576 : layers.35.attention.wk.weight : [128, 8192] : torch.bfloat16
318 : 1048576 : layers.35.attention.wv.weight : [128, 8192] : torch.bfloat16
319 : 8388608 : layers.35.attention.wo.weight : [8192, 1024] : torch.bfloat16
320 : 29360128 : layers.35.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
321 : 29360128 : layers.35.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
322 : 29360128 : layers.35.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
323 : 8192 : layers.35.attention_norm.weight : [8192] : torch.bfloat16
324 : 8192 : layers.35.ffn_norm.weight : [8192] : torch.bfloat16
325 : 8388608 : layers.36.attention.wq.weight : [1024, 8192] : torch.bfloat16
326 : 1048576 : layers.36.attention.wk.weight : [128, 8192] : torch.bfloat16
327 : 1048576 : layers.36.attention.wv.weight : [128, 8192] : torch.bfloat16
328 : 8388608 : layers.36.attention.wo.weight : [8192, 1024] : torch.bfloat16
329 : 29360128 : layers.36.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
330 : 29360128 : layers.36.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
331 : 29360128 : layers.36.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
332 : 8192 : layers.36.attention_norm.weight : [8192] : torch.bfloat16
333 : 8192 : layers.36.ffn_norm.weight : [8192] : torch.bfloat16
334 : 8388608 : layers.37.attention.wq.weight : [1024, 8192] : torch.bfloat16
335 : 1048576 : layers.37.attention.wk.weight : [128, 8192] : torch.bfloat16
336 : 1048576 : layers.37.attention.wv.weight : [128, 8192] : torch.bfloat16
337 : 8388608 : layers.37.attention.wo.weight : [8192, 1024] : torch.bfloat16
338 : 29360128 : layers.37.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
339 : 29360128 : layers.37.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
340 : 29360128 : layers.37.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
341 : 8192 : layers.37.attention_norm.weight : [8192] : torch.bfloat16
342 : 8192 : layers.37.ffn_norm.weight : [8192] : torch.bfloat16
343 : 8388608 : layers.38.attention.wq.weight : [1024, 8192] : torch.bfloat16
344 : 1048576 : layers.38.attention.wk.weight : [128, 8192] : torch.bfloat16
345 : 1048576 : layers.38.attention.wv.weight : [128, 8192] : torch.bfloat16
346 : 8388608 : layers.38.attention.wo.weight : [8192, 1024] : torch.bfloat16
347 : 29360128 : layers.38.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
348 : 29360128 : layers.38.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
349 : 29360128 : layers.38.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
350 : 8192 : layers.38.attention_norm.weight : [8192] : torch.bfloat16
351 : 8192 : layers.38.ffn_norm.weight : [8192] : torch.bfloat16
352 : 8388608 : layers.39.attention.wq.weight : [1024, 8192] : torch.bfloat16
353 : 1048576 : layers.39.attention.wk.weight : [128, 8192] : torch.bfloat16
354 : 1048576 : layers.39.attention.wv.weight : [128, 8192] : torch.bfloat16
355 : 8388608 : layers.39.attention.wo.weight : [8192, 1024] : torch.bfloat16
356 : 29360128 : layers.39.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
357 : 29360128 : layers.39.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
358 : 29360128 : layers.39.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
359 : 8192 : layers.39.attention_norm.weight : [8192] : torch.bfloat16
360 : 8192 : layers.39.ffn_norm.weight : [8192] : torch.bfloat16
361 : 8388608 : layers.40.attention.wq.weight : [1024, 8192] : torch.bfloat16
362 : 1048576 : layers.40.attention.wk.weight : [128, 8192] : torch.bfloat16
363 : 1048576 : layers.40.attention.wv.weight : [128, 8192] : torch.bfloat16
364 : 8388608 : layers.40.attention.wo.weight : [8192, 1024] : torch.bfloat16
365 : 29360128 : layers.40.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
366 : 29360128 : layers.40.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
367 : 29360128 : layers.40.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
368 : 8192 : layers.40.attention_norm.weight : [8192] : torch.bfloat16
369 : 8192 : layers.40.ffn_norm.weight : [8192] : torch.bfloat16
370 : 8388608 : layers.41.attention.wq.weight : [1024, 8192] : torch.bfloat16
371 : 1048576 : layers.41.attention.wk.weight : [128, 8192] : torch.bfloat16
372 : 1048576 : layers.41.attention.wv.weight : [128, 8192] : torch.bfloat16
373 : 8388608 : layers.41.attention.wo.weight : [8192, 1024] : torch.bfloat16
374 : 29360128 : layers.41.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
375 : 29360128 : layers.41.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
376 : 29360128 : layers.41.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
377 : 8192 : layers.41.attention_norm.weight : [8192] : torch.bfloat16
378 : 8192 : layers.41.ffn_norm.weight : [8192] : torch.bfloat16
379 : 8388608 : layers.42.attention.wq.weight : [1024, 8192] : torch.bfloat16
380 : 1048576 : layers.42.attention.wk.weight : [128, 8192] : torch.bfloat16
381 : 1048576 : layers.42.attention.wv.weight : [128, 8192] : torch.bfloat16
382 : 8388608 : layers.42.attention.wo.weight : [8192, 1024] : torch.bfloat16
383 : 29360128 : layers.42.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
384 : 29360128 : layers.42.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
385 : 29360128 : layers.42.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
386 : 8192 : layers.42.attention_norm.weight : [8192] : torch.bfloat16
387 : 8192 : layers.42.ffn_norm.weight : [8192] : torch.bfloat16
388 : 8388608 : layers.43.attention.wq.weight : [1024, 8192] : torch.bfloat16
389 : 1048576 : layers.43.attention.wk.weight : [128, 8192] : torch.bfloat16
390 : 1048576 : layers.43.attention.wv.weight : [128, 8192] : torch.bfloat16
391 : 8388608 : layers.43.attention.wo.weight : [8192, 1024] : torch.bfloat16
392 : 29360128 : layers.43.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
393 : 29360128 : layers.43.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
394 : 29360128 : layers.43.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
395 : 8192 : layers.43.attention_norm.weight : [8192] : torch.bfloat16
396 : 8192 : layers.43.ffn_norm.weight : [8192] : torch.bfloat16
397 : 8388608 : layers.44.attention.wq.weight : [1024, 8192] : torch.bfloat16
398 : 1048576 : layers.44.attention.wk.weight : [128, 8192] : torch.bfloat16
399 : 1048576 : layers.44.attention.wv.weight : [128, 8192] : torch.bfloat16
400 : 8388608 : layers.44.attention.wo.weight : [8192, 1024] : torch.bfloat16
401 : 29360128 : layers.44.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
402 : 29360128 : layers.44.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
403 : 29360128 : layers.44.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
404 : 8192 : layers.44.attention_norm.weight : [8192] : torch.bfloat16
405 : 8192 : layers.44.ffn_norm.weight : [8192] : torch.bfloat16
406 : 8388608 : layers.45.attention.wq.weight : [1024, 8192] : torch.bfloat16
407 : 1048576 : layers.45.attention.wk.weight : [128, 8192] : torch.bfloat16
408 : 1048576 : layers.45.attention.wv.weight : [128, 8192] : torch.bfloat16
409 : 8388608 : layers.45.attention.wo.weight : [8192, 1024] : torch.bfloat16
410 : 29360128 : layers.45.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
411 : 29360128 : layers.45.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
412 : 29360128 : layers.45.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
413 : 8192 : layers.45.attention_norm.weight : [8192] : torch.bfloat16
414 : 8192 : layers.45.ffn_norm.weight : [8192] : torch.bfloat16
415 : 8388608 : layers.46.attention.wq.weight : [1024, 8192] : torch.bfloat16
416 : 1048576 : layers.46.attention.wk.weight : [128, 8192] : torch.bfloat16
417 : 1048576 : layers.46.attention.wv.weight : [128, 8192] : torch.bfloat16
418 : 8388608 : layers.46.attention.wo.weight : [8192, 1024] : torch.bfloat16
419 : 29360128 : layers.46.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
420 : 29360128 : layers.46.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
421 : 29360128 : layers.46.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
422 : 8192 : layers.46.attention_norm.weight : [8192] : torch.bfloat16
423 : 8192 : layers.46.ffn_norm.weight : [8192] : torch.bfloat16
424 : 8388608 : layers.47.attention.wq.weight : [1024, 8192] : torch.bfloat16
425 : 1048576 : layers.47.attention.wk.weight : [128, 8192] : torch.bfloat16
426 : 1048576 : layers.47.attention.wv.weight : [128, 8192] : torch.bfloat16
427 : 8388608 : layers.47.attention.wo.weight : [8192, 1024] : torch.bfloat16
428 : 29360128 : layers.47.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
429 : 29360128 : layers.47.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
430 : 29360128 : layers.47.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
431 : 8192 : layers.47.attention_norm.weight : [8192] : torch.bfloat16
432 : 8192 : layers.47.ffn_norm.weight : [8192] : torch.bfloat16
433 : 8388608 : layers.48.attention.wq.weight : [1024, 8192] : torch.bfloat16
434 : 1048576 : layers.48.attention.wk.weight : [128, 8192] : torch.bfloat16
435 : 1048576 : layers.48.attention.wv.weight : [128, 8192] : torch.bfloat16
436 : 8388608 : layers.48.attention.wo.weight : [8192, 1024] : torch.bfloat16
437 : 29360128 : layers.48.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
438 : 29360128 : layers.48.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
439 : 29360128 : layers.48.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
440 : 8192 : layers.48.attention_norm.weight : [8192] : torch.bfloat16
441 : 8192 : layers.48.ffn_norm.weight : [8192] : torch.bfloat16
442 : 8388608 : layers.49.attention.wq.weight : [1024, 8192] : torch.bfloat16
443 : 1048576 : layers.49.attention.wk.weight : [128, 8192] : torch.bfloat16
444 : 1048576 : layers.49.attention.wv.weight : [128, 8192] : torch.bfloat16
445 : 8388608 : layers.49.attention.wo.weight : [8192, 1024] : torch.bfloat16
446 : 29360128 : layers.49.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
447 : 29360128 : layers.49.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
448 : 29360128 : layers.49.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
449 : 8192 : layers.49.attention_norm.weight : [8192] : torch.bfloat16
450 : 8192 : layers.49.ffn_norm.weight : [8192] : torch.bfloat16
451 : 8388608 : layers.50.attention.wq.weight : [1024, 8192] : torch.bfloat16
452 : 1048576 : layers.50.attention.wk.weight : [128, 8192] : torch.bfloat16
453 : 1048576 : layers.50.attention.wv.weight : [128, 8192] : torch.bfloat16
454 : 8388608 : layers.50.attention.wo.weight : [8192, 1024] : torch.bfloat16
455 : 29360128 : layers.50.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
456 : 29360128 : layers.50.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
457 : 29360128 : layers.50.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
458 : 8192 : layers.50.attention_norm.weight : [8192] : torch.bfloat16
459 : 8192 : layers.50.ffn_norm.weight : [8192] : torch.bfloat16
460 : 8388608 : layers.51.attention.wq.weight : [1024, 8192] : torch.bfloat16
461 : 1048576 : layers.51.attention.wk.weight : [128, 8192] : torch.bfloat16
462 : 1048576 : layers.51.attention.wv.weight : [128, 8192] : torch.bfloat16
463 : 8388608 : layers.51.attention.wo.weight : [8192, 1024] : torch.bfloat16
464 : 29360128 : layers.51.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
465 : 29360128 : layers.51.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
466 : 29360128 : layers.51.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
467 : 8192 : layers.51.attention_norm.weight : [8192] : torch.bfloat16
468 : 8192 : layers.51.ffn_norm.weight : [8192] : torch.bfloat16
469 : 8388608 : layers.52.attention.wq.weight : [1024, 8192] : torch.bfloat16
470 : 1048576 : layers.52.attention.wk.weight : [128, 8192] : torch.bfloat16
471 : 1048576 : layers.52.attention.wv.weight : [128, 8192] : torch.bfloat16
472 : 8388608 : layers.52.attention.wo.weight : [8192, 1024] : torch.bfloat16
473 : 29360128 : layers.52.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
474 : 29360128 : layers.52.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
475 : 29360128 : layers.52.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
476 : 8192 : layers.52.attention_norm.weight : [8192] : torch.bfloat16
477 : 8192 : layers.52.ffn_norm.weight : [8192] : torch.bfloat16
478 : 8388608 : layers.53.attention.wq.weight : [1024, 8192] : torch.bfloat16
479 : 1048576 : layers.53.attention.wk.weight : [128, 8192] : torch.bfloat16
480 : 1048576 : layers.53.attention.wv.weight : [128, 8192] : torch.bfloat16
481 : 8388608 : layers.53.attention.wo.weight : [8192, 1024] : torch.bfloat16
482 : 29360128 : layers.53.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
483 : 29360128 : layers.53.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
484 : 29360128 : layers.53.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
485 : 8192 : layers.53.attention_norm.weight : [8192] : torch.bfloat16
486 : 8192 : layers.53.ffn_norm.weight : [8192] : torch.bfloat16
487 : 8388608 : layers.54.attention.wq.weight : [1024, 8192] : torch.bfloat16
488 : 1048576 : layers.54.attention.wk.weight : [128, 8192] : torch.bfloat16
489 : 1048576 : layers.54.attention.wv.weight : [128, 8192] : torch.bfloat16
490 : 8388608 : layers.54.attention.wo.weight : [8192, 1024] : torch.bfloat16
491 : 29360128 : layers.54.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
492 : 29360128 : layers.54.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
493 : 29360128 : layers.54.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
494 : 8192 : layers.54.attention_norm.weight : [8192] : torch.bfloat16
495 : 8192 : layers.54.ffn_norm.weight : [8192] : torch.bfloat16
496 : 8388608 : layers.55.attention.wq.weight : [1024, 8192] : torch.bfloat16
497 : 1048576 : layers.55.attention.wk.weight : [128, 8192] : torch.bfloat16
498 : 1048576 : layers.55.attention.wv.weight : [128, 8192] : torch.bfloat16
499 : 8388608 : layers.55.attention.wo.weight : [8192, 1024] : torch.bfloat16
500 : 29360128 : layers.55.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
501 : 29360128 : layers.55.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
502 : 29360128 : layers.55.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
503 : 8192 : layers.55.attention_norm.weight : [8192] : torch.bfloat16
504 : 8192 : layers.55.ffn_norm.weight : [8192] : torch.bfloat16
505 : 8388608 : layers.56.attention.wq.weight : [1024, 8192] : torch.bfloat16
506 : 1048576 : layers.56.attention.wk.weight : [128, 8192] : torch.bfloat16
507 : 1048576 : layers.56.attention.wv.weight : [128, 8192] : torch.bfloat16
508 : 8388608 : layers.56.attention.wo.weight : [8192, 1024] : torch.bfloat16
509 : 29360128 : layers.56.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
510 : 29360128 : layers.56.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
511 : 29360128 : layers.56.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
512 : 8192 : layers.56.attention_norm.weight : [8192] : torch.bfloat16
513 : 8192 : layers.56.ffn_norm.weight : [8192] : torch.bfloat16
514 : 8388608 : layers.57.attention.wq.weight : [1024, 8192] : torch.bfloat16
515 : 1048576 : layers.57.attention.wk.weight : [128, 8192] : torch.bfloat16
516 : 1048576 : layers.57.attention.wv.weight : [128, 8192] : torch.bfloat16
517 : 8388608 : layers.57.attention.wo.weight : [8192, 1024] : torch.bfloat16
518 : 29360128 : layers.57.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
519 : 29360128 : layers.57.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
520 : 29360128 : layers.57.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
521 : 8192 : layers.57.attention_norm.weight : [8192] : torch.bfloat16
522 : 8192 : layers.57.ffn_norm.weight : [8192] : torch.bfloat16
523 : 8388608 : layers.58.attention.wq.weight : [1024, 8192] : torch.bfloat16
524 : 1048576 : layers.58.attention.wk.weight : [128, 8192] : torch.bfloat16
525 : 1048576 : layers.58.attention.wv.weight : [128, 8192] : torch.bfloat16
526 : 8388608 : layers.58.attention.wo.weight : [8192, 1024] : torch.bfloat16
527 : 29360128 : layers.58.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
528 : 29360128 : layers.58.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
529 : 29360128 : layers.58.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
530 : 8192 : layers.58.attention_norm.weight : [8192] : torch.bfloat16
531 : 8192 : layers.58.ffn_norm.weight : [8192] : torch.bfloat16
532 : 8388608 : layers.59.attention.wq.weight : [1024, 8192] : torch.bfloat16
533 : 1048576 : layers.59.attention.wk.weight : [128, 8192] : torch.bfloat16
534 : 1048576 : layers.59.attention.wv.weight : [128, 8192] : torch.bfloat16
535 : 8388608 : layers.59.attention.wo.weight : [8192, 1024] : torch.bfloat16
536 : 29360128 : layers.59.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
537 : 29360128 : layers.59.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
538 : 29360128 : layers.59.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
539 : 8192 : layers.59.attention_norm.weight : [8192] : torch.bfloat16
540 : 8192 : layers.59.ffn_norm.weight : [8192] : torch.bfloat16
541 : 8388608 : layers.60.attention.wq.weight : [1024, 8192] : torch.bfloat16
542 : 1048576 : layers.60.attention.wk.weight : [128, 8192] : torch.bfloat16
543 : 1048576 : layers.60.attention.wv.weight : [128, 8192] : torch.bfloat16
544 : 8388608 : layers.60.attention.wo.weight : [8192, 1024] : torch.bfloat16
545 : 29360128 : layers.60.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
546 : 29360128 : layers.60.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
547 : 29360128 : layers.60.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
548 : 8192 : layers.60.attention_norm.weight : [8192] : torch.bfloat16
549 : 8192 : layers.60.ffn_norm.weight : [8192] : torch.bfloat16
550 : 8388608 : layers.61.attention.wq.weight : [1024, 8192] : torch.bfloat16
551 : 1048576 : layers.61.attention.wk.weight : [128, 8192] : torch.bfloat16
552 : 1048576 : layers.61.attention.wv.weight : [128, 8192] : torch.bfloat16
553 : 8388608 : layers.61.attention.wo.weight : [8192, 1024] : torch.bfloat16
554 : 29360128 : layers.61.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
555 : 29360128 : layers.61.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
556 : 29360128 : layers.61.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
557 : 8192 : layers.61.attention_norm.weight : [8192] : torch.bfloat16
558 : 8192 : layers.61.ffn_norm.weight : [8192] : torch.bfloat16
559 : 8388608 : layers.62.attention.wq.weight : [1024, 8192] : torch.bfloat16
560 : 1048576 : layers.62.attention.wk.weight : [128, 8192] : torch.bfloat16
561 : 1048576 : layers.62.attention.wv.weight : [128, 8192] : torch.bfloat16
562 : 8388608 : layers.62.attention.wo.weight : [8192, 1024] : torch.bfloat16
563 : 29360128 : layers.62.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
564 : 29360128 : layers.62.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
565 : 29360128 : layers.62.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
566 : 8192 : layers.62.attention_norm.weight : [8192] : torch.bfloat16
567 : 8192 : layers.62.ffn_norm.weight : [8192] : torch.bfloat16
568 : 8388608 : layers.63.attention.wq.weight : [1024, 8192] : torch.bfloat16
569 : 1048576 : layers.63.attention.wk.weight : [128, 8192] : torch.bfloat16
570 : 1048576 : layers.63.attention.wv.weight : [128, 8192] : torch.bfloat16
571 : 8388608 : layers.63.attention.wo.weight : [8192, 1024] : torch.bfloat16
572 : 29360128 : layers.63.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
573 : 29360128 : layers.63.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
574 : 29360128 : layers.63.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
575 : 8192 : layers.63.attention_norm.weight : [8192] : torch.bfloat16
576 : 8192 : layers.63.ffn_norm.weight : [8192] : torch.bfloat16
577 : 8388608 : layers.64.attention.wq.weight : [1024, 8192] : torch.bfloat16
578 : 1048576 : layers.64.attention.wk.weight : [128, 8192] : torch.bfloat16
579 : 1048576 : layers.64.attention.wv.weight : [128, 8192] : torch.bfloat16
580 : 8388608 : layers.64.attention.wo.weight : [8192, 1024] : torch.bfloat16
581 : 29360128 : layers.64.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
582 : 29360128 : layers.64.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
583 : 29360128 : layers.64.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
584 : 8192 : layers.64.attention_norm.weight : [8192] : torch.bfloat16
585 : 8192 : layers.64.ffn_norm.weight : [8192] : torch.bfloat16
586 : 8388608 : layers.65.attention.wq.weight : [1024, 8192] : torch.bfloat16
587 : 1048576 : layers.65.attention.wk.weight : [128, 8192] : torch.bfloat16
588 : 1048576 : layers.65.attention.wv.weight : [128, 8192] : torch.bfloat16
589 : 8388608 : layers.65.attention.wo.weight : [8192, 1024] : torch.bfloat16
590 : 29360128 : layers.65.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
591 : 29360128 : layers.65.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
592 : 29360128 : layers.65.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
593 : 8192 : layers.65.attention_norm.weight : [8192] : torch.bfloat16
594 : 8192 : layers.65.ffn_norm.weight : [8192] : torch.bfloat16
595 : 8388608 : layers.66.attention.wq.weight : [1024, 8192] : torch.bfloat16
596 : 1048576 : layers.66.attention.wk.weight : [128, 8192] : torch.bfloat16
597 : 1048576 : layers.66.attention.wv.weight : [128, 8192] : torch.bfloat16
598 : 8388608 : layers.66.attention.wo.weight : [8192, 1024] : torch.bfloat16
599 : 29360128 : layers.66.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
600 : 29360128 : layers.66.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
601 : 29360128 : layers.66.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
602 : 8192 : layers.66.attention_norm.weight : [8192] : torch.bfloat16
603 : 8192 : layers.66.ffn_norm.weight : [8192] : torch.bfloat16
604 : 8388608 : layers.67.attention.wq.weight : [1024, 8192] : torch.bfloat16
605 : 1048576 : layers.67.attention.wk.weight : [128, 8192] : torch.bfloat16
606 : 1048576 : layers.67.attention.wv.weight : [128, 8192] : torch.bfloat16
607 : 8388608 : layers.67.attention.wo.weight : [8192, 1024] : torch.bfloat16
608 : 29360128 : layers.67.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
609 : 29360128 : layers.67.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
610 : 29360128 : layers.67.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
611 : 8192 : layers.67.attention_norm.weight : [8192] : torch.bfloat16
612 : 8192 : layers.67.ffn_norm.weight : [8192] : torch.bfloat16
613 : 8388608 : layers.68.attention.wq.weight : [1024, 8192] : torch.bfloat16
614 : 1048576 : layers.68.attention.wk.weight : [128, 8192] : torch.bfloat16
615 : 1048576 : layers.68.attention.wv.weight : [128, 8192] : torch.bfloat16
616 : 8388608 : layers.68.attention.wo.weight : [8192, 1024] : torch.bfloat16
617 : 29360128 : layers.68.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
618 : 29360128 : layers.68.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
619 : 29360128 : layers.68.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
620 : 8192 : layers.68.attention_norm.weight : [8192] : torch.bfloat16
621 : 8192 : layers.68.ffn_norm.weight : [8192] : torch.bfloat16
622 : 8388608 : layers.69.attention.wq.weight : [1024, 8192] : torch.bfloat16
623 : 1048576 : layers.69.attention.wk.weight : [128, 8192] : torch.bfloat16
624 : 1048576 : layers.69.attention.wv.weight : [128, 8192] : torch.bfloat16
625 : 8388608 : layers.69.attention.wo.weight : [8192, 1024] : torch.bfloat16
626 : 29360128 : layers.69.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
627 : 29360128 : layers.69.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
628 : 29360128 : layers.69.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
629 : 8192 : layers.69.attention_norm.weight : [8192] : torch.bfloat16
630 : 8192 : layers.69.ffn_norm.weight : [8192] : torch.bfloat16
631 : 8388608 : layers.70.attention.wq.weight : [1024, 8192] : torch.bfloat16
632 : 1048576 : layers.70.attention.wk.weight : [128, 8192] : torch.bfloat16
633 : 1048576 : layers.70.attention.wv.weight : [128, 8192] : torch.bfloat16
634 : 8388608 : layers.70.attention.wo.weight : [8192, 1024] : torch.bfloat16
635 : 29360128 : layers.70.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
636 : 29360128 : layers.70.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
637 : 29360128 : layers.70.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
638 : 8192 : layers.70.attention_norm.weight : [8192] : torch.bfloat16
639 : 8192 : layers.70.ffn_norm.weight : [8192] : torch.bfloat16
640 : 8388608 : layers.71.attention.wq.weight : [1024, 8192] : torch.bfloat16
641 : 1048576 : layers.71.attention.wk.weight : [128, 8192] : torch.bfloat16
642 : 1048576 : layers.71.attention.wv.weight : [128, 8192] : torch.bfloat16
643 : 8388608 : layers.71.attention.wo.weight : [8192, 1024] : torch.bfloat16
644 : 29360128 : layers.71.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
645 : 29360128 : layers.71.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
646 : 29360128 : layers.71.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
647 : 8192 : layers.71.attention_norm.weight : [8192] : torch.bfloat16
648 : 8192 : layers.71.ffn_norm.weight : [8192] : torch.bfloat16
649 : 8388608 : layers.72.attention.wq.weight : [1024, 8192] : torch.bfloat16
650 : 1048576 : layers.72.attention.wk.weight : [128, 8192] : torch.bfloat16
651 : 1048576 : layers.72.attention.wv.weight : [128, 8192] : torch.bfloat16
652 : 8388608 : layers.72.attention.wo.weight : [8192, 1024] : torch.bfloat16
653 : 29360128 : layers.72.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
654 : 29360128 : layers.72.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
655 : 29360128 : layers.72.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
656 : 8192 : layers.72.attention_norm.weight : [8192] : torch.bfloat16
657 : 8192 : layers.72.ffn_norm.weight : [8192] : torch.bfloat16
658 : 8388608 : layers.73.attention.wq.weight : [1024, 8192] : torch.bfloat16
659 : 1048576 : layers.73.attention.wk.weight : [128, 8192] : torch.bfloat16
660 : 1048576 : layers.73.attention.wv.weight : [128, 8192] : torch.bfloat16
661 : 8388608 : layers.73.attention.wo.weight : [8192, 1024] : torch.bfloat16
662 : 29360128 : layers.73.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
663 : 29360128 : layers.73.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
664 : 29360128 : layers.73.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
665 : 8192 : layers.73.attention_norm.weight : [8192] : torch.bfloat16
666 : 8192 : layers.73.ffn_norm.weight : [8192] : torch.bfloat16
667 : 8388608 : layers.74.attention.wq.weight : [1024, 8192] : torch.bfloat16
668 : 1048576 : layers.74.attention.wk.weight : [128, 8192] : torch.bfloat16
669 : 1048576 : layers.74.attention.wv.weight : [128, 8192] : torch.bfloat16
670 : 8388608 : layers.74.attention.wo.weight : [8192, 1024] : torch.bfloat16
671 : 29360128 : layers.74.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
672 : 29360128 : layers.74.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
673 : 29360128 : layers.74.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
674 : 8192 : layers.74.attention_norm.weight : [8192] : torch.bfloat16
675 : 8192 : layers.74.ffn_norm.weight : [8192] : torch.bfloat16
676 : 8388608 : layers.75.attention.wq.weight : [1024, 8192] : torch.bfloat16
677 : 1048576 : layers.75.attention.wk.weight : [128, 8192] : torch.bfloat16
678 : 1048576 : layers.75.attention.wv.weight : [128, 8192] : torch.bfloat16
679 : 8388608 : layers.75.attention.wo.weight : [8192, 1024] : torch.bfloat16
680 : 29360128 : layers.75.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
681 : 29360128 : layers.75.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
682 : 29360128 : layers.75.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
683 : 8192 : layers.75.attention_norm.weight : [8192] : torch.bfloat16
684 : 8192 : layers.75.ffn_norm.weight : [8192] : torch.bfloat16
685 : 8388608 : layers.76.attention.wq.weight : [1024, 8192] : torch.bfloat16
686 : 1048576 : layers.76.attention.wk.weight : [128, 8192] : torch.bfloat16
687 : 1048576 : layers.76.attention.wv.weight : [128, 8192] : torch.bfloat16
688 : 8388608 : layers.76.attention.wo.weight : [8192, 1024] : torch.bfloat16
689 : 29360128 : layers.76.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
690 : 29360128 : layers.76.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
691 : 29360128 : layers.76.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
692 : 8192 : layers.76.attention_norm.weight : [8192] : torch.bfloat16
693 : 8192 : layers.76.ffn_norm.weight : [8192] : torch.bfloat16
694 : 8388608 : layers.77.attention.wq.weight : [1024, 8192] : torch.bfloat16
695 : 1048576 : layers.77.attention.wk.weight : [128, 8192] : torch.bfloat16
696 : 1048576 : layers.77.attention.wv.weight : [128, 8192] : torch.bfloat16
697 : 8388608 : layers.77.attention.wo.weight : [8192, 1024] : torch.bfloat16
698 : 29360128 : layers.77.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
699 : 29360128 : layers.77.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
700 : 29360128 : layers.77.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
701 : 8192 : layers.77.attention_norm.weight : [8192] : torch.bfloat16
702 : 8192 : layers.77.ffn_norm.weight : [8192] : torch.bfloat16
703 : 8388608 : layers.78.attention.wq.weight : [1024, 8192] : torch.bfloat16
704 : 1048576 : layers.78.attention.wk.weight : [128, 8192] : torch.bfloat16
705 : 1048576 : layers.78.attention.wv.weight : [128, 8192] : torch.bfloat16
706 : 8388608 : layers.78.attention.wo.weight : [8192, 1024] : torch.bfloat16
707 : 29360128 : layers.78.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
708 : 29360128 : layers.78.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
709 : 29360128 : layers.78.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
710 : 8192 : layers.78.attention_norm.weight : [8192] : torch.bfloat16
711 : 8192 : layers.78.ffn_norm.weight : [8192] : torch.bfloat16
712 : 8388608 : layers.79.attention.wq.weight : [1024, 8192] : torch.bfloat16
713 : 1048576 : layers.79.attention.wk.weight : [128, 8192] : torch.bfloat16
714 : 1048576 : layers.79.attention.wv.weight : [128, 8192] : torch.bfloat16
715 : 8388608 : layers.79.attention.wo.weight : [8192, 1024] : torch.bfloat16
716 : 29360128 : layers.79.feed_forward.w1.weight : [3584, 8192] : torch.bfloat16
717 : 29360128 : layers.79.feed_forward.w3.weight : [3584, 8192] : torch.bfloat16
718 : 29360128 : layers.79.feed_forward.w2.weight : [8192, 3584] : torch.bfloat16
719 : 8192 : layers.79.attention_norm.weight : [8192] : torch.bfloat16
720 : 8192 : layers.79.ffn_norm.weight : [8192] : torch.bfloat16
721 : 8192 : norm.weight : [8192] : torch.bfloat16
722 : 131334144 : output.weight : [16032, 8192] : torch.bfloat16
Total number of parameters: 70562938880
70.5629 B
141125877760 Bytes
131.4337 GB