suredream commited on
Commit
45a67ae
1 Parent(s): 105014c

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +6 -6
  2. train_results.json +6 -6
  3. trainer_state.json +1942 -255
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.873417721518987,
3
- "total_flos": 1.9321077770606223e+18,
4
- "train_loss": 0.4781974340096498,
5
- "train_runtime": 382.4209,
6
- "train_samples_per_second": 66.027,
7
- "train_steps_per_second": 1.02
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.3877265500181135e+19,
4
+ "train_loss": 0.3087472263830049,
5
+ "train_runtime": 6534.1253,
6
+ "train_samples_per_second": 27.407,
7
+ "train_steps_per_second": 0.429
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.873417721518987,
3
- "total_flos": 1.9321077770606223e+18,
4
- "train_loss": 0.4781974340096498,
5
- "train_runtime": 382.4209,
6
- "train_samples_per_second": 66.027,
7
- "train_steps_per_second": 1.02
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.3877265500181135e+19,
4
+ "train_loss": 0.3087472263830049,
5
+ "train_runtime": 6534.1253,
6
+ "train_samples_per_second": 27.407,
7
+ "train_steps_per_second": 0.429
8
  }
trainer_state.json CHANGED
@@ -1,388 +1,2075 @@
1
  {
2
- "best_metric": 0.7816455696202531,
3
- "best_model_checkpoint": "tsec_vit_model/checkpoint-276",
4
- "epoch": 9.873417721518987,
5
  "eval_steps": 500,
6
- "global_step": 390,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.25316455696202533,
13
- "grad_norm": 1.1729109287261963,
14
- "learning_rate": 1.282051282051282e-05,
15
- "loss": 0.5143,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.5063291139240507,
20
- "grad_norm": 1.1752610206604004,
21
- "learning_rate": 2.564102564102564e-05,
22
- "loss": 0.559,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.759493670886076,
27
- "grad_norm": 1.3603609800338745,
28
- "learning_rate": 3.846153846153846e-05,
29
- "loss": 0.4963,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.9873417721518988,
34
- "eval_accuracy": 0.6724683544303798,
35
- "eval_loss": 0.5699456334114075,
36
- "eval_runtime": 4.299,
37
- "eval_samples_per_second": 147.012,
38
- "eval_steps_per_second": 9.305,
39
- "step": 39
40
- },
41
- {
42
- "epoch": 1.0126582278481013,
43
- "grad_norm": 2.1996357440948486,
44
- "learning_rate": 4.985754985754986e-05,
45
- "loss": 0.5522,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 1.2658227848101267,
50
- "grad_norm": 1.0542365312576294,
51
- "learning_rate": 4.8433048433048433e-05,
52
- "loss": 0.4911,
53
  "step": 50
54
  },
55
  {
56
- "epoch": 1.518987341772152,
57
- "grad_norm": 1.4736473560333252,
58
- "learning_rate": 4.700854700854701e-05,
59
- "loss": 0.5448,
60
  "step": 60
61
  },
62
  {
63
- "epoch": 1.7721518987341773,
64
- "grad_norm": 1.3524911403656006,
65
- "learning_rate": 4.558404558404559e-05,
66
- "loss": 0.4959,
67
  "step": 70
68
  },
69
  {
70
- "epoch": 2.0,
71
- "eval_accuracy": 0.7151898734177216,
72
- "eval_loss": 0.5484605431556702,
73
- "eval_runtime": 4.1719,
74
- "eval_samples_per_second": 151.489,
75
- "eval_steps_per_second": 9.588,
76
- "step": 79
77
- },
78
- {
79
- "epoch": 2.0253164556962027,
80
- "grad_norm": 1.6714565753936768,
81
- "learning_rate": 4.415954415954416e-05,
82
- "loss": 0.4969,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 2.278481012658228,
87
- "grad_norm": 1.8887039422988892,
88
- "learning_rate": 4.2735042735042735e-05,
89
- "loss": 0.4863,
90
  "step": 90
91
  },
92
  {
93
- "epoch": 2.5316455696202533,
94
- "grad_norm": 2.640259027481079,
95
- "learning_rate": 4.131054131054131e-05,
96
- "loss": 0.5075,
97
  "step": 100
98
  },
99
  {
100
- "epoch": 2.7848101265822782,
101
- "grad_norm": 1.167457103729248,
102
- "learning_rate": 3.988603988603989e-05,
103
- "loss": 0.4879,
104
  "step": 110
105
  },
106
  {
107
- "epoch": 2.9873417721518987,
108
- "eval_accuracy": 0.7689873417721519,
109
- "eval_loss": 0.48862743377685547,
110
- "eval_runtime": 4.3228,
111
- "eval_samples_per_second": 146.201,
112
- "eval_steps_per_second": 9.253,
113
- "step": 118
114
- },
115
- {
116
- "epoch": 3.037974683544304,
117
- "grad_norm": 1.0525858402252197,
118
- "learning_rate": 3.846153846153846e-05,
119
- "loss": 0.4946,
120
  "step": 120
121
  },
122
  {
123
- "epoch": 3.291139240506329,
124
- "grad_norm": 2.056792736053467,
125
- "learning_rate": 3.7037037037037037e-05,
126
- "loss": 0.4884,
127
  "step": 130
128
  },
129
  {
130
- "epoch": 3.5443037974683547,
131
- "grad_norm": 1.7059816122055054,
132
- "learning_rate": 3.561253561253561e-05,
133
- "loss": 0.4926,
134
  "step": 140
135
  },
136
  {
137
- "epoch": 3.7974683544303796,
138
- "grad_norm": 1.2786283493041992,
139
- "learning_rate": 3.418803418803419e-05,
140
- "loss": 0.5243,
141
  "step": 150
142
  },
143
  {
144
- "epoch": 4.0,
145
- "eval_accuracy": 0.7468354430379747,
146
- "eval_loss": 0.5133278965950012,
147
- "eval_runtime": 4.3787,
148
- "eval_samples_per_second": 144.335,
149
- "eval_steps_per_second": 9.135,
150
- "step": 158
151
- },
152
- {
153
- "epoch": 4.050632911392405,
154
- "grad_norm": 2.1460752487182617,
155
- "learning_rate": 3.2763532763532764e-05,
156
- "loss": 0.4983,
157
  "step": 160
158
  },
159
  {
160
- "epoch": 4.30379746835443,
161
- "grad_norm": 1.596248984336853,
162
- "learning_rate": 3.133903133903134e-05,
163
- "loss": 0.4936,
164
  "step": 170
165
  },
166
  {
167
- "epoch": 4.556962025316456,
168
- "grad_norm": 1.6535227298736572,
169
- "learning_rate": 2.9914529914529915e-05,
170
- "loss": 0.5049,
171
  "step": 180
172
  },
173
  {
174
- "epoch": 4.810126582278481,
175
- "grad_norm": 1.3324358463287354,
176
- "learning_rate": 2.8490028490028492e-05,
177
- "loss": 0.4654,
178
  "step": 190
179
  },
180
  {
181
- "epoch": 4.987341772151899,
182
- "eval_accuracy": 0.7515822784810127,
183
- "eval_loss": 0.49274376034736633,
184
- "eval_runtime": 4.3657,
185
- "eval_samples_per_second": 144.763,
186
- "eval_steps_per_second": 9.162,
187
- "step": 197
188
- },
189
- {
190
- "epoch": 5.063291139240507,
191
- "grad_norm": 1.212802529335022,
192
- "learning_rate": 2.706552706552707e-05,
193
- "loss": 0.4828,
194
  "step": 200
195
  },
196
  {
197
- "epoch": 5.3164556962025316,
198
- "grad_norm": 1.189599633216858,
199
- "learning_rate": 2.564102564102564e-05,
200
- "loss": 0.4816,
201
  "step": 210
202
  },
203
  {
204
- "epoch": 5.569620253164557,
205
- "grad_norm": 1.3671377897262573,
206
- "learning_rate": 2.4216524216524217e-05,
207
- "loss": 0.4709,
208
  "step": 220
209
  },
210
  {
211
- "epoch": 5.822784810126582,
212
- "grad_norm": 1.5688464641571045,
213
- "learning_rate": 2.2792022792022794e-05,
214
- "loss": 0.4776,
215
  "step": 230
216
  },
217
  {
218
- "epoch": 6.0,
219
- "eval_accuracy": 0.7642405063291139,
220
- "eval_loss": 0.4901277422904968,
221
- "eval_runtime": 4.2593,
222
- "eval_samples_per_second": 148.382,
223
- "eval_steps_per_second": 9.391,
224
- "step": 237
225
- },
226
- {
227
- "epoch": 6.075949367088608,
228
- "grad_norm": 1.6727008819580078,
229
- "learning_rate": 2.1367521367521368e-05,
230
- "loss": 0.4878,
231
  "step": 240
232
  },
233
  {
234
- "epoch": 6.329113924050633,
235
- "grad_norm": 1.2271509170532227,
236
- "learning_rate": 1.9943019943019945e-05,
237
- "loss": 0.4725,
238
  "step": 250
239
  },
240
  {
241
- "epoch": 6.582278481012658,
242
- "grad_norm": 1.5308541059494019,
243
- "learning_rate": 1.8518518518518518e-05,
244
- "loss": 0.444,
245
  "step": 260
246
  },
247
  {
248
- "epoch": 6.8354430379746836,
249
- "grad_norm": 1.6207118034362793,
250
- "learning_rate": 1.7094017094017095e-05,
251
- "loss": 0.4767,
252
  "step": 270
253
  },
254
  {
255
- "epoch": 6.987341772151899,
256
- "eval_accuracy": 0.7816455696202531,
257
- "eval_loss": 0.46520036458969116,
258
- "eval_runtime": 4.1881,
259
- "eval_samples_per_second": 150.905,
260
- "eval_steps_per_second": 9.551,
261
- "step": 276
262
  },
263
  {
264
- "epoch": 7.0886075949367084,
265
- "grad_norm": 1.3883824348449707,
266
- "learning_rate": 1.566951566951567e-05,
267
- "loss": 0.4649,
 
 
268
  "step": 280
269
  },
270
  {
271
- "epoch": 7.341772151898734,
272
- "grad_norm": 2.2651216983795166,
273
- "learning_rate": 1.4245014245014246e-05,
274
- "loss": 0.4644,
275
  "step": 290
276
  },
277
  {
278
- "epoch": 7.594936708860759,
279
- "grad_norm": 1.2660713195800781,
280
- "learning_rate": 1.282051282051282e-05,
281
- "loss": 0.4448,
282
  "step": 300
283
  },
284
  {
285
- "epoch": 7.848101265822785,
286
- "grad_norm": 1.8906290531158447,
287
- "learning_rate": 1.1396011396011397e-05,
288
- "loss": 0.4465,
289
  "step": 310
290
  },
291
  {
292
- "epoch": 8.0,
293
- "eval_accuracy": 0.7642405063291139,
294
- "eval_loss": 0.4795072674751282,
295
- "eval_runtime": 4.3585,
296
- "eval_samples_per_second": 145.004,
297
- "eval_steps_per_second": 9.177,
298
- "step": 316
299
- },
300
- {
301
- "epoch": 8.10126582278481,
302
- "grad_norm": 1.7846542596817017,
303
- "learning_rate": 9.971509971509972e-06,
304
- "loss": 0.4552,
305
  "step": 320
306
  },
307
  {
308
- "epoch": 8.354430379746836,
309
- "grad_norm": 1.2796516418457031,
310
- "learning_rate": 8.547008547008548e-06,
311
- "loss": 0.4284,
312
  "step": 330
313
  },
314
  {
315
- "epoch": 8.60759493670886,
316
- "grad_norm": 1.181015133857727,
317
- "learning_rate": 7.122507122507123e-06,
318
- "loss": 0.3985,
319
  "step": 340
320
  },
321
  {
322
- "epoch": 8.860759493670885,
323
- "grad_norm": 1.352704644203186,
324
- "learning_rate": 5.6980056980056985e-06,
325
- "loss": 0.467,
326
  "step": 350
327
  },
328
  {
329
- "epoch": 8.987341772151899,
330
- "eval_accuracy": 0.7484177215189873,
331
- "eval_loss": 0.4690525233745575,
332
- "eval_runtime": 4.2658,
333
- "eval_samples_per_second": 148.154,
334
- "eval_steps_per_second": 9.377,
335
- "step": 355
336
- },
337
- {
338
- "epoch": 9.113924050632912,
339
- "grad_norm": 1.3692480325698853,
340
- "learning_rate": 4.273504273504274e-06,
341
- "loss": 0.437,
342
  "step": 360
343
  },
344
  {
345
- "epoch": 9.367088607594937,
346
- "grad_norm": 2.6086838245391846,
347
- "learning_rate": 2.8490028490028492e-06,
348
- "loss": 0.4062,
349
  "step": 370
350
  },
351
  {
352
- "epoch": 9.620253164556962,
353
- "grad_norm": 1.352748155593872,
354
- "learning_rate": 1.4245014245014246e-06,
355
- "loss": 0.4365,
356
  "step": 380
357
  },
358
  {
359
- "epoch": 9.873417721518987,
360
- "grad_norm": 1.9576495885849,
361
- "learning_rate": 0.0,
362
- "loss": 0.4121,
363
  "step": 390
364
  },
365
  {
366
- "epoch": 9.873417721518987,
367
- "eval_accuracy": 0.7689873417721519,
368
- "eval_loss": 0.482129842042923,
369
- "eval_runtime": 4.3754,
370
- "eval_samples_per_second": 144.443,
371
- "eval_steps_per_second": 9.142,
372
- "step": 390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  },
374
  {
375
- "epoch": 9.873417721518987,
376
- "step": 390,
377
- "total_flos": 1.9321077770606223e+18,
378
- "train_loss": 0.4781974340096498,
379
- "train_runtime": 382.4209,
380
- "train_samples_per_second": 66.027,
381
- "train_steps_per_second": 1.02
382
  }
383
  ],
384
  "logging_steps": 10,
385
- "max_steps": 390,
386
  "num_input_tokens_seen": 0,
387
  "num_train_epochs": 10,
388
  "save_steps": 500,
@@ -398,7 +2085,7 @@
398
  "attributes": {}
399
  }
400
  },
401
- "total_flos": 1.9321077770606223e+18,
402
  "train_batch_size": 16,
403
  "trial_name": null,
404
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8894595801697186,
3
+ "best_model_checkpoint": "tsec_vit_model/checkpoint-2520",
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 2800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03571428571428571,
13
+ "grad_norm": 0.632759690284729,
14
+ "learning_rate": 1.7857142857142857e-06,
15
+ "loss": 0.7005,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.07142857142857142,
20
+ "grad_norm": 0.6090130805969238,
21
+ "learning_rate": 3.5714285714285714e-06,
22
+ "loss": 0.6888,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.10714285714285714,
27
+ "grad_norm": 0.5027034878730774,
28
+ "learning_rate": 5.357142857142857e-06,
29
+ "loss": 0.6841,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.14285714285714285,
34
+ "grad_norm": 0.7158761620521545,
35
+ "learning_rate": 7.142857142857143e-06,
36
+ "loss": 0.6711,
 
 
 
 
 
 
 
 
 
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.17857142857142858,
41
+ "grad_norm": 1.0677605867385864,
42
+ "learning_rate": 8.92857142857143e-06,
43
+ "loss": 0.6439,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.21428571428571427,
48
+ "grad_norm": 0.716444194316864,
49
+ "learning_rate": 1.0714285714285714e-05,
50
+ "loss": 0.6388,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.25,
55
+ "grad_norm": 0.9646293520927429,
56
+ "learning_rate": 1.25e-05,
57
+ "loss": 0.5942,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.2857142857142857,
62
+ "grad_norm": 1.5171552896499634,
63
+ "learning_rate": 1.4285714285714285e-05,
64
+ "loss": 0.587,
 
 
 
 
 
 
 
 
 
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.32142857142857145,
69
+ "grad_norm": 1.2230862379074097,
70
+ "learning_rate": 1.6071428571428572e-05,
71
+ "loss": 0.5882,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.35714285714285715,
76
+ "grad_norm": 1.0726110935211182,
77
+ "learning_rate": 1.785714285714286e-05,
78
+ "loss": 0.5506,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.39285714285714285,
83
+ "grad_norm": 2.488288640975952,
84
+ "learning_rate": 1.9642857142857145e-05,
85
+ "loss": 0.5487,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.42857142857142855,
90
+ "grad_norm": 2.186500310897827,
91
+ "learning_rate": 2.1428571428571428e-05,
92
+ "loss": 0.5138,
 
 
 
 
 
 
 
 
 
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.4642857142857143,
97
+ "grad_norm": 2.18605637550354,
98
+ "learning_rate": 2.3214285714285715e-05,
99
+ "loss": 0.5203,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.5,
104
+ "grad_norm": 1.5495938062667847,
105
+ "learning_rate": 2.5e-05,
106
+ "loss": 0.5006,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.5357142857142857,
111
+ "grad_norm": 1.3939014673233032,
112
+ "learning_rate": 2.6785714285714288e-05,
113
+ "loss": 0.5171,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.5714285714285714,
118
+ "grad_norm": 1.4516574144363403,
119
+ "learning_rate": 2.857142857142857e-05,
120
+ "loss": 0.4882,
 
 
 
 
 
 
 
 
 
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.6071428571428571,
125
+ "grad_norm": 1.5352245569229126,
126
+ "learning_rate": 3.0357142857142857e-05,
127
+ "loss": 0.4809,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.6428571428571429,
132
+ "grad_norm": 2.6514480113983154,
133
+ "learning_rate": 3.2142857142857144e-05,
134
+ "loss": 0.458,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.6785714285714286,
139
+ "grad_norm": 2.1452901363372803,
140
+ "learning_rate": 3.392857142857143e-05,
141
+ "loss": 0.478,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.7142857142857143,
146
+ "grad_norm": 1.5536608695983887,
147
+ "learning_rate": 3.571428571428572e-05,
148
+ "loss": 0.4851,
 
 
 
 
 
 
 
 
 
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 0.75,
153
+ "grad_norm": 1.7489731311798096,
154
+ "learning_rate": 3.7500000000000003e-05,
155
+ "loss": 0.4608,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 0.7857142857142857,
160
+ "grad_norm": 1.6109980344772339,
161
+ "learning_rate": 3.928571428571429e-05,
162
+ "loss": 0.4611,
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 0.8214285714285714,
167
+ "grad_norm": 2.391719102859497,
168
+ "learning_rate": 4.107142857142857e-05,
169
+ "loss": 0.3998,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 0.8571428571428571,
174
+ "grad_norm": 3.6933391094207764,
175
+ "learning_rate": 4.2857142857142856e-05,
176
+ "loss": 0.4624,
 
 
 
 
 
 
 
 
 
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 0.8928571428571429,
181
+ "grad_norm": 2.0588369369506836,
182
+ "learning_rate": 4.464285714285715e-05,
183
+ "loss": 0.4437,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 0.9285714285714286,
188
+ "grad_norm": 2.831156015396118,
189
+ "learning_rate": 4.642857142857143e-05,
190
+ "loss": 0.4652,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 0.9642857142857143,
195
+ "grad_norm": 2.0229790210723877,
196
+ "learning_rate": 4.8214285714285716e-05,
197
+ "loss": 0.4037,
198
  "step": 270
199
  },
200
  {
201
+ "epoch": 1.0,
202
+ "grad_norm": 2.1834700107574463,
203
+ "learning_rate": 5e-05,
204
+ "loss": 0.4387,
205
+ "step": 280
 
 
206
  },
207
  {
208
+ "epoch": 1.0,
209
+ "eval_accuracy": 0.8150960250111657,
210
+ "eval_loss": 0.4179099500179291,
211
+ "eval_runtime": 110.4006,
212
+ "eval_samples_per_second": 40.561,
213
+ "eval_steps_per_second": 2.536,
214
  "step": 280
215
  },
216
  {
217
+ "epoch": 1.0357142857142858,
218
+ "grad_norm": 1.7746831178665161,
219
+ "learning_rate": 4.9801587301587306e-05,
220
+ "loss": 0.4069,
221
  "step": 290
222
  },
223
  {
224
+ "epoch": 1.0714285714285714,
225
+ "grad_norm": 2.6947922706604004,
226
+ "learning_rate": 4.960317460317461e-05,
227
+ "loss": 0.4334,
228
  "step": 300
229
  },
230
  {
231
+ "epoch": 1.1071428571428572,
232
+ "grad_norm": 1.4933383464813232,
233
+ "learning_rate": 4.940476190476191e-05,
234
+ "loss": 0.4409,
235
  "step": 310
236
  },
237
  {
238
+ "epoch": 1.1428571428571428,
239
+ "grad_norm": 1.8830089569091797,
240
+ "learning_rate": 4.9206349206349204e-05,
241
+ "loss": 0.4065,
 
 
 
 
 
 
 
 
 
242
  "step": 320
243
  },
244
  {
245
+ "epoch": 1.1785714285714286,
246
+ "grad_norm": 1.978843092918396,
247
+ "learning_rate": 4.900793650793651e-05,
248
+ "loss": 0.4273,
249
  "step": 330
250
  },
251
  {
252
+ "epoch": 1.2142857142857142,
253
+ "grad_norm": 1.2928149700164795,
254
+ "learning_rate": 4.880952380952381e-05,
255
+ "loss": 0.4376,
256
  "step": 340
257
  },
258
  {
259
+ "epoch": 1.25,
260
+ "grad_norm": 1.5401108264923096,
261
+ "learning_rate": 4.8611111111111115e-05,
262
+ "loss": 0.4061,
263
  "step": 350
264
  },
265
  {
266
+ "epoch": 1.2857142857142856,
267
+ "grad_norm": 2.296520709991455,
268
+ "learning_rate": 4.841269841269841e-05,
269
+ "loss": 0.433,
 
 
 
 
 
 
 
 
 
270
  "step": 360
271
  },
272
  {
273
+ "epoch": 1.3214285714285714,
274
+ "grad_norm": 1.2640879154205322,
275
+ "learning_rate": 4.8214285714285716e-05,
276
+ "loss": 0.4142,
277
  "step": 370
278
  },
279
  {
280
+ "epoch": 1.3571428571428572,
281
+ "grad_norm": 1.3309462070465088,
282
+ "learning_rate": 4.801587301587302e-05,
283
+ "loss": 0.4222,
284
  "step": 380
285
  },
286
  {
287
+ "epoch": 1.3928571428571428,
288
+ "grad_norm": 1.8462568521499634,
289
+ "learning_rate": 4.781746031746032e-05,
290
+ "loss": 0.4483,
291
  "step": 390
292
  },
293
  {
294
+ "epoch": 1.4285714285714286,
295
+ "grad_norm": 1.074401617050171,
296
+ "learning_rate": 4.761904761904762e-05,
297
+ "loss": 0.3762,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.4642857142857144,
302
+ "grad_norm": 1.5691473484039307,
303
+ "learning_rate": 4.7420634920634924e-05,
304
+ "loss": 0.415,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 1.5,
309
+ "grad_norm": 1.2222189903259277,
310
+ "learning_rate": 4.722222222222222e-05,
311
+ "loss": 0.3862,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 1.5357142857142856,
316
+ "grad_norm": 1.4210410118103027,
317
+ "learning_rate": 4.7023809523809525e-05,
318
+ "loss": 0.352,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 1.5714285714285714,
323
+ "grad_norm": 1.632934331893921,
324
+ "learning_rate": 4.682539682539683e-05,
325
+ "loss": 0.3873,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 1.6071428571428572,
330
+ "grad_norm": 1.3977652788162231,
331
+ "learning_rate": 4.662698412698413e-05,
332
+ "loss": 0.3763,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 1.6428571428571428,
337
+ "grad_norm": 2.74967622756958,
338
+ "learning_rate": 4.642857142857143e-05,
339
+ "loss": 0.3823,
340
+ "step": 460
341
+ },
342
+ {
343
+ "epoch": 1.6785714285714286,
344
+ "grad_norm": 1.1579408645629883,
345
+ "learning_rate": 4.623015873015873e-05,
346
+ "loss": 0.3967,
347
+ "step": 470
348
+ },
349
+ {
350
+ "epoch": 1.7142857142857144,
351
+ "grad_norm": 2.7291440963745117,
352
+ "learning_rate": 4.603174603174603e-05,
353
+ "loss": 0.4201,
354
+ "step": 480
355
+ },
356
+ {
357
+ "epoch": 1.75,
358
+ "grad_norm": 1.0126214027404785,
359
+ "learning_rate": 4.5833333333333334e-05,
360
+ "loss": 0.3598,
361
+ "step": 490
362
+ },
363
+ {
364
+ "epoch": 1.7857142857142856,
365
+ "grad_norm": 1.6705009937286377,
366
+ "learning_rate": 4.563492063492064e-05,
367
+ "loss": 0.366,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 1.8214285714285714,
372
+ "grad_norm": 1.6749546527862549,
373
+ "learning_rate": 4.543650793650794e-05,
374
+ "loss": 0.3854,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 1.8571428571428572,
379
+ "grad_norm": 1.0892480611801147,
380
+ "learning_rate": 4.523809523809524e-05,
381
+ "loss": 0.3563,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 1.8928571428571428,
386
+ "grad_norm": 1.4169703722000122,
387
+ "learning_rate": 4.503968253968254e-05,
388
+ "loss": 0.3903,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 1.9285714285714286,
393
+ "grad_norm": 1.393864631652832,
394
+ "learning_rate": 4.4841269841269846e-05,
395
+ "loss": 0.3915,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 1.9642857142857144,
400
+ "grad_norm": 1.6008644104003906,
401
+ "learning_rate": 4.464285714285715e-05,
402
+ "loss": 0.3665,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 2.0,
407
+ "grad_norm": 1.0977710485458374,
408
+ "learning_rate": 4.4444444444444447e-05,
409
+ "loss": 0.4239,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 2.0,
414
+ "eval_accuracy": 0.8398838767306833,
415
+ "eval_loss": 0.3610800504684448,
416
+ "eval_runtime": 114.7581,
417
+ "eval_samples_per_second": 39.021,
418
+ "eval_steps_per_second": 2.44,
419
+ "step": 560
420
+ },
421
+ {
422
+ "epoch": 2.0357142857142856,
423
+ "grad_norm": 1.4717609882354736,
424
+ "learning_rate": 4.4246031746031744e-05,
425
+ "loss": 0.3609,
426
+ "step": 570
427
+ },
428
+ {
429
+ "epoch": 2.0714285714285716,
430
+ "grad_norm": 1.0943901538848877,
431
+ "learning_rate": 4.404761904761905e-05,
432
+ "loss": 0.3379,
433
+ "step": 580
434
+ },
435
+ {
436
+ "epoch": 2.107142857142857,
437
+ "grad_norm": 1.8704146146774292,
438
+ "learning_rate": 4.384920634920635e-05,
439
+ "loss": 0.364,
440
+ "step": 590
441
+ },
442
+ {
443
+ "epoch": 2.142857142857143,
444
+ "grad_norm": 1.2977815866470337,
445
+ "learning_rate": 4.3650793650793655e-05,
446
+ "loss": 0.3834,
447
+ "step": 600
448
+ },
449
+ {
450
+ "epoch": 2.1785714285714284,
451
+ "grad_norm": 1.6529649496078491,
452
+ "learning_rate": 4.345238095238096e-05,
453
+ "loss": 0.3688,
454
+ "step": 610
455
+ },
456
+ {
457
+ "epoch": 2.2142857142857144,
458
+ "grad_norm": 1.1914763450622559,
459
+ "learning_rate": 4.3253968253968256e-05,
460
+ "loss": 0.3548,
461
+ "step": 620
462
+ },
463
+ {
464
+ "epoch": 2.25,
465
+ "grad_norm": 1.70374596118927,
466
+ "learning_rate": 4.305555555555556e-05,
467
+ "loss": 0.3323,
468
+ "step": 630
469
+ },
470
+ {
471
+ "epoch": 2.2857142857142856,
472
+ "grad_norm": 1.1331793069839478,
473
+ "learning_rate": 4.2857142857142856e-05,
474
+ "loss": 0.3494,
475
+ "step": 640
476
+ },
477
+ {
478
+ "epoch": 2.3214285714285716,
479
+ "grad_norm": 1.3151099681854248,
480
+ "learning_rate": 4.265873015873016e-05,
481
+ "loss": 0.3177,
482
+ "step": 650
483
+ },
484
+ {
485
+ "epoch": 2.357142857142857,
486
+ "grad_norm": 1.7670910358428955,
487
+ "learning_rate": 4.2460317460317464e-05,
488
+ "loss": 0.3145,
489
+ "step": 660
490
+ },
491
+ {
492
+ "epoch": 2.392857142857143,
493
+ "grad_norm": 1.441829800605774,
494
+ "learning_rate": 4.226190476190476e-05,
495
+ "loss": 0.2968,
496
+ "step": 670
497
+ },
498
+ {
499
+ "epoch": 2.4285714285714284,
500
+ "grad_norm": 1.5992521047592163,
501
+ "learning_rate": 4.2063492063492065e-05,
502
+ "loss": 0.353,
503
+ "step": 680
504
+ },
505
+ {
506
+ "epoch": 2.4642857142857144,
507
+ "grad_norm": 1.076711654663086,
508
+ "learning_rate": 4.186507936507937e-05,
509
+ "loss": 0.3461,
510
+ "step": 690
511
+ },
512
+ {
513
+ "epoch": 2.5,
514
+ "grad_norm": 1.0764169692993164,
515
+ "learning_rate": 4.166666666666667e-05,
516
+ "loss": 0.3345,
517
+ "step": 700
518
+ },
519
+ {
520
+ "epoch": 2.5357142857142856,
521
+ "grad_norm": 1.3735431432724,
522
+ "learning_rate": 4.1468253968253976e-05,
523
+ "loss": 0.3612,
524
+ "step": 710
525
+ },
526
+ {
527
+ "epoch": 2.571428571428571,
528
+ "grad_norm": 1.4703452587127686,
529
+ "learning_rate": 4.126984126984127e-05,
530
+ "loss": 0.3444,
531
+ "step": 720
532
+ },
533
+ {
534
+ "epoch": 2.607142857142857,
535
+ "grad_norm": 1.451257348060608,
536
+ "learning_rate": 4.107142857142857e-05,
537
+ "loss": 0.3287,
538
+ "step": 730
539
+ },
540
+ {
541
+ "epoch": 2.642857142857143,
542
+ "grad_norm": 1.8615336418151855,
543
+ "learning_rate": 4.0873015873015874e-05,
544
+ "loss": 0.3419,
545
+ "step": 740
546
+ },
547
+ {
548
+ "epoch": 2.678571428571429,
549
+ "grad_norm": 2.925323963165283,
550
+ "learning_rate": 4.067460317460318e-05,
551
+ "loss": 0.3217,
552
+ "step": 750
553
+ },
554
+ {
555
+ "epoch": 2.7142857142857144,
556
+ "grad_norm": 1.7260726690292358,
557
+ "learning_rate": 4.047619047619048e-05,
558
+ "loss": 0.3584,
559
+ "step": 760
560
+ },
561
+ {
562
+ "epoch": 2.75,
563
+ "grad_norm": 1.4659258127212524,
564
+ "learning_rate": 4.027777777777778e-05,
565
+ "loss": 0.3821,
566
+ "step": 770
567
+ },
568
+ {
569
+ "epoch": 2.7857142857142856,
570
+ "grad_norm": 1.7859466075897217,
571
+ "learning_rate": 4.007936507936508e-05,
572
+ "loss": 0.3595,
573
+ "step": 780
574
+ },
575
+ {
576
+ "epoch": 2.821428571428571,
577
+ "grad_norm": 1.2320784330368042,
578
+ "learning_rate": 3.9880952380952386e-05,
579
+ "loss": 0.3764,
580
+ "step": 790
581
+ },
582
+ {
583
+ "epoch": 2.857142857142857,
584
+ "grad_norm": 1.4097161293029785,
585
+ "learning_rate": 3.968253968253968e-05,
586
+ "loss": 0.3282,
587
+ "step": 800
588
+ },
589
+ {
590
+ "epoch": 2.892857142857143,
591
+ "grad_norm": 1.185285210609436,
592
+ "learning_rate": 3.9484126984126986e-05,
593
+ "loss": 0.3214,
594
+ "step": 810
595
+ },
596
+ {
597
+ "epoch": 2.928571428571429,
598
+ "grad_norm": 1.3706032037734985,
599
+ "learning_rate": 3.928571428571429e-05,
600
+ "loss": 0.3092,
601
+ "step": 820
602
+ },
603
+ {
604
+ "epoch": 2.9642857142857144,
605
+ "grad_norm": 2.1342084407806396,
606
+ "learning_rate": 3.908730158730159e-05,
607
+ "loss": 0.3274,
608
+ "step": 830
609
+ },
610
+ {
611
+ "epoch": 3.0,
612
+ "grad_norm": 1.438025712966919,
613
+ "learning_rate": 3.888888888888889e-05,
614
+ "loss": 0.3148,
615
+ "step": 840
616
+ },
617
+ {
618
+ "epoch": 3.0,
619
+ "eval_accuracy": 0.8599821348816435,
620
+ "eval_loss": 0.31560027599334717,
621
+ "eval_runtime": 116.0276,
622
+ "eval_samples_per_second": 38.594,
623
+ "eval_steps_per_second": 2.413,
624
+ "step": 840
625
+ },
626
+ {
627
+ "epoch": 3.0357142857142856,
628
+ "grad_norm": 1.6471174955368042,
629
+ "learning_rate": 3.8690476190476195e-05,
630
+ "loss": 0.3198,
631
+ "step": 850
632
+ },
633
+ {
634
+ "epoch": 3.0714285714285716,
635
+ "grad_norm": 1.5575629472732544,
636
+ "learning_rate": 3.84920634920635e-05,
637
+ "loss": 0.3127,
638
+ "step": 860
639
+ },
640
+ {
641
+ "epoch": 3.107142857142857,
642
+ "grad_norm": 1.1578933000564575,
643
+ "learning_rate": 3.8293650793650795e-05,
644
+ "loss": 0.3416,
645
+ "step": 870
646
+ },
647
+ {
648
+ "epoch": 3.142857142857143,
649
+ "grad_norm": 1.9167553186416626,
650
+ "learning_rate": 3.809523809523809e-05,
651
+ "loss": 0.3626,
652
+ "step": 880
653
+ },
654
+ {
655
+ "epoch": 3.1785714285714284,
656
+ "grad_norm": 1.038451910018921,
657
+ "learning_rate": 3.7896825396825396e-05,
658
+ "loss": 0.336,
659
+ "step": 890
660
+ },
661
+ {
662
+ "epoch": 3.2142857142857144,
663
+ "grad_norm": 1.2739081382751465,
664
+ "learning_rate": 3.76984126984127e-05,
665
+ "loss": 0.2838,
666
+ "step": 900
667
+ },
668
+ {
669
+ "epoch": 3.25,
670
+ "grad_norm": 1.2061238288879395,
671
+ "learning_rate": 3.7500000000000003e-05,
672
+ "loss": 0.3099,
673
+ "step": 910
674
+ },
675
+ {
676
+ "epoch": 3.2857142857142856,
677
+ "grad_norm": 1.5365681648254395,
678
+ "learning_rate": 3.730158730158731e-05,
679
+ "loss": 0.3163,
680
+ "step": 920
681
+ },
682
+ {
683
+ "epoch": 3.3214285714285716,
684
+ "grad_norm": 2.33838152885437,
685
+ "learning_rate": 3.7103174603174604e-05,
686
+ "loss": 0.3109,
687
+ "step": 930
688
+ },
689
+ {
690
+ "epoch": 3.357142857142857,
691
+ "grad_norm": 1.2280935049057007,
692
+ "learning_rate": 3.690476190476191e-05,
693
+ "loss": 0.3104,
694
+ "step": 940
695
+ },
696
+ {
697
+ "epoch": 3.392857142857143,
698
+ "grad_norm": 1.2506486177444458,
699
+ "learning_rate": 3.6706349206349205e-05,
700
+ "loss": 0.339,
701
+ "step": 950
702
+ },
703
+ {
704
+ "epoch": 3.4285714285714284,
705
+ "grad_norm": 1.0430885553359985,
706
+ "learning_rate": 3.650793650793651e-05,
707
+ "loss": 0.3015,
708
+ "step": 960
709
+ },
710
+ {
711
+ "epoch": 3.4642857142857144,
712
+ "grad_norm": 1.5323641300201416,
713
+ "learning_rate": 3.630952380952381e-05,
714
+ "loss": 0.3408,
715
+ "step": 970
716
+ },
717
+ {
718
+ "epoch": 3.5,
719
+ "grad_norm": 1.0826184749603271,
720
+ "learning_rate": 3.611111111111111e-05,
721
+ "loss": 0.3061,
722
+ "step": 980
723
+ },
724
+ {
725
+ "epoch": 3.5357142857142856,
726
+ "grad_norm": 2.019437313079834,
727
+ "learning_rate": 3.591269841269841e-05,
728
+ "loss": 0.3465,
729
+ "step": 990
730
+ },
731
+ {
732
+ "epoch": 3.571428571428571,
733
+ "grad_norm": 1.5877951383590698,
734
+ "learning_rate": 3.571428571428572e-05,
735
+ "loss": 0.3023,
736
+ "step": 1000
737
+ },
738
+ {
739
+ "epoch": 3.607142857142857,
740
+ "grad_norm": 1.5832830667495728,
741
+ "learning_rate": 3.551587301587302e-05,
742
+ "loss": 0.262,
743
+ "step": 1010
744
+ },
745
+ {
746
+ "epoch": 3.642857142857143,
747
+ "grad_norm": 1.5758150815963745,
748
+ "learning_rate": 3.5317460317460324e-05,
749
+ "loss": 0.3337,
750
+ "step": 1020
751
+ },
752
+ {
753
+ "epoch": 3.678571428571429,
754
+ "grad_norm": 1.255638837814331,
755
+ "learning_rate": 3.511904761904762e-05,
756
+ "loss": 0.2857,
757
+ "step": 1030
758
+ },
759
+ {
760
+ "epoch": 3.7142857142857144,
761
+ "grad_norm": 1.5360593795776367,
762
+ "learning_rate": 3.492063492063492e-05,
763
+ "loss": 0.2964,
764
+ "step": 1040
765
+ },
766
+ {
767
+ "epoch": 3.75,
768
+ "grad_norm": 1.0826270580291748,
769
+ "learning_rate": 3.472222222222222e-05,
770
+ "loss": 0.2872,
771
+ "step": 1050
772
+ },
773
+ {
774
+ "epoch": 3.7857142857142856,
775
+ "grad_norm": 1.6883878707885742,
776
+ "learning_rate": 3.4523809523809526e-05,
777
+ "loss": 0.3016,
778
+ "step": 1060
779
+ },
780
+ {
781
+ "epoch": 3.821428571428571,
782
+ "grad_norm": 1.8018616437911987,
783
+ "learning_rate": 3.432539682539683e-05,
784
+ "loss": 0.3115,
785
+ "step": 1070
786
+ },
787
+ {
788
+ "epoch": 3.857142857142857,
789
+ "grad_norm": 1.9364327192306519,
790
+ "learning_rate": 3.412698412698413e-05,
791
+ "loss": 0.2986,
792
+ "step": 1080
793
+ },
794
+ {
795
+ "epoch": 3.892857142857143,
796
+ "grad_norm": 1.6606508493423462,
797
+ "learning_rate": 3.392857142857143e-05,
798
+ "loss": 0.3017,
799
+ "step": 1090
800
+ },
801
+ {
802
+ "epoch": 3.928571428571429,
803
+ "grad_norm": 2.1849498748779297,
804
+ "learning_rate": 3.3730158730158734e-05,
805
+ "loss": 0.2779,
806
+ "step": 1100
807
+ },
808
+ {
809
+ "epoch": 3.9642857142857144,
810
+ "grad_norm": 1.5579859018325806,
811
+ "learning_rate": 3.353174603174603e-05,
812
+ "loss": 0.3199,
813
+ "step": 1110
814
+ },
815
+ {
816
+ "epoch": 4.0,
817
+ "grad_norm": 2.6287128925323486,
818
+ "learning_rate": 3.3333333333333335e-05,
819
+ "loss": 0.2988,
820
+ "step": 1120
821
+ },
822
+ {
823
+ "epoch": 4.0,
824
+ "eval_accuracy": 0.8729343456900402,
825
+ "eval_loss": 0.30018478631973267,
826
+ "eval_runtime": 115.4434,
827
+ "eval_samples_per_second": 38.79,
828
+ "eval_steps_per_second": 2.425,
829
+ "step": 1120
830
+ },
831
+ {
832
+ "epoch": 4.035714285714286,
833
+ "grad_norm": 1.5949360132217407,
834
+ "learning_rate": 3.313492063492064e-05,
835
+ "loss": 0.305,
836
+ "step": 1130
837
+ },
838
+ {
839
+ "epoch": 4.071428571428571,
840
+ "grad_norm": 1.678801417350769,
841
+ "learning_rate": 3.2936507936507936e-05,
842
+ "loss": 0.2677,
843
+ "step": 1140
844
+ },
845
+ {
846
+ "epoch": 4.107142857142857,
847
+ "grad_norm": 1.6015031337738037,
848
+ "learning_rate": 3.273809523809524e-05,
849
+ "loss": 0.3167,
850
+ "step": 1150
851
+ },
852
+ {
853
+ "epoch": 4.142857142857143,
854
+ "grad_norm": 1.525894045829773,
855
+ "learning_rate": 3.253968253968254e-05,
856
+ "loss": 0.2745,
857
+ "step": 1160
858
+ },
859
+ {
860
+ "epoch": 4.178571428571429,
861
+ "grad_norm": 1.2955286502838135,
862
+ "learning_rate": 3.234126984126985e-05,
863
+ "loss": 0.2925,
864
+ "step": 1170
865
+ },
866
+ {
867
+ "epoch": 4.214285714285714,
868
+ "grad_norm": 2.1041815280914307,
869
+ "learning_rate": 3.2142857142857144e-05,
870
+ "loss": 0.2782,
871
+ "step": 1180
872
+ },
873
+ {
874
+ "epoch": 4.25,
875
+ "grad_norm": 1.7325941324234009,
876
+ "learning_rate": 3.194444444444444e-05,
877
+ "loss": 0.302,
878
+ "step": 1190
879
+ },
880
+ {
881
+ "epoch": 4.285714285714286,
882
+ "grad_norm": 1.9786794185638428,
883
+ "learning_rate": 3.1746031746031745e-05,
884
+ "loss": 0.2864,
885
+ "step": 1200
886
+ },
887
+ {
888
+ "epoch": 4.321428571428571,
889
+ "grad_norm": 1.3869291543960571,
890
+ "learning_rate": 3.154761904761905e-05,
891
+ "loss": 0.2834,
892
+ "step": 1210
893
+ },
894
+ {
895
+ "epoch": 4.357142857142857,
896
+ "grad_norm": 2.1556620597839355,
897
+ "learning_rate": 3.134920634920635e-05,
898
+ "loss": 0.3011,
899
+ "step": 1220
900
+ },
901
+ {
902
+ "epoch": 4.392857142857143,
903
+ "grad_norm": 1.6058976650238037,
904
+ "learning_rate": 3.1150793650793656e-05,
905
+ "loss": 0.2893,
906
+ "step": 1230
907
+ },
908
+ {
909
+ "epoch": 4.428571428571429,
910
+ "grad_norm": 1.9528658390045166,
911
+ "learning_rate": 3.095238095238095e-05,
912
+ "loss": 0.2725,
913
+ "step": 1240
914
+ },
915
+ {
916
+ "epoch": 4.464285714285714,
917
+ "grad_norm": 1.5343618392944336,
918
+ "learning_rate": 3.075396825396826e-05,
919
+ "loss": 0.2578,
920
+ "step": 1250
921
+ },
922
+ {
923
+ "epoch": 4.5,
924
+ "grad_norm": 1.0154541730880737,
925
+ "learning_rate": 3.055555555555556e-05,
926
+ "loss": 0.2897,
927
+ "step": 1260
928
+ },
929
+ {
930
+ "epoch": 4.535714285714286,
931
+ "grad_norm": 2.355865001678467,
932
+ "learning_rate": 3.0357142857142857e-05,
933
+ "loss": 0.2658,
934
+ "step": 1270
935
+ },
936
+ {
937
+ "epoch": 4.571428571428571,
938
+ "grad_norm": 1.24091374874115,
939
+ "learning_rate": 3.0158730158730158e-05,
940
+ "loss": 0.3198,
941
+ "step": 1280
942
+ },
943
+ {
944
+ "epoch": 4.607142857142857,
945
+ "grad_norm": 1.0385922193527222,
946
+ "learning_rate": 2.996031746031746e-05,
947
+ "loss": 0.285,
948
+ "step": 1290
949
+ },
950
+ {
951
+ "epoch": 4.642857142857143,
952
+ "grad_norm": 1.032619833946228,
953
+ "learning_rate": 2.9761904761904762e-05,
954
+ "loss": 0.2741,
955
+ "step": 1300
956
+ },
957
+ {
958
+ "epoch": 4.678571428571429,
959
+ "grad_norm": 1.587849736213684,
960
+ "learning_rate": 2.9563492063492066e-05,
961
+ "loss": 0.3093,
962
+ "step": 1310
963
+ },
964
+ {
965
+ "epoch": 4.714285714285714,
966
+ "grad_norm": 1.2873854637145996,
967
+ "learning_rate": 2.9365079365079366e-05,
968
+ "loss": 0.299,
969
+ "step": 1320
970
+ },
971
+ {
972
+ "epoch": 4.75,
973
+ "grad_norm": 1.6315929889678955,
974
+ "learning_rate": 2.916666666666667e-05,
975
+ "loss": 0.2776,
976
+ "step": 1330
977
+ },
978
+ {
979
+ "epoch": 4.785714285714286,
980
+ "grad_norm": 1.8490331172943115,
981
+ "learning_rate": 2.8968253968253974e-05,
982
+ "loss": 0.2613,
983
+ "step": 1340
984
+ },
985
+ {
986
+ "epoch": 4.821428571428571,
987
+ "grad_norm": 1.8352344036102295,
988
+ "learning_rate": 2.876984126984127e-05,
989
+ "loss": 0.281,
990
+ "step": 1350
991
+ },
992
+ {
993
+ "epoch": 4.857142857142857,
994
+ "grad_norm": 1.0744656324386597,
995
+ "learning_rate": 2.857142857142857e-05,
996
+ "loss": 0.2622,
997
+ "step": 1360
998
+ },
999
+ {
1000
+ "epoch": 4.892857142857143,
1001
+ "grad_norm": 1.2477718591690063,
1002
+ "learning_rate": 2.8373015873015875e-05,
1003
+ "loss": 0.2997,
1004
+ "step": 1370
1005
+ },
1006
+ {
1007
+ "epoch": 4.928571428571429,
1008
+ "grad_norm": 1.5114529132843018,
1009
+ "learning_rate": 2.8174603174603175e-05,
1010
+ "loss": 0.2827,
1011
+ "step": 1380
1012
+ },
1013
+ {
1014
+ "epoch": 4.964285714285714,
1015
+ "grad_norm": 1.3266760110855103,
1016
+ "learning_rate": 2.797619047619048e-05,
1017
+ "loss": 0.283,
1018
+ "step": 1390
1019
+ },
1020
+ {
1021
+ "epoch": 5.0,
1022
+ "grad_norm": 2.2647006511688232,
1023
+ "learning_rate": 2.777777777777778e-05,
1024
+ "loss": 0.2498,
1025
+ "step": 1400
1026
+ },
1027
+ {
1028
+ "epoch": 5.0,
1029
+ "eval_accuracy": 0.8693613220187584,
1030
+ "eval_loss": 0.30874186754226685,
1031
+ "eval_runtime": 116.4377,
1032
+ "eval_samples_per_second": 38.458,
1033
+ "eval_steps_per_second": 2.405,
1034
+ "step": 1400
1035
+ },
1036
+ {
1037
+ "epoch": 5.035714285714286,
1038
+ "grad_norm": 1.6704837083816528,
1039
+ "learning_rate": 2.7579365079365083e-05,
1040
+ "loss": 0.2973,
1041
+ "step": 1410
1042
+ },
1043
+ {
1044
+ "epoch": 5.071428571428571,
1045
+ "grad_norm": 1.9291003942489624,
1046
+ "learning_rate": 2.7380952380952383e-05,
1047
+ "loss": 0.2756,
1048
+ "step": 1420
1049
+ },
1050
+ {
1051
+ "epoch": 5.107142857142857,
1052
+ "grad_norm": 2.44765043258667,
1053
+ "learning_rate": 2.718253968253968e-05,
1054
+ "loss": 0.2575,
1055
+ "step": 1430
1056
+ },
1057
+ {
1058
+ "epoch": 5.142857142857143,
1059
+ "grad_norm": 2.0583574771881104,
1060
+ "learning_rate": 2.6984126984126984e-05,
1061
+ "loss": 0.256,
1062
+ "step": 1440
1063
+ },
1064
+ {
1065
+ "epoch": 5.178571428571429,
1066
+ "grad_norm": 1.8882899284362793,
1067
+ "learning_rate": 2.6785714285714288e-05,
1068
+ "loss": 0.299,
1069
+ "step": 1450
1070
+ },
1071
+ {
1072
+ "epoch": 5.214285714285714,
1073
+ "grad_norm": 1.5178027153015137,
1074
+ "learning_rate": 2.6587301587301588e-05,
1075
+ "loss": 0.3209,
1076
+ "step": 1460
1077
+ },
1078
+ {
1079
+ "epoch": 5.25,
1080
+ "grad_norm": 1.4342873096466064,
1081
+ "learning_rate": 2.6388888888888892e-05,
1082
+ "loss": 0.2828,
1083
+ "step": 1470
1084
+ },
1085
+ {
1086
+ "epoch": 5.285714285714286,
1087
+ "grad_norm": 1.3157316446304321,
1088
+ "learning_rate": 2.6190476190476192e-05,
1089
+ "loss": 0.2655,
1090
+ "step": 1480
1091
+ },
1092
+ {
1093
+ "epoch": 5.321428571428571,
1094
+ "grad_norm": 1.0939053297042847,
1095
+ "learning_rate": 2.5992063492063496e-05,
1096
+ "loss": 0.278,
1097
+ "step": 1490
1098
+ },
1099
+ {
1100
+ "epoch": 5.357142857142857,
1101
+ "grad_norm": 1.9465833902359009,
1102
+ "learning_rate": 2.5793650793650796e-05,
1103
+ "loss": 0.2652,
1104
+ "step": 1500
1105
+ },
1106
+ {
1107
+ "epoch": 5.392857142857143,
1108
+ "grad_norm": 1.5688917636871338,
1109
+ "learning_rate": 2.5595238095238093e-05,
1110
+ "loss": 0.2622,
1111
+ "step": 1510
1112
+ },
1113
+ {
1114
+ "epoch": 5.428571428571429,
1115
+ "grad_norm": 1.3724976778030396,
1116
+ "learning_rate": 2.5396825396825397e-05,
1117
+ "loss": 0.2524,
1118
+ "step": 1520
1119
+ },
1120
+ {
1121
+ "epoch": 5.464285714285714,
1122
+ "grad_norm": 1.160733699798584,
1123
+ "learning_rate": 2.5198412698412697e-05,
1124
+ "loss": 0.2868,
1125
+ "step": 1530
1126
+ },
1127
+ {
1128
+ "epoch": 5.5,
1129
+ "grad_norm": 1.2799668312072754,
1130
+ "learning_rate": 2.5e-05,
1131
+ "loss": 0.2713,
1132
+ "step": 1540
1133
+ },
1134
+ {
1135
+ "epoch": 5.535714285714286,
1136
+ "grad_norm": 1.9062435626983643,
1137
+ "learning_rate": 2.4801587301587305e-05,
1138
+ "loss": 0.2527,
1139
+ "step": 1550
1140
+ },
1141
+ {
1142
+ "epoch": 5.571428571428571,
1143
+ "grad_norm": 2.5998220443725586,
1144
+ "learning_rate": 2.4603174603174602e-05,
1145
+ "loss": 0.2668,
1146
+ "step": 1560
1147
+ },
1148
+ {
1149
+ "epoch": 5.607142857142857,
1150
+ "grad_norm": 2.346576452255249,
1151
+ "learning_rate": 2.4404761904761906e-05,
1152
+ "loss": 0.2557,
1153
+ "step": 1570
1154
+ },
1155
+ {
1156
+ "epoch": 5.642857142857143,
1157
+ "grad_norm": 1.9609774351119995,
1158
+ "learning_rate": 2.4206349206349206e-05,
1159
+ "loss": 0.2706,
1160
+ "step": 1580
1161
+ },
1162
+ {
1163
+ "epoch": 5.678571428571429,
1164
+ "grad_norm": 1.8994617462158203,
1165
+ "learning_rate": 2.400793650793651e-05,
1166
+ "loss": 0.2878,
1167
+ "step": 1590
1168
+ },
1169
+ {
1170
+ "epoch": 5.714285714285714,
1171
+ "grad_norm": 1.677849531173706,
1172
+ "learning_rate": 2.380952380952381e-05,
1173
+ "loss": 0.2761,
1174
+ "step": 1600
1175
+ },
1176
+ {
1177
+ "epoch": 5.75,
1178
+ "grad_norm": 1.6345635652542114,
1179
+ "learning_rate": 2.361111111111111e-05,
1180
+ "loss": 0.2231,
1181
+ "step": 1610
1182
+ },
1183
+ {
1184
+ "epoch": 5.785714285714286,
1185
+ "grad_norm": 1.6198554039001465,
1186
+ "learning_rate": 2.3412698412698414e-05,
1187
+ "loss": 0.2612,
1188
+ "step": 1620
1189
+ },
1190
+ {
1191
+ "epoch": 5.821428571428571,
1192
+ "grad_norm": 2.052764415740967,
1193
+ "learning_rate": 2.3214285714285715e-05,
1194
+ "loss": 0.2738,
1195
+ "step": 1630
1196
+ },
1197
+ {
1198
+ "epoch": 5.857142857142857,
1199
+ "grad_norm": 1.0596450567245483,
1200
+ "learning_rate": 2.3015873015873015e-05,
1201
+ "loss": 0.2597,
1202
+ "step": 1640
1203
+ },
1204
+ {
1205
+ "epoch": 5.892857142857143,
1206
+ "grad_norm": 1.9113609790802002,
1207
+ "learning_rate": 2.281746031746032e-05,
1208
+ "loss": 0.2894,
1209
+ "step": 1650
1210
+ },
1211
+ {
1212
+ "epoch": 5.928571428571429,
1213
+ "grad_norm": 1.7055829763412476,
1214
+ "learning_rate": 2.261904761904762e-05,
1215
+ "loss": 0.226,
1216
+ "step": 1660
1217
+ },
1218
+ {
1219
+ "epoch": 5.964285714285714,
1220
+ "grad_norm": 0.969464898109436,
1221
+ "learning_rate": 2.2420634920634923e-05,
1222
+ "loss": 0.2977,
1223
+ "step": 1670
1224
+ },
1225
+ {
1226
+ "epoch": 6.0,
1227
+ "grad_norm": 3.1485939025878906,
1228
+ "learning_rate": 2.2222222222222223e-05,
1229
+ "loss": 0.3028,
1230
+ "step": 1680
1231
+ },
1232
+ {
1233
+ "epoch": 6.0,
1234
+ "eval_accuracy": 0.8715944618133095,
1235
+ "eval_loss": 0.29658079147338867,
1236
+ "eval_runtime": 113.6842,
1237
+ "eval_samples_per_second": 39.39,
1238
+ "eval_steps_per_second": 2.463,
1239
+ "step": 1680
1240
+ },
1241
+ {
1242
+ "epoch": 6.035714285714286,
1243
+ "grad_norm": 1.4179354906082153,
1244
+ "learning_rate": 2.2023809523809524e-05,
1245
+ "loss": 0.2681,
1246
+ "step": 1690
1247
+ },
1248
+ {
1249
+ "epoch": 6.071428571428571,
1250
+ "grad_norm": 2.099681854248047,
1251
+ "learning_rate": 2.1825396825396827e-05,
1252
+ "loss": 0.2441,
1253
+ "step": 1700
1254
+ },
1255
+ {
1256
+ "epoch": 6.107142857142857,
1257
+ "grad_norm": 1.5406619310379028,
1258
+ "learning_rate": 2.1626984126984128e-05,
1259
+ "loss": 0.2707,
1260
+ "step": 1710
1261
+ },
1262
+ {
1263
+ "epoch": 6.142857142857143,
1264
+ "grad_norm": 2.4518625736236572,
1265
+ "learning_rate": 2.1428571428571428e-05,
1266
+ "loss": 0.2942,
1267
+ "step": 1720
1268
+ },
1269
+ {
1270
+ "epoch": 6.178571428571429,
1271
+ "grad_norm": 1.1450200080871582,
1272
+ "learning_rate": 2.1230158730158732e-05,
1273
+ "loss": 0.2336,
1274
+ "step": 1730
1275
+ },
1276
+ {
1277
+ "epoch": 6.214285714285714,
1278
+ "grad_norm": 1.4170438051223755,
1279
+ "learning_rate": 2.1031746031746032e-05,
1280
+ "loss": 0.2857,
1281
+ "step": 1740
1282
+ },
1283
+ {
1284
+ "epoch": 6.25,
1285
+ "grad_norm": 1.2231560945510864,
1286
+ "learning_rate": 2.0833333333333336e-05,
1287
+ "loss": 0.2507,
1288
+ "step": 1750
1289
+ },
1290
+ {
1291
+ "epoch": 6.285714285714286,
1292
+ "grad_norm": 1.4597039222717285,
1293
+ "learning_rate": 2.0634920634920636e-05,
1294
+ "loss": 0.2688,
1295
+ "step": 1760
1296
+ },
1297
+ {
1298
+ "epoch": 6.321428571428571,
1299
+ "grad_norm": 1.9395854473114014,
1300
+ "learning_rate": 2.0436507936507937e-05,
1301
+ "loss": 0.2715,
1302
+ "step": 1770
1303
+ },
1304
+ {
1305
+ "epoch": 6.357142857142857,
1306
+ "grad_norm": 0.9649907946586609,
1307
+ "learning_rate": 2.023809523809524e-05,
1308
+ "loss": 0.2508,
1309
+ "step": 1780
1310
+ },
1311
+ {
1312
+ "epoch": 6.392857142857143,
1313
+ "grad_norm": 1.3842933177947998,
1314
+ "learning_rate": 2.003968253968254e-05,
1315
+ "loss": 0.2818,
1316
+ "step": 1790
1317
+ },
1318
+ {
1319
+ "epoch": 6.428571428571429,
1320
+ "grad_norm": 1.7550239562988281,
1321
+ "learning_rate": 1.984126984126984e-05,
1322
+ "loss": 0.2553,
1323
+ "step": 1800
1324
+ },
1325
+ {
1326
+ "epoch": 6.464285714285714,
1327
+ "grad_norm": 1.04232919216156,
1328
+ "learning_rate": 1.9642857142857145e-05,
1329
+ "loss": 0.2303,
1330
+ "step": 1810
1331
+ },
1332
+ {
1333
+ "epoch": 6.5,
1334
+ "grad_norm": 2.306325674057007,
1335
+ "learning_rate": 1.9444444444444445e-05,
1336
+ "loss": 0.257,
1337
+ "step": 1820
1338
+ },
1339
+ {
1340
+ "epoch": 6.535714285714286,
1341
+ "grad_norm": 1.704186201095581,
1342
+ "learning_rate": 1.924603174603175e-05,
1343
+ "loss": 0.2844,
1344
+ "step": 1830
1345
+ },
1346
+ {
1347
+ "epoch": 6.571428571428571,
1348
+ "grad_norm": 1.3930236101150513,
1349
+ "learning_rate": 1.9047619047619046e-05,
1350
+ "loss": 0.2784,
1351
+ "step": 1840
1352
+ },
1353
+ {
1354
+ "epoch": 6.607142857142857,
1355
+ "grad_norm": 1.5494309663772583,
1356
+ "learning_rate": 1.884920634920635e-05,
1357
+ "loss": 0.2735,
1358
+ "step": 1850
1359
+ },
1360
+ {
1361
+ "epoch": 6.642857142857143,
1362
+ "grad_norm": 1.5971505641937256,
1363
+ "learning_rate": 1.8650793650793654e-05,
1364
+ "loss": 0.2466,
1365
+ "step": 1860
1366
+ },
1367
+ {
1368
+ "epoch": 6.678571428571429,
1369
+ "grad_norm": 1.8419290781021118,
1370
+ "learning_rate": 1.8452380952380954e-05,
1371
+ "loss": 0.2476,
1372
+ "step": 1870
1373
+ },
1374
+ {
1375
+ "epoch": 6.714285714285714,
1376
+ "grad_norm": 1.3679064512252808,
1377
+ "learning_rate": 1.8253968253968254e-05,
1378
+ "loss": 0.2277,
1379
+ "step": 1880
1380
+ },
1381
+ {
1382
+ "epoch": 6.75,
1383
+ "grad_norm": 1.1586477756500244,
1384
+ "learning_rate": 1.8055555555555555e-05,
1385
+ "loss": 0.2266,
1386
+ "step": 1890
1387
+ },
1388
+ {
1389
+ "epoch": 6.785714285714286,
1390
+ "grad_norm": 1.9979654550552368,
1391
+ "learning_rate": 1.785714285714286e-05,
1392
+ "loss": 0.2747,
1393
+ "step": 1900
1394
+ },
1395
+ {
1396
+ "epoch": 6.821428571428571,
1397
+ "grad_norm": 1.4875764846801758,
1398
+ "learning_rate": 1.7658730158730162e-05,
1399
+ "loss": 0.2666,
1400
+ "step": 1910
1401
+ },
1402
+ {
1403
+ "epoch": 6.857142857142857,
1404
+ "grad_norm": 1.6196482181549072,
1405
+ "learning_rate": 1.746031746031746e-05,
1406
+ "loss": 0.2532,
1407
+ "step": 1920
1408
+ },
1409
+ {
1410
+ "epoch": 6.892857142857143,
1411
+ "grad_norm": 2.7135918140411377,
1412
+ "learning_rate": 1.7261904761904763e-05,
1413
+ "loss": 0.2265,
1414
+ "step": 1930
1415
+ },
1416
+ {
1417
+ "epoch": 6.928571428571429,
1418
+ "grad_norm": 1.423257827758789,
1419
+ "learning_rate": 1.7063492063492063e-05,
1420
+ "loss": 0.2621,
1421
+ "step": 1940
1422
+ },
1423
+ {
1424
+ "epoch": 6.964285714285714,
1425
+ "grad_norm": 2.1509084701538086,
1426
+ "learning_rate": 1.6865079365079367e-05,
1427
+ "loss": 0.2636,
1428
+ "step": 1950
1429
+ },
1430
+ {
1431
+ "epoch": 7.0,
1432
+ "grad_norm": 1.562657117843628,
1433
+ "learning_rate": 1.6666666666666667e-05,
1434
+ "loss": 0.2179,
1435
+ "step": 1960
1436
+ },
1437
+ {
1438
+ "epoch": 7.0,
1439
+ "eval_accuracy": 0.8807503349709692,
1440
+ "eval_loss": 0.2742190361022949,
1441
+ "eval_runtime": 113.8381,
1442
+ "eval_samples_per_second": 39.337,
1443
+ "eval_steps_per_second": 2.46,
1444
+ "step": 1960
1445
+ },
1446
+ {
1447
+ "epoch": 7.035714285714286,
1448
+ "grad_norm": 1.4138425588607788,
1449
+ "learning_rate": 1.6468253968253968e-05,
1450
+ "loss": 0.2251,
1451
+ "step": 1970
1452
+ },
1453
+ {
1454
+ "epoch": 7.071428571428571,
1455
+ "grad_norm": 2.6958255767822266,
1456
+ "learning_rate": 1.626984126984127e-05,
1457
+ "loss": 0.2521,
1458
+ "step": 1980
1459
+ },
1460
+ {
1461
+ "epoch": 7.107142857142857,
1462
+ "grad_norm": 2.014803647994995,
1463
+ "learning_rate": 1.6071428571428572e-05,
1464
+ "loss": 0.216,
1465
+ "step": 1990
1466
+ },
1467
+ {
1468
+ "epoch": 7.142857142857143,
1469
+ "grad_norm": 2.1656410694122314,
1470
+ "learning_rate": 1.5873015873015872e-05,
1471
+ "loss": 0.2085,
1472
+ "step": 2000
1473
+ },
1474
+ {
1475
+ "epoch": 7.178571428571429,
1476
+ "grad_norm": 2.207980155944824,
1477
+ "learning_rate": 1.5674603174603176e-05,
1478
+ "loss": 0.263,
1479
+ "step": 2010
1480
+ },
1481
+ {
1482
+ "epoch": 7.214285714285714,
1483
+ "grad_norm": 1.978162169456482,
1484
+ "learning_rate": 1.5476190476190476e-05,
1485
+ "loss": 0.2478,
1486
+ "step": 2020
1487
+ },
1488
+ {
1489
+ "epoch": 7.25,
1490
+ "grad_norm": 1.2705848217010498,
1491
+ "learning_rate": 1.527777777777778e-05,
1492
+ "loss": 0.2338,
1493
+ "step": 2030
1494
+ },
1495
+ {
1496
+ "epoch": 7.285714285714286,
1497
+ "grad_norm": 1.6287261247634888,
1498
+ "learning_rate": 1.5079365079365079e-05,
1499
+ "loss": 0.2063,
1500
+ "step": 2040
1501
+ },
1502
+ {
1503
+ "epoch": 7.321428571428571,
1504
+ "grad_norm": 2.0518875122070312,
1505
+ "learning_rate": 1.4880952380952381e-05,
1506
+ "loss": 0.3032,
1507
+ "step": 2050
1508
+ },
1509
+ {
1510
+ "epoch": 7.357142857142857,
1511
+ "grad_norm": 2.7869536876678467,
1512
+ "learning_rate": 1.4682539682539683e-05,
1513
+ "loss": 0.2242,
1514
+ "step": 2060
1515
+ },
1516
+ {
1517
+ "epoch": 7.392857142857143,
1518
+ "grad_norm": 1.007391095161438,
1519
+ "learning_rate": 1.4484126984126987e-05,
1520
+ "loss": 0.2589,
1521
+ "step": 2070
1522
+ },
1523
+ {
1524
+ "epoch": 7.428571428571429,
1525
+ "grad_norm": 1.2812851667404175,
1526
+ "learning_rate": 1.4285714285714285e-05,
1527
+ "loss": 0.2383,
1528
+ "step": 2080
1529
+ },
1530
+ {
1531
+ "epoch": 7.464285714285714,
1532
+ "grad_norm": 1.1465330123901367,
1533
+ "learning_rate": 1.4087301587301587e-05,
1534
+ "loss": 0.2284,
1535
+ "step": 2090
1536
+ },
1537
+ {
1538
+ "epoch": 7.5,
1539
+ "grad_norm": 2.2567813396453857,
1540
+ "learning_rate": 1.388888888888889e-05,
1541
+ "loss": 0.213,
1542
+ "step": 2100
1543
+ },
1544
+ {
1545
+ "epoch": 7.535714285714286,
1546
+ "grad_norm": 1.7949641942977905,
1547
+ "learning_rate": 1.3690476190476192e-05,
1548
+ "loss": 0.2368,
1549
+ "step": 2110
1550
+ },
1551
+ {
1552
+ "epoch": 7.571428571428571,
1553
+ "grad_norm": 2.443598985671997,
1554
+ "learning_rate": 1.3492063492063492e-05,
1555
+ "loss": 0.2665,
1556
+ "step": 2120
1557
+ },
1558
+ {
1559
+ "epoch": 7.607142857142857,
1560
+ "grad_norm": 1.9699336290359497,
1561
+ "learning_rate": 1.3293650793650794e-05,
1562
+ "loss": 0.2346,
1563
+ "step": 2130
1564
+ },
1565
+ {
1566
+ "epoch": 7.642857142857143,
1567
+ "grad_norm": 1.543039083480835,
1568
+ "learning_rate": 1.3095238095238096e-05,
1569
+ "loss": 0.2432,
1570
+ "step": 2140
1571
+ },
1572
+ {
1573
+ "epoch": 7.678571428571429,
1574
+ "grad_norm": 1.9814691543579102,
1575
+ "learning_rate": 1.2896825396825398e-05,
1576
+ "loss": 0.2575,
1577
+ "step": 2150
1578
+ },
1579
+ {
1580
+ "epoch": 7.714285714285714,
1581
+ "grad_norm": 2.1088602542877197,
1582
+ "learning_rate": 1.2698412698412699e-05,
1583
+ "loss": 0.2346,
1584
+ "step": 2160
1585
+ },
1586
+ {
1587
+ "epoch": 7.75,
1588
+ "grad_norm": 1.5648256540298462,
1589
+ "learning_rate": 1.25e-05,
1590
+ "loss": 0.2403,
1591
+ "step": 2170
1592
+ },
1593
+ {
1594
+ "epoch": 7.785714285714286,
1595
+ "grad_norm": 1.6079583168029785,
1596
+ "learning_rate": 1.2301587301587301e-05,
1597
+ "loss": 0.2414,
1598
+ "step": 2180
1599
+ },
1600
+ {
1601
+ "epoch": 7.821428571428571,
1602
+ "grad_norm": 1.2859593629837036,
1603
+ "learning_rate": 1.2103174603174603e-05,
1604
+ "loss": 0.2756,
1605
+ "step": 2190
1606
+ },
1607
+ {
1608
+ "epoch": 7.857142857142857,
1609
+ "grad_norm": 2.072089672088623,
1610
+ "learning_rate": 1.1904761904761905e-05,
1611
+ "loss": 0.2484,
1612
+ "step": 2200
1613
+ },
1614
+ {
1615
+ "epoch": 7.892857142857143,
1616
+ "grad_norm": 1.623353362083435,
1617
+ "learning_rate": 1.1706349206349207e-05,
1618
+ "loss": 0.274,
1619
+ "step": 2210
1620
+ },
1621
+ {
1622
+ "epoch": 7.928571428571429,
1623
+ "grad_norm": 1.70241379737854,
1624
+ "learning_rate": 1.1507936507936508e-05,
1625
+ "loss": 0.2307,
1626
+ "step": 2220
1627
+ },
1628
+ {
1629
+ "epoch": 7.964285714285714,
1630
+ "grad_norm": 1.7186700105667114,
1631
+ "learning_rate": 1.130952380952381e-05,
1632
+ "loss": 0.2223,
1633
+ "step": 2230
1634
+ },
1635
+ {
1636
+ "epoch": 8.0,
1637
+ "grad_norm": 2.0665862560272217,
1638
+ "learning_rate": 1.1111111111111112e-05,
1639
+ "loss": 0.2274,
1640
+ "step": 2240
1641
+ },
1642
+ {
1643
+ "epoch": 8.0,
1644
+ "eval_accuracy": 0.8814202769093346,
1645
+ "eval_loss": 0.2860513925552368,
1646
+ "eval_runtime": 115.9278,
1647
+ "eval_samples_per_second": 38.627,
1648
+ "eval_steps_per_second": 2.415,
1649
+ "step": 2240
1650
+ },
1651
+ {
1652
+ "epoch": 8.035714285714286,
1653
+ "grad_norm": 1.3822689056396484,
1654
+ "learning_rate": 1.0912698412698414e-05,
1655
+ "loss": 0.238,
1656
+ "step": 2250
1657
+ },
1658
+ {
1659
+ "epoch": 8.071428571428571,
1660
+ "grad_norm": 1.4226853847503662,
1661
+ "learning_rate": 1.0714285714285714e-05,
1662
+ "loss": 0.266,
1663
+ "step": 2260
1664
+ },
1665
+ {
1666
+ "epoch": 8.107142857142858,
1667
+ "grad_norm": 1.9675803184509277,
1668
+ "learning_rate": 1.0515873015873016e-05,
1669
+ "loss": 0.2814,
1670
+ "step": 2270
1671
+ },
1672
+ {
1673
+ "epoch": 8.142857142857142,
1674
+ "grad_norm": 1.8133440017700195,
1675
+ "learning_rate": 1.0317460317460318e-05,
1676
+ "loss": 0.2486,
1677
+ "step": 2280
1678
+ },
1679
+ {
1680
+ "epoch": 8.178571428571429,
1681
+ "grad_norm": 1.2323780059814453,
1682
+ "learning_rate": 1.011904761904762e-05,
1683
+ "loss": 0.2175,
1684
+ "step": 2290
1685
+ },
1686
+ {
1687
+ "epoch": 8.214285714285714,
1688
+ "grad_norm": 1.7063086032867432,
1689
+ "learning_rate": 9.92063492063492e-06,
1690
+ "loss": 0.2125,
1691
+ "step": 2300
1692
+ },
1693
+ {
1694
+ "epoch": 8.25,
1695
+ "grad_norm": 1.532769799232483,
1696
+ "learning_rate": 9.722222222222223e-06,
1697
+ "loss": 0.1813,
1698
+ "step": 2310
1699
+ },
1700
+ {
1701
+ "epoch": 8.285714285714286,
1702
+ "grad_norm": 1.5620160102844238,
1703
+ "learning_rate": 9.523809523809523e-06,
1704
+ "loss": 0.2355,
1705
+ "step": 2320
1706
+ },
1707
+ {
1708
+ "epoch": 8.321428571428571,
1709
+ "grad_norm": 1.6230847835540771,
1710
+ "learning_rate": 9.325396825396827e-06,
1711
+ "loss": 0.2199,
1712
+ "step": 2330
1713
+ },
1714
+ {
1715
+ "epoch": 8.357142857142858,
1716
+ "grad_norm": 1.552085041999817,
1717
+ "learning_rate": 9.126984126984127e-06,
1718
+ "loss": 0.2479,
1719
+ "step": 2340
1720
+ },
1721
+ {
1722
+ "epoch": 8.392857142857142,
1723
+ "grad_norm": 1.8006685972213745,
1724
+ "learning_rate": 8.92857142857143e-06,
1725
+ "loss": 0.2677,
1726
+ "step": 2350
1727
+ },
1728
+ {
1729
+ "epoch": 8.428571428571429,
1730
+ "grad_norm": 1.2057029008865356,
1731
+ "learning_rate": 8.73015873015873e-06,
1732
+ "loss": 0.2022,
1733
+ "step": 2360
1734
+ },
1735
+ {
1736
+ "epoch": 8.464285714285714,
1737
+ "grad_norm": 1.4805638790130615,
1738
+ "learning_rate": 8.531746031746032e-06,
1739
+ "loss": 0.2158,
1740
+ "step": 2370
1741
+ },
1742
+ {
1743
+ "epoch": 8.5,
1744
+ "grad_norm": 1.441603422164917,
1745
+ "learning_rate": 8.333333333333334e-06,
1746
+ "loss": 0.2166,
1747
+ "step": 2380
1748
+ },
1749
+ {
1750
+ "epoch": 8.535714285714286,
1751
+ "grad_norm": 2.171687602996826,
1752
+ "learning_rate": 8.134920634920636e-06,
1753
+ "loss": 0.2332,
1754
+ "step": 2390
1755
+ },
1756
+ {
1757
+ "epoch": 8.571428571428571,
1758
+ "grad_norm": 1.5523641109466553,
1759
+ "learning_rate": 7.936507936507936e-06,
1760
+ "loss": 0.2099,
1761
+ "step": 2400
1762
+ },
1763
+ {
1764
+ "epoch": 8.607142857142858,
1765
+ "grad_norm": 1.429527759552002,
1766
+ "learning_rate": 7.738095238095238e-06,
1767
+ "loss": 0.1934,
1768
+ "step": 2410
1769
+ },
1770
+ {
1771
+ "epoch": 8.642857142857142,
1772
+ "grad_norm": 3.8555209636688232,
1773
+ "learning_rate": 7.5396825396825394e-06,
1774
+ "loss": 0.2813,
1775
+ "step": 2420
1776
+ },
1777
+ {
1778
+ "epoch": 8.678571428571429,
1779
+ "grad_norm": 1.118416666984558,
1780
+ "learning_rate": 7.3412698412698415e-06,
1781
+ "loss": 0.2358,
1782
+ "step": 2430
1783
+ },
1784
+ {
1785
+ "epoch": 8.714285714285714,
1786
+ "grad_norm": 1.9037988185882568,
1787
+ "learning_rate": 7.142857142857143e-06,
1788
+ "loss": 0.2091,
1789
+ "step": 2440
1790
+ },
1791
+ {
1792
+ "epoch": 8.75,
1793
+ "grad_norm": 1.8768919706344604,
1794
+ "learning_rate": 6.944444444444445e-06,
1795
+ "loss": 0.2176,
1796
+ "step": 2450
1797
+ },
1798
+ {
1799
+ "epoch": 8.785714285714286,
1800
+ "grad_norm": 1.606123685836792,
1801
+ "learning_rate": 6.746031746031746e-06,
1802
+ "loss": 0.2388,
1803
+ "step": 2460
1804
+ },
1805
+ {
1806
+ "epoch": 8.821428571428571,
1807
+ "grad_norm": 2.6942107677459717,
1808
+ "learning_rate": 6.547619047619048e-06,
1809
+ "loss": 0.2182,
1810
+ "step": 2470
1811
+ },
1812
+ {
1813
+ "epoch": 8.857142857142858,
1814
+ "grad_norm": 1.449601173400879,
1815
+ "learning_rate": 6.349206349206349e-06,
1816
+ "loss": 0.2102,
1817
+ "step": 2480
1818
+ },
1819
+ {
1820
+ "epoch": 8.892857142857142,
1821
+ "grad_norm": 2.651686668395996,
1822
+ "learning_rate": 6.1507936507936505e-06,
1823
+ "loss": 0.2166,
1824
+ "step": 2490
1825
+ },
1826
+ {
1827
+ "epoch": 8.928571428571429,
1828
+ "grad_norm": 2.6743762493133545,
1829
+ "learning_rate": 5.9523809523809525e-06,
1830
+ "loss": 0.2468,
1831
+ "step": 2500
1832
+ },
1833
+ {
1834
+ "epoch": 8.964285714285714,
1835
+ "grad_norm": 2.29903244972229,
1836
+ "learning_rate": 5.753968253968254e-06,
1837
+ "loss": 0.2111,
1838
+ "step": 2510
1839
+ },
1840
+ {
1841
+ "epoch": 9.0,
1842
+ "grad_norm": 1.8269622325897217,
1843
+ "learning_rate": 5.555555555555556e-06,
1844
+ "loss": 0.2195,
1845
+ "step": 2520
1846
+ },
1847
+ {
1848
+ "epoch": 9.0,
1849
+ "eval_accuracy": 0.8894595801697186,
1850
+ "eval_loss": 0.26261791586875916,
1851
+ "eval_runtime": 114.2956,
1852
+ "eval_samples_per_second": 39.179,
1853
+ "eval_steps_per_second": 2.45,
1854
+ "step": 2520
1855
+ },
1856
+ {
1857
+ "epoch": 9.035714285714286,
1858
+ "grad_norm": 3.011329174041748,
1859
+ "learning_rate": 5.357142857142857e-06,
1860
+ "loss": 0.1827,
1861
+ "step": 2530
1862
+ },
1863
+ {
1864
+ "epoch": 9.071428571428571,
1865
+ "grad_norm": 2.3060262203216553,
1866
+ "learning_rate": 5.158730158730159e-06,
1867
+ "loss": 0.1862,
1868
+ "step": 2540
1869
+ },
1870
+ {
1871
+ "epoch": 9.107142857142858,
1872
+ "grad_norm": 1.7220100164413452,
1873
+ "learning_rate": 4.96031746031746e-06,
1874
+ "loss": 0.2215,
1875
+ "step": 2550
1876
+ },
1877
+ {
1878
+ "epoch": 9.142857142857142,
1879
+ "grad_norm": 2.463092803955078,
1880
+ "learning_rate": 4.7619047619047615e-06,
1881
+ "loss": 0.2228,
1882
+ "step": 2560
1883
+ },
1884
+ {
1885
+ "epoch": 9.178571428571429,
1886
+ "grad_norm": 1.204136848449707,
1887
+ "learning_rate": 4.563492063492064e-06,
1888
+ "loss": 0.2439,
1889
+ "step": 2570
1890
+ },
1891
+ {
1892
+ "epoch": 9.214285714285714,
1893
+ "grad_norm": 2.263396978378296,
1894
+ "learning_rate": 4.365079365079365e-06,
1895
+ "loss": 0.2266,
1896
+ "step": 2580
1897
+ },
1898
+ {
1899
+ "epoch": 9.25,
1900
+ "grad_norm": 2.7832555770874023,
1901
+ "learning_rate": 4.166666666666667e-06,
1902
+ "loss": 0.2089,
1903
+ "step": 2590
1904
+ },
1905
+ {
1906
+ "epoch": 9.285714285714286,
1907
+ "grad_norm": 2.0564024448394775,
1908
+ "learning_rate": 3.968253968253968e-06,
1909
+ "loss": 0.1839,
1910
+ "step": 2600
1911
+ },
1912
+ {
1913
+ "epoch": 9.321428571428571,
1914
+ "grad_norm": 2.0316998958587646,
1915
+ "learning_rate": 3.7698412698412697e-06,
1916
+ "loss": 0.2424,
1917
+ "step": 2610
1918
+ },
1919
+ {
1920
+ "epoch": 9.357142857142858,
1921
+ "grad_norm": 2.229687213897705,
1922
+ "learning_rate": 3.5714285714285714e-06,
1923
+ "loss": 0.2489,
1924
+ "step": 2620
1925
+ },
1926
+ {
1927
+ "epoch": 9.392857142857142,
1928
+ "grad_norm": 1.7529199123382568,
1929
+ "learning_rate": 3.373015873015873e-06,
1930
+ "loss": 0.2273,
1931
+ "step": 2630
1932
+ },
1933
+ {
1934
+ "epoch": 9.428571428571429,
1935
+ "grad_norm": 1.5242239236831665,
1936
+ "learning_rate": 3.1746031746031746e-06,
1937
+ "loss": 0.2224,
1938
+ "step": 2640
1939
+ },
1940
+ {
1941
+ "epoch": 9.464285714285714,
1942
+ "grad_norm": 1.5499508380889893,
1943
+ "learning_rate": 2.9761904761904763e-06,
1944
+ "loss": 0.2338,
1945
+ "step": 2650
1946
+ },
1947
+ {
1948
+ "epoch": 9.5,
1949
+ "grad_norm": 2.5902230739593506,
1950
+ "learning_rate": 2.777777777777778e-06,
1951
+ "loss": 0.1909,
1952
+ "step": 2660
1953
+ },
1954
+ {
1955
+ "epoch": 9.535714285714286,
1956
+ "grad_norm": 1.3243242502212524,
1957
+ "learning_rate": 2.5793650793650795e-06,
1958
+ "loss": 0.2199,
1959
+ "step": 2670
1960
+ },
1961
+ {
1962
+ "epoch": 9.571428571428571,
1963
+ "grad_norm": 1.9745112657546997,
1964
+ "learning_rate": 2.3809523809523808e-06,
1965
+ "loss": 0.2172,
1966
+ "step": 2680
1967
+ },
1968
+ {
1969
+ "epoch": 9.607142857142858,
1970
+ "grad_norm": 2.10951828956604,
1971
+ "learning_rate": 2.1825396825396824e-06,
1972
+ "loss": 0.2331,
1973
+ "step": 2690
1974
+ },
1975
+ {
1976
+ "epoch": 9.642857142857142,
1977
+ "grad_norm": 2.113539457321167,
1978
+ "learning_rate": 1.984126984126984e-06,
1979
+ "loss": 0.2373,
1980
+ "step": 2700
1981
+ },
1982
+ {
1983
+ "epoch": 9.678571428571429,
1984
+ "grad_norm": 1.548854112625122,
1985
+ "learning_rate": 1.7857142857142857e-06,
1986
+ "loss": 0.2035,
1987
+ "step": 2710
1988
+ },
1989
+ {
1990
+ "epoch": 9.714285714285714,
1991
+ "grad_norm": 1.797196626663208,
1992
+ "learning_rate": 1.5873015873015873e-06,
1993
+ "loss": 0.2015,
1994
+ "step": 2720
1995
+ },
1996
+ {
1997
+ "epoch": 9.75,
1998
+ "grad_norm": 1.9279841184616089,
1999
+ "learning_rate": 1.388888888888889e-06,
2000
+ "loss": 0.2208,
2001
+ "step": 2730
2002
+ },
2003
+ {
2004
+ "epoch": 9.785714285714286,
2005
+ "grad_norm": 0.950290858745575,
2006
+ "learning_rate": 1.1904761904761904e-06,
2007
+ "loss": 0.2394,
2008
+ "step": 2740
2009
+ },
2010
+ {
2011
+ "epoch": 9.821428571428571,
2012
+ "grad_norm": 1.2573094367980957,
2013
+ "learning_rate": 9.92063492063492e-07,
2014
+ "loss": 0.2491,
2015
+ "step": 2750
2016
+ },
2017
+ {
2018
+ "epoch": 9.857142857142858,
2019
+ "grad_norm": 1.8162927627563477,
2020
+ "learning_rate": 7.936507936507937e-07,
2021
+ "loss": 0.2229,
2022
+ "step": 2760
2023
+ },
2024
+ {
2025
+ "epoch": 9.892857142857142,
2026
+ "grad_norm": 2.0845260620117188,
2027
+ "learning_rate": 5.952380952380952e-07,
2028
+ "loss": 0.1984,
2029
+ "step": 2770
2030
+ },
2031
+ {
2032
+ "epoch": 9.928571428571429,
2033
+ "grad_norm": 1.8501282930374146,
2034
+ "learning_rate": 3.9682539682539683e-07,
2035
+ "loss": 0.2287,
2036
+ "step": 2780
2037
+ },
2038
+ {
2039
+ "epoch": 9.964285714285714,
2040
+ "grad_norm": 2.941807746887207,
2041
+ "learning_rate": 1.9841269841269841e-07,
2042
+ "loss": 0.2233,
2043
+ "step": 2790
2044
+ },
2045
+ {
2046
+ "epoch": 10.0,
2047
+ "grad_norm": 1.2508630752563477,
2048
+ "learning_rate": 0.0,
2049
+ "loss": 0.1886,
2050
+ "step": 2800
2051
+ },
2052
+ {
2053
+ "epoch": 10.0,
2054
+ "eval_accuracy": 0.8865564984368022,
2055
+ "eval_loss": 0.2717145085334778,
2056
+ "eval_runtime": 113.959,
2057
+ "eval_samples_per_second": 39.295,
2058
+ "eval_steps_per_second": 2.457,
2059
+ "step": 2800
2060
  },
2061
  {
2062
+ "epoch": 10.0,
2063
+ "step": 2800,
2064
+ "total_flos": 1.3877265500181135e+19,
2065
+ "train_loss": 0.3087472263830049,
2066
+ "train_runtime": 6534.1253,
2067
+ "train_samples_per_second": 27.407,
2068
+ "train_steps_per_second": 0.429
2069
  }
2070
  ],
2071
  "logging_steps": 10,
2072
+ "max_steps": 2800,
2073
  "num_input_tokens_seen": 0,
2074
  "num_train_epochs": 10,
2075
  "save_steps": 500,
 
2085
  "attributes": {}
2086
  }
2087
  },
2088
+ "total_flos": 1.3877265500181135e+19,
2089
  "train_batch_size": 16,
2090
  "trial_name": null,
2091
  "trial_params": null