c10 commited on
Commit
7b9714a
1 Parent(s): d316588

Model save

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. all_results.json +8 -8
  3. eval_results.json +4 -4
  4. train_results.json +4 -4
  5. trainer_state.json +468 -468
README.md CHANGED
@@ -18,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [yanolja/EEVE-Korean-Instruct-2.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-2.8B-v1.0) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.3932
22
 
23
  ## Model description
24
 
@@ -54,7 +54,7 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 0.3021 | 1.0 | 233 | 0.3932 |
58
 
59
 
60
  ### Framework versions
 
18
 
19
  This model is a fine-tuned version of [yanolja/EEVE-Korean-Instruct-2.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-2.8B-v1.0) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.3962
22
 
23
  ## Model description
24
 
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.3034 | 1.0 | 233 | 0.3962 |
58
 
59
 
60
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.3927557170391083,
4
- "eval_runtime": 1.5271,
5
  "eval_samples": 230,
6
- "eval_samples_per_second": 150.611,
7
- "eval_steps_per_second": 6.548,
8
  "total_flos": 1.933383782157517e+16,
9
- "train_loss": 0.43314852683840904,
10
- "train_runtime": 303.7389,
11
  "train_samples": 11140,
12
- "train_samples_per_second": 36.676,
13
- "train_steps_per_second": 0.767
14
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.3932158648967743,
4
+ "eval_runtime": 1.5613,
5
  "eval_samples": 230,
6
+ "eval_samples_per_second": 147.316,
7
+ "eval_steps_per_second": 6.405,
8
  "total_flos": 1.933383782157517e+16,
9
+ "train_loss": 0.4341796827162796,
10
+ "train_runtime": 302.6302,
11
  "train_samples": 11140,
12
+ "train_samples_per_second": 36.811,
13
+ "train_steps_per_second": 0.77
14
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.3927557170391083,
4
- "eval_runtime": 1.5271,
5
  "eval_samples": 230,
6
- "eval_samples_per_second": 150.611,
7
- "eval_steps_per_second": 6.548
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.3932158648967743,
4
+ "eval_runtime": 1.5613,
5
  "eval_samples": 230,
6
+ "eval_samples_per_second": 147.316,
7
+ "eval_steps_per_second": 6.405
8
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
- "train_loss": 0.43314852683840904,
5
- "train_runtime": 303.7389,
6
  "train_samples": 11140,
7
- "train_samples_per_second": 36.676,
8
- "train_steps_per_second": 0.767
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
+ "train_loss": 0.4341796827162796,
5
+ "train_runtime": 302.6302,
6
  "train_samples": 11140,
7
+ "train_samples_per_second": 36.811,
8
+ "train_steps_per_second": 0.77
9
  }
trainer_state.json CHANGED
@@ -10,1651 +10,1651 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004291845493562232,
13
- "grad_norm": 6.858954429626465,
14
  "learning_rate": 4.1666666666666667e-07,
15
  "loss": 2.3401,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.008583690987124463,
20
- "grad_norm": 7.1901326179504395,
21
  "learning_rate": 8.333333333333333e-07,
22
  "loss": 2.3774,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.012875536480686695,
27
- "grad_norm": 6.998518466949463,
28
  "learning_rate": 1.25e-06,
29
- "loss": 2.3589,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.017167381974248927,
34
- "grad_norm": 6.999710559844971,
35
  "learning_rate": 1.6666666666666667e-06,
36
- "loss": 2.3641,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.02145922746781116,
41
- "grad_norm": 6.936703681945801,
42
  "learning_rate": 2.0833333333333334e-06,
43
- "loss": 2.3491,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.02575107296137339,
48
- "grad_norm": 6.817142963409424,
49
  "learning_rate": 2.5e-06,
50
- "loss": 2.28,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.030042918454935622,
55
- "grad_norm": 6.60942268371582,
56
  "learning_rate": 2.916666666666667e-06,
57
- "loss": 2.2736,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.034334763948497854,
62
- "grad_norm": 5.893615245819092,
63
  "learning_rate": 3.3333333333333333e-06,
64
- "loss": 2.1437,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03862660944206009,
69
- "grad_norm": 5.829619407653809,
70
  "learning_rate": 3.7500000000000005e-06,
71
- "loss": 2.1175,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.04291845493562232,
76
- "grad_norm": 5.597039699554443,
77
  "learning_rate": 4.166666666666667e-06,
78
- "loss": 2.0907,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04721030042918455,
83
- "grad_norm": 4.393044948577881,
84
  "learning_rate": 4.583333333333333e-06,
85
- "loss": 1.7745,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.05150214592274678,
90
- "grad_norm": 4.335526466369629,
91
  "learning_rate": 5e-06,
92
- "loss": 1.6767,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.055793991416309016,
97
- "grad_norm": 4.237323760986328,
98
  "learning_rate": 5.416666666666667e-06,
99
- "loss": 1.6224,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.060085836909871244,
104
- "grad_norm": 3.4697413444519043,
105
  "learning_rate": 5.833333333333334e-06,
106
- "loss": 1.1534,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.06437768240343347,
111
- "grad_norm": 3.5880987644195557,
112
  "learning_rate": 6.25e-06,
113
- "loss": 1.0938,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06866952789699571,
118
- "grad_norm": 3.3989222049713135,
119
  "learning_rate": 6.666666666666667e-06,
120
- "loss": 1.0218,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.07296137339055794,
125
- "grad_norm": 2.876070976257324,
126
  "learning_rate": 7.083333333333335e-06,
127
- "loss": 0.9028,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07725321888412018,
132
- "grad_norm": 2.5071043968200684,
133
  "learning_rate": 7.500000000000001e-06,
134
- "loss": 0.8486,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0815450643776824,
139
- "grad_norm": 2.0654499530792236,
140
  "learning_rate": 7.916666666666667e-06,
141
- "loss": 0.5864,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.08583690987124463,
146
- "grad_norm": 1.3941829204559326,
147
  "learning_rate": 8.333333333333334e-06,
148
- "loss": 0.4791,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.09012875536480687,
153
- "grad_norm": 1.0642451047897339,
154
  "learning_rate": 8.750000000000001e-06,
155
- "loss": 0.4628,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0944206008583691,
160
- "grad_norm": 0.8186908960342407,
161
  "learning_rate": 9.166666666666666e-06,
162
- "loss": 0.423,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09871244635193133,
167
- "grad_norm": 0.714146614074707,
168
  "learning_rate": 9.583333333333335e-06,
169
- "loss": 0.4155,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.10300429184549356,
174
- "grad_norm": 0.6719127297401428,
175
  "learning_rate": 1e-05,
176
- "loss": 0.4172,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.1072961373390558,
181
- "grad_norm": 0.6532909870147705,
182
  "learning_rate": 9.999435142363484e-06,
183
- "loss": 0.3825,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.11158798283261803,
188
- "grad_norm": 0.7833576202392578,
189
  "learning_rate": 9.997740697079595e-06,
190
- "loss": 0.4211,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.11587982832618025,
195
- "grad_norm": 0.6481274366378784,
196
  "learning_rate": 9.994917046996472e-06,
197
- "loss": 0.3679,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.12017167381974249,
202
- "grad_norm": 0.6363394856452942,
203
  "learning_rate": 9.990964830098246e-06,
204
- "loss": 0.3789,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.12446351931330472,
209
- "grad_norm": 0.6472516655921936,
210
  "learning_rate": 9.985884939360873e-06,
211
- "loss": 0.4076,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.12875536480686695,
216
- "grad_norm": 0.5628584623336792,
217
  "learning_rate": 9.979678522550382e-06,
218
- "loss": 0.3832,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.13304721030042918,
223
- "grad_norm": 0.5358518362045288,
224
  "learning_rate": 9.972346981963546e-06,
225
- "loss": 0.3658,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.13733905579399142,
230
- "grad_norm": 0.5220690965652466,
231
  "learning_rate": 9.963891974111042e-06,
232
- "loss": 0.3743,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.14163090128755365,
237
- "grad_norm": 0.544533908367157,
238
  "learning_rate": 9.95431540934317e-06,
239
- "loss": 0.3732,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1459227467811159,
244
- "grad_norm": 0.5741094946861267,
245
  "learning_rate": 9.943619451418225e-06,
246
- "loss": 0.3981,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.15021459227467812,
251
- "grad_norm": 0.5270912647247314,
252
  "learning_rate": 9.931806517013612e-06,
253
- "loss": 0.3583,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.15450643776824036,
258
- "grad_norm": 0.49920132756233215,
259
  "learning_rate": 9.918879275179819e-06,
260
- "loss": 0.3414,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.15879828326180256,
265
- "grad_norm": 0.5417159199714661,
266
  "learning_rate": 9.904840646737346e-06,
267
  "loss": 0.3734,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1630901287553648,
272
- "grad_norm": 0.4884810149669647,
273
  "learning_rate": 9.889693803616793e-06,
274
- "loss": 0.3682,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.16738197424892703,
279
- "grad_norm": 0.48894229531288147,
280
  "learning_rate": 9.873442168142158e-06,
281
- "loss": 0.3607,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.17167381974248927,
286
- "grad_norm": 0.4725055396556854,
287
  "learning_rate": 9.856089412257605e-06,
288
- "loss": 0.3348,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.1759656652360515,
293
- "grad_norm": 0.499160498380661,
294
  "learning_rate": 9.837639456697802e-06,
295
- "loss": 0.382,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.18025751072961374,
300
- "grad_norm": 0.4489183723926544,
301
  "learning_rate": 9.818096470102067e-06,
302
- "loss": 0.3341,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.18454935622317598,
307
- "grad_norm": 0.4789492189884186,
308
  "learning_rate": 9.797464868072489e-06,
309
- "loss": 0.3521,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.1888412017167382,
314
- "grad_norm": 0.47436925768852234,
315
  "learning_rate": 9.775749312176249e-06,
316
- "loss": 0.3302,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.19313304721030042,
321
- "grad_norm": 0.4419718086719513,
322
  "learning_rate": 9.752954708892379e-06,
323
- "loss": 0.3392,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.19742489270386265,
328
- "grad_norm": 0.4800013601779938,
329
  "learning_rate": 9.729086208503174e-06,
330
  "loss": 0.3416,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.2017167381974249,
335
- "grad_norm": 0.48074233531951904,
336
  "learning_rate": 9.704149203930522e-06,
337
- "loss": 0.3632,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.20600858369098712,
342
- "grad_norm": 0.48519065976142883,
343
  "learning_rate": 9.67814932951741e-06,
344
- "loss": 0.3706,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.21030042918454936,
349
- "grad_norm": 0.46741950511932373,
350
  "learning_rate": 9.651092459754879e-06,
351
- "loss": 0.3728,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.2145922746781116,
356
- "grad_norm": 0.46153953671455383,
357
  "learning_rate": 9.622984707954732e-06,
358
- "loss": 0.348,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.21888412017167383,
363
- "grad_norm": 0.42454302310943604,
364
  "learning_rate": 9.593832424868271e-06,
365
- "loss": 0.3319,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.22317596566523606,
370
- "grad_norm": 0.47633853554725647,
371
  "learning_rate": 9.563642197251382e-06,
372
- "loss": 0.3699,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.22746781115879827,
377
- "grad_norm": 0.46166110038757324,
378
  "learning_rate": 9.532420846376316e-06,
379
- "loss": 0.3278,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.2317596566523605,
384
- "grad_norm": 0.4451698362827301,
385
  "learning_rate": 9.500175426490455e-06,
386
- "loss": 0.3165,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.23605150214592274,
391
- "grad_norm": 0.4674447774887085,
392
  "learning_rate": 9.466913223222467e-06,
393
- "loss": 0.3487,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.24034334763948498,
398
- "grad_norm": 0.44866660237312317,
399
  "learning_rate": 9.432641751936162e-06,
400
  "loss": 0.3286,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2446351931330472,
405
- "grad_norm": 0.4655809700489044,
406
  "learning_rate": 9.397368756032445e-06,
407
- "loss": 0.3435,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.24892703862660945,
412
- "grad_norm": 0.4543526768684387,
413
  "learning_rate": 9.361102205199762e-06,
414
- "loss": 0.3448,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.2532188841201717,
419
- "grad_norm": 0.46444252133369446,
420
  "learning_rate": 9.32385029361338e-06,
421
- "loss": 0.3244,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.2575107296137339,
426
- "grad_norm": 0.44596582651138306,
427
  "learning_rate": 9.285621438083997e-06,
428
- "loss": 0.3183,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.26180257510729615,
433
- "grad_norm": 0.4672187864780426,
434
  "learning_rate": 9.246424276156008e-06,
435
- "loss": 0.3288,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.26609442060085836,
440
- "grad_norm": 0.451576292514801,
441
  "learning_rate": 9.206267664155906e-06,
442
- "loss": 0.3268,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2703862660944206,
447
- "grad_norm": 0.4450378119945526,
448
  "learning_rate": 9.165160675191272e-06,
449
- "loss": 0.3318,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.27467811158798283,
454
- "grad_norm": 0.4558464586734772,
455
  "learning_rate": 9.123112597100759e-06,
456
- "loss": 0.3366,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.27896995708154504,
461
- "grad_norm": 0.5316265821456909,
462
  "learning_rate": 9.080132930355567e-06,
463
- "loss": 0.3323,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2832618025751073,
468
- "grad_norm": 0.42524275183677673,
469
  "learning_rate": 9.03623138591289e-06,
470
- "loss": 0.3076,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2875536480686695,
475
- "grad_norm": 0.4055148661136627,
476
  "learning_rate": 8.99141788302178e-06,
477
- "loss": 0.3071,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2918454935622318,
482
- "grad_norm": 0.4067463278770447,
483
  "learning_rate": 8.94570254698197e-06,
484
- "loss": 0.2992,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.296137339055794,
489
- "grad_norm": 0.4295370280742645,
490
  "learning_rate": 8.899095706856122e-06,
491
- "loss": 0.321,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.30042918454935624,
496
- "grad_norm": 0.44975805282592773,
497
  "learning_rate": 8.851607893136065e-06,
498
- "loss": 0.3521,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.30472103004291845,
503
- "grad_norm": 0.47437217831611633,
504
  "learning_rate": 8.803249835363486e-06,
505
- "loss": 0.2967,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3090128755364807,
510
- "grad_norm": 0.4566970765590668,
511
  "learning_rate": 8.754032459705672e-06,
512
- "loss": 0.3449,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.3133047210300429,
517
- "grad_norm": 0.4559485614299774,
518
  "learning_rate": 8.703966886486819e-06,
519
- "loss": 0.3286,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.31759656652360513,
524
- "grad_norm": 0.42962053418159485,
525
  "learning_rate": 8.65306442767547e-06,
526
- "loss": 0.3195,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3218884120171674,
531
- "grad_norm": 0.4547812342643738,
532
  "learning_rate": 8.601336584328659e-06,
533
- "loss": 0.3347,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3261802575107296,
538
- "grad_norm": 0.5012584328651428,
539
  "learning_rate": 8.548795043993316e-06,
540
- "loss": 0.2947,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.33047210300429186,
545
- "grad_norm": 0.4377043545246124,
546
  "learning_rate": 8.495451678065563e-06,
547
- "loss": 0.3323,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.33476394849785407,
552
- "grad_norm": 0.45930778980255127,
553
  "learning_rate": 8.441318539108433e-06,
554
- "loss": 0.3447,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.33905579399141633,
559
- "grad_norm": 0.43653759360313416,
560
  "learning_rate": 8.386407858128707e-06,
561
- "loss": 0.3362,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.34334763948497854,
566
- "grad_norm": 0.4360993206501007,
567
  "learning_rate": 8.330732041813367e-06,
568
- "loss": 0.313,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.34763948497854075,
573
- "grad_norm": 0.45763546228408813,
574
  "learning_rate": 8.274303669726427e-06,
575
- "loss": 0.3427,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.351931330472103,
580
- "grad_norm": 0.45791178941726685,
581
  "learning_rate": 8.217135491466636e-06,
582
- "loss": 0.3459,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.3562231759656652,
587
- "grad_norm": 0.4212036430835724,
588
  "learning_rate": 8.15924042378682e-06,
589
- "loss": 0.3039,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.3605150214592275,
594
- "grad_norm": 0.4088297486305237,
595
  "learning_rate": 8.100631547675417e-06,
596
- "loss": 0.3036,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3648068669527897,
601
- "grad_norm": 0.44220373034477234,
602
  "learning_rate": 8.041322105400923e-06,
603
- "loss": 0.3334,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.36909871244635195,
608
- "grad_norm": 0.4390133023262024,
609
  "learning_rate": 7.981325497519892e-06,
610
- "loss": 0.3183,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.37339055793991416,
615
- "grad_norm": 0.46158358454704285,
616
  "learning_rate": 7.920655279849173e-06,
617
- "loss": 0.3211,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3776824034334764,
622
- "grad_norm": 0.444137305021286,
623
  "learning_rate": 7.859325160403073e-06,
624
- "loss": 0.3392,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.38197424892703863,
629
- "grad_norm": 0.4566199779510498,
630
  "learning_rate": 7.797348996296116e-06,
631
- "loss": 0.3201,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.38626609442060084,
636
- "grad_norm": 0.45487260818481445,
637
  "learning_rate": 7.734740790612137e-06,
638
- "loss": 0.3399,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.3905579399141631,
643
- "grad_norm": 0.4396146833896637,
644
  "learning_rate": 7.671514689240366e-06,
645
  "loss": 0.3076,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3948497854077253,
650
- "grad_norm": 0.45286861062049866,
651
  "learning_rate": 7.607684977679284e-06,
652
- "loss": 0.299,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.39914163090128757,
657
- "grad_norm": 0.4087929129600525,
658
  "learning_rate": 7.543266077808893e-06,
659
- "loss": 0.2845,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.4034334763948498,
664
- "grad_norm": 0.4473104774951935,
665
  "learning_rate": 7.478272544632204e-06,
666
- "loss": 0.3198,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.40772532188841204,
671
- "grad_norm": 0.4244577884674072,
672
  "learning_rate": 7.412719062986632e-06,
673
- "loss": 0.3367,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.41201716738197425,
678
- "grad_norm": 0.4132303297519684,
679
  "learning_rate": 7.3466204442260605e-06,
680
- "loss": 0.276,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.41630901287553645,
685
- "grad_norm": 0.4498296082019806,
686
  "learning_rate": 7.279991622874319e-06,
687
- "loss": 0.3412,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.4206008583690987,
692
- "grad_norm": 0.45308294892311096,
693
  "learning_rate": 7.212847653250828e-06,
694
- "loss": 0.3001,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.4248927038626609,
699
- "grad_norm": 0.4460461437702179,
700
  "learning_rate": 7.145203706069183e-06,
701
- "loss": 0.2855,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.4291845493562232,
706
- "grad_norm": 0.43338119983673096,
707
  "learning_rate": 7.0770750650094335e-06,
708
- "loss": 0.3366,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4334763948497854,
713
- "grad_norm": 0.4553949534893036,
714
  "learning_rate": 7.008477123264849e-06,
715
- "loss": 0.3444,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.43776824034334766,
720
- "grad_norm": 0.42553162574768066,
721
  "learning_rate": 6.939425380063924e-06,
722
- "loss": 0.3146,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.44206008583690987,
727
- "grad_norm": 0.42176350951194763,
728
  "learning_rate": 6.869935437168449e-06,
729
- "loss": 0.3221,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.44635193133047213,
734
- "grad_norm": 0.44573134183883667,
735
  "learning_rate": 6.800022995348381e-06,
736
- "loss": 0.3182,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.45064377682403434,
741
- "grad_norm": 0.43147024512290955,
742
  "learning_rate": 6.729703850834381e-06,
743
- "loss": 0.3246,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.45493562231759654,
748
- "grad_norm": 0.402482807636261,
749
  "learning_rate": 6.65899389174876e-06,
750
- "loss": 0.2823,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.4592274678111588,
755
- "grad_norm": 0.4000365138053894,
756
  "learning_rate": 6.587909094515663e-06,
757
- "loss": 0.2955,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.463519313304721,
762
- "grad_norm": 0.3905521333217621,
763
  "learning_rate": 6.5164655202513135e-06,
764
- "loss": 0.2793,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4678111587982833,
769
- "grad_norm": 0.4438071548938751,
770
  "learning_rate": 6.444679311135112e-06,
771
- "loss": 0.34,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.4721030042918455,
776
- "grad_norm": 0.45224955677986145,
777
  "learning_rate": 6.372566686762427e-06,
778
- "loss": 0.3406,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.47639484978540775,
783
- "grad_norm": 0.45422661304473877,
784
  "learning_rate": 6.300143940479881e-06,
785
- "loss": 0.3243,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.48068669527896996,
790
- "grad_norm": 0.526847779750824,
791
  "learning_rate": 6.227427435703997e-06,
792
- "loss": 0.2892,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.48497854077253216,
797
- "grad_norm": 0.4258354604244232,
798
  "learning_rate": 6.154433602223979e-06,
799
- "loss": 0.269,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4892703862660944,
804
- "grad_norm": 0.40687990188598633,
805
  "learning_rate": 6.0811789324895365e-06,
806
- "loss": 0.3001,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.49356223175965663,
811
- "grad_norm": 0.43260207772254944,
812
  "learning_rate": 6.0076799778845105e-06,
813
- "loss": 0.306,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4978540772532189,
818
- "grad_norm": 0.43823885917663574,
819
  "learning_rate": 5.933953344987215e-06,
820
- "loss": 0.3119,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.5021459227467812,
825
- "grad_norm": 0.4540681540966034,
826
  "learning_rate": 5.860015691818292e-06,
827
- "loss": 0.3307,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.5064377682403434,
832
- "grad_norm": 0.4211110770702362,
833
  "learning_rate": 5.78588372407695e-06,
834
- "loss": 0.2904,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.5107296137339056,
839
- "grad_norm": 0.42265021800994873,
840
  "learning_rate": 5.711574191366427e-06,
841
- "loss": 0.3069,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.5150214592274678,
846
- "grad_norm": 0.42844855785369873,
847
  "learning_rate": 5.637103883409525e-06,
848
- "loss": 0.3033,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.51931330472103,
853
- "grad_norm": 0.4592513144016266,
854
  "learning_rate": 5.562489626255104e-06,
855
- "loss": 0.3125,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.5236051502145923,
860
- "grad_norm": 0.4309280216693878,
861
  "learning_rate": 5.487748278476342e-06,
862
- "loss": 0.3124,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.5278969957081545,
867
- "grad_norm": 0.3942621946334839,
868
  "learning_rate": 5.412896727361663e-06,
869
- "loss": 0.2525,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.5321888412017167,
874
- "grad_norm": 0.40998613834381104,
875
  "learning_rate": 5.337951885099167e-06,
876
- "loss": 0.2895,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.5364806866952789,
881
- "grad_norm": 0.4335436522960663,
882
  "learning_rate": 5.262930684955439e-06,
883
- "loss": 0.2924,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5407725321888412,
888
- "grad_norm": 0.4402632415294647,
889
  "learning_rate": 5.187850077449604e-06,
890
- "loss": 0.317,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5450643776824035,
895
- "grad_norm": 0.39414867758750916,
896
  "learning_rate": 5.112727026523461e-06,
897
- "loss": 0.2753,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5493562231759657,
902
- "grad_norm": 0.4759502410888672,
903
  "learning_rate": 5.03757850570861e-06,
904
- "loss": 0.3066,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5536480686695279,
909
- "grad_norm": 0.41338634490966797,
910
  "learning_rate": 4.9624214942913916e-06,
911
- "loss": 0.3013,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5579399141630901,
916
- "grad_norm": 0.4515892267227173,
917
  "learning_rate": 4.88727297347654e-06,
918
- "loss": 0.306,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5622317596566524,
923
- "grad_norm": 0.3964577317237854,
924
  "learning_rate": 4.8121499225503974e-06,
925
- "loss": 0.2874,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5665236051502146,
930
- "grad_norm": 0.41198039054870605,
931
  "learning_rate": 4.737069315044562e-06,
932
- "loss": 0.2947,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5708154506437768,
937
- "grad_norm": 0.4514983892440796,
938
  "learning_rate": 4.662048114900837e-06,
939
- "loss": 0.3235,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.575107296137339,
944
- "grad_norm": 0.4610046446323395,
945
  "learning_rate": 4.587103272638339e-06,
946
- "loss": 0.3268,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5793991416309013,
951
- "grad_norm": 0.4211605489253998,
952
  "learning_rate": 4.512251721523659e-06,
953
- "loss": 0.2671,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5836909871244635,
958
- "grad_norm": 0.4412516951560974,
959
  "learning_rate": 4.437510373744897e-06,
960
- "loss": 0.3072,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.5879828326180258,
965
- "grad_norm": 0.4627492427825928,
966
  "learning_rate": 4.362896116590475e-06,
967
- "loss": 0.3031,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.592274678111588,
972
- "grad_norm": 0.4873397648334503,
973
  "learning_rate": 4.2884258086335755e-06,
974
- "loss": 0.3223,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5965665236051502,
979
- "grad_norm": 0.41365429759025574,
980
  "learning_rate": 4.214116275923051e-06,
981
- "loss": 0.2736,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.6008583690987125,
986
- "grad_norm": 0.45329636335372925,
987
  "learning_rate": 4.1399843081817085e-06,
988
- "loss": 0.3119,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.6051502145922747,
993
- "grad_norm": 0.41913923621177673,
994
  "learning_rate": 4.066046655012786e-06,
995
- "loss": 0.3037,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.6094420600858369,
1000
- "grad_norm": 0.43529585003852844,
1001
  "learning_rate": 3.992320022115492e-06,
1002
- "loss": 0.3148,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.6137339055793991,
1007
- "grad_norm": 0.4258601665496826,
1008
  "learning_rate": 3.918821067510464e-06,
1009
- "loss": 0.2917,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.6180257510729614,
1014
- "grad_norm": 0.46047812700271606,
1015
  "learning_rate": 3.845566397776022e-06,
1016
- "loss": 0.3298,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.6223175965665236,
1021
- "grad_norm": 0.42097219824790955,
1022
  "learning_rate": 3.7725725642960047e-06,
1023
- "loss": 0.3056,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.6266094420600858,
1028
- "grad_norm": 0.43911123275756836,
1029
  "learning_rate": 3.6998560595201188e-06,
1030
- "loss": 0.2912,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.630901287553648,
1035
- "grad_norm": 0.42664578557014465,
1036
  "learning_rate": 3.627433313237576e-06,
1037
- "loss": 0.2955,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.6351931330472103,
1042
- "grad_norm": 0.42000484466552734,
1043
  "learning_rate": 3.555320688864889e-06,
1044
- "loss": 0.2903,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.6394849785407726,
1049
- "grad_norm": 0.4376209080219269,
1050
  "learning_rate": 3.483534479748688e-06,
1051
- "loss": 0.291,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.6437768240343348,
1056
- "grad_norm": 0.4324072301387787,
1057
  "learning_rate": 3.4120909054843375e-06,
1058
- "loss": 0.2824,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.648068669527897,
1063
- "grad_norm": 0.38668328523635864,
1064
  "learning_rate": 3.3410061082512422e-06,
1065
- "loss": 0.2586,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6523605150214592,
1070
- "grad_norm": 0.40362098813056946,
1071
  "learning_rate": 3.2702961491656197e-06,
1072
- "loss": 0.2853,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.6566523605150214,
1077
- "grad_norm": 0.3973497152328491,
1078
  "learning_rate": 3.1999770046516198e-06,
1079
- "loss": 0.2826,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6609442060085837,
1084
- "grad_norm": 0.4138431251049042,
1085
  "learning_rate": 3.130064562831553e-06,
1086
- "loss": 0.2757,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6652360515021459,
1091
- "grad_norm": 0.4256836771965027,
1092
  "learning_rate": 3.0605746199360755e-06,
1093
- "loss": 0.2978,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6695278969957081,
1098
- "grad_norm": 0.41062039136886597,
1099
  "learning_rate": 2.991522876735154e-06,
1100
- "loss": 0.2859,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6738197424892703,
1105
- "grad_norm": 0.44024190306663513,
1106
  "learning_rate": 2.9229249349905686e-06,
1107
- "loss": 0.3014,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6781115879828327,
1112
- "grad_norm": 0.44576936960220337,
1113
  "learning_rate": 2.8547962939308187e-06,
1114
- "loss": 0.2828,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6824034334763949,
1119
- "grad_norm": 0.4253944456577301,
1120
  "learning_rate": 2.787152346749173e-06,
1121
- "loss": 0.3,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6866952789699571,
1126
- "grad_norm": 0.40661880373954773,
1127
  "learning_rate": 2.720008377125682e-06,
1128
- "loss": 0.2811,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6909871244635193,
1133
- "grad_norm": 0.45691847801208496,
1134
  "learning_rate": 2.6533795557739407e-06,
1135
- "loss": 0.3238,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6952789699570815,
1140
- "grad_norm": 0.42418181896209717,
1141
  "learning_rate": 2.5872809370133704e-06,
1142
- "loss": 0.2935,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6995708154506438,
1147
- "grad_norm": 0.42245230078697205,
1148
  "learning_rate": 2.5217274553677975e-06,
1149
- "loss": 0.2985,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.703862660944206,
1154
- "grad_norm": 0.41597869992256165,
1155
  "learning_rate": 2.4567339221911086e-06,
1156
- "loss": 0.2923,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.7081545064377682,
1161
- "grad_norm": 0.44589149951934814,
1162
  "learning_rate": 2.3923150223207176e-06,
1163
- "loss": 0.3106,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.7124463519313304,
1168
- "grad_norm": 0.40725013613700867,
1169
  "learning_rate": 2.328485310759635e-06,
1170
- "loss": 0.2861,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.7167381974248928,
1175
- "grad_norm": 0.44540056586265564,
1176
  "learning_rate": 2.265259209387867e-06,
1177
- "loss": 0.3039,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.721030042918455,
1182
- "grad_norm": 0.4068719446659088,
1183
  "learning_rate": 2.202651003703885e-06,
1184
- "loss": 0.2802,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.7253218884120172,
1189
- "grad_norm": 0.39111173152923584,
1190
  "learning_rate": 2.140674839596931e-06,
1191
- "loss": 0.2656,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.7296137339055794,
1196
- "grad_norm": 0.4605322480201721,
1197
  "learning_rate": 2.0793447201508288e-06,
1198
- "loss": 0.3151,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.7339055793991416,
1203
- "grad_norm": 0.4269479513168335,
1204
  "learning_rate": 2.01867450248011e-06,
1205
- "loss": 0.3022,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.7381974248927039,
1210
- "grad_norm": 0.3947524130344391,
1211
  "learning_rate": 1.9586778945990785e-06,
1212
- "loss": 0.2815,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.7424892703862661,
1217
- "grad_norm": 0.43399283289909363,
1218
  "learning_rate": 1.8993684523245842e-06,
1219
- "loss": 0.2899,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.7467811158798283,
1224
- "grad_norm": 0.4567514955997467,
1225
  "learning_rate": 1.8407595762131814e-06,
1226
- "loss": 0.3411,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.7510729613733905,
1231
- "grad_norm": 0.4480878710746765,
1232
  "learning_rate": 1.7828645085333645e-06,
1233
- "loss": 0.3458,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.7553648068669528,
1238
- "grad_norm": 0.44001972675323486,
1239
  "learning_rate": 1.7256963302735752e-06,
1240
- "loss": 0.3171,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.759656652360515,
1245
- "grad_norm": 0.42058637738227844,
1246
  "learning_rate": 1.6692679581866334e-06,
1247
- "loss": 0.3039,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.7639484978540773,
1252
- "grad_norm": 0.42307794094085693,
1253
  "learning_rate": 1.6135921418712959e-06,
1254
- "loss": 0.2734,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7682403433476395,
1259
- "grad_norm": 0.42678767442703247,
1260
  "learning_rate": 1.5586814608915673e-06,
1261
- "loss": 0.3135,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7725321888412017,
1266
- "grad_norm": 0.4102153480052948,
1267
  "learning_rate": 1.5045483219344387e-06,
1268
- "loss": 0.2792,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.776824034334764,
1273
- "grad_norm": 0.40929415822029114,
1274
  "learning_rate": 1.4512049560066837e-06,
1275
- "loss": 0.2758,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7811158798283262,
1280
- "grad_norm": 0.42089492082595825,
1281
  "learning_rate": 1.3986634156713418e-06,
1282
- "loss": 0.2835,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7854077253218884,
1287
- "grad_norm": 0.420242041349411,
1288
  "learning_rate": 1.3469355723245303e-06,
1289
- "loss": 0.2855,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7896995708154506,
1294
- "grad_norm": 0.45139551162719727,
1295
  "learning_rate": 1.2960331135131826e-06,
1296
- "loss": 0.3284,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.7939914163090128,
1301
- "grad_norm": 0.4033341109752655,
1302
  "learning_rate": 1.245967540294329e-06,
1303
- "loss": 0.2848,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7982832618025751,
1308
- "grad_norm": 0.37155604362487793,
1309
  "learning_rate": 1.1967501646365147e-06,
1310
- "loss": 0.2757,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.8025751072961373,
1315
- "grad_norm": 0.4291873872280121,
1316
  "learning_rate": 1.1483921068639353e-06,
1317
- "loss": 0.2755,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.8068669527896996,
1322
- "grad_norm": 0.4211515784263611,
1323
  "learning_rate": 1.1009042931438784e-06,
1324
- "loss": 0.298,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.8111587982832618,
1329
- "grad_norm": 0.4309849441051483,
1330
  "learning_rate": 1.0542974530180327e-06,
1331
- "loss": 0.3067,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.8154506437768241,
1336
- "grad_norm": 0.3826170563697815,
1337
  "learning_rate": 1.00858211697822e-06,
1338
- "loss": 0.265,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.8197424892703863,
1343
- "grad_norm": 0.40116748213768005,
1344
  "learning_rate": 9.637686140871121e-07,
1345
- "loss": 0.2901,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.8240343347639485,
1350
- "grad_norm": 0.40444761514663696,
1351
  "learning_rate": 9.198670696444339e-07,
1352
- "loss": 0.2805,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.8283261802575107,
1357
- "grad_norm": 0.521791934967041,
1358
  "learning_rate": 8.768874028992431e-07,
1359
- "loss": 0.3184,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.8326180257510729,
1364
- "grad_norm": 0.4177843928337097,
1365
  "learning_rate": 8.348393248087289e-07,
1366
- "loss": 0.3183,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.8369098712446352,
1371
- "grad_norm": 0.44546449184417725,
1372
  "learning_rate": 7.937323358440935e-07,
1373
- "loss": 0.3407,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.8412017167381974,
1378
- "grad_norm": 0.43112218379974365,
1379
  "learning_rate": 7.535757238439939e-07,
1380
- "loss": 0.3112,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.8454935622317596,
1385
- "grad_norm": 0.38515380024909973,
1386
  "learning_rate": 7.143785619160026e-07,
1387
- "loss": 0.2781,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.8497854077253219,
1392
- "grad_norm": 0.40132516622543335,
1393
  "learning_rate": 6.761497063866207e-07,
1394
- "loss": 0.2836,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.8540772532188842,
1399
- "grad_norm": 0.43590113520622253,
1400
  "learning_rate": 6.388977948002406e-07,
1401
- "loss": 0.3019,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.8583690987124464,
1406
- "grad_norm": 0.39947134256362915,
1407
  "learning_rate": 6.026312439675553e-07,
1408
- "loss": 0.2741,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8626609442060086,
1413
- "grad_norm": 0.3957541882991791,
1414
  "learning_rate": 5.673582480638395e-07,
1415
- "loss": 0.3023,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8669527896995708,
1420
- "grad_norm": 0.44793784618377686,
1421
  "learning_rate": 5.330867767775333e-07,
1422
- "loss": 0.3392,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.871244635193133,
1427
- "grad_norm": 0.40923067927360535,
1428
  "learning_rate": 4.998245735095459e-07,
1429
- "loss": 0.3041,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8755364806866953,
1434
- "grad_norm": 0.413678377866745,
1435
  "learning_rate": 4.6757915362368567e-07,
1436
- "loss": 0.2865,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8798283261802575,
1441
- "grad_norm": 0.4134410619735718,
1442
  "learning_rate": 4.363578027486187e-07,
1443
- "loss": 0.302,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8841201716738197,
1448
- "grad_norm": 0.41365522146224976,
1449
  "learning_rate": 4.0616757513173123e-07,
1450
- "loss": 0.2828,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8884120171673819,
1455
- "grad_norm": 0.4147285521030426,
1456
  "learning_rate": 3.7701529204526856e-07,
1457
- "loss": 0.2937,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8927038626609443,
1462
- "grad_norm": 0.41686710715293884,
1463
  "learning_rate": 3.4890754024512254e-07,
1464
- "loss": 0.3021,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8969957081545065,
1469
- "grad_norm": 0.4329932630062103,
1470
  "learning_rate": 3.2185067048259245e-07,
1471
- "loss": 0.2926,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.9012875536480687,
1476
- "grad_norm": 0.3969036936759949,
1477
  "learning_rate": 2.9585079606947843e-07,
1478
- "loss": 0.2973,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.9055793991416309,
1483
- "grad_norm": 0.42916810512542725,
1484
  "learning_rate": 2.7091379149682683e-07,
1485
- "loss": 0.2745,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.9098712446351931,
1490
- "grad_norm": 0.3912080228328705,
1491
  "learning_rate": 2.470452911076227e-07,
1492
- "loss": 0.2763,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.9141630901287554,
1497
- "grad_norm": 0.4045410454273224,
1498
  "learning_rate": 2.242506878237538e-07,
1499
- "loss": 0.2944,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.9184549356223176,
1504
- "grad_norm": 0.3928059935569763,
1505
  "learning_rate": 2.0253513192751374e-07,
1506
- "loss": 0.2821,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.9227467811158798,
1511
- "grad_norm": 0.42325636744499207,
1512
  "learning_rate": 1.8190352989793325e-07,
1513
- "loss": 0.2807,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.927038626609442,
1518
- "grad_norm": 0.4236631691455841,
1519
  "learning_rate": 1.6236054330219853e-07,
1520
- "loss": 0.2996,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.9313304721030042,
1525
- "grad_norm": 0.416293740272522,
1526
  "learning_rate": 1.439105877423963e-07,
1527
- "loss": 0.2958,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.9356223175965666,
1532
- "grad_norm": 0.4194984436035156,
1533
  "learning_rate": 1.2655783185784253e-07,
1534
- "loss": 0.2971,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.9399141630901288,
1539
- "grad_norm": 0.4220767617225647,
1540
  "learning_rate": 1.1030619638320805e-07,
1541
- "loss": 0.2947,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.944206008583691,
1546
- "grad_norm": 0.463470995426178,
1547
  "learning_rate": 9.51593532626538e-08,
1548
- "loss": 0.2804,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.9484978540772532,
1553
- "grad_norm": 0.44654470682144165,
1554
  "learning_rate": 8.11207248201834e-08,
1555
- "loss": 0.2907,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.9527896995708155,
1560
- "grad_norm": 0.5606074333190918,
1561
  "learning_rate": 6.819348298638839e-08,
1562
- "loss": 0.3248,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.9570815450643777,
1567
- "grad_norm": 0.41357025504112244,
1568
  "learning_rate": 5.638054858177644e-08,
1569
- "loss": 0.2782,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.9613733905579399,
1574
- "grad_norm": 0.4360516369342804,
1575
  "learning_rate": 4.568459065683206e-08,
1576
- "loss": 0.2866,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.9656652360515021,
1581
- "grad_norm": 0.43068474531173706,
1582
  "learning_rate": 3.610802588895845e-08,
1583
- "loss": 0.3177,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.9699570815450643,
1588
- "grad_norm": 0.39279019832611084,
1589
  "learning_rate": 2.765301803645426e-08,
1590
- "loss": 0.2584,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9742489270386266,
1595
- "grad_norm": 0.42877382040023804,
1596
  "learning_rate": 2.0321477449619098e-08,
1597
- "loss": 0.3091,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9785407725321889,
1602
- "grad_norm": 0.4641684889793396,
1603
  "learning_rate": 1.411506063912882e-08,
1604
- "loss": 0.3223,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9828326180257511,
1609
- "grad_norm": 0.41311776638031006,
1610
  "learning_rate": 9.035169901754902e-09,
1611
- "loss": 0.2827,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9871244635193133,
1616
- "grad_norm": 0.40819963812828064,
1617
  "learning_rate": 5.082953003528457e-09,
1618
- "loss": 0.2759,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9914163090128756,
1623
- "grad_norm": 0.39194202423095703,
1624
  "learning_rate": 2.2593029204076578e-09,
1625
- "loss": 0.2787,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9957081545064378,
1630
- "grad_norm": 0.43726563453674316,
1631
  "learning_rate": 5.648576365169245e-10,
1632
- "loss": 0.3068,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.0,
1637
- "grad_norm": 0.40751731395721436,
1638
  "learning_rate": 0.0,
1639
- "loss": 0.3021,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.0,
1644
- "eval_loss": 0.3932158648967743,
1645
- "eval_runtime": 1.5342,
1646
- "eval_samples_per_second": 149.92,
1647
- "eval_steps_per_second": 6.518,
1648
  "step": 233
1649
  },
1650
  {
1651
  "epoch": 1.0,
1652
  "step": 233,
1653
  "total_flos": 1.933383782157517e+16,
1654
- "train_loss": 0.43314852683840904,
1655
- "train_runtime": 303.7389,
1656
- "train_samples_per_second": 36.676,
1657
- "train_steps_per_second": 0.767
1658
  }
1659
  ],
1660
  "logging_steps": 1,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004291845493562232,
13
+ "grad_norm": 6.858922481536865,
14
  "learning_rate": 4.1666666666666667e-07,
15
  "loss": 2.3401,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.008583690987124463,
20
+ "grad_norm": 7.190420150756836,
21
  "learning_rate": 8.333333333333333e-07,
22
  "loss": 2.3774,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.012875536480686695,
27
+ "grad_norm": 7.038424015045166,
28
  "learning_rate": 1.25e-06,
29
+ "loss": 2.3585,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.017167381974248927,
34
+ "grad_norm": 6.984631538391113,
35
  "learning_rate": 1.6666666666666667e-06,
36
+ "loss": 2.3643,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.02145922746781116,
41
+ "grad_norm": 6.922569751739502,
42
  "learning_rate": 2.0833333333333334e-06,
43
+ "loss": 2.3493,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.02575107296137339,
48
+ "grad_norm": 6.812183380126953,
49
  "learning_rate": 2.5e-06,
50
+ "loss": 2.2802,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.030042918454935622,
55
+ "grad_norm": 6.6095428466796875,
56
  "learning_rate": 2.916666666666667e-06,
57
+ "loss": 2.2738,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.034334763948497854,
62
+ "grad_norm": 5.877363204956055,
63
  "learning_rate": 3.3333333333333333e-06,
64
+ "loss": 2.1442,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03862660944206009,
69
+ "grad_norm": 5.8311381340026855,
70
  "learning_rate": 3.7500000000000005e-06,
71
+ "loss": 2.1174,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.04291845493562232,
76
+ "grad_norm": 5.568020820617676,
77
  "learning_rate": 4.166666666666667e-06,
78
+ "loss": 2.0909,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04721030042918455,
83
+ "grad_norm": 4.405425071716309,
84
  "learning_rate": 4.583333333333333e-06,
85
+ "loss": 1.7761,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.05150214592274678,
90
+ "grad_norm": 4.338817119598389,
91
  "learning_rate": 5e-06,
92
+ "loss": 1.677,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.055793991416309016,
97
+ "grad_norm": 4.237913131713867,
98
  "learning_rate": 5.416666666666667e-06,
99
+ "loss": 1.623,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.060085836909871244,
104
+ "grad_norm": 3.4793739318847656,
105
  "learning_rate": 5.833333333333334e-06,
106
+ "loss": 1.1545,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.06437768240343347,
111
+ "grad_norm": 3.606027364730835,
112
  "learning_rate": 6.25e-06,
113
+ "loss": 1.0947,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06866952789699571,
118
+ "grad_norm": 3.3845248222351074,
119
  "learning_rate": 6.666666666666667e-06,
120
+ "loss": 1.0227,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.07296137339055794,
125
+ "grad_norm": 2.8718042373657227,
126
  "learning_rate": 7.083333333333335e-06,
127
+ "loss": 0.9031,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07725321888412018,
132
+ "grad_norm": 2.50516939163208,
133
  "learning_rate": 7.500000000000001e-06,
134
+ "loss": 0.849,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0815450643776824,
139
+ "grad_norm": 2.0661134719848633,
140
  "learning_rate": 7.916666666666667e-06,
141
+ "loss": 0.587,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.08583690987124463,
146
+ "grad_norm": 1.3932267427444458,
147
  "learning_rate": 8.333333333333334e-06,
148
+ "loss": 0.4799,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.09012875536480687,
153
+ "grad_norm": 1.0650274753570557,
154
  "learning_rate": 8.750000000000001e-06,
155
+ "loss": 0.4626,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0944206008583691,
160
+ "grad_norm": 0.8739241361618042,
161
  "learning_rate": 9.166666666666666e-06,
162
+ "loss": 0.4227,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09871244635193133,
167
+ "grad_norm": 0.7223235368728638,
168
  "learning_rate": 9.583333333333335e-06,
169
+ "loss": 0.4152,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.10300429184549356,
174
+ "grad_norm": 0.694736123085022,
175
  "learning_rate": 1e-05,
176
+ "loss": 0.417,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.1072961373390558,
181
+ "grad_norm": 0.6262380480766296,
182
  "learning_rate": 9.999435142363484e-06,
183
+ "loss": 0.3824,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.11158798283261803,
188
+ "grad_norm": 0.7715851068496704,
189
  "learning_rate": 9.997740697079595e-06,
190
+ "loss": 0.4202,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.11587982832618025,
195
+ "grad_norm": 0.650015652179718,
196
  "learning_rate": 9.994917046996472e-06,
197
+ "loss": 0.367,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.12017167381974249,
202
+ "grad_norm": 0.6557629704475403,
203
  "learning_rate": 9.990964830098246e-06,
204
+ "loss": 0.3784,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.12446351931330472,
209
+ "grad_norm": 0.6453913450241089,
210
  "learning_rate": 9.985884939360873e-06,
211
+ "loss": 0.4083,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.12875536480686695,
216
+ "grad_norm": 0.5659867525100708,
217
  "learning_rate": 9.979678522550382e-06,
218
+ "loss": 0.3833,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.13304721030042918,
223
+ "grad_norm": 0.5851492881774902,
224
  "learning_rate": 9.972346981963546e-06,
225
+ "loss": 0.3664,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.13733905579399142,
230
+ "grad_norm": 0.6640633940696716,
231
  "learning_rate": 9.963891974111042e-06,
232
+ "loss": 0.3746,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.14163090128755365,
237
+ "grad_norm": 0.5522991418838501,
238
  "learning_rate": 9.95431540934317e-06,
239
+ "loss": 0.3726,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1459227467811159,
244
+ "grad_norm": 0.5857948660850525,
245
  "learning_rate": 9.943619451418225e-06,
246
+ "loss": 0.3985,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.15021459227467812,
251
+ "grad_norm": 0.5637620687484741,
252
  "learning_rate": 9.931806517013612e-06,
253
+ "loss": 0.358,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.15450643776824036,
258
+ "grad_norm": 0.5759857892990112,
259
  "learning_rate": 9.918879275179819e-06,
260
+ "loss": 0.3415,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.15879828326180256,
265
+ "grad_norm": 0.5764205455780029,
266
  "learning_rate": 9.904840646737346e-06,
267
  "loss": 0.3734,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1630901287553648,
272
+ "grad_norm": 0.5272979736328125,
273
  "learning_rate": 9.889693803616793e-06,
274
+ "loss": 0.3689,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.16738197424892703,
279
+ "grad_norm": 0.49785128235816956,
280
  "learning_rate": 9.873442168142158e-06,
281
+ "loss": 0.3608,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.17167381974248927,
286
+ "grad_norm": 0.5396416187286377,
287
  "learning_rate": 9.856089412257605e-06,
288
+ "loss": 0.3354,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.1759656652360515,
293
+ "grad_norm": 0.6894033551216125,
294
  "learning_rate": 9.837639456697802e-06,
295
+ "loss": 0.3833,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.18025751072961374,
300
+ "grad_norm": 0.4849610924720764,
301
  "learning_rate": 9.818096470102067e-06,
302
+ "loss": 0.3338,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.18454935622317598,
307
+ "grad_norm": 0.5170153379440308,
308
  "learning_rate": 9.797464868072489e-06,
309
+ "loss": 0.3529,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.1888412017167382,
314
+ "grad_norm": 0.5858620405197144,
315
  "learning_rate": 9.775749312176249e-06,
316
+ "loss": 0.3305,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.19313304721030042,
321
+ "grad_norm": 0.4724620282649994,
322
  "learning_rate": 9.752954708892379e-06,
323
+ "loss": 0.3386,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.19742489270386265,
328
+ "grad_norm": 0.5135810971260071,
329
  "learning_rate": 9.729086208503174e-06,
330
  "loss": 0.3416,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.2017167381974249,
335
+ "grad_norm": 0.4790899455547333,
336
  "learning_rate": 9.704149203930522e-06,
337
+ "loss": 0.3642,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.20600858369098712,
342
+ "grad_norm": 0.5261183977127075,
343
  "learning_rate": 9.67814932951741e-06,
344
+ "loss": 0.372,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.21030042918454936,
349
+ "grad_norm": 0.7327371835708618,
350
  "learning_rate": 9.651092459754879e-06,
351
+ "loss": 0.3721,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.2145922746781116,
356
+ "grad_norm": 0.4842984974384308,
357
  "learning_rate": 9.622984707954732e-06,
358
+ "loss": 0.3484,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.21888412017167383,
363
+ "grad_norm": 0.5505157113075256,
364
  "learning_rate": 9.593832424868271e-06,
365
+ "loss": 0.3323,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.22317596566523606,
370
+ "grad_norm": 0.7517397999763489,
371
  "learning_rate": 9.563642197251382e-06,
372
+ "loss": 0.3712,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.22746781115879827,
377
+ "grad_norm": 0.5905287265777588,
378
  "learning_rate": 9.532420846376316e-06,
379
+ "loss": 0.3285,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.2317596566523605,
384
+ "grad_norm": 0.47292834520339966,
385
  "learning_rate": 9.500175426490455e-06,
386
+ "loss": 0.3177,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.23605150214592274,
391
+ "grad_norm": 0.5047996640205383,
392
  "learning_rate": 9.466913223222467e-06,
393
+ "loss": 0.3494,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.24034334763948498,
398
+ "grad_norm": 0.46452632546424866,
399
  "learning_rate": 9.432641751936162e-06,
400
  "loss": 0.3286,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2446351931330472,
405
+ "grad_norm": 0.485176146030426,
406
  "learning_rate": 9.397368756032445e-06,
407
+ "loss": 0.3434,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.24892703862660945,
412
+ "grad_norm": 0.5602961778640747,
413
  "learning_rate": 9.361102205199762e-06,
414
+ "loss": 0.346,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.2532188841201717,
419
+ "grad_norm": 0.4957013726234436,
420
  "learning_rate": 9.32385029361338e-06,
421
+ "loss": 0.3238,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.2575107296137339,
426
+ "grad_norm": 0.5618671178817749,
427
  "learning_rate": 9.285621438083997e-06,
428
+ "loss": 0.3189,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.26180257510729615,
433
+ "grad_norm": 0.5188417434692383,
434
  "learning_rate": 9.246424276156008e-06,
435
+ "loss": 0.3294,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.26609442060085836,
440
+ "grad_norm": 0.5750749111175537,
441
  "learning_rate": 9.206267664155906e-06,
442
+ "loss": 0.3271,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2703862660944206,
447
+ "grad_norm": 0.4488634169101715,
448
  "learning_rate": 9.165160675191272e-06,
449
+ "loss": 0.3317,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.27467811158798283,
454
+ "grad_norm": 0.8180350661277771,
455
  "learning_rate": 9.123112597100759e-06,
456
+ "loss": 0.3379,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.27896995708154504,
461
+ "grad_norm": 0.5122578740119934,
462
  "learning_rate": 9.080132930355567e-06,
463
+ "loss": 0.3331,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2832618025751073,
468
+ "grad_norm": 0.45875608921051025,
469
  "learning_rate": 9.03623138591289e-06,
470
+ "loss": 0.308,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2875536480686695,
475
+ "grad_norm": 0.44469574093818665,
476
  "learning_rate": 8.99141788302178e-06,
477
+ "loss": 0.3068,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2918454935622318,
482
+ "grad_norm": 0.5081714987754822,
483
  "learning_rate": 8.94570254698197e-06,
484
+ "loss": 0.3,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.296137339055794,
489
+ "grad_norm": 0.45017769932746887,
490
  "learning_rate": 8.899095706856122e-06,
491
+ "loss": 0.3223,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.30042918454935624,
496
+ "grad_norm": 0.4976506233215332,
497
  "learning_rate": 8.851607893136065e-06,
498
+ "loss": 0.352,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.30472103004291845,
503
+ "grad_norm": 0.4459227919578552,
504
  "learning_rate": 8.803249835363486e-06,
505
+ "loss": 0.2959,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3090128755364807,
510
+ "grad_norm": 0.4941282272338867,
511
  "learning_rate": 8.754032459705672e-06,
512
+ "loss": 0.3462,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.3133047210300429,
517
+ "grad_norm": 0.4608596861362457,
518
  "learning_rate": 8.703966886486819e-06,
519
+ "loss": 0.3291,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.31759656652360513,
524
+ "grad_norm": 0.5216560959815979,
525
  "learning_rate": 8.65306442767547e-06,
526
+ "loss": 0.3205,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3218884120171674,
531
+ "grad_norm": 0.6270391941070557,
532
  "learning_rate": 8.601336584328659e-06,
533
+ "loss": 0.3354,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3261802575107296,
538
+ "grad_norm": 0.5226686000823975,
539
  "learning_rate": 8.548795043993316e-06,
540
+ "loss": 0.2959,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.33047210300429186,
545
+ "grad_norm": 0.45553216338157654,
546
  "learning_rate": 8.495451678065563e-06,
547
+ "loss": 0.3333,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.33476394849785407,
552
+ "grad_norm": 0.4647076427936554,
553
  "learning_rate": 8.441318539108433e-06,
554
+ "loss": 0.3463,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.33905579399141633,
559
+ "grad_norm": 0.4420306086540222,
560
  "learning_rate": 8.386407858128707e-06,
561
+ "loss": 0.3377,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.34334763948497854,
566
+ "grad_norm": 0.4489307403564453,
567
  "learning_rate": 8.330732041813367e-06,
568
+ "loss": 0.3138,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.34763948497854075,
573
+ "grad_norm": 0.47731906175613403,
574
  "learning_rate": 8.274303669726427e-06,
575
+ "loss": 0.345,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.351931330472103,
580
+ "grad_norm": 0.5177859663963318,
581
  "learning_rate": 8.217135491466636e-06,
582
+ "loss": 0.3468,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.3562231759656652,
587
+ "grad_norm": 0.4289628565311432,
588
  "learning_rate": 8.15924042378682e-06,
589
+ "loss": 0.3045,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.3605150214592275,
594
+ "grad_norm": 0.4363357424736023,
595
  "learning_rate": 8.100631547675417e-06,
596
+ "loss": 0.3042,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3648068669527897,
601
+ "grad_norm": 0.46649646759033203,
602
  "learning_rate": 8.041322105400923e-06,
603
+ "loss": 0.3346,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.36909871244635195,
608
+ "grad_norm": 0.45827436447143555,
609
  "learning_rate": 7.981325497519892e-06,
610
+ "loss": 0.3199,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.37339055793991416,
615
+ "grad_norm": 0.46927300095558167,
616
  "learning_rate": 7.920655279849173e-06,
617
+ "loss": 0.3216,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3776824034334764,
622
+ "grad_norm": 0.4526705741882324,
623
  "learning_rate": 7.859325160403073e-06,
624
+ "loss": 0.341,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.38197424892703863,
629
+ "grad_norm": 0.47333866357803345,
630
  "learning_rate": 7.797348996296116e-06,
631
+ "loss": 0.3214,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.38626609442060084,
636
+ "grad_norm": 0.4578205645084381,
637
  "learning_rate": 7.734740790612137e-06,
638
+ "loss": 0.3408,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.3905579399141631,
643
+ "grad_norm": 0.43222102522850037,
644
  "learning_rate": 7.671514689240366e-06,
645
  "loss": 0.3076,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3948497854077253,
650
+ "grad_norm": 0.4864787459373474,
651
  "learning_rate": 7.607684977679284e-06,
652
+ "loss": 0.2992,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.39914163090128757,
657
+ "grad_norm": 0.4219631850719452,
658
  "learning_rate": 7.543266077808893e-06,
659
+ "loss": 0.2855,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.4034334763948498,
664
+ "grad_norm": 0.4421772062778473,
665
  "learning_rate": 7.478272544632204e-06,
666
+ "loss": 0.3204,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.40772532188841204,
671
+ "grad_norm": 0.5428329110145569,
672
  "learning_rate": 7.412719062986632e-06,
673
+ "loss": 0.3378,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.41201716738197425,
678
+ "grad_norm": 0.42145970463752747,
679
  "learning_rate": 7.3466204442260605e-06,
680
+ "loss": 0.2763,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.41630901287553645,
685
+ "grad_norm": 0.459547221660614,
686
  "learning_rate": 7.279991622874319e-06,
687
+ "loss": 0.3421,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.4206008583690987,
692
+ "grad_norm": 0.4329896569252014,
693
  "learning_rate": 7.212847653250828e-06,
694
+ "loss": 0.2995,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.4248927038626609,
699
+ "grad_norm": 0.4259698688983917,
700
  "learning_rate": 7.145203706069183e-06,
701
+ "loss": 0.2866,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.4291845493562232,
706
+ "grad_norm": 0.46346020698547363,
707
  "learning_rate": 7.0770750650094335e-06,
708
+ "loss": 0.3374,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4334763948497854,
713
+ "grad_norm": 0.4846978187561035,
714
  "learning_rate": 7.008477123264849e-06,
715
+ "loss": 0.3449,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.43776824034334766,
720
+ "grad_norm": 0.43872982263565063,
721
  "learning_rate": 6.939425380063924e-06,
722
+ "loss": 0.3158,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.44206008583690987,
727
+ "grad_norm": 0.4564816951751709,
728
  "learning_rate": 6.869935437168449e-06,
729
+ "loss": 0.3233,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.44635193133047213,
734
+ "grad_norm": 0.4504660964012146,
735
  "learning_rate": 6.800022995348381e-06,
736
+ "loss": 0.3186,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.45064377682403434,
741
+ "grad_norm": 0.4534120261669159,
742
  "learning_rate": 6.729703850834381e-06,
743
+ "loss": 0.3256,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.45493562231759654,
748
+ "grad_norm": 0.41408586502075195,
749
  "learning_rate": 6.65899389174876e-06,
750
+ "loss": 0.2824,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.4592274678111588,
755
+ "grad_norm": 0.4310588836669922,
756
  "learning_rate": 6.587909094515663e-06,
757
+ "loss": 0.2962,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.463519313304721,
762
+ "grad_norm": 0.39895761013031006,
763
  "learning_rate": 6.5164655202513135e-06,
764
+ "loss": 0.2789,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4678111587982833,
769
+ "grad_norm": 0.4631725251674652,
770
  "learning_rate": 6.444679311135112e-06,
771
+ "loss": 0.3409,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.4721030042918455,
776
+ "grad_norm": 0.4815923869609833,
777
  "learning_rate": 6.372566686762427e-06,
778
+ "loss": 0.3413,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.47639484978540775,
783
+ "grad_norm": 0.4571560025215149,
784
  "learning_rate": 6.300143940479881e-06,
785
+ "loss": 0.3255,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.48068669527896996,
790
+ "grad_norm": 0.44677498936653137,
791
  "learning_rate": 6.227427435703997e-06,
792
+ "loss": 0.2897,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.48497854077253216,
797
+ "grad_norm": 0.47044360637664795,
798
  "learning_rate": 6.154433602223979e-06,
799
+ "loss": 0.2694,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4892703862660944,
804
+ "grad_norm": 0.45956674218177795,
805
  "learning_rate": 6.0811789324895365e-06,
806
+ "loss": 0.3009,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.49356223175965663,
811
+ "grad_norm": 0.4583991467952728,
812
  "learning_rate": 6.0076799778845105e-06,
813
+ "loss": 0.3071,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4978540772532189,
818
+ "grad_norm": 0.4753432273864746,
819
  "learning_rate": 5.933953344987215e-06,
820
+ "loss": 0.3127,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.5021459227467812,
825
+ "grad_norm": 0.47773176431655884,
826
  "learning_rate": 5.860015691818292e-06,
827
+ "loss": 0.3316,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.5064377682403434,
832
+ "grad_norm": 0.4430916905403137,
833
  "learning_rate": 5.78588372407695e-06,
834
+ "loss": 0.2908,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.5107296137339056,
839
+ "grad_norm": 0.4451543390750885,
840
  "learning_rate": 5.711574191366427e-06,
841
+ "loss": 0.3078,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.5150214592274678,
846
+ "grad_norm": 1.239274024963379,
847
  "learning_rate": 5.637103883409525e-06,
848
+ "loss": 0.3047,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.51931330472103,
853
+ "grad_norm": 0.45960766077041626,
854
  "learning_rate": 5.562489626255104e-06,
855
+ "loss": 0.313,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.5236051502145923,
860
+ "grad_norm": 0.46485236287117004,
861
  "learning_rate": 5.487748278476342e-06,
862
+ "loss": 0.3129,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.5278969957081545,
867
+ "grad_norm": 0.4218948781490326,
868
  "learning_rate": 5.412896727361663e-06,
869
+ "loss": 0.2524,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.5321888412017167,
874
+ "grad_norm": 0.44945651292800903,
875
  "learning_rate": 5.337951885099167e-06,
876
+ "loss": 0.2905,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.5364806866952789,
881
+ "grad_norm": 0.45700693130493164,
882
  "learning_rate": 5.262930684955439e-06,
883
+ "loss": 0.2936,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5407725321888412,
888
+ "grad_norm": 0.46893808245658875,
889
  "learning_rate": 5.187850077449604e-06,
890
+ "loss": 0.3184,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5450643776824035,
895
+ "grad_norm": 0.4275188148021698,
896
  "learning_rate": 5.112727026523461e-06,
897
+ "loss": 0.2762,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5493562231759657,
902
+ "grad_norm": 0.47469258308410645,
903
  "learning_rate": 5.03757850570861e-06,
904
+ "loss": 0.3085,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5536480686695279,
909
+ "grad_norm": 0.49836552143096924,
910
  "learning_rate": 4.9624214942913916e-06,
911
+ "loss": 0.3023,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5579399141630901,
916
+ "grad_norm": 0.5247652530670166,
917
  "learning_rate": 4.88727297347654e-06,
918
+ "loss": 0.3083,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5622317596566524,
923
+ "grad_norm": 0.4549153447151184,
924
  "learning_rate": 4.8121499225503974e-06,
925
+ "loss": 0.2881,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5665236051502146,
930
+ "grad_norm": 0.44566160440444946,
931
  "learning_rate": 4.737069315044562e-06,
932
+ "loss": 0.2951,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5708154506437768,
937
+ "grad_norm": 1.0022189617156982,
938
  "learning_rate": 4.662048114900837e-06,
939
+ "loss": 0.3241,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.575107296137339,
944
+ "grad_norm": 1.2487646341323853,
945
  "learning_rate": 4.587103272638339e-06,
946
+ "loss": 0.3282,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5793991416309013,
951
+ "grad_norm": 0.4649348855018616,
952
  "learning_rate": 4.512251721523659e-06,
953
+ "loss": 0.2695,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5836909871244635,
958
+ "grad_norm": 0.6003963947296143,
959
  "learning_rate": 4.437510373744897e-06,
960
+ "loss": 0.3098,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.5879828326180258,
965
+ "grad_norm": 0.5494801998138428,
966
  "learning_rate": 4.362896116590475e-06,
967
+ "loss": 0.3057,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.592274678111588,
972
+ "grad_norm": 0.46754059195518494,
973
  "learning_rate": 4.2884258086335755e-06,
974
+ "loss": 0.3237,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5965665236051502,
979
+ "grad_norm": 0.6345877051353455,
980
  "learning_rate": 4.214116275923051e-06,
981
+ "loss": 0.2745,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.6008583690987125,
986
+ "grad_norm": 0.5413657426834106,
987
  "learning_rate": 4.1399843081817085e-06,
988
+ "loss": 0.3143,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.6051502145922747,
993
+ "grad_norm": 0.47255265712738037,
994
  "learning_rate": 4.066046655012786e-06,
995
+ "loss": 0.3053,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.6094420600858369,
1000
+ "grad_norm": 0.5756471753120422,
1001
  "learning_rate": 3.992320022115492e-06,
1002
+ "loss": 0.3173,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.6137339055793991,
1007
+ "grad_norm": 0.5067482590675354,
1008
  "learning_rate": 3.918821067510464e-06,
1009
+ "loss": 0.2936,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.6180257510729614,
1014
+ "grad_norm": 0.5305939316749573,
1015
  "learning_rate": 3.845566397776022e-06,
1016
+ "loss": 0.3315,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.6223175965665236,
1021
+ "grad_norm": 0.5009644627571106,
1022
  "learning_rate": 3.7725725642960047e-06,
1023
+ "loss": 0.3065,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.6266094420600858,
1028
+ "grad_norm": 0.8466928005218506,
1029
  "learning_rate": 3.6998560595201188e-06,
1030
+ "loss": 0.2926,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.630901287553648,
1035
+ "grad_norm": 0.4850812256336212,
1036
  "learning_rate": 3.627433313237576e-06,
1037
+ "loss": 0.2967,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.6351931330472103,
1042
+ "grad_norm": 0.49088186025619507,
1043
  "learning_rate": 3.555320688864889e-06,
1044
+ "loss": 0.2921,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.6394849785407726,
1049
+ "grad_norm": 0.4341699481010437,
1050
  "learning_rate": 3.483534479748688e-06,
1051
+ "loss": 0.2927,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.6437768240343348,
1056
+ "grad_norm": 0.48990392684936523,
1057
  "learning_rate": 3.4120909054843375e-06,
1058
+ "loss": 0.2847,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.648068669527897,
1063
+ "grad_norm": 0.39915674924850464,
1064
  "learning_rate": 3.3410061082512422e-06,
1065
+ "loss": 0.259,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6523605150214592,
1070
+ "grad_norm": 0.4499885141849518,
1071
  "learning_rate": 3.2702961491656197e-06,
1072
+ "loss": 0.2874,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.6566523605150214,
1077
+ "grad_norm": 0.4216214120388031,
1078
  "learning_rate": 3.1999770046516198e-06,
1079
+ "loss": 0.2852,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6609442060085837,
1084
+ "grad_norm": 0.4931308627128601,
1085
  "learning_rate": 3.130064562831553e-06,
1086
+ "loss": 0.2776,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6652360515021459,
1091
+ "grad_norm": 0.5528554320335388,
1092
  "learning_rate": 3.0605746199360755e-06,
1093
+ "loss": 0.2991,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6695278969957081,
1098
+ "grad_norm": 0.5081768035888672,
1099
  "learning_rate": 2.991522876735154e-06,
1100
+ "loss": 0.2876,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6738197424892703,
1105
+ "grad_norm": 0.5041592121124268,
1106
  "learning_rate": 2.9229249349905686e-06,
1107
+ "loss": 0.303,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6781115879828327,
1112
+ "grad_norm": 0.4566456377506256,
1113
  "learning_rate": 2.8547962939308187e-06,
1114
+ "loss": 0.2842,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6824034334763949,
1119
+ "grad_norm": 0.6664004325866699,
1120
  "learning_rate": 2.787152346749173e-06,
1121
+ "loss": 0.3024,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6866952789699571,
1126
+ "grad_norm": 0.4658995568752289,
1127
  "learning_rate": 2.720008377125682e-06,
1128
+ "loss": 0.2821,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6909871244635193,
1133
+ "grad_norm": 0.5254700779914856,
1134
  "learning_rate": 2.6533795557739407e-06,
1135
+ "loss": 0.3255,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6952789699570815,
1140
+ "grad_norm": 0.4895904064178467,
1141
  "learning_rate": 2.5872809370133704e-06,
1142
+ "loss": 0.295,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6995708154506438,
1147
+ "grad_norm": 0.5141322612762451,
1148
  "learning_rate": 2.5217274553677975e-06,
1149
+ "loss": 0.3003,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.703862660944206,
1154
+ "grad_norm": 0.4485315978527069,
1155
  "learning_rate": 2.4567339221911086e-06,
1156
+ "loss": 0.294,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.7081545064377682,
1161
+ "grad_norm": 0.4758903384208679,
1162
  "learning_rate": 2.3923150223207176e-06,
1163
+ "loss": 0.3118,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.7124463519313304,
1168
+ "grad_norm": 0.43379268050193787,
1169
  "learning_rate": 2.328485310759635e-06,
1170
+ "loss": 0.2881,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.7167381974248928,
1175
+ "grad_norm": 0.5669881701469421,
1176
  "learning_rate": 2.265259209387867e-06,
1177
+ "loss": 0.3063,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.721030042918455,
1182
+ "grad_norm": 0.43821701407432556,
1183
  "learning_rate": 2.202651003703885e-06,
1184
+ "loss": 0.2817,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.7253218884120172,
1189
+ "grad_norm": 0.5514321327209473,
1190
  "learning_rate": 2.140674839596931e-06,
1191
+ "loss": 0.2665,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.7296137339055794,
1196
+ "grad_norm": 0.543703019618988,
1197
  "learning_rate": 2.0793447201508288e-06,
1198
+ "loss": 0.3159,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.7339055793991416,
1203
+ "grad_norm": 0.4547806978225708,
1204
  "learning_rate": 2.01867450248011e-06,
1205
+ "loss": 0.3043,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.7381974248927039,
1210
+ "grad_norm": 0.413406640291214,
1211
  "learning_rate": 1.9586778945990785e-06,
1212
+ "loss": 0.2827,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.7424892703862661,
1217
+ "grad_norm": 0.46883872151374817,
1218
  "learning_rate": 1.8993684523245842e-06,
1219
+ "loss": 0.291,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.7467811158798283,
1224
+ "grad_norm": 0.5354252457618713,
1225
  "learning_rate": 1.8407595762131814e-06,
1226
+ "loss": 0.3433,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.7510729613733905,
1231
+ "grad_norm": 0.4768048822879791,
1232
  "learning_rate": 1.7828645085333645e-06,
1233
+ "loss": 0.3483,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.7553648068669528,
1238
+ "grad_norm": 0.46260949969291687,
1239
  "learning_rate": 1.7256963302735752e-06,
1240
+ "loss": 0.3188,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.759656652360515,
1245
+ "grad_norm": 0.42871591448783875,
1246
  "learning_rate": 1.6692679581866334e-06,
1247
+ "loss": 0.3064,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.7639484978540773,
1252
+ "grad_norm": 0.4325142800807953,
1253
  "learning_rate": 1.6135921418712959e-06,
1254
+ "loss": 0.2752,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7682403433476395,
1259
+ "grad_norm": 0.7094153761863708,
1260
  "learning_rate": 1.5586814608915673e-06,
1261
+ "loss": 0.3147,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7725321888412017,
1266
+ "grad_norm": 0.5599787831306458,
1267
  "learning_rate": 1.5045483219344387e-06,
1268
+ "loss": 0.2803,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.776824034334764,
1273
+ "grad_norm": 0.4438364505767822,
1274
  "learning_rate": 1.4512049560066837e-06,
1275
+ "loss": 0.2767,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7811158798283262,
1280
+ "grad_norm": 0.46111562848091125,
1281
  "learning_rate": 1.3986634156713418e-06,
1282
+ "loss": 0.286,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7854077253218884,
1287
+ "grad_norm": 0.6384255886077881,
1288
  "learning_rate": 1.3469355723245303e-06,
1289
+ "loss": 0.2875,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7896995708154506,
1294
+ "grad_norm": 0.46488675475120544,
1295
  "learning_rate": 1.2960331135131826e-06,
1296
+ "loss": 0.3303,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.7939914163090128,
1301
+ "grad_norm": 0.4769670367240906,
1302
  "learning_rate": 1.245967540294329e-06,
1303
+ "loss": 0.286,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7982832618025751,
1308
+ "grad_norm": 0.4902667701244354,
1309
  "learning_rate": 1.1967501646365147e-06,
1310
+ "loss": 0.2775,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.8025751072961373,
1315
+ "grad_norm": 0.4696778953075409,
1316
  "learning_rate": 1.1483921068639353e-06,
1317
+ "loss": 0.276,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.8068669527896996,
1322
+ "grad_norm": 0.49862194061279297,
1323
  "learning_rate": 1.1009042931438784e-06,
1324
+ "loss": 0.3005,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.8111587982832618,
1329
+ "grad_norm": 0.46521204710006714,
1330
  "learning_rate": 1.0542974530180327e-06,
1331
+ "loss": 0.3083,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.8154506437768241,
1336
+ "grad_norm": 0.3978651165962219,
1337
  "learning_rate": 1.00858211697822e-06,
1338
+ "loss": 0.2661,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.8197424892703863,
1343
+ "grad_norm": 0.41702109575271606,
1344
  "learning_rate": 9.637686140871121e-07,
1345
+ "loss": 0.2915,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.8240343347639485,
1350
+ "grad_norm": 0.42670562863349915,
1351
  "learning_rate": 9.198670696444339e-07,
1352
+ "loss": 0.2825,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.8283261802575107,
1357
+ "grad_norm": 0.47617340087890625,
1358
  "learning_rate": 8.768874028992431e-07,
1359
+ "loss": 0.3216,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.8326180257510729,
1364
+ "grad_norm": 0.4543740451335907,
1365
  "learning_rate": 8.348393248087289e-07,
1366
+ "loss": 0.3192,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.8369098712446352,
1371
+ "grad_norm": 0.5380436182022095,
1372
  "learning_rate": 7.937323358440935e-07,
1373
+ "loss": 0.3419,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.8412017167381974,
1378
+ "grad_norm": 1.2507916688919067,
1379
  "learning_rate": 7.535757238439939e-07,
1380
+ "loss": 0.313,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.8454935622317596,
1385
+ "grad_norm": 0.4088553488254547,
1386
  "learning_rate": 7.143785619160026e-07,
1387
+ "loss": 0.2796,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.8497854077253219,
1392
+ "grad_norm": 0.41873860359191895,
1393
  "learning_rate": 6.761497063866207e-07,
1394
+ "loss": 0.2863,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.8540772532188842,
1399
+ "grad_norm": 0.5995304584503174,
1400
  "learning_rate": 6.388977948002406e-07,
1401
+ "loss": 0.3035,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.8583690987124464,
1406
+ "grad_norm": 0.46327289938926697,
1407
  "learning_rate": 6.026312439675553e-07,
1408
+ "loss": 0.2753,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8626609442060086,
1413
+ "grad_norm": 0.4259130358695984,
1414
  "learning_rate": 5.673582480638395e-07,
1415
+ "loss": 0.3039,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8669527896995708,
1420
+ "grad_norm": 0.47707608342170715,
1421
  "learning_rate": 5.330867767775333e-07,
1422
+ "loss": 0.3406,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.871244635193133,
1427
+ "grad_norm": 0.585634708404541,
1428
  "learning_rate": 4.998245735095459e-07,
1429
+ "loss": 0.305,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8755364806866953,
1434
+ "grad_norm": 0.46183857321739197,
1435
  "learning_rate": 4.6757915362368567e-07,
1436
+ "loss": 0.2884,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8798283261802575,
1441
+ "grad_norm": 0.45979636907577515,
1442
  "learning_rate": 4.363578027486187e-07,
1443
+ "loss": 0.3036,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8841201716738197,
1448
+ "grad_norm": 0.4133254289627075,
1449
  "learning_rate": 4.0616757513173123e-07,
1450
+ "loss": 0.2845,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8884120171673819,
1455
+ "grad_norm": 0.45074373483657837,
1456
  "learning_rate": 3.7701529204526856e-07,
1457
+ "loss": 0.2955,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8927038626609443,
1462
+ "grad_norm": 0.4666990339756012,
1463
  "learning_rate": 3.4890754024512254e-07,
1464
+ "loss": 0.3046,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8969957081545065,
1469
+ "grad_norm": 0.45079725980758667,
1470
  "learning_rate": 3.2185067048259245e-07,
1471
+ "loss": 0.2938,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.9012875536480687,
1476
+ "grad_norm": 0.45044079422950745,
1477
  "learning_rate": 2.9585079606947843e-07,
1478
+ "loss": 0.3,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.9055793991416309,
1483
+ "grad_norm": 0.4355252683162689,
1484
  "learning_rate": 2.7091379149682683e-07,
1485
+ "loss": 0.2758,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.9098712446351931,
1490
+ "grad_norm": 0.43034669756889343,
1491
  "learning_rate": 2.470452911076227e-07,
1492
+ "loss": 0.2785,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.9141630901287554,
1497
+ "grad_norm": 0.4334523379802704,
1498
  "learning_rate": 2.242506878237538e-07,
1499
+ "loss": 0.297,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.9184549356223176,
1504
+ "grad_norm": 0.40220150351524353,
1505
  "learning_rate": 2.0253513192751374e-07,
1506
+ "loss": 0.2833,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.9227467811158798,
1511
+ "grad_norm": 0.4706878364086151,
1512
  "learning_rate": 1.8190352989793325e-07,
1513
+ "loss": 0.2824,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.927038626609442,
1518
+ "grad_norm": 0.44920873641967773,
1519
  "learning_rate": 1.6236054330219853e-07,
1520
+ "loss": 0.3015,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.9313304721030042,
1525
+ "grad_norm": 0.48133671283721924,
1526
  "learning_rate": 1.439105877423963e-07,
1527
+ "loss": 0.2982,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.9356223175965666,
1532
+ "grad_norm": 0.4548247456550598,
1533
  "learning_rate": 1.2655783185784253e-07,
1534
+ "loss": 0.2984,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.9399141630901288,
1539
+ "grad_norm": 0.4537779986858368,
1540
  "learning_rate": 1.1030619638320805e-07,
1541
+ "loss": 0.2971,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.944206008583691,
1546
+ "grad_norm": 0.42610111832618713,
1547
  "learning_rate": 9.51593532626538e-08,
1548
+ "loss": 0.2809,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.9484978540772532,
1553
+ "grad_norm": 0.5735429525375366,
1554
  "learning_rate": 8.11207248201834e-08,
1555
+ "loss": 0.2924,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.9527896995708155,
1560
+ "grad_norm": 0.8572208285331726,
1561
  "learning_rate": 6.819348298638839e-08,
1562
+ "loss": 0.3269,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.9570815450643777,
1567
+ "grad_norm": 0.5738445520401001,
1568
  "learning_rate": 5.638054858177644e-08,
1569
+ "loss": 0.2796,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.9613733905579399,
1574
+ "grad_norm": 0.4766637980937958,
1575
  "learning_rate": 4.568459065683206e-08,
1576
+ "loss": 0.2876,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.9656652360515021,
1581
+ "grad_norm": 0.4772549867630005,
1582
  "learning_rate": 3.610802588895845e-08,
1583
+ "loss": 0.3196,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.9699570815450643,
1588
+ "grad_norm": 0.4136182963848114,
1589
  "learning_rate": 2.765301803645426e-08,
1590
+ "loss": 0.2603,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9742489270386266,
1595
+ "grad_norm": 0.4521878659725189,
1596
  "learning_rate": 2.0321477449619098e-08,
1597
+ "loss": 0.3104,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9785407725321889,
1602
+ "grad_norm": 0.4722403287887573,
1603
  "learning_rate": 1.411506063912882e-08,
1604
+ "loss": 0.3228,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9828326180257511,
1609
+ "grad_norm": 0.46703073382377625,
1610
  "learning_rate": 9.035169901754902e-09,
1611
+ "loss": 0.2847,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9871244635193133,
1616
+ "grad_norm": 0.4283360540866852,
1617
  "learning_rate": 5.082953003528457e-09,
1618
+ "loss": 0.276,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9914163090128756,
1623
+ "grad_norm": 0.4464006721973419,
1624
  "learning_rate": 2.2593029204076578e-09,
1625
+ "loss": 0.2801,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9957081545064378,
1630
+ "grad_norm": 0.4991028904914856,
1631
  "learning_rate": 5.648576365169245e-10,
1632
+ "loss": 0.3083,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.0,
1637
+ "grad_norm": 0.4361727237701416,
1638
  "learning_rate": 0.0,
1639
+ "loss": 0.3034,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.0,
1644
+ "eval_loss": 0.39615777134895325,
1645
+ "eval_runtime": 1.5197,
1646
+ "eval_samples_per_second": 151.345,
1647
+ "eval_steps_per_second": 6.58,
1648
  "step": 233
1649
  },
1650
  {
1651
  "epoch": 1.0,
1652
  "step": 233,
1653
  "total_flos": 1.933383782157517e+16,
1654
+ "train_loss": 0.4341796827162796,
1655
+ "train_runtime": 302.6302,
1656
+ "train_samples_per_second": 36.811,
1657
+ "train_steps_per_second": 0.77
1658
  }
1659
  ],
1660
  "logging_steps": 1,