c10 commited on
Commit
d316588
1 Parent(s): af81b99

Model save

Browse files
Files changed (5) hide show
  1. README.md +3 -3
  2. all_results.json +8 -8
  3. eval_results.json +4 -4
  4. train_results.json +4 -4
  5. trainer_state.json +457 -457
README.md CHANGED
@@ -4,8 +4,8 @@ base_model: yanolja/EEVE-Korean-Instruct-2.8B-v1.0
4
  tags:
5
  - trl
6
  - sft
7
- - generated_from_trainer
8
  - alignment-handbook
 
9
  model-index:
10
  - name: eeve-2.8B_C
11
  results: []
@@ -18,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [yanolja/EEVE-Korean-Instruct-2.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-2.8B-v1.0) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.3928
22
 
23
  ## Model description
24
 
@@ -54,7 +54,7 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 0.3028 | 1.0 | 233 | 0.3928 |
58
 
59
 
60
  ### Framework versions
 
4
  tags:
5
  - trl
6
  - sft
 
7
  - alignment-handbook
8
+ - generated_from_trainer
9
  model-index:
10
  - name: eeve-2.8B_C
11
  results: []
 
18
 
19
  This model is a fine-tuned version of [yanolja/EEVE-Korean-Instruct-2.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-2.8B-v1.0) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.3932
22
 
23
  ## Model description
24
 
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.3021 | 1.0 | 233 | 0.3932 |
58
 
59
 
60
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.39601820707321167,
4
- "eval_runtime": 1.5467,
5
  "eval_samples": 230,
6
- "eval_samples_per_second": 148.707,
7
- "eval_steps_per_second": 6.466,
8
  "total_flos": 1.933383782157517e+16,
9
- "train_loss": 0.43344665942273936,
10
- "train_runtime": 303.441,
11
  "train_samples": 11140,
12
- "train_samples_per_second": 36.712,
13
- "train_steps_per_second": 0.768
14
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.3927557170391083,
4
+ "eval_runtime": 1.5271,
5
  "eval_samples": 230,
6
+ "eval_samples_per_second": 150.611,
7
+ "eval_steps_per_second": 6.548,
8
  "total_flos": 1.933383782157517e+16,
9
+ "train_loss": 0.43314852683840904,
10
+ "train_runtime": 303.7389,
11
  "train_samples": 11140,
12
+ "train_samples_per_second": 36.676,
13
+ "train_steps_per_second": 0.767
14
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.39601820707321167,
4
- "eval_runtime": 1.5467,
5
  "eval_samples": 230,
6
- "eval_samples_per_second": 148.707,
7
- "eval_steps_per_second": 6.466
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.3927557170391083,
4
+ "eval_runtime": 1.5271,
5
  "eval_samples": 230,
6
+ "eval_samples_per_second": 150.611,
7
+ "eval_steps_per_second": 6.548
8
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
- "train_loss": 0.43344665942273936,
5
- "train_runtime": 303.441,
6
  "train_samples": 11140,
7
- "train_samples_per_second": 36.712,
8
- "train_steps_per_second": 0.768
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
+ "train_loss": 0.43314852683840904,
5
+ "train_runtime": 303.7389,
6
  "train_samples": 11140,
7
+ "train_samples_per_second": 36.676,
8
+ "train_steps_per_second": 0.767
9
  }
trainer_state.json CHANGED
@@ -10,1651 +10,1651 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004291845493562232,
13
- "grad_norm": 6.858878135681152,
14
  "learning_rate": 4.1666666666666667e-07,
15
  "loss": 2.3401,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.008583690987124463,
20
- "grad_norm": 7.190281391143799,
21
  "learning_rate": 8.333333333333333e-07,
22
  "loss": 2.3774,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.012875536480686695,
27
- "grad_norm": 7.002199172973633,
28
  "learning_rate": 1.25e-06,
29
- "loss": 2.3584,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.017167381974248927,
34
- "grad_norm": 6.990989685058594,
35
  "learning_rate": 1.6666666666666667e-06,
36
- "loss": 2.3648,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.02145922746781116,
41
- "grad_norm": 6.870188236236572,
42
  "learning_rate": 2.0833333333333334e-06,
43
- "loss": 2.3489,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.02575107296137339,
48
- "grad_norm": 6.819883346557617,
49
  "learning_rate": 2.5e-06,
50
- "loss": 2.2795,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.030042918454935622,
55
- "grad_norm": 6.639195442199707,
56
  "learning_rate": 2.916666666666667e-06,
57
- "loss": 2.2742,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.034334763948497854,
62
- "grad_norm": 5.921014308929443,
63
  "learning_rate": 3.3333333333333333e-06,
64
- "loss": 2.1436,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03862660944206009,
69
- "grad_norm": 5.771921157836914,
70
  "learning_rate": 3.7500000000000005e-06,
71
- "loss": 2.1177,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.04291845493562232,
76
- "grad_norm": 5.56660270690918,
77
  "learning_rate": 4.166666666666667e-06,
78
- "loss": 2.0906,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04721030042918455,
83
- "grad_norm": 4.383533477783203,
84
  "learning_rate": 4.583333333333333e-06,
85
- "loss": 1.7752,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.05150214592274678,
90
- "grad_norm": 4.370767593383789,
91
  "learning_rate": 5e-06,
92
- "loss": 1.6766,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.055793991416309016,
97
- "grad_norm": 4.208242893218994,
98
  "learning_rate": 5.416666666666667e-06,
99
- "loss": 1.622,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.060085836909871244,
104
- "grad_norm": 3.4597911834716797,
105
  "learning_rate": 5.833333333333334e-06,
106
- "loss": 1.1528,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.06437768240343347,
111
- "grad_norm": 3.6010146141052246,
112
  "learning_rate": 6.25e-06,
113
- "loss": 1.0936,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06866952789699571,
118
- "grad_norm": 3.400857448577881,
119
  "learning_rate": 6.666666666666667e-06,
120
- "loss": 1.0217,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.07296137339055794,
125
- "grad_norm": 2.8686187267303467,
126
  "learning_rate": 7.083333333333335e-06,
127
- "loss": 0.9024,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07725321888412018,
132
- "grad_norm": 2.4871363639831543,
133
  "learning_rate": 7.500000000000001e-06,
134
- "loss": 0.848,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0815450643776824,
139
- "grad_norm": 2.057180881500244,
140
  "learning_rate": 7.916666666666667e-06,
141
- "loss": 0.586,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.08583690987124463,
146
- "grad_norm": 1.4040249586105347,
147
  "learning_rate": 8.333333333333334e-06,
148
- "loss": 0.4792,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.09012875536480687,
153
- "grad_norm": 1.0943259000778198,
154
  "learning_rate": 8.750000000000001e-06,
155
- "loss": 0.4627,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0944206008583691,
160
- "grad_norm": 0.8370047807693481,
161
  "learning_rate": 9.166666666666666e-06,
162
- "loss": 0.4231,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09871244635193133,
167
- "grad_norm": 0.7553132772445679,
168
  "learning_rate": 9.583333333333335e-06,
169
- "loss": 0.4156,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.10300429184549356,
174
- "grad_norm": 0.7301490902900696,
175
  "learning_rate": 1e-05,
176
- "loss": 0.417,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.1072961373390558,
181
- "grad_norm": 0.696942150592804,
182
  "learning_rate": 9.999435142363484e-06,
183
- "loss": 0.3828,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.11158798283261803,
188
- "grad_norm": 0.7832161784172058,
189
  "learning_rate": 9.997740697079595e-06,
190
- "loss": 0.4205,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.11587982832618025,
195
- "grad_norm": 0.6943851709365845,
196
  "learning_rate": 9.994917046996472e-06,
197
- "loss": 0.3683,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.12017167381974249,
202
- "grad_norm": 0.6518245935440063,
203
  "learning_rate": 9.990964830098246e-06,
204
- "loss": 0.3797,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.12446351931330472,
209
- "grad_norm": 0.6293859481811523,
210
  "learning_rate": 9.985884939360873e-06,
211
- "loss": 0.4082,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.12875536480686695,
216
- "grad_norm": 0.5730544924736023,
217
  "learning_rate": 9.979678522550382e-06,
218
- "loss": 0.3835,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.13304721030042918,
223
- "grad_norm": 0.5572949647903442,
224
  "learning_rate": 9.972346981963546e-06,
225
- "loss": 0.3656,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.13733905579399142,
230
- "grad_norm": 0.5439539551734924,
231
  "learning_rate": 9.963891974111042e-06,
232
- "loss": 0.3745,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.14163090128755365,
237
- "grad_norm": 0.5525590181350708,
238
  "learning_rate": 9.95431540934317e-06,
239
- "loss": 0.3736,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1459227467811159,
244
- "grad_norm": 0.5891681909561157,
245
  "learning_rate": 9.943619451418225e-06,
246
- "loss": 0.3988,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.15021459227467812,
251
- "grad_norm": 0.5311374068260193,
252
  "learning_rate": 9.931806517013612e-06,
253
- "loss": 0.3577,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.15450643776824036,
258
- "grad_norm": 0.48534247279167175,
259
  "learning_rate": 9.918879275179819e-06,
260
- "loss": 0.3407,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.15879828326180256,
265
- "grad_norm": 0.5369582772254944,
266
  "learning_rate": 9.904840646737346e-06,
267
- "loss": 0.3724,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1630901287553648,
272
- "grad_norm": 0.4897380471229553,
273
  "learning_rate": 9.889693803616793e-06,
274
- "loss": 0.3685,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.16738197424892703,
279
- "grad_norm": 0.5207609534263611,
280
  "learning_rate": 9.873442168142158e-06,
281
- "loss": 0.3601,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.17167381974248927,
286
- "grad_norm": 0.4904135763645172,
287
  "learning_rate": 9.856089412257605e-06,
288
- "loss": 0.335,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.1759656652360515,
293
- "grad_norm": 0.5017397403717041,
294
  "learning_rate": 9.837639456697802e-06,
295
- "loss": 0.3815,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.18025751072961374,
300
- "grad_norm": 0.4633351266384125,
301
  "learning_rate": 9.818096470102067e-06,
302
- "loss": 0.3338,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.18454935622317598,
307
- "grad_norm": 0.4839096665382385,
308
  "learning_rate": 9.797464868072489e-06,
309
- "loss": 0.352,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.1888412017167382,
314
- "grad_norm": 0.48044103384017944,
315
  "learning_rate": 9.775749312176249e-06,
316
- "loss": 0.3295,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.19313304721030042,
321
- "grad_norm": 0.4903911352157593,
322
  "learning_rate": 9.752954708892379e-06,
323
- "loss": 0.339,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.19742489270386265,
328
- "grad_norm": 0.5130500197410583,
329
  "learning_rate": 9.729086208503174e-06,
330
- "loss": 0.3409,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.2017167381974249,
335
- "grad_norm": 0.5341370701789856,
336
  "learning_rate": 9.704149203930522e-06,
337
- "loss": 0.3637,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.20600858369098712,
342
- "grad_norm": 0.534353494644165,
343
  "learning_rate": 9.67814932951741e-06,
344
- "loss": 0.3708,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.21030042918454936,
349
- "grad_norm": 0.5091646909713745,
350
  "learning_rate": 9.651092459754879e-06,
351
- "loss": 0.372,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.2145922746781116,
356
- "grad_norm": 0.46878930926322937,
357
  "learning_rate": 9.622984707954732e-06,
358
- "loss": 0.3481,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.21888412017167383,
363
- "grad_norm": 0.43085217475891113,
364
  "learning_rate": 9.593832424868271e-06,
365
- "loss": 0.3324,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.22317596566523606,
370
- "grad_norm": 0.5066664218902588,
371
  "learning_rate": 9.563642197251382e-06,
372
- "loss": 0.3703,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.22746781115879827,
377
- "grad_norm": 0.4604777991771698,
378
  "learning_rate": 9.532420846376316e-06,
379
- "loss": 0.3281,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.2317596566523605,
384
- "grad_norm": 0.46397772431373596,
385
  "learning_rate": 9.500175426490455e-06,
386
- "loss": 0.3161,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.23605150214592274,
391
- "grad_norm": 0.4790486693382263,
392
  "learning_rate": 9.466913223222467e-06,
393
- "loss": 0.3485,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.24034334763948498,
398
- "grad_norm": 0.46051961183547974,
399
  "learning_rate": 9.432641751936162e-06,
400
- "loss": 0.3293,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2446351931330472,
405
- "grad_norm": 0.4890981912612915,
406
  "learning_rate": 9.397368756032445e-06,
407
  "loss": 0.3435,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.24892703862660945,
412
- "grad_norm": 0.4803486764431,
413
  "learning_rate": 9.361102205199762e-06,
414
- "loss": 0.3455,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.2532188841201717,
419
- "grad_norm": 0.4754889905452728,
420
  "learning_rate": 9.32385029361338e-06,
421
- "loss": 0.3235,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.2575107296137339,
426
- "grad_norm": 0.4711175858974457,
427
  "learning_rate": 9.285621438083997e-06,
428
- "loss": 0.3184,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.26180257510729615,
433
- "grad_norm": 0.508696973323822,
434
  "learning_rate": 9.246424276156008e-06,
435
- "loss": 0.329,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.26609442060085836,
440
- "grad_norm": 0.45938849449157715,
441
  "learning_rate": 9.206267664155906e-06,
442
- "loss": 0.3269,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2703862660944206,
447
- "grad_norm": 0.4641532003879547,
448
  "learning_rate": 9.165160675191272e-06,
449
- "loss": 0.3315,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.27467811158798283,
454
- "grad_norm": 0.4699811041355133,
455
  "learning_rate": 9.123112597100759e-06,
456
- "loss": 0.3362,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.27896995708154504,
461
- "grad_norm": 0.5028325319290161,
462
  "learning_rate": 9.080132930355567e-06,
463
- "loss": 0.332,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2832618025751073,
468
- "grad_norm": 0.45282426476478577,
469
  "learning_rate": 9.03623138591289e-06,
470
- "loss": 0.3073,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2875536480686695,
475
- "grad_norm": 0.43206533789634705,
476
  "learning_rate": 8.99141788302178e-06,
477
- "loss": 0.3068,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2918454935622318,
482
- "grad_norm": 0.45906496047973633,
483
  "learning_rate": 8.94570254698197e-06,
484
- "loss": 0.2993,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.296137339055794,
489
- "grad_norm": 0.44725412130355835,
490
  "learning_rate": 8.899095706856122e-06,
491
- "loss": 0.3211,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.30042918454935624,
496
- "grad_norm": 0.48440036177635193,
497
  "learning_rate": 8.851607893136065e-06,
498
- "loss": 0.353,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.30472103004291845,
503
- "grad_norm": 0.5581377744674683,
504
  "learning_rate": 8.803249835363486e-06,
505
- "loss": 0.2963,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3090128755364807,
510
- "grad_norm": 0.5228906869888306,
511
  "learning_rate": 8.754032459705672e-06,
512
- "loss": 0.3458,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.3133047210300429,
517
- "grad_norm": 0.5844430327415466,
518
  "learning_rate": 8.703966886486819e-06,
519
- "loss": 0.3285,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.31759656652360513,
524
- "grad_norm": 0.4595542848110199,
525
  "learning_rate": 8.65306442767547e-06,
526
- "loss": 0.32,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3218884120171674,
531
- "grad_norm": 0.47302451729774475,
532
  "learning_rate": 8.601336584328659e-06,
533
- "loss": 0.3359,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3261802575107296,
538
- "grad_norm": 0.5394320487976074,
539
  "learning_rate": 8.548795043993316e-06,
540
- "loss": 0.2956,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.33047210300429186,
545
- "grad_norm": 0.4741763472557068,
546
  "learning_rate": 8.495451678065563e-06,
547
- "loss": 0.332,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.33476394849785407,
552
- "grad_norm": 0.5460072159767151,
553
  "learning_rate": 8.441318539108433e-06,
554
- "loss": 0.3453,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.33905579399141633,
559
- "grad_norm": 0.4681239128112793,
560
  "learning_rate": 8.386407858128707e-06,
561
- "loss": 0.3374,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.34334763948497854,
566
- "grad_norm": 0.44867822527885437,
567
  "learning_rate": 8.330732041813367e-06,
568
- "loss": 0.3129,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.34763948497854075,
573
- "grad_norm": 0.4977342188358307,
574
  "learning_rate": 8.274303669726427e-06,
575
- "loss": 0.3433,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.351931330472103,
580
- "grad_norm": 0.488757461309433,
581
  "learning_rate": 8.217135491466636e-06,
582
- "loss": 0.3465,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.3562231759656652,
587
- "grad_norm": 0.42737138271331787,
588
  "learning_rate": 8.15924042378682e-06,
589
- "loss": 0.3045,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.3605150214592275,
594
- "grad_norm": 0.4217222332954407,
595
  "learning_rate": 8.100631547675417e-06,
596
- "loss": 0.3042,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3648068669527897,
601
- "grad_norm": 0.4778914749622345,
602
  "learning_rate": 8.041322105400923e-06,
603
- "loss": 0.3344,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.36909871244635195,
608
- "grad_norm": 0.454506516456604,
609
  "learning_rate": 7.981325497519892e-06,
610
- "loss": 0.3193,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.37339055793991416,
615
- "grad_norm": 0.4511786997318268,
616
  "learning_rate": 7.920655279849173e-06,
617
- "loss": 0.3219,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3776824034334764,
622
- "grad_norm": 0.42557790875434875,
623
  "learning_rate": 7.859325160403073e-06,
624
- "loss": 0.3402,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.38197424892703863,
629
- "grad_norm": 0.46827811002731323,
630
  "learning_rate": 7.797348996296116e-06,
631
- "loss": 0.3206,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.38626609442060084,
636
- "grad_norm": 0.45431816577911377,
637
  "learning_rate": 7.734740790612137e-06,
638
- "loss": 0.3408,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.3905579399141631,
643
- "grad_norm": 0.4279470145702362,
644
  "learning_rate": 7.671514689240366e-06,
645
- "loss": 0.3078,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3948497854077253,
650
- "grad_norm": 0.4454224109649658,
651
  "learning_rate": 7.607684977679284e-06,
652
- "loss": 0.2979,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.39914163090128757,
657
- "grad_norm": 0.4116591513156891,
658
  "learning_rate": 7.543266077808893e-06,
659
- "loss": 0.2848,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.4034334763948498,
664
- "grad_norm": 0.45302438735961914,
665
  "learning_rate": 7.478272544632204e-06,
666
- "loss": 0.3202,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.40772532188841204,
671
- "grad_norm": 0.4374842941761017,
672
  "learning_rate": 7.412719062986632e-06,
673
- "loss": 0.3376,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.41201716738197425,
678
- "grad_norm": 0.40668705105781555,
679
  "learning_rate": 7.3466204442260605e-06,
680
  "loss": 0.276,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.41630901287553645,
685
- "grad_norm": 0.4532172381877899,
686
  "learning_rate": 7.279991622874319e-06,
687
- "loss": 0.342,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.4206008583690987,
692
- "grad_norm": 0.46435272693634033,
693
  "learning_rate": 7.212847653250828e-06,
694
- "loss": 0.3,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.4248927038626609,
699
- "grad_norm": 0.48846691846847534,
700
  "learning_rate": 7.145203706069183e-06,
701
- "loss": 0.2866,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.4291845493562232,
706
- "grad_norm": 0.43814221024513245,
707
  "learning_rate": 7.0770750650094335e-06,
708
- "loss": 0.3376,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4334763948497854,
713
- "grad_norm": 0.45952096581459045,
714
  "learning_rate": 7.008477123264849e-06,
715
- "loss": 0.344,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.43776824034334766,
720
- "grad_norm": 0.452181875705719,
721
  "learning_rate": 6.939425380063924e-06,
722
- "loss": 0.315,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.44206008583690987,
727
- "grad_norm": 0.44957470893859863,
728
  "learning_rate": 6.869935437168449e-06,
729
- "loss": 0.3224,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.44635193133047213,
734
- "grad_norm": 0.4448547959327698,
735
  "learning_rate": 6.800022995348381e-06,
736
  "loss": 0.3182,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.45064377682403434,
741
- "grad_norm": 0.43706539273262024,
742
  "learning_rate": 6.729703850834381e-06,
743
- "loss": 0.3254,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.45493562231759654,
748
- "grad_norm": 0.4131150245666504,
749
  "learning_rate": 6.65899389174876e-06,
750
- "loss": 0.2827,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.4592274678111588,
755
- "grad_norm": 0.41083696484565735,
756
  "learning_rate": 6.587909094515663e-06,
757
- "loss": 0.2954,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.463519313304721,
762
- "grad_norm": 0.3907100558280945,
763
  "learning_rate": 6.5164655202513135e-06,
764
- "loss": 0.2797,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4678111587982833,
769
- "grad_norm": 0.4586903154850006,
770
  "learning_rate": 6.444679311135112e-06,
771
- "loss": 0.3409,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.4721030042918455,
776
- "grad_norm": 0.46873989701271057,
777
  "learning_rate": 6.372566686762427e-06,
778
- "loss": 0.3423,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.47639484978540775,
783
- "grad_norm": 0.44646352529525757,
784
  "learning_rate": 6.300143940479881e-06,
785
- "loss": 0.325,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.48068669527896996,
790
- "grad_norm": 0.4229104518890381,
791
  "learning_rate": 6.227427435703997e-06,
792
  "loss": 0.2892,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.48497854077253216,
797
- "grad_norm": 0.409986674785614,
798
  "learning_rate": 6.154433602223979e-06,
799
- "loss": 0.2689,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4892703862660944,
804
- "grad_norm": 0.4755241572856903,
805
  "learning_rate": 6.0811789324895365e-06,
806
- "loss": 0.3003,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.49356223175965663,
811
- "grad_norm": 0.4277583062648773,
812
  "learning_rate": 6.0076799778845105e-06,
813
- "loss": 0.3062,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4978540772532189,
818
- "grad_norm": 0.422399640083313,
819
  "learning_rate": 5.933953344987215e-06,
820
- "loss": 0.3116,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.5021459227467812,
825
- "grad_norm": 0.44285473227500916,
826
  "learning_rate": 5.860015691818292e-06,
827
- "loss": 0.3313,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.5064377682403434,
832
- "grad_norm": 0.4164673089981079,
833
  "learning_rate": 5.78588372407695e-06,
834
- "loss": 0.2907,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.5107296137339056,
839
- "grad_norm": 0.4171607196331024,
840
  "learning_rate": 5.711574191366427e-06,
841
- "loss": 0.3078,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.5150214592274678,
846
- "grad_norm": 0.43039482831954956,
847
  "learning_rate": 5.637103883409525e-06,
848
- "loss": 0.3045,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.51931330472103,
853
- "grad_norm": 0.44303321838378906,
854
  "learning_rate": 5.562489626255104e-06,
855
- "loss": 0.3128,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.5236051502145923,
860
- "grad_norm": 0.4535645544528961,
861
  "learning_rate": 5.487748278476342e-06,
862
- "loss": 0.3126,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.5278969957081545,
867
- "grad_norm": 0.3880036771297455,
868
  "learning_rate": 5.412896727361663e-06,
869
- "loss": 0.2516,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.5321888412017167,
874
- "grad_norm": 0.4334699213504791,
875
  "learning_rate": 5.337951885099167e-06,
876
- "loss": 0.29,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.5364806866952789,
881
- "grad_norm": 0.46000969409942627,
882
  "learning_rate": 5.262930684955439e-06,
883
- "loss": 0.2925,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5407725321888412,
888
- "grad_norm": 0.4380834996700287,
889
  "learning_rate": 5.187850077449604e-06,
890
  "loss": 0.317,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5450643776824035,
895
- "grad_norm": 0.47088897228240967,
896
  "learning_rate": 5.112727026523461e-06,
897
- "loss": 0.2757,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5493562231759657,
902
- "grad_norm": 0.4383191764354706,
903
  "learning_rate": 5.03757850570861e-06,
904
  "loss": 0.3066,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5536480686695279,
909
- "grad_norm": 0.4193175733089447,
910
  "learning_rate": 4.9624214942913916e-06,
911
- "loss": 0.3012,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5579399141630901,
916
- "grad_norm": 0.4122987687587738,
917
  "learning_rate": 4.88727297347654e-06,
918
- "loss": 0.3068,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5622317596566524,
923
- "grad_norm": 0.4264082908630371,
924
  "learning_rate": 4.8121499225503974e-06,
925
  "loss": 0.2874,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5665236051502146,
930
- "grad_norm": 0.4167967140674591,
931
  "learning_rate": 4.737069315044562e-06,
932
- "loss": 0.2961,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5708154506437768,
937
- "grad_norm": 0.4843411445617676,
938
  "learning_rate": 4.662048114900837e-06,
939
- "loss": 0.3242,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.575107296137339,
944
- "grad_norm": 0.4601851999759674,
945
  "learning_rate": 4.587103272638339e-06,
946
- "loss": 0.3267,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5793991416309013,
951
- "grad_norm": 0.41821733117103577,
952
  "learning_rate": 4.512251721523659e-06,
953
  "loss": 0.2671,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5836909871244635,
958
- "grad_norm": 0.4290674328804016,
959
  "learning_rate": 4.437510373744897e-06,
960
- "loss": 0.3081,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.5879828326180258,
965
- "grad_norm": 0.5026841163635254,
966
  "learning_rate": 4.362896116590475e-06,
967
- "loss": 0.3039,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.592274678111588,
972
- "grad_norm": 0.4369226396083832,
973
  "learning_rate": 4.2884258086335755e-06,
974
- "loss": 0.3234,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5965665236051502,
979
- "grad_norm": 0.4451539218425751,
980
  "learning_rate": 4.214116275923051e-06,
981
- "loss": 0.2728,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.6008583690987125,
986
- "grad_norm": 0.4773256480693817,
987
  "learning_rate": 4.1399843081817085e-06,
988
- "loss": 0.3123,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.6051502145922747,
993
- "grad_norm": 0.42359718680381775,
994
  "learning_rate": 4.066046655012786e-06,
995
- "loss": 0.3048,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.6094420600858369,
1000
- "grad_norm": 0.45454850792884827,
1001
  "learning_rate": 3.992320022115492e-06,
1002
- "loss": 0.3159,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.6137339055793991,
1007
- "grad_norm": 0.4412500262260437,
1008
  "learning_rate": 3.918821067510464e-06,
1009
- "loss": 0.2924,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.6180257510729614,
1014
- "grad_norm": 0.4579533338546753,
1015
  "learning_rate": 3.845566397776022e-06,
1016
- "loss": 0.3308,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.6223175965665236,
1021
- "grad_norm": 0.44342905282974243,
1022
  "learning_rate": 3.7725725642960047e-06,
1023
- "loss": 0.3059,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.6266094420600858,
1028
- "grad_norm": 0.4391109347343445,
1029
  "learning_rate": 3.6998560595201188e-06,
1030
- "loss": 0.2911,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.630901287553648,
1035
- "grad_norm": 0.4317631125450134,
1036
  "learning_rate": 3.627433313237576e-06,
1037
- "loss": 0.2961,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.6351931330472103,
1042
- "grad_norm": 0.4535035789012909,
1043
  "learning_rate": 3.555320688864889e-06,
1044
- "loss": 0.291,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.6394849785407726,
1049
- "grad_norm": 0.4161687195301056,
1050
  "learning_rate": 3.483534479748688e-06,
1051
- "loss": 0.2915,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.6437768240343348,
1056
- "grad_norm": 0.4346662163734436,
1057
  "learning_rate": 3.4120909054843375e-06,
1058
- "loss": 0.2831,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.648068669527897,
1063
- "grad_norm": 0.4250935912132263,
1064
  "learning_rate": 3.3410061082512422e-06,
1065
- "loss": 0.2583,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6523605150214592,
1070
- "grad_norm": 0.43957194685935974,
1071
  "learning_rate": 3.2702961491656197e-06,
1072
- "loss": 0.286,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.6566523605150214,
1077
- "grad_norm": 0.4373530149459839,
1078
  "learning_rate": 3.1999770046516198e-06,
1079
- "loss": 0.2828,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6609442060085837,
1084
- "grad_norm": 0.40731024742126465,
1085
  "learning_rate": 3.130064562831553e-06,
1086
- "loss": 0.2761,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6652360515021459,
1091
- "grad_norm": 0.6041237711906433,
1092
  "learning_rate": 3.0605746199360755e-06,
1093
- "loss": 0.2984,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6695278969957081,
1098
- "grad_norm": 0.41370561718940735,
1099
  "learning_rate": 2.991522876735154e-06,
1100
- "loss": 0.2862,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6738197424892703,
1105
- "grad_norm": 0.4284398853778839,
1106
  "learning_rate": 2.9229249349905686e-06,
1107
- "loss": 0.3019,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6781115879828327,
1112
- "grad_norm": 0.45468419790267944,
1113
  "learning_rate": 2.8547962939308187e-06,
1114
- "loss": 0.2831,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6824034334763949,
1119
- "grad_norm": 0.4251733124256134,
1120
  "learning_rate": 2.787152346749173e-06,
1121
- "loss": 0.2993,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6866952789699571,
1126
- "grad_norm": 0.4290471374988556,
1127
  "learning_rate": 2.720008377125682e-06,
1128
- "loss": 0.2805,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6909871244635193,
1133
- "grad_norm": 0.4311181902885437,
1134
  "learning_rate": 2.6533795557739407e-06,
1135
  "loss": 0.3238,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6952789699570815,
1140
- "grad_norm": 0.4299990236759186,
1141
  "learning_rate": 2.5872809370133704e-06,
1142
- "loss": 0.2939,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6995708154506438,
1147
- "grad_norm": 0.47888845205307007,
1148
  "learning_rate": 2.5217274553677975e-06,
1149
  "loss": 0.2985,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.703862660944206,
1154
- "grad_norm": 0.44146785140037537,
1155
  "learning_rate": 2.4567339221911086e-06,
1156
- "loss": 0.2931,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.7081545064377682,
1161
- "grad_norm": 0.44416606426239014,
1162
  "learning_rate": 2.3923150223207176e-06,
1163
- "loss": 0.3113,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.7124463519313304,
1168
- "grad_norm": 0.42069002985954285,
1169
  "learning_rate": 2.328485310759635e-06,
1170
- "loss": 0.2866,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.7167381974248928,
1175
- "grad_norm": 0.46958863735198975,
1176
  "learning_rate": 2.265259209387867e-06,
1177
- "loss": 0.3047,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.721030042918455,
1182
- "grad_norm": 0.41116032004356384,
1183
  "learning_rate": 2.202651003703885e-06,
1184
- "loss": 0.2811,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.7253218884120172,
1189
- "grad_norm": 0.42484262585639954,
1190
  "learning_rate": 2.140674839596931e-06,
1191
- "loss": 0.2657,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.7296137339055794,
1196
- "grad_norm": 0.44194865226745605,
1197
  "learning_rate": 2.0793447201508288e-06,
1198
- "loss": 0.315,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.7339055793991416,
1203
- "grad_norm": 0.43406444787979126,
1204
  "learning_rate": 2.01867450248011e-06,
1205
- "loss": 0.3026,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.7381974248927039,
1210
- "grad_norm": 0.4179142117500305,
1211
  "learning_rate": 1.9586778945990785e-06,
1212
- "loss": 0.2812,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.7424892703862661,
1217
- "grad_norm": 0.4454743564128876,
1218
  "learning_rate": 1.8993684523245842e-06,
1219
- "loss": 0.291,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.7467811158798283,
1224
- "grad_norm": 0.4816957116127014,
1225
  "learning_rate": 1.8407595762131814e-06,
1226
- "loss": 0.3409,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.7510729613733905,
1231
- "grad_norm": 0.46107807755470276,
1232
  "learning_rate": 1.7828645085333645e-06,
1233
- "loss": 0.3469,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.7553648068669528,
1238
- "grad_norm": 0.4322707951068878,
1239
  "learning_rate": 1.7256963302735752e-06,
1240
- "loss": 0.3175,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.759656652360515,
1245
- "grad_norm": 0.45718640089035034,
1246
  "learning_rate": 1.6692679581866334e-06,
1247
- "loss": 0.3053,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.7639484978540773,
1252
- "grad_norm": 0.42895740270614624,
1253
  "learning_rate": 1.6135921418712959e-06,
1254
- "loss": 0.2732,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7682403433476395,
1259
- "grad_norm": 0.4421488046646118,
1260
  "learning_rate": 1.5586814608915673e-06,
1261
- "loss": 0.3137,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7725321888412017,
1266
- "grad_norm": 0.4177413582801819,
1267
  "learning_rate": 1.5045483219344387e-06,
1268
- "loss": 0.279,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.776824034334764,
1273
- "grad_norm": 0.4410814344882965,
1274
  "learning_rate": 1.4512049560066837e-06,
1275
  "loss": 0.2758,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7811158798283262,
1280
- "grad_norm": 0.4211938679218292,
1281
  "learning_rate": 1.3986634156713418e-06,
1282
- "loss": 0.2844,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7854077253218884,
1287
- "grad_norm": 0.41497108340263367,
1288
  "learning_rate": 1.3469355723245303e-06,
1289
- "loss": 0.2867,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7896995708154506,
1294
- "grad_norm": 0.4306085705757141,
1295
  "learning_rate": 1.2960331135131826e-06,
1296
- "loss": 0.3287,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.7939914163090128,
1301
- "grad_norm": 0.4141402542591095,
1302
  "learning_rate": 1.245967540294329e-06,
1303
- "loss": 0.2855,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7982832618025751,
1308
- "grad_norm": 0.3995893597602844,
1309
  "learning_rate": 1.1967501646365147e-06,
1310
- "loss": 0.2761,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.8025751072961373,
1315
- "grad_norm": 0.4385574758052826,
1316
  "learning_rate": 1.1483921068639353e-06,
1317
- "loss": 0.2758,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.8068669527896996,
1322
- "grad_norm": 0.45672720670700073,
1323
  "learning_rate": 1.1009042931438784e-06,
1324
- "loss": 0.2989,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.8111587982832618,
1329
- "grad_norm": 0.44737428426742554,
1330
  "learning_rate": 1.0542974530180327e-06,
1331
- "loss": 0.3073,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.8154506437768241,
1336
- "grad_norm": 0.3771262764930725,
1337
  "learning_rate": 1.00858211697822e-06,
1338
- "loss": 0.2655,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.8197424892703863,
1343
- "grad_norm": 0.42601287364959717,
1344
  "learning_rate": 9.637686140871121e-07,
1345
- "loss": 0.2911,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.8240343347639485,
1350
- "grad_norm": 0.40493884682655334,
1351
  "learning_rate": 9.198670696444339e-07,
1352
- "loss": 0.2806,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.8283261802575107,
1357
- "grad_norm": 0.496890664100647,
1358
  "learning_rate": 8.768874028992431e-07,
1359
- "loss": 0.3193,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.8326180257510729,
1364
- "grad_norm": 0.41898757219314575,
1365
  "learning_rate": 8.348393248087289e-07,
1366
- "loss": 0.3184,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.8369098712446352,
1371
- "grad_norm": 0.45460304617881775,
1372
  "learning_rate": 7.937323358440935e-07,
1373
- "loss": 0.3412,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.8412017167381974,
1378
- "grad_norm": 0.44133260846138,
1379
  "learning_rate": 7.535757238439939e-07,
1380
- "loss": 0.3116,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.8454935622317596,
1385
- "grad_norm": 0.393711119890213,
1386
  "learning_rate": 7.143785619160026e-07,
1387
- "loss": 0.2788,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.8497854077253219,
1392
- "grad_norm": 0.4073408842086792,
1393
  "learning_rate": 6.761497063866207e-07,
1394
- "loss": 0.2842,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.8540772532188842,
1399
- "grad_norm": 0.43793272972106934,
1400
  "learning_rate": 6.388977948002406e-07,
1401
- "loss": 0.3027,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.8583690987124464,
1406
- "grad_norm": 0.3962232172489166,
1407
  "learning_rate": 6.026312439675553e-07,
1408
- "loss": 0.2752,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8626609442060086,
1413
- "grad_norm": 0.44113269448280334,
1414
  "learning_rate": 5.673582480638395e-07,
1415
- "loss": 0.3019,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8669527896995708,
1420
- "grad_norm": 0.44335755705833435,
1421
  "learning_rate": 5.330867767775333e-07,
1422
- "loss": 0.3393,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.871244635193133,
1427
- "grad_norm": 0.41239792108535767,
1428
  "learning_rate": 4.998245735095459e-07,
1429
- "loss": 0.3049,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8755364806866953,
1434
- "grad_norm": 0.4250977337360382,
1435
  "learning_rate": 4.6757915362368567e-07,
1436
- "loss": 0.2875,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8798283261802575,
1441
- "grad_norm": 0.43631571531295776,
1442
  "learning_rate": 4.363578027486187e-07,
1443
- "loss": 0.3031,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8841201716738197,
1448
- "grad_norm": 0.4137750566005707,
1449
  "learning_rate": 4.0616757513173123e-07,
1450
  "loss": 0.2828,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8884120171673819,
1455
- "grad_norm": 0.4102308452129364,
1456
  "learning_rate": 3.7701529204526856e-07,
1457
- "loss": 0.2942,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8927038626609443,
1462
- "grad_norm": 0.4059811532497406,
1463
  "learning_rate": 3.4890754024512254e-07,
1464
- "loss": 0.3024,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8969957081545065,
1469
- "grad_norm": 0.4469408392906189,
1470
  "learning_rate": 3.2185067048259245e-07,
1471
  "loss": 0.2926,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.9012875536480687,
1476
- "grad_norm": 0.4502747356891632,
1477
  "learning_rate": 2.9585079606947843e-07,
1478
- "loss": 0.2982,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.9055793991416309,
1483
- "grad_norm": 0.4147951602935791,
1484
  "learning_rate": 2.7091379149682683e-07,
1485
- "loss": 0.2735,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.9098712446351931,
1490
- "grad_norm": 0.39468127489089966,
1491
  "learning_rate": 2.470452911076227e-07,
1492
- "loss": 0.2775,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.9141630901287554,
1497
- "grad_norm": 0.4191775321960449,
1498
  "learning_rate": 2.242506878237538e-07,
1499
- "loss": 0.2956,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.9184549356223176,
1504
- "grad_norm": 0.384409636259079,
1505
  "learning_rate": 2.0253513192751374e-07,
1506
- "loss": 0.2829,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.9227467811158798,
1511
- "grad_norm": 0.41374218463897705,
1512
  "learning_rate": 1.8190352989793325e-07,
1513
  "loss": 0.2807,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.927038626609442,
1518
- "grad_norm": 0.45746806263923645,
1519
  "learning_rate": 1.6236054330219853e-07,
1520
- "loss": 0.3002,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.9313304721030042,
1525
- "grad_norm": 0.41695234179496765,
1526
  "learning_rate": 1.439105877423963e-07,
1527
- "loss": 0.2962,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.9356223175965666,
1532
- "grad_norm": 0.44064605236053467,
1533
  "learning_rate": 1.2655783185784253e-07,
1534
- "loss": 0.2977,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.9399141630901288,
1539
- "grad_norm": 0.471942663192749,
1540
  "learning_rate": 1.1030619638320805e-07,
1541
- "loss": 0.2961,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.944206008583691,
1546
- "grad_norm": 0.4514838755130768,
1547
  "learning_rate": 9.51593532626538e-08,
1548
- "loss": 0.2805,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.9484978540772532,
1553
- "grad_norm": 0.4152979552745819,
1554
  "learning_rate": 8.11207248201834e-08,
1555
- "loss": 0.291,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.9527896995708155,
1560
- "grad_norm": 0.4407040476799011,
1561
  "learning_rate": 6.819348298638839e-08,
1562
- "loss": 0.3254,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.9570815450643777,
1567
- "grad_norm": 0.4180196225643158,
1568
  "learning_rate": 5.638054858177644e-08,
1569
- "loss": 0.2787,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.9613733905579399,
1574
- "grad_norm": 0.4244682192802429,
1575
  "learning_rate": 4.568459065683206e-08,
1576
- "loss": 0.2874,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.9656652360515021,
1581
- "grad_norm": 0.43256813287734985,
1582
  "learning_rate": 3.610802588895845e-08,
1583
- "loss": 0.3175,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.9699570815450643,
1588
- "grad_norm": 0.39380866289138794,
1589
  "learning_rate": 2.765301803645426e-08,
1590
- "loss": 0.2589,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9742489270386266,
1595
- "grad_norm": 0.43451765179634094,
1596
  "learning_rate": 2.0321477449619098e-08,
1597
- "loss": 0.3095,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9785407725321889,
1602
- "grad_norm": 0.4835575819015503,
1603
  "learning_rate": 1.411506063912882e-08,
1604
- "loss": 0.323,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9828326180257511,
1609
- "grad_norm": 0.43744510412216187,
1610
  "learning_rate": 9.035169901754902e-09,
1611
- "loss": 0.2833,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9871244635193133,
1616
- "grad_norm": 0.41378504037857056,
1617
  "learning_rate": 5.082953003528457e-09,
1618
  "loss": 0.2759,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9914163090128756,
1623
- "grad_norm": 0.39382418990135193,
1624
  "learning_rate": 2.2593029204076578e-09,
1625
- "loss": 0.2789,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9957081545064378,
1630
- "grad_norm": 0.4490097761154175,
1631
  "learning_rate": 5.648576365169245e-10,
1632
- "loss": 0.3077,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.0,
1637
- "grad_norm": 0.4198805093765259,
1638
  "learning_rate": 0.0,
1639
- "loss": 0.3028,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.0,
1644
- "eval_loss": 0.3927557170391083,
1645
- "eval_runtime": 1.5152,
1646
- "eval_samples_per_second": 151.796,
1647
- "eval_steps_per_second": 6.6,
1648
  "step": 233
1649
  },
1650
  {
1651
  "epoch": 1.0,
1652
  "step": 233,
1653
  "total_flos": 1.933383782157517e+16,
1654
- "train_loss": 0.43344665942273936,
1655
- "train_runtime": 303.441,
1656
- "train_samples_per_second": 36.712,
1657
- "train_steps_per_second": 0.768
1658
  }
1659
  ],
1660
  "logging_steps": 1,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004291845493562232,
13
+ "grad_norm": 6.858954429626465,
14
  "learning_rate": 4.1666666666666667e-07,
15
  "loss": 2.3401,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.008583690987124463,
20
+ "grad_norm": 7.1901326179504395,
21
  "learning_rate": 8.333333333333333e-07,
22
  "loss": 2.3774,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.012875536480686695,
27
+ "grad_norm": 6.998518466949463,
28
  "learning_rate": 1.25e-06,
29
+ "loss": 2.3589,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.017167381974248927,
34
+ "grad_norm": 6.999710559844971,
35
  "learning_rate": 1.6666666666666667e-06,
36
+ "loss": 2.3641,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.02145922746781116,
41
+ "grad_norm": 6.936703681945801,
42
  "learning_rate": 2.0833333333333334e-06,
43
+ "loss": 2.3491,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.02575107296137339,
48
+ "grad_norm": 6.817142963409424,
49
  "learning_rate": 2.5e-06,
50
+ "loss": 2.28,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.030042918454935622,
55
+ "grad_norm": 6.60942268371582,
56
  "learning_rate": 2.916666666666667e-06,
57
+ "loss": 2.2736,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.034334763948497854,
62
+ "grad_norm": 5.893615245819092,
63
  "learning_rate": 3.3333333333333333e-06,
64
+ "loss": 2.1437,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03862660944206009,
69
+ "grad_norm": 5.829619407653809,
70
  "learning_rate": 3.7500000000000005e-06,
71
+ "loss": 2.1175,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.04291845493562232,
76
+ "grad_norm": 5.597039699554443,
77
  "learning_rate": 4.166666666666667e-06,
78
+ "loss": 2.0907,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04721030042918455,
83
+ "grad_norm": 4.393044948577881,
84
  "learning_rate": 4.583333333333333e-06,
85
+ "loss": 1.7745,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.05150214592274678,
90
+ "grad_norm": 4.335526466369629,
91
  "learning_rate": 5e-06,
92
+ "loss": 1.6767,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.055793991416309016,
97
+ "grad_norm": 4.237323760986328,
98
  "learning_rate": 5.416666666666667e-06,
99
+ "loss": 1.6224,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.060085836909871244,
104
+ "grad_norm": 3.4697413444519043,
105
  "learning_rate": 5.833333333333334e-06,
106
+ "loss": 1.1534,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.06437768240343347,
111
+ "grad_norm": 3.5880987644195557,
112
  "learning_rate": 6.25e-06,
113
+ "loss": 1.0938,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06866952789699571,
118
+ "grad_norm": 3.3989222049713135,
119
  "learning_rate": 6.666666666666667e-06,
120
+ "loss": 1.0218,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.07296137339055794,
125
+ "grad_norm": 2.876070976257324,
126
  "learning_rate": 7.083333333333335e-06,
127
+ "loss": 0.9028,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07725321888412018,
132
+ "grad_norm": 2.5071043968200684,
133
  "learning_rate": 7.500000000000001e-06,
134
+ "loss": 0.8486,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0815450643776824,
139
+ "grad_norm": 2.0654499530792236,
140
  "learning_rate": 7.916666666666667e-06,
141
+ "loss": 0.5864,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.08583690987124463,
146
+ "grad_norm": 1.3941829204559326,
147
  "learning_rate": 8.333333333333334e-06,
148
+ "loss": 0.4791,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.09012875536480687,
153
+ "grad_norm": 1.0642451047897339,
154
  "learning_rate": 8.750000000000001e-06,
155
+ "loss": 0.4628,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0944206008583691,
160
+ "grad_norm": 0.8186908960342407,
161
  "learning_rate": 9.166666666666666e-06,
162
+ "loss": 0.423,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09871244635193133,
167
+ "grad_norm": 0.714146614074707,
168
  "learning_rate": 9.583333333333335e-06,
169
+ "loss": 0.4155,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.10300429184549356,
174
+ "grad_norm": 0.6719127297401428,
175
  "learning_rate": 1e-05,
176
+ "loss": 0.4172,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.1072961373390558,
181
+ "grad_norm": 0.6532909870147705,
182
  "learning_rate": 9.999435142363484e-06,
183
+ "loss": 0.3825,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.11158798283261803,
188
+ "grad_norm": 0.7833576202392578,
189
  "learning_rate": 9.997740697079595e-06,
190
+ "loss": 0.4211,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.11587982832618025,
195
+ "grad_norm": 0.6481274366378784,
196
  "learning_rate": 9.994917046996472e-06,
197
+ "loss": 0.3679,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.12017167381974249,
202
+ "grad_norm": 0.6363394856452942,
203
  "learning_rate": 9.990964830098246e-06,
204
+ "loss": 0.3789,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.12446351931330472,
209
+ "grad_norm": 0.6472516655921936,
210
  "learning_rate": 9.985884939360873e-06,
211
+ "loss": 0.4076,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.12875536480686695,
216
+ "grad_norm": 0.5628584623336792,
217
  "learning_rate": 9.979678522550382e-06,
218
+ "loss": 0.3832,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.13304721030042918,
223
+ "grad_norm": 0.5358518362045288,
224
  "learning_rate": 9.972346981963546e-06,
225
+ "loss": 0.3658,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.13733905579399142,
230
+ "grad_norm": 0.5220690965652466,
231
  "learning_rate": 9.963891974111042e-06,
232
+ "loss": 0.3743,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.14163090128755365,
237
+ "grad_norm": 0.544533908367157,
238
  "learning_rate": 9.95431540934317e-06,
239
+ "loss": 0.3732,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1459227467811159,
244
+ "grad_norm": 0.5741094946861267,
245
  "learning_rate": 9.943619451418225e-06,
246
+ "loss": 0.3981,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.15021459227467812,
251
+ "grad_norm": 0.5270912647247314,
252
  "learning_rate": 9.931806517013612e-06,
253
+ "loss": 0.3583,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.15450643776824036,
258
+ "grad_norm": 0.49920132756233215,
259
  "learning_rate": 9.918879275179819e-06,
260
+ "loss": 0.3414,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.15879828326180256,
265
+ "grad_norm": 0.5417159199714661,
266
  "learning_rate": 9.904840646737346e-06,
267
+ "loss": 0.3734,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1630901287553648,
272
+ "grad_norm": 0.4884810149669647,
273
  "learning_rate": 9.889693803616793e-06,
274
+ "loss": 0.3682,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.16738197424892703,
279
+ "grad_norm": 0.48894229531288147,
280
  "learning_rate": 9.873442168142158e-06,
281
+ "loss": 0.3607,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.17167381974248927,
286
+ "grad_norm": 0.4725055396556854,
287
  "learning_rate": 9.856089412257605e-06,
288
+ "loss": 0.3348,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.1759656652360515,
293
+ "grad_norm": 0.499160498380661,
294
  "learning_rate": 9.837639456697802e-06,
295
+ "loss": 0.382,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.18025751072961374,
300
+ "grad_norm": 0.4489183723926544,
301
  "learning_rate": 9.818096470102067e-06,
302
+ "loss": 0.3341,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.18454935622317598,
307
+ "grad_norm": 0.4789492189884186,
308
  "learning_rate": 9.797464868072489e-06,
309
+ "loss": 0.3521,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.1888412017167382,
314
+ "grad_norm": 0.47436925768852234,
315
  "learning_rate": 9.775749312176249e-06,
316
+ "loss": 0.3302,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.19313304721030042,
321
+ "grad_norm": 0.4419718086719513,
322
  "learning_rate": 9.752954708892379e-06,
323
+ "loss": 0.3392,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.19742489270386265,
328
+ "grad_norm": 0.4800013601779938,
329
  "learning_rate": 9.729086208503174e-06,
330
+ "loss": 0.3416,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.2017167381974249,
335
+ "grad_norm": 0.48074233531951904,
336
  "learning_rate": 9.704149203930522e-06,
337
+ "loss": 0.3632,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.20600858369098712,
342
+ "grad_norm": 0.48519065976142883,
343
  "learning_rate": 9.67814932951741e-06,
344
+ "loss": 0.3706,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.21030042918454936,
349
+ "grad_norm": 0.46741950511932373,
350
  "learning_rate": 9.651092459754879e-06,
351
+ "loss": 0.3728,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.2145922746781116,
356
+ "grad_norm": 0.46153953671455383,
357
  "learning_rate": 9.622984707954732e-06,
358
+ "loss": 0.348,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.21888412017167383,
363
+ "grad_norm": 0.42454302310943604,
364
  "learning_rate": 9.593832424868271e-06,
365
+ "loss": 0.3319,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.22317596566523606,
370
+ "grad_norm": 0.47633853554725647,
371
  "learning_rate": 9.563642197251382e-06,
372
+ "loss": 0.3699,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.22746781115879827,
377
+ "grad_norm": 0.46166110038757324,
378
  "learning_rate": 9.532420846376316e-06,
379
+ "loss": 0.3278,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.2317596566523605,
384
+ "grad_norm": 0.4451698362827301,
385
  "learning_rate": 9.500175426490455e-06,
386
+ "loss": 0.3165,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.23605150214592274,
391
+ "grad_norm": 0.4674447774887085,
392
  "learning_rate": 9.466913223222467e-06,
393
+ "loss": 0.3487,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.24034334763948498,
398
+ "grad_norm": 0.44866660237312317,
399
  "learning_rate": 9.432641751936162e-06,
400
+ "loss": 0.3286,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2446351931330472,
405
+ "grad_norm": 0.4655809700489044,
406
  "learning_rate": 9.397368756032445e-06,
407
  "loss": 0.3435,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.24892703862660945,
412
+ "grad_norm": 0.4543526768684387,
413
  "learning_rate": 9.361102205199762e-06,
414
+ "loss": 0.3448,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.2532188841201717,
419
+ "grad_norm": 0.46444252133369446,
420
  "learning_rate": 9.32385029361338e-06,
421
+ "loss": 0.3244,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.2575107296137339,
426
+ "grad_norm": 0.44596582651138306,
427
  "learning_rate": 9.285621438083997e-06,
428
+ "loss": 0.3183,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.26180257510729615,
433
+ "grad_norm": 0.4672187864780426,
434
  "learning_rate": 9.246424276156008e-06,
435
+ "loss": 0.3288,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.26609442060085836,
440
+ "grad_norm": 0.451576292514801,
441
  "learning_rate": 9.206267664155906e-06,
442
+ "loss": 0.3268,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2703862660944206,
447
+ "grad_norm": 0.4450378119945526,
448
  "learning_rate": 9.165160675191272e-06,
449
+ "loss": 0.3318,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.27467811158798283,
454
+ "grad_norm": 0.4558464586734772,
455
  "learning_rate": 9.123112597100759e-06,
456
+ "loss": 0.3366,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.27896995708154504,
461
+ "grad_norm": 0.5316265821456909,
462
  "learning_rate": 9.080132930355567e-06,
463
+ "loss": 0.3323,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2832618025751073,
468
+ "grad_norm": 0.42524275183677673,
469
  "learning_rate": 9.03623138591289e-06,
470
+ "loss": 0.3076,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2875536480686695,
475
+ "grad_norm": 0.4055148661136627,
476
  "learning_rate": 8.99141788302178e-06,
477
+ "loss": 0.3071,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2918454935622318,
482
+ "grad_norm": 0.4067463278770447,
483
  "learning_rate": 8.94570254698197e-06,
484
+ "loss": 0.2992,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.296137339055794,
489
+ "grad_norm": 0.4295370280742645,
490
  "learning_rate": 8.899095706856122e-06,
491
+ "loss": 0.321,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.30042918454935624,
496
+ "grad_norm": 0.44975805282592773,
497
  "learning_rate": 8.851607893136065e-06,
498
+ "loss": 0.3521,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.30472103004291845,
503
+ "grad_norm": 0.47437217831611633,
504
  "learning_rate": 8.803249835363486e-06,
505
+ "loss": 0.2967,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3090128755364807,
510
+ "grad_norm": 0.4566970765590668,
511
  "learning_rate": 8.754032459705672e-06,
512
+ "loss": 0.3449,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.3133047210300429,
517
+ "grad_norm": 0.4559485614299774,
518
  "learning_rate": 8.703966886486819e-06,
519
+ "loss": 0.3286,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.31759656652360513,
524
+ "grad_norm": 0.42962053418159485,
525
  "learning_rate": 8.65306442767547e-06,
526
+ "loss": 0.3195,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3218884120171674,
531
+ "grad_norm": 0.4547812342643738,
532
  "learning_rate": 8.601336584328659e-06,
533
+ "loss": 0.3347,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3261802575107296,
538
+ "grad_norm": 0.5012584328651428,
539
  "learning_rate": 8.548795043993316e-06,
540
+ "loss": 0.2947,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.33047210300429186,
545
+ "grad_norm": 0.4377043545246124,
546
  "learning_rate": 8.495451678065563e-06,
547
+ "loss": 0.3323,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.33476394849785407,
552
+ "grad_norm": 0.45930778980255127,
553
  "learning_rate": 8.441318539108433e-06,
554
+ "loss": 0.3447,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.33905579399141633,
559
+ "grad_norm": 0.43653759360313416,
560
  "learning_rate": 8.386407858128707e-06,
561
+ "loss": 0.3362,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.34334763948497854,
566
+ "grad_norm": 0.4360993206501007,
567
  "learning_rate": 8.330732041813367e-06,
568
+ "loss": 0.313,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.34763948497854075,
573
+ "grad_norm": 0.45763546228408813,
574
  "learning_rate": 8.274303669726427e-06,
575
+ "loss": 0.3427,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.351931330472103,
580
+ "grad_norm": 0.45791178941726685,
581
  "learning_rate": 8.217135491466636e-06,
582
+ "loss": 0.3459,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.3562231759656652,
587
+ "grad_norm": 0.4212036430835724,
588
  "learning_rate": 8.15924042378682e-06,
589
+ "loss": 0.3039,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.3605150214592275,
594
+ "grad_norm": 0.4088297486305237,
595
  "learning_rate": 8.100631547675417e-06,
596
+ "loss": 0.3036,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3648068669527897,
601
+ "grad_norm": 0.44220373034477234,
602
  "learning_rate": 8.041322105400923e-06,
603
+ "loss": 0.3334,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.36909871244635195,
608
+ "grad_norm": 0.4390133023262024,
609
  "learning_rate": 7.981325497519892e-06,
610
+ "loss": 0.3183,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.37339055793991416,
615
+ "grad_norm": 0.46158358454704285,
616
  "learning_rate": 7.920655279849173e-06,
617
+ "loss": 0.3211,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3776824034334764,
622
+ "grad_norm": 0.444137305021286,
623
  "learning_rate": 7.859325160403073e-06,
624
+ "loss": 0.3392,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.38197424892703863,
629
+ "grad_norm": 0.4566199779510498,
630
  "learning_rate": 7.797348996296116e-06,
631
+ "loss": 0.3201,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.38626609442060084,
636
+ "grad_norm": 0.45487260818481445,
637
  "learning_rate": 7.734740790612137e-06,
638
+ "loss": 0.3399,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.3905579399141631,
643
+ "grad_norm": 0.4396146833896637,
644
  "learning_rate": 7.671514689240366e-06,
645
+ "loss": 0.3076,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3948497854077253,
650
+ "grad_norm": 0.45286861062049866,
651
  "learning_rate": 7.607684977679284e-06,
652
+ "loss": 0.299,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.39914163090128757,
657
+ "grad_norm": 0.4087929129600525,
658
  "learning_rate": 7.543266077808893e-06,
659
+ "loss": 0.2845,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.4034334763948498,
664
+ "grad_norm": 0.4473104774951935,
665
  "learning_rate": 7.478272544632204e-06,
666
+ "loss": 0.3198,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.40772532188841204,
671
+ "grad_norm": 0.4244577884674072,
672
  "learning_rate": 7.412719062986632e-06,
673
+ "loss": 0.3367,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.41201716738197425,
678
+ "grad_norm": 0.4132303297519684,
679
  "learning_rate": 7.3466204442260605e-06,
680
  "loss": 0.276,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.41630901287553645,
685
+ "grad_norm": 0.4498296082019806,
686
  "learning_rate": 7.279991622874319e-06,
687
+ "loss": 0.3412,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.4206008583690987,
692
+ "grad_norm": 0.45308294892311096,
693
  "learning_rate": 7.212847653250828e-06,
694
+ "loss": 0.3001,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.4248927038626609,
699
+ "grad_norm": 0.4460461437702179,
700
  "learning_rate": 7.145203706069183e-06,
701
+ "loss": 0.2855,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.4291845493562232,
706
+ "grad_norm": 0.43338119983673096,
707
  "learning_rate": 7.0770750650094335e-06,
708
+ "loss": 0.3366,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4334763948497854,
713
+ "grad_norm": 0.4553949534893036,
714
  "learning_rate": 7.008477123264849e-06,
715
+ "loss": 0.3444,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.43776824034334766,
720
+ "grad_norm": 0.42553162574768066,
721
  "learning_rate": 6.939425380063924e-06,
722
+ "loss": 0.3146,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.44206008583690987,
727
+ "grad_norm": 0.42176350951194763,
728
  "learning_rate": 6.869935437168449e-06,
729
+ "loss": 0.3221,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.44635193133047213,
734
+ "grad_norm": 0.44573134183883667,
735
  "learning_rate": 6.800022995348381e-06,
736
  "loss": 0.3182,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.45064377682403434,
741
+ "grad_norm": 0.43147024512290955,
742
  "learning_rate": 6.729703850834381e-06,
743
+ "loss": 0.3246,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.45493562231759654,
748
+ "grad_norm": 0.402482807636261,
749
  "learning_rate": 6.65899389174876e-06,
750
+ "loss": 0.2823,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.4592274678111588,
755
+ "grad_norm": 0.4000365138053894,
756
  "learning_rate": 6.587909094515663e-06,
757
+ "loss": 0.2955,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.463519313304721,
762
+ "grad_norm": 0.3905521333217621,
763
  "learning_rate": 6.5164655202513135e-06,
764
+ "loss": 0.2793,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4678111587982833,
769
+ "grad_norm": 0.4438071548938751,
770
  "learning_rate": 6.444679311135112e-06,
771
+ "loss": 0.34,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.4721030042918455,
776
+ "grad_norm": 0.45224955677986145,
777
  "learning_rate": 6.372566686762427e-06,
778
+ "loss": 0.3406,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.47639484978540775,
783
+ "grad_norm": 0.45422661304473877,
784
  "learning_rate": 6.300143940479881e-06,
785
+ "loss": 0.3243,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.48068669527896996,
790
+ "grad_norm": 0.526847779750824,
791
  "learning_rate": 6.227427435703997e-06,
792
  "loss": 0.2892,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.48497854077253216,
797
+ "grad_norm": 0.4258354604244232,
798
  "learning_rate": 6.154433602223979e-06,
799
+ "loss": 0.269,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4892703862660944,
804
+ "grad_norm": 0.40687990188598633,
805
  "learning_rate": 6.0811789324895365e-06,
806
+ "loss": 0.3001,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.49356223175965663,
811
+ "grad_norm": 0.43260207772254944,
812
  "learning_rate": 6.0076799778845105e-06,
813
+ "loss": 0.306,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4978540772532189,
818
+ "grad_norm": 0.43823885917663574,
819
  "learning_rate": 5.933953344987215e-06,
820
+ "loss": 0.3119,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.5021459227467812,
825
+ "grad_norm": 0.4540681540966034,
826
  "learning_rate": 5.860015691818292e-06,
827
+ "loss": 0.3307,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.5064377682403434,
832
+ "grad_norm": 0.4211110770702362,
833
  "learning_rate": 5.78588372407695e-06,
834
+ "loss": 0.2904,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.5107296137339056,
839
+ "grad_norm": 0.42265021800994873,
840
  "learning_rate": 5.711574191366427e-06,
841
+ "loss": 0.3069,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.5150214592274678,
846
+ "grad_norm": 0.42844855785369873,
847
  "learning_rate": 5.637103883409525e-06,
848
+ "loss": 0.3033,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.51931330472103,
853
+ "grad_norm": 0.4592513144016266,
854
  "learning_rate": 5.562489626255104e-06,
855
+ "loss": 0.3125,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.5236051502145923,
860
+ "grad_norm": 0.4309280216693878,
861
  "learning_rate": 5.487748278476342e-06,
862
+ "loss": 0.3124,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.5278969957081545,
867
+ "grad_norm": 0.3942621946334839,
868
  "learning_rate": 5.412896727361663e-06,
869
+ "loss": 0.2525,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.5321888412017167,
874
+ "grad_norm": 0.40998613834381104,
875
  "learning_rate": 5.337951885099167e-06,
876
+ "loss": 0.2895,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.5364806866952789,
881
+ "grad_norm": 0.4335436522960663,
882
  "learning_rate": 5.262930684955439e-06,
883
+ "loss": 0.2924,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5407725321888412,
888
+ "grad_norm": 0.4402632415294647,
889
  "learning_rate": 5.187850077449604e-06,
890
  "loss": 0.317,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5450643776824035,
895
+ "grad_norm": 0.39414867758750916,
896
  "learning_rate": 5.112727026523461e-06,
897
+ "loss": 0.2753,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5493562231759657,
902
+ "grad_norm": 0.4759502410888672,
903
  "learning_rate": 5.03757850570861e-06,
904
  "loss": 0.3066,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5536480686695279,
909
+ "grad_norm": 0.41338634490966797,
910
  "learning_rate": 4.9624214942913916e-06,
911
+ "loss": 0.3013,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5579399141630901,
916
+ "grad_norm": 0.4515892267227173,
917
  "learning_rate": 4.88727297347654e-06,
918
+ "loss": 0.306,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5622317596566524,
923
+ "grad_norm": 0.3964577317237854,
924
  "learning_rate": 4.8121499225503974e-06,
925
  "loss": 0.2874,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5665236051502146,
930
+ "grad_norm": 0.41198039054870605,
931
  "learning_rate": 4.737069315044562e-06,
932
+ "loss": 0.2947,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5708154506437768,
937
+ "grad_norm": 0.4514983892440796,
938
  "learning_rate": 4.662048114900837e-06,
939
+ "loss": 0.3235,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.575107296137339,
944
+ "grad_norm": 0.4610046446323395,
945
  "learning_rate": 4.587103272638339e-06,
946
+ "loss": 0.3268,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5793991416309013,
951
+ "grad_norm": 0.4211605489253998,
952
  "learning_rate": 4.512251721523659e-06,
953
  "loss": 0.2671,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5836909871244635,
958
+ "grad_norm": 0.4412516951560974,
959
  "learning_rate": 4.437510373744897e-06,
960
+ "loss": 0.3072,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.5879828326180258,
965
+ "grad_norm": 0.4627492427825928,
966
  "learning_rate": 4.362896116590475e-06,
967
+ "loss": 0.3031,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.592274678111588,
972
+ "grad_norm": 0.4873397648334503,
973
  "learning_rate": 4.2884258086335755e-06,
974
+ "loss": 0.3223,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5965665236051502,
979
+ "grad_norm": 0.41365429759025574,
980
  "learning_rate": 4.214116275923051e-06,
981
+ "loss": 0.2736,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.6008583690987125,
986
+ "grad_norm": 0.45329636335372925,
987
  "learning_rate": 4.1399843081817085e-06,
988
+ "loss": 0.3119,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.6051502145922747,
993
+ "grad_norm": 0.41913923621177673,
994
  "learning_rate": 4.066046655012786e-06,
995
+ "loss": 0.3037,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.6094420600858369,
1000
+ "grad_norm": 0.43529585003852844,
1001
  "learning_rate": 3.992320022115492e-06,
1002
+ "loss": 0.3148,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.6137339055793991,
1007
+ "grad_norm": 0.4258601665496826,
1008
  "learning_rate": 3.918821067510464e-06,
1009
+ "loss": 0.2917,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.6180257510729614,
1014
+ "grad_norm": 0.46047812700271606,
1015
  "learning_rate": 3.845566397776022e-06,
1016
+ "loss": 0.3298,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.6223175965665236,
1021
+ "grad_norm": 0.42097219824790955,
1022
  "learning_rate": 3.7725725642960047e-06,
1023
+ "loss": 0.3056,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.6266094420600858,
1028
+ "grad_norm": 0.43911123275756836,
1029
  "learning_rate": 3.6998560595201188e-06,
1030
+ "loss": 0.2912,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.630901287553648,
1035
+ "grad_norm": 0.42664578557014465,
1036
  "learning_rate": 3.627433313237576e-06,
1037
+ "loss": 0.2955,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.6351931330472103,
1042
+ "grad_norm": 0.42000484466552734,
1043
  "learning_rate": 3.555320688864889e-06,
1044
+ "loss": 0.2903,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.6394849785407726,
1049
+ "grad_norm": 0.4376209080219269,
1050
  "learning_rate": 3.483534479748688e-06,
1051
+ "loss": 0.291,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.6437768240343348,
1056
+ "grad_norm": 0.4324072301387787,
1057
  "learning_rate": 3.4120909054843375e-06,
1058
+ "loss": 0.2824,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.648068669527897,
1063
+ "grad_norm": 0.38668328523635864,
1064
  "learning_rate": 3.3410061082512422e-06,
1065
+ "loss": 0.2586,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6523605150214592,
1070
+ "grad_norm": 0.40362098813056946,
1071
  "learning_rate": 3.2702961491656197e-06,
1072
+ "loss": 0.2853,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.6566523605150214,
1077
+ "grad_norm": 0.3973497152328491,
1078
  "learning_rate": 3.1999770046516198e-06,
1079
+ "loss": 0.2826,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6609442060085837,
1084
+ "grad_norm": 0.4138431251049042,
1085
  "learning_rate": 3.130064562831553e-06,
1086
+ "loss": 0.2757,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6652360515021459,
1091
+ "grad_norm": 0.4256836771965027,
1092
  "learning_rate": 3.0605746199360755e-06,
1093
+ "loss": 0.2978,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6695278969957081,
1098
+ "grad_norm": 0.41062039136886597,
1099
  "learning_rate": 2.991522876735154e-06,
1100
+ "loss": 0.2859,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6738197424892703,
1105
+ "grad_norm": 0.44024190306663513,
1106
  "learning_rate": 2.9229249349905686e-06,
1107
+ "loss": 0.3014,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6781115879828327,
1112
+ "grad_norm": 0.44576936960220337,
1113
  "learning_rate": 2.8547962939308187e-06,
1114
+ "loss": 0.2828,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6824034334763949,
1119
+ "grad_norm": 0.4253944456577301,
1120
  "learning_rate": 2.787152346749173e-06,
1121
+ "loss": 0.3,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6866952789699571,
1126
+ "grad_norm": 0.40661880373954773,
1127
  "learning_rate": 2.720008377125682e-06,
1128
+ "loss": 0.2811,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6909871244635193,
1133
+ "grad_norm": 0.45691847801208496,
1134
  "learning_rate": 2.6533795557739407e-06,
1135
  "loss": 0.3238,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6952789699570815,
1140
+ "grad_norm": 0.42418181896209717,
1141
  "learning_rate": 2.5872809370133704e-06,
1142
+ "loss": 0.2935,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6995708154506438,
1147
+ "grad_norm": 0.42245230078697205,
1148
  "learning_rate": 2.5217274553677975e-06,
1149
  "loss": 0.2985,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.703862660944206,
1154
+ "grad_norm": 0.41597869992256165,
1155
  "learning_rate": 2.4567339221911086e-06,
1156
+ "loss": 0.2923,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.7081545064377682,
1161
+ "grad_norm": 0.44589149951934814,
1162
  "learning_rate": 2.3923150223207176e-06,
1163
+ "loss": 0.3106,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.7124463519313304,
1168
+ "grad_norm": 0.40725013613700867,
1169
  "learning_rate": 2.328485310759635e-06,
1170
+ "loss": 0.2861,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.7167381974248928,
1175
+ "grad_norm": 0.44540056586265564,
1176
  "learning_rate": 2.265259209387867e-06,
1177
+ "loss": 0.3039,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.721030042918455,
1182
+ "grad_norm": 0.4068719446659088,
1183
  "learning_rate": 2.202651003703885e-06,
1184
+ "loss": 0.2802,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.7253218884120172,
1189
+ "grad_norm": 0.39111173152923584,
1190
  "learning_rate": 2.140674839596931e-06,
1191
+ "loss": 0.2656,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.7296137339055794,
1196
+ "grad_norm": 0.4605322480201721,
1197
  "learning_rate": 2.0793447201508288e-06,
1198
+ "loss": 0.3151,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.7339055793991416,
1203
+ "grad_norm": 0.4269479513168335,
1204
  "learning_rate": 2.01867450248011e-06,
1205
+ "loss": 0.3022,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.7381974248927039,
1210
+ "grad_norm": 0.3947524130344391,
1211
  "learning_rate": 1.9586778945990785e-06,
1212
+ "loss": 0.2815,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.7424892703862661,
1217
+ "grad_norm": 0.43399283289909363,
1218
  "learning_rate": 1.8993684523245842e-06,
1219
+ "loss": 0.2899,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.7467811158798283,
1224
+ "grad_norm": 0.4567514955997467,
1225
  "learning_rate": 1.8407595762131814e-06,
1226
+ "loss": 0.3411,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.7510729613733905,
1231
+ "grad_norm": 0.4480878710746765,
1232
  "learning_rate": 1.7828645085333645e-06,
1233
+ "loss": 0.3458,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.7553648068669528,
1238
+ "grad_norm": 0.44001972675323486,
1239
  "learning_rate": 1.7256963302735752e-06,
1240
+ "loss": 0.3171,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.759656652360515,
1245
+ "grad_norm": 0.42058637738227844,
1246
  "learning_rate": 1.6692679581866334e-06,
1247
+ "loss": 0.3039,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.7639484978540773,
1252
+ "grad_norm": 0.42307794094085693,
1253
  "learning_rate": 1.6135921418712959e-06,
1254
+ "loss": 0.2734,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7682403433476395,
1259
+ "grad_norm": 0.42678767442703247,
1260
  "learning_rate": 1.5586814608915673e-06,
1261
+ "loss": 0.3135,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7725321888412017,
1266
+ "grad_norm": 0.4102153480052948,
1267
  "learning_rate": 1.5045483219344387e-06,
1268
+ "loss": 0.2792,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.776824034334764,
1273
+ "grad_norm": 0.40929415822029114,
1274
  "learning_rate": 1.4512049560066837e-06,
1275
  "loss": 0.2758,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7811158798283262,
1280
+ "grad_norm": 0.42089492082595825,
1281
  "learning_rate": 1.3986634156713418e-06,
1282
+ "loss": 0.2835,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7854077253218884,
1287
+ "grad_norm": 0.420242041349411,
1288
  "learning_rate": 1.3469355723245303e-06,
1289
+ "loss": 0.2855,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7896995708154506,
1294
+ "grad_norm": 0.45139551162719727,
1295
  "learning_rate": 1.2960331135131826e-06,
1296
+ "loss": 0.3284,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.7939914163090128,
1301
+ "grad_norm": 0.4033341109752655,
1302
  "learning_rate": 1.245967540294329e-06,
1303
+ "loss": 0.2848,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7982832618025751,
1308
+ "grad_norm": 0.37155604362487793,
1309
  "learning_rate": 1.1967501646365147e-06,
1310
+ "loss": 0.2757,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.8025751072961373,
1315
+ "grad_norm": 0.4291873872280121,
1316
  "learning_rate": 1.1483921068639353e-06,
1317
+ "loss": 0.2755,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.8068669527896996,
1322
+ "grad_norm": 0.4211515784263611,
1323
  "learning_rate": 1.1009042931438784e-06,
1324
+ "loss": 0.298,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.8111587982832618,
1329
+ "grad_norm": 0.4309849441051483,
1330
  "learning_rate": 1.0542974530180327e-06,
1331
+ "loss": 0.3067,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.8154506437768241,
1336
+ "grad_norm": 0.3826170563697815,
1337
  "learning_rate": 1.00858211697822e-06,
1338
+ "loss": 0.265,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.8197424892703863,
1343
+ "grad_norm": 0.40116748213768005,
1344
  "learning_rate": 9.637686140871121e-07,
1345
+ "loss": 0.2901,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.8240343347639485,
1350
+ "grad_norm": 0.40444761514663696,
1351
  "learning_rate": 9.198670696444339e-07,
1352
+ "loss": 0.2805,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.8283261802575107,
1357
+ "grad_norm": 0.521791934967041,
1358
  "learning_rate": 8.768874028992431e-07,
1359
+ "loss": 0.3184,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.8326180257510729,
1364
+ "grad_norm": 0.4177843928337097,
1365
  "learning_rate": 8.348393248087289e-07,
1366
+ "loss": 0.3183,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.8369098712446352,
1371
+ "grad_norm": 0.44546449184417725,
1372
  "learning_rate": 7.937323358440935e-07,
1373
+ "loss": 0.3407,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.8412017167381974,
1378
+ "grad_norm": 0.43112218379974365,
1379
  "learning_rate": 7.535757238439939e-07,
1380
+ "loss": 0.3112,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.8454935622317596,
1385
+ "grad_norm": 0.38515380024909973,
1386
  "learning_rate": 7.143785619160026e-07,
1387
+ "loss": 0.2781,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.8497854077253219,
1392
+ "grad_norm": 0.40132516622543335,
1393
  "learning_rate": 6.761497063866207e-07,
1394
+ "loss": 0.2836,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.8540772532188842,
1399
+ "grad_norm": 0.43590113520622253,
1400
  "learning_rate": 6.388977948002406e-07,
1401
+ "loss": 0.3019,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.8583690987124464,
1406
+ "grad_norm": 0.39947134256362915,
1407
  "learning_rate": 6.026312439675553e-07,
1408
+ "loss": 0.2741,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8626609442060086,
1413
+ "grad_norm": 0.3957541882991791,
1414
  "learning_rate": 5.673582480638395e-07,
1415
+ "loss": 0.3023,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8669527896995708,
1420
+ "grad_norm": 0.44793784618377686,
1421
  "learning_rate": 5.330867767775333e-07,
1422
+ "loss": 0.3392,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.871244635193133,
1427
+ "grad_norm": 0.40923067927360535,
1428
  "learning_rate": 4.998245735095459e-07,
1429
+ "loss": 0.3041,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8755364806866953,
1434
+ "grad_norm": 0.413678377866745,
1435
  "learning_rate": 4.6757915362368567e-07,
1436
+ "loss": 0.2865,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8798283261802575,
1441
+ "grad_norm": 0.4134410619735718,
1442
  "learning_rate": 4.363578027486187e-07,
1443
+ "loss": 0.302,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8841201716738197,
1448
+ "grad_norm": 0.41365522146224976,
1449
  "learning_rate": 4.0616757513173123e-07,
1450
  "loss": 0.2828,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8884120171673819,
1455
+ "grad_norm": 0.4147285521030426,
1456
  "learning_rate": 3.7701529204526856e-07,
1457
+ "loss": 0.2937,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8927038626609443,
1462
+ "grad_norm": 0.41686710715293884,
1463
  "learning_rate": 3.4890754024512254e-07,
1464
+ "loss": 0.3021,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8969957081545065,
1469
+ "grad_norm": 0.4329932630062103,
1470
  "learning_rate": 3.2185067048259245e-07,
1471
  "loss": 0.2926,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.9012875536480687,
1476
+ "grad_norm": 0.3969036936759949,
1477
  "learning_rate": 2.9585079606947843e-07,
1478
+ "loss": 0.2973,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.9055793991416309,
1483
+ "grad_norm": 0.42916810512542725,
1484
  "learning_rate": 2.7091379149682683e-07,
1485
+ "loss": 0.2745,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.9098712446351931,
1490
+ "grad_norm": 0.3912080228328705,
1491
  "learning_rate": 2.470452911076227e-07,
1492
+ "loss": 0.2763,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.9141630901287554,
1497
+ "grad_norm": 0.4045410454273224,
1498
  "learning_rate": 2.242506878237538e-07,
1499
+ "loss": 0.2944,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.9184549356223176,
1504
+ "grad_norm": 0.3928059935569763,
1505
  "learning_rate": 2.0253513192751374e-07,
1506
+ "loss": 0.2821,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.9227467811158798,
1511
+ "grad_norm": 0.42325636744499207,
1512
  "learning_rate": 1.8190352989793325e-07,
1513
  "loss": 0.2807,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.927038626609442,
1518
+ "grad_norm": 0.4236631691455841,
1519
  "learning_rate": 1.6236054330219853e-07,
1520
+ "loss": 0.2996,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.9313304721030042,
1525
+ "grad_norm": 0.416293740272522,
1526
  "learning_rate": 1.439105877423963e-07,
1527
+ "loss": 0.2958,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.9356223175965666,
1532
+ "grad_norm": 0.4194984436035156,
1533
  "learning_rate": 1.2655783185784253e-07,
1534
+ "loss": 0.2971,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.9399141630901288,
1539
+ "grad_norm": 0.4220767617225647,
1540
  "learning_rate": 1.1030619638320805e-07,
1541
+ "loss": 0.2947,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.944206008583691,
1546
+ "grad_norm": 0.463470995426178,
1547
  "learning_rate": 9.51593532626538e-08,
1548
+ "loss": 0.2804,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.9484978540772532,
1553
+ "grad_norm": 0.44654470682144165,
1554
  "learning_rate": 8.11207248201834e-08,
1555
+ "loss": 0.2907,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.9527896995708155,
1560
+ "grad_norm": 0.5606074333190918,
1561
  "learning_rate": 6.819348298638839e-08,
1562
+ "loss": 0.3248,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.9570815450643777,
1567
+ "grad_norm": 0.41357025504112244,
1568
  "learning_rate": 5.638054858177644e-08,
1569
+ "loss": 0.2782,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.9613733905579399,
1574
+ "grad_norm": 0.4360516369342804,
1575
  "learning_rate": 4.568459065683206e-08,
1576
+ "loss": 0.2866,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.9656652360515021,
1581
+ "grad_norm": 0.43068474531173706,
1582
  "learning_rate": 3.610802588895845e-08,
1583
+ "loss": 0.3177,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.9699570815450643,
1588
+ "grad_norm": 0.39279019832611084,
1589
  "learning_rate": 2.765301803645426e-08,
1590
+ "loss": 0.2584,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9742489270386266,
1595
+ "grad_norm": 0.42877382040023804,
1596
  "learning_rate": 2.0321477449619098e-08,
1597
+ "loss": 0.3091,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9785407725321889,
1602
+ "grad_norm": 0.4641684889793396,
1603
  "learning_rate": 1.411506063912882e-08,
1604
+ "loss": 0.3223,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9828326180257511,
1609
+ "grad_norm": 0.41311776638031006,
1610
  "learning_rate": 9.035169901754902e-09,
1611
+ "loss": 0.2827,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9871244635193133,
1616
+ "grad_norm": 0.40819963812828064,
1617
  "learning_rate": 5.082953003528457e-09,
1618
  "loss": 0.2759,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9914163090128756,
1623
+ "grad_norm": 0.39194202423095703,
1624
  "learning_rate": 2.2593029204076578e-09,
1625
+ "loss": 0.2787,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9957081545064378,
1630
+ "grad_norm": 0.43726563453674316,
1631
  "learning_rate": 5.648576365169245e-10,
1632
+ "loss": 0.3068,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.0,
1637
+ "grad_norm": 0.40751731395721436,
1638
  "learning_rate": 0.0,
1639
+ "loss": 0.3021,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.0,
1644
+ "eval_loss": 0.3932158648967743,
1645
+ "eval_runtime": 1.5342,
1646
+ "eval_samples_per_second": 149.92,
1647
+ "eval_steps_per_second": 6.518,
1648
  "step": 233
1649
  },
1650
  {
1651
  "epoch": 1.0,
1652
  "step": 233,
1653
  "total_flos": 1.933383782157517e+16,
1654
+ "train_loss": 0.43314852683840904,
1655
+ "train_runtime": 303.7389,
1656
+ "train_samples_per_second": 36.676,
1657
+ "train_steps_per_second": 0.767
1658
  }
1659
  ],
1660
  "logging_steps": 1,