terry69 commited on
Commit
8a3dd4b
1 Parent(s): 769cefd

Model save

Browse files
Files changed (4) hide show
  1. README.md +6 -10
  2. all_results.json +7 -12
  3. train_results.json +7 -7
  4. trainer_state.json +958 -958
README.md CHANGED
@@ -2,15 +2,11 @@
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-Instruct-v0.2
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
12
  datasets:
13
- - preference-data
14
  model-index:
15
  - name: preference_p0.1_seed42_level3_rare
16
  results: []
@@ -21,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # preference_p0.1_seed42_level3_rare
23
 
24
- This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the preference-data dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.3398
27
 
28
  ## Model description
29
 
@@ -58,9 +54,9 @@ The following hyperparameters were used during training:
58
 
59
  ### Training results
60
 
61
- | Training Loss | Epoch | Step | Validation Loss |
62
- |:-------------:|:-----:|:----:|:---------------:|
63
- | 0.3499 | 1.0 | 1206 | 0.3398 |
64
 
65
 
66
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-Instruct-v0.2
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: preference_p0.1_seed42_level3_rare
12
  results: []
 
17
 
18
  # preference_p0.1_seed42_level3_rare
19
 
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.3488
23
 
24
  ## Model description
25
 
 
54
 
55
  ### Training results
56
 
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:------:|:----:|:---------------:|
59
+ | 0.3416 | 0.9998 | 1207 | 0.3488 |
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.3398154377937317,
4
- "eval_runtime": 123.7322,
5
- "eval_samples": 999,
6
- "eval_samples_per_second": 3.152,
7
- "eval_steps_per_second": 0.792,
8
- "total_flos": 252459519836160.0,
9
- "train_loss": 0.5258198334506495,
10
- "train_runtime": 27352.0353,
11
- "train_samples": 98793,
12
- "train_samples_per_second": 1.411,
13
- "train_steps_per_second": 0.044
14
  }
 
1
  {
2
+ "epoch": 0.999792917788362,
3
+ "total_flos": 252668899491840.0,
4
+ "train_loss": 0.5272446889569172,
5
+ "train_runtime": 29415.576,
6
+ "train_samples": 98881,
7
+ "train_samples_per_second": 1.313,
8
+ "train_steps_per_second": 0.041
 
 
 
 
 
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 252459519836160.0,
4
- "train_loss": 0.5258198334506495,
5
- "train_runtime": 27352.0353,
6
- "train_samples": 98793,
7
- "train_samples_per_second": 1.411,
8
- "train_steps_per_second": 0.044
9
  }
 
1
  {
2
+ "epoch": 0.999792917788362,
3
+ "total_flos": 252668899491840.0,
4
+ "train_loss": 0.5272446889569172,
5
+ "train_runtime": 29415.576,
6
+ "train_samples": 98881,
7
+ "train_samples_per_second": 1.313,
8
+ "train_steps_per_second": 0.041
9
  }
trainer_state.json CHANGED
@@ -1,1727 +1,1727 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 1206,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0008291873963515755,
13
- "grad_norm": 24.653067933234663,
14
  "learning_rate": 8.264462809917357e-08,
15
- "loss": 1.4306,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.0041459369817578775,
20
- "grad_norm": 23.892643599825455,
21
  "learning_rate": 4.132231404958678e-07,
22
- "loss": 1.4152,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.008291873963515755,
27
- "grad_norm": 8.240455406065408,
28
  "learning_rate": 8.264462809917356e-07,
29
- "loss": 1.3087,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.012437810945273632,
34
- "grad_norm": 10.488796830355087,
35
  "learning_rate": 1.2396694214876035e-06,
36
- "loss": 1.1492,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.01658374792703151,
41
- "grad_norm": 2.925786467239926,
42
  "learning_rate": 1.6528925619834712e-06,
43
- "loss": 1.0067,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.020729684908789386,
48
- "grad_norm": 2.883660781210465,
49
  "learning_rate": 2.066115702479339e-06,
50
- "loss": 0.9584,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.024875621890547265,
55
- "grad_norm": 2.4669088438574134,
56
  "learning_rate": 2.479338842975207e-06,
57
- "loss": 0.9292,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.02902155887230514,
62
- "grad_norm": 2.3299527650938723,
63
  "learning_rate": 2.8925619834710743e-06,
64
- "loss": 0.9085,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.03316749585406302,
69
- "grad_norm": 2.2708144285624106,
70
  "learning_rate": 3.3057851239669424e-06,
71
- "loss": 0.8881,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.03731343283582089,
76
- "grad_norm": 2.316177321692127,
77
  "learning_rate": 3.71900826446281e-06,
78
- "loss": 0.8868,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.04145936981757877,
83
- "grad_norm": 2.232211217344898,
84
  "learning_rate": 4.132231404958678e-06,
85
- "loss": 0.8621,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.04560530679933665,
90
- "grad_norm": 2.2356345009499874,
91
  "learning_rate": 4.5454545454545455e-06,
92
- "loss": 0.8638,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.04975124378109453,
97
- "grad_norm": 2.3393115031498475,
98
  "learning_rate": 4.958677685950414e-06,
99
- "loss": 0.8651,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.0538971807628524,
104
- "grad_norm": 2.457898545521906,
105
  "learning_rate": 5.371900826446281e-06,
106
- "loss": 0.8538,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.05804311774461028,
111
- "grad_norm": 2.426055506518116,
112
  "learning_rate": 5.785123966942149e-06,
113
- "loss": 0.8332,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.06218905472636816,
118
- "grad_norm": 2.2865749273548115,
119
  "learning_rate": 6.198347107438017e-06,
120
- "loss": 0.8421,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.06633499170812604,
125
- "grad_norm": 2.659505510747107,
126
  "learning_rate": 6.611570247933885e-06,
127
- "loss": 0.8332,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.07048092868988391,
132
- "grad_norm": 2.3495584049644394,
133
  "learning_rate": 7.0247933884297525e-06,
134
- "loss": 0.8288,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.07462686567164178,
139
- "grad_norm": 2.4089715032979266,
140
  "learning_rate": 7.43801652892562e-06,
141
- "loss": 0.8125,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.07877280265339967,
146
- "grad_norm": 2.441259064281342,
147
  "learning_rate": 7.851239669421489e-06,
148
- "loss": 0.8197,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.08291873963515754,
153
- "grad_norm": 2.9656175255591606,
154
  "learning_rate": 8.264462809917356e-06,
155
- "loss": 0.8201,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.08706467661691543,
160
- "grad_norm": 2.506253302731775,
161
  "learning_rate": 8.677685950413224e-06,
162
- "loss": 0.8131,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.0912106135986733,
167
- "grad_norm": 2.5306709875051787,
168
  "learning_rate": 9.090909090909091e-06,
169
- "loss": 0.8067,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.09535655058043117,
174
- "grad_norm": 2.474766693594476,
175
  "learning_rate": 9.50413223140496e-06,
176
- "loss": 0.7954,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.09950248756218906,
181
- "grad_norm": 2.549282498557303,
182
  "learning_rate": 9.917355371900828e-06,
183
- "loss": 0.7941,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.10364842454394693,
188
- "grad_norm": 2.2132395708930708,
189
- "learning_rate": 9.999664652243188e-06,
190
- "loss": 0.7909,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.1077943615257048,
195
- "grad_norm": 2.6772912694676796,
196
- "learning_rate": 9.99830237907608e-06,
197
- "loss": 0.7904,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.11194029850746269,
202
- "grad_norm": 2.5540345209896733,
203
- "learning_rate": 9.995892506564461e-06,
204
- "loss": 0.7885,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.11608623548922056,
209
- "grad_norm": 2.6275103138831826,
210
- "learning_rate": 9.992435539796e-06,
211
- "loss": 0.7734,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.12023217247097844,
216
- "grad_norm": 2.287915872289185,
217
- "learning_rate": 9.987932203319917e-06,
218
- "loss": 0.7707,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.12437810945273632,
223
- "grad_norm": 2.2854068094875983,
224
- "learning_rate": 9.982383440995146e-06,
225
- "loss": 0.7781,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.1285240464344942,
230
- "grad_norm": 2.316022902309436,
231
- "learning_rate": 9.975790415792497e-06,
232
- "loss": 0.7647,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.13266998341625208,
237
- "grad_norm": 2.3292620561056503,
238
- "learning_rate": 9.968154509550914e-06,
239
- "loss": 0.7585,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 0.13681592039800994,
244
- "grad_norm": 2.3705724640289154,
245
- "learning_rate": 9.959477322687852e-06,
246
- "loss": 0.7603,
247
  "step": 165
248
  },
249
  {
250
- "epoch": 0.14096185737976782,
251
- "grad_norm": 2.3622727865331123,
252
- "learning_rate": 9.949760673863846e-06,
253
- "loss": 0.7485,
254
  "step": 170
255
  },
256
  {
257
- "epoch": 0.1451077943615257,
258
- "grad_norm": 2.7004571213256656,
259
- "learning_rate": 9.93900659960133e-06,
260
- "loss": 0.7487,
261
  "step": 175
262
  },
263
  {
264
- "epoch": 0.14925373134328357,
265
- "grad_norm": 2.3887038368874762,
266
- "learning_rate": 9.927217353857809e-06,
267
- "loss": 0.7491,
268
  "step": 180
269
  },
270
  {
271
- "epoch": 0.15339966832504145,
272
- "grad_norm": 2.6241901921933706,
273
- "learning_rate": 9.914395407553444e-06,
274
- "loss": 0.7393,
275
  "step": 185
276
  },
277
  {
278
- "epoch": 0.15754560530679934,
279
- "grad_norm": 2.2436866055233975,
280
- "learning_rate": 9.900543448053164e-06,
281
- "loss": 0.7307,
282
  "step": 190
283
  },
284
  {
285
- "epoch": 0.16169154228855723,
286
- "grad_norm": 2.2704152830824373,
287
- "learning_rate": 9.885664378603432e-06,
288
- "loss": 0.7294,
289
  "step": 195
290
  },
291
  {
292
- "epoch": 0.16583747927031509,
293
- "grad_norm": 2.421716993627199,
294
- "learning_rate": 9.869761317723744e-06,
295
- "loss": 0.7333,
296
  "step": 200
297
  },
298
  {
299
- "epoch": 0.16998341625207297,
300
- "grad_norm": 2.331419486103847,
301
- "learning_rate": 9.85283759855301e-06,
302
- "loss": 0.717,
303
  "step": 205
304
  },
305
  {
306
- "epoch": 0.17412935323383086,
307
- "grad_norm": 2.2723125567631173,
308
- "learning_rate": 9.834896768150963e-06,
309
- "loss": 0.7163,
310
  "step": 210
311
  },
312
  {
313
- "epoch": 0.17827529021558872,
314
- "grad_norm": 2.611863422806305,
315
- "learning_rate": 9.81594258675473e-06,
316
- "loss": 0.7112,
317
  "step": 215
318
  },
319
  {
320
- "epoch": 0.1824212271973466,
321
- "grad_norm": 2.5015371044620442,
322
- "learning_rate": 9.795979026990717e-06,
323
- "loss": 0.7229,
324
  "step": 220
325
  },
326
  {
327
- "epoch": 0.1865671641791045,
328
- "grad_norm": 2.3673630495470723,
329
- "learning_rate": 9.775010273041975e-06,
330
- "loss": 0.714,
331
  "step": 225
332
  },
333
  {
334
- "epoch": 0.19071310116086235,
335
- "grad_norm": 2.465794360848315,
336
- "learning_rate": 9.753040719771249e-06,
337
- "loss": 0.7014,
338
  "step": 230
339
  },
340
  {
341
- "epoch": 0.19485903814262023,
342
- "grad_norm": 2.499380902535721,
343
- "learning_rate": 9.730074971799837e-06,
344
- "loss": 0.7016,
345
  "step": 235
346
  },
347
  {
348
- "epoch": 0.19900497512437812,
349
- "grad_norm": 2.370365787728058,
350
- "learning_rate": 9.706117842542517e-06,
351
- "loss": 0.6892,
352
  "step": 240
353
  },
354
  {
355
- "epoch": 0.20315091210613598,
356
- "grad_norm": 2.2231212477650164,
357
- "learning_rate": 9.681174353198687e-06,
358
- "loss": 0.679,
359
  "step": 245
360
  },
361
  {
362
- "epoch": 0.20729684908789386,
363
- "grad_norm": 2.2184988368745944,
364
- "learning_rate": 9.655249731699973e-06,
365
- "loss": 0.6727,
366
  "step": 250
367
  },
368
  {
369
- "epoch": 0.21144278606965175,
370
- "grad_norm": 2.4670630040508765,
371
- "learning_rate": 9.628349411614503e-06,
372
- "loss": 0.6741,
373
  "step": 255
374
  },
375
  {
376
- "epoch": 0.2155887230514096,
377
- "grad_norm": 2.3340004265252867,
378
- "learning_rate": 9.600479031008072e-06,
379
- "loss": 0.6645,
380
  "step": 260
381
  },
382
  {
383
- "epoch": 0.2197346600331675,
384
- "grad_norm": 2.345025105358215,
385
- "learning_rate": 9.571644431262463e-06,
386
- "loss": 0.6718,
387
  "step": 265
388
  },
389
  {
390
- "epoch": 0.22388059701492538,
391
- "grad_norm": 2.2781158471416934,
392
- "learning_rate": 9.54185165585114e-06,
393
- "loss": 0.6554,
394
  "step": 270
395
  },
396
  {
397
- "epoch": 0.22802653399668324,
398
- "grad_norm": 2.196935873469807,
399
- "learning_rate": 9.511106949072588e-06,
400
- "loss": 0.675,
401
  "step": 275
402
  },
403
  {
404
- "epoch": 0.23217247097844113,
405
- "grad_norm": 2.145734254726692,
406
- "learning_rate": 9.479416754741577e-06,
407
- "loss": 0.6518,
408
  "step": 280
409
  },
410
  {
411
- "epoch": 0.236318407960199,
412
- "grad_norm": 2.2924487460591134,
413
- "learning_rate": 9.446787714838579e-06,
414
- "loss": 0.6545,
415
  "step": 285
416
  },
417
  {
418
- "epoch": 0.24046434494195687,
419
- "grad_norm": 2.3437070209698367,
420
- "learning_rate": 9.413226668117679e-06,
421
- "loss": 0.6535,
422
  "step": 290
423
  },
424
  {
425
- "epoch": 0.24461028192371476,
426
- "grad_norm": 2.342026944702776,
427
- "learning_rate": 9.37874064867323e-06,
428
  "loss": 0.6473,
429
  "step": 295
430
  },
431
  {
432
- "epoch": 0.24875621890547264,
433
- "grad_norm": 2.238260770607696,
434
- "learning_rate": 9.343336884465577e-06,
435
- "loss": 0.6548,
436
  "step": 300
437
  },
438
  {
439
- "epoch": 0.25290215588723053,
440
- "grad_norm": 2.3937794704139486,
441
- "learning_rate": 9.307022795806125e-06,
442
- "loss": 0.6428,
443
  "step": 305
444
  },
445
  {
446
- "epoch": 0.2570480928689884,
447
- "grad_norm": 2.3251089898222994,
448
- "learning_rate": 9.26980599380213e-06,
449
- "loss": 0.6468,
450
  "step": 310
451
  },
452
  {
453
- "epoch": 0.26119402985074625,
454
- "grad_norm": 2.3078161738975633,
455
- "learning_rate": 9.231694278761455e-06,
456
- "loss": 0.6187,
457
  "step": 315
458
  },
459
  {
460
- "epoch": 0.26533996683250416,
461
- "grad_norm": 2.2552528979591155,
462
- "learning_rate": 9.192695638557723e-06,
463
- "loss": 0.6291,
464
  "step": 320
465
  },
466
  {
467
- "epoch": 0.269485903814262,
468
- "grad_norm": 2.154063394322633,
469
- "learning_rate": 9.1528182469561e-06,
470
- "loss": 0.6346,
471
  "step": 325
472
  },
473
  {
474
- "epoch": 0.2736318407960199,
475
- "grad_norm": 2.189247030250631,
476
- "learning_rate": 9.112070461900178e-06,
477
- "loss": 0.6375,
478
  "step": 330
479
  },
480
  {
481
- "epoch": 0.2777777777777778,
482
- "grad_norm": 2.40766844049897,
483
- "learning_rate": 9.070460823760197e-06,
484
- "loss": 0.608,
485
  "step": 335
486
  },
487
  {
488
- "epoch": 0.28192371475953565,
489
- "grad_norm": 2.3493489039694704,
490
- "learning_rate": 9.027998053543079e-06,
491
- "loss": 0.6239,
492
  "step": 340
493
  },
494
  {
495
- "epoch": 0.2860696517412935,
496
- "grad_norm": 2.2327563648233917,
497
- "learning_rate": 8.984691051064576e-06,
498
- "loss": 0.6153,
499
  "step": 345
500
  },
501
  {
502
- "epoch": 0.2902155887230514,
503
- "grad_norm": 2.254418513410284,
504
- "learning_rate": 8.94054889308395e-06,
505
- "loss": 0.6107,
506
  "step": 350
507
  },
508
  {
509
- "epoch": 0.2943615257048093,
510
- "grad_norm": 2.257147694372905,
511
- "learning_rate": 8.895580831401563e-06,
512
- "loss": 0.6109,
513
  "step": 355
514
  },
515
  {
516
- "epoch": 0.29850746268656714,
517
- "grad_norm": 2.2584599022264906,
518
- "learning_rate": 8.849796290919787e-06,
519
- "loss": 0.6049,
520
  "step": 360
521
  },
522
  {
523
- "epoch": 0.30265339966832505,
524
- "grad_norm": 2.232877679923525,
525
- "learning_rate": 8.803204867667624e-06,
526
- "loss": 0.6046,
527
  "step": 365
528
  },
529
  {
530
- "epoch": 0.3067993366500829,
531
- "grad_norm": 2.197359769046609,
532
- "learning_rate": 8.755816326789469e-06,
533
- "loss": 0.5954,
534
  "step": 370
535
  },
536
  {
537
- "epoch": 0.31094527363184077,
538
- "grad_norm": 2.46361591666781,
539
- "learning_rate": 8.70764060049842e-06,
540
- "loss": 0.5916,
541
  "step": 375
542
  },
543
  {
544
- "epoch": 0.3150912106135987,
545
- "grad_norm": 2.3755525081223303,
546
- "learning_rate": 8.658687785994579e-06,
547
- "loss": 0.581,
548
  "step": 380
549
  },
550
  {
551
- "epoch": 0.31923714759535654,
552
- "grad_norm": 2.258602083288232,
553
- "learning_rate": 8.608968143348765e-06,
554
- "loss": 0.588,
555
  "step": 385
556
  },
557
  {
558
- "epoch": 0.32338308457711445,
559
- "grad_norm": 2.3996445145589096,
560
- "learning_rate": 8.558492093352098e-06,
561
- "loss": 0.5912,
562
  "step": 390
563
  },
564
  {
565
- "epoch": 0.3275290215588723,
566
- "grad_norm": 2.3912474752480413,
567
- "learning_rate": 8.50727021533189e-06,
568
- "loss": 0.5794,
569
  "step": 395
570
  },
571
  {
572
- "epoch": 0.33167495854063017,
573
- "grad_norm": 2.408562619975618,
574
- "learning_rate": 8.455313244934324e-06,
575
- "loss": 0.5743,
576
  "step": 400
577
  },
578
  {
579
- "epoch": 0.3358208955223881,
580
- "grad_norm": 2.4095385411305132,
581
- "learning_rate": 8.402632071874348e-06,
582
- "loss": 0.554,
583
  "step": 405
584
  },
585
  {
586
- "epoch": 0.33996683250414594,
587
- "grad_norm": 2.5087046445805594,
588
- "learning_rate": 8.349237737653288e-06,
589
- "loss": 0.5644,
590
  "step": 410
591
  },
592
  {
593
- "epoch": 0.3441127694859038,
594
- "grad_norm": 2.248409385546328,
595
- "learning_rate": 8.29514143324466e-06,
596
- "loss": 0.5679,
597
  "step": 415
598
  },
599
  {
600
- "epoch": 0.3482587064676617,
601
- "grad_norm": 2.4818862999723605,
602
- "learning_rate": 8.24035449674863e-06,
603
- "loss": 0.5681,
604
  "step": 420
605
  },
606
  {
607
- "epoch": 0.3524046434494196,
608
- "grad_norm": 2.2920752370693216,
609
- "learning_rate": 8.184888411015655e-06,
610
- "loss": 0.5573,
611
  "step": 425
612
  },
613
  {
614
- "epoch": 0.35655058043117743,
615
- "grad_norm": 2.1843535452336704,
616
- "learning_rate": 8.128754801239781e-06,
617
- "loss": 0.5667,
618
  "step": 430
619
  },
620
  {
621
- "epoch": 0.36069651741293535,
622
- "grad_norm": 2.290594364416105,
623
- "learning_rate": 8.071965432522107e-06,
624
- "loss": 0.5545,
625
  "step": 435
626
  },
627
  {
628
- "epoch": 0.3648424543946932,
629
- "grad_norm": 2.154813761206833,
630
- "learning_rate": 8.01453220740492e-06,
631
- "loss": 0.5396,
632
  "step": 440
633
  },
634
  {
635
- "epoch": 0.36898839137645106,
636
- "grad_norm": 2.242838598303811,
637
- "learning_rate": 7.956467163377037e-06,
638
- "loss": 0.5569,
639
  "step": 445
640
  },
641
  {
642
- "epoch": 0.373134328358209,
643
- "grad_norm": 2.2432428272389813,
644
- "learning_rate": 7.89778247035085e-06,
645
- "loss": 0.5477,
646
  "step": 450
647
  },
648
  {
649
- "epoch": 0.37728026533996684,
650
- "grad_norm": 2.153768298203764,
651
- "learning_rate": 7.838490428111625e-06,
652
- "loss": 0.5456,
653
  "step": 455
654
  },
655
  {
656
- "epoch": 0.3814262023217247,
657
- "grad_norm": 2.498209859766411,
658
- "learning_rate": 7.77860346373957e-06,
659
- "loss": 0.5458,
660
  "step": 460
661
  },
662
  {
663
- "epoch": 0.3855721393034826,
664
- "grad_norm": 2.212126590749594,
665
- "learning_rate": 7.718134129005238e-06,
666
- "loss": 0.5416,
667
  "step": 465
668
  },
669
  {
670
- "epoch": 0.38971807628524047,
671
- "grad_norm": 2.110625634068855,
672
- "learning_rate": 7.657095097738793e-06,
673
- "loss": 0.5371,
674
  "step": 470
675
  },
676
  {
677
- "epoch": 0.3938640132669983,
678
- "grad_norm": 2.253003518667003,
679
- "learning_rate": 7.59549916317367e-06,
680
- "loss": 0.5323,
681
  "step": 475
682
  },
683
  {
684
- "epoch": 0.39800995024875624,
685
- "grad_norm": 2.1328709298310846,
686
- "learning_rate": 7.533359235265248e-06,
687
- "loss": 0.5276,
688
  "step": 480
689
  },
690
  {
691
- "epoch": 0.4021558872305141,
692
- "grad_norm": 2.234017981235709,
693
- "learning_rate": 7.470688337985029e-06,
694
- "loss": 0.5301,
695
  "step": 485
696
  },
697
  {
698
- "epoch": 0.40630182421227196,
699
- "grad_norm": 2.2000987111293404,
700
- "learning_rate": 7.407499606590934e-06,
701
- "loss": 0.5148,
702
  "step": 490
703
  },
704
  {
705
- "epoch": 0.41044776119402987,
706
- "grad_norm": 2.2432120641590654,
707
- "learning_rate": 7.343806284874268e-06,
708
- "loss": 0.5283,
709
  "step": 495
710
  },
711
  {
712
- "epoch": 0.41459369817578773,
713
- "grad_norm": 2.23585573867938,
714
- "learning_rate": 7.279621722383939e-06,
715
- "loss": 0.5207,
716
  "step": 500
717
  },
718
  {
719
- "epoch": 0.4187396351575456,
720
- "grad_norm": 2.329754879846752,
721
- "learning_rate": 7.214959371628522e-06,
722
- "loss": 0.5082,
723
  "step": 505
724
  },
725
  {
726
- "epoch": 0.4228855721393035,
727
- "grad_norm": 2.2504503512558562,
728
- "learning_rate": 7.149832785256718e-06,
729
- "loss": 0.5184,
730
  "step": 510
731
  },
732
  {
733
- "epoch": 0.42703150912106136,
734
- "grad_norm": 2.5157494964752396,
735
- "learning_rate": 7.084255613216855e-06,
736
- "loss": 0.5032,
737
  "step": 515
738
  },
739
  {
740
- "epoch": 0.4311774461028192,
741
- "grad_norm": 2.1030059356060447,
742
- "learning_rate": 7.018241599895974e-06,
743
- "loss": 0.513,
744
  "step": 520
745
  },
746
  {
747
- "epoch": 0.43532338308457713,
748
- "grad_norm": 2.4065308180353764,
749
- "learning_rate": 6.95180458123913e-06,
750
- "loss": 0.5078,
751
  "step": 525
752
  },
753
  {
754
- "epoch": 0.439469320066335,
755
- "grad_norm": 2.177713989913874,
756
- "learning_rate": 6.8849584818494984e-06,
757
- "loss": 0.5031,
758
  "step": 530
759
  },
760
  {
761
- "epoch": 0.44361525704809285,
762
- "grad_norm": 2.393877760969861,
763
- "learning_rate": 6.817717312069913e-06,
764
- "loss": 0.5065,
765
  "step": 535
766
  },
767
  {
768
- "epoch": 0.44776119402985076,
769
- "grad_norm": 2.247681749025056,
770
- "learning_rate": 6.750095165046415e-06,
771
- "loss": 0.4938,
772
  "step": 540
773
  },
774
  {
775
- "epoch": 0.4519071310116086,
776
- "grad_norm": 2.281463166027544,
777
- "learning_rate": 6.682106213774459e-06,
778
- "loss": 0.4923,
779
  "step": 545
780
  },
781
  {
782
- "epoch": 0.4560530679933665,
783
- "grad_norm": 2.3727012799023193,
784
- "learning_rate": 6.6137647081283776e-06,
785
- "loss": 0.489,
786
  "step": 550
787
  },
788
  {
789
- "epoch": 0.4601990049751244,
790
- "grad_norm": 2.1180016685410226,
791
- "learning_rate": 6.545084971874738e-06,
792
- "loss": 0.4991,
793
  "step": 555
794
  },
795
  {
796
- "epoch": 0.46434494195688225,
797
- "grad_norm": 2.127384804925787,
798
- "learning_rate": 6.476081399670212e-06,
799
- "loss": 0.4792,
800
  "step": 560
801
  },
802
  {
803
- "epoch": 0.4684908789386401,
804
- "grad_norm": 2.16820368052577,
805
- "learning_rate": 6.406768454044581e-06,
806
- "loss": 0.4844,
807
  "step": 565
808
  },
809
  {
810
- "epoch": 0.472636815920398,
811
- "grad_norm": 2.014070529324509,
812
- "learning_rate": 6.337160662369519e-06,
813
- "loss": 0.4863,
814
  "step": 570
815
  },
816
  {
817
- "epoch": 0.4767827529021559,
818
- "grad_norm": 2.3092001470719623,
819
- "learning_rate": 6.267272613813789e-06,
820
- "loss": 0.4812,
821
  "step": 575
822
  },
823
  {
824
- "epoch": 0.48092868988391374,
825
- "grad_norm": 2.109627929383052,
826
- "learning_rate": 6.19711895628548e-06,
827
- "loss": 0.488,
828
  "step": 580
829
  },
830
  {
831
- "epoch": 0.48507462686567165,
832
- "grad_norm": 2.243414203622559,
833
- "learning_rate": 6.126714393361939e-06,
834
- "loss": 0.4808,
835
  "step": 585
836
  },
837
  {
838
- "epoch": 0.4892205638474295,
839
- "grad_norm": 2.1766103678064113,
840
- "learning_rate": 6.056073681208038e-06,
841
- "loss": 0.4744,
842
  "step": 590
843
  },
844
  {
845
- "epoch": 0.49336650082918737,
846
- "grad_norm": 2.174639170089267,
847
- "learning_rate": 5.985211625483403e-06,
848
- "loss": 0.4701,
849
  "step": 595
850
  },
851
  {
852
- "epoch": 0.4975124378109453,
853
- "grad_norm": 1.9848040293839193,
854
- "learning_rate": 5.914143078239293e-06,
855
- "loss": 0.4692,
856
  "step": 600
857
  },
858
  {
859
- "epoch": 0.5016583747927031,
860
- "grad_norm": 1.9915667633435625,
861
- "learning_rate": 5.842882934805731e-06,
862
- "loss": 0.466,
863
  "step": 605
864
  },
865
  {
866
- "epoch": 0.5058043117744611,
867
- "grad_norm": 2.1843494006695705,
868
- "learning_rate": 5.771446130669589e-06,
869
- "loss": 0.4629,
870
  "step": 610
871
  },
872
  {
873
- "epoch": 0.5099502487562189,
874
- "grad_norm": 2.207052329929412,
875
- "learning_rate": 5.6998476383442345e-06,
876
- "loss": 0.4684,
877
  "step": 615
878
  },
879
  {
880
- "epoch": 0.5140961857379768,
881
- "grad_norm": 2.175310031084628,
882
- "learning_rate": 5.628102464231429e-06,
883
- "loss": 0.4689,
884
  "step": 620
885
  },
886
  {
887
- "epoch": 0.5182421227197347,
888
- "grad_norm": 2.0357834023415244,
889
- "learning_rate": 5.556225645476119e-06,
890
- "loss": 0.4566,
891
  "step": 625
892
  },
893
  {
894
- "epoch": 0.5223880597014925,
895
- "grad_norm": 2.1019221590828963,
896
- "learning_rate": 5.4842322468147926e-06,
897
- "loss": 0.456,
898
  "step": 630
899
  },
900
  {
901
- "epoch": 0.5265339966832504,
902
- "grad_norm": 2.04812559469483,
903
- "learning_rate": 5.412137357418037e-06,
904
- "loss": 0.4558,
905
  "step": 635
906
  },
907
  {
908
- "epoch": 0.5306799336650083,
909
- "grad_norm": 2.134530649370582,
910
- "learning_rate": 5.339956087727985e-06,
911
- "loss": 0.4629,
912
  "step": 640
913
  },
914
  {
915
- "epoch": 0.5348258706467661,
916
- "grad_norm": 2.313675421645146,
917
- "learning_rate": 5.2677035662913116e-06,
918
- "loss": 0.4498,
919
  "step": 645
920
  },
921
  {
922
- "epoch": 0.538971807628524,
923
- "grad_norm": 2.1806979711435406,
924
- "learning_rate": 5.195394936588409e-06,
925
- "loss": 0.461,
926
  "step": 650
927
  },
928
  {
929
- "epoch": 0.543117744610282,
930
- "grad_norm": 2.0940375884841,
931
- "learning_rate": 5.123045353859465e-06,
932
- "loss": 0.4563,
933
  "step": 655
934
  },
935
  {
936
- "epoch": 0.5472636815920398,
937
- "grad_norm": 2.130208767432883,
938
- "learning_rate": 5.050669981928056e-06,
939
- "loss": 0.4517,
940
  "step": 660
941
  },
942
  {
943
- "epoch": 0.5514096185737977,
944
- "grad_norm": 2.1879449516075407,
945
- "learning_rate": 4.978283990022936e-06,
946
- "loss": 0.4313,
947
  "step": 665
948
  },
949
  {
950
- "epoch": 0.5555555555555556,
951
- "grad_norm": 2.0202401459826747,
952
- "learning_rate": 4.905902549598719e-06,
953
- "loss": 0.4366,
954
  "step": 670
955
  },
956
  {
957
- "epoch": 0.5597014925373134,
958
- "grad_norm": 2.2819105075976256,
959
- "learning_rate": 4.833540831156062e-06,
960
- "loss": 0.4419,
961
  "step": 675
962
  },
963
  {
964
- "epoch": 0.5638474295190713,
965
- "grad_norm": 2.0594548949793596,
966
- "learning_rate": 4.761214001062079e-06,
967
- "loss": 0.4328,
968
  "step": 680
969
  },
970
  {
971
- "epoch": 0.5679933665008292,
972
- "grad_norm": 2.0960663433169846,
973
- "learning_rate": 4.688937218371592e-06,
974
- "loss": 0.4398,
975
  "step": 685
976
  },
977
  {
978
- "epoch": 0.572139303482587,
979
- "grad_norm": 2.1187005469004907,
980
- "learning_rate": 4.616725631649938e-06,
981
- "loss": 0.4375,
982
  "step": 690
983
  },
984
  {
985
- "epoch": 0.5762852404643449,
986
- "grad_norm": 2.1860498031954716,
987
- "learning_rate": 4.544594375797969e-06,
988
- "loss": 0.4447,
989
  "step": 695
990
  },
991
  {
992
- "epoch": 0.5804311774461028,
993
- "grad_norm": 2.066319100687514,
994
- "learning_rate": 4.472558568879901e-06,
995
- "loss": 0.4312,
996
  "step": 700
997
  },
998
  {
999
- "epoch": 0.5845771144278606,
1000
- "grad_norm": 2.155667714913593,
1001
- "learning_rate": 4.400633308954713e-06,
1002
- "loss": 0.4247,
1003
  "step": 705
1004
  },
1005
  {
1006
- "epoch": 0.5887230514096186,
1007
- "grad_norm": 2.0969883178836617,
1008
- "learning_rate": 4.3288336709117246e-06,
1009
- "loss": 0.4216,
1010
  "step": 710
1011
  },
1012
  {
1013
- "epoch": 0.5928689883913765,
1014
- "grad_norm": 2.0786568268164896,
1015
- "learning_rate": 4.257174703311032e-06,
1016
- "loss": 0.4294,
1017
  "step": 715
1018
  },
1019
  {
1020
- "epoch": 0.5970149253731343,
1021
- "grad_norm": 1.987423177907522,
1022
- "learning_rate": 4.185671425229477e-06,
1023
- "loss": 0.4156,
1024
  "step": 720
1025
  },
1026
  {
1027
- "epoch": 0.6011608623548922,
1028
- "grad_norm": 2.078141690630155,
1029
- "learning_rate": 4.11433882311277e-06,
1030
- "loss": 0.4184,
1031
  "step": 725
1032
  },
1033
  {
1034
- "epoch": 0.6053067993366501,
1035
- "grad_norm": 2.017150658693192,
1036
- "learning_rate": 4.043191847634469e-06,
1037
- "loss": 0.4115,
1038
  "step": 730
1039
  },
1040
  {
1041
- "epoch": 0.6094527363184079,
1042
- "grad_norm": 2.1300235136817895,
1043
- "learning_rate": 3.9722454105624545e-06,
1044
- "loss": 0.4287,
1045
  "step": 735
1046
  },
1047
  {
1048
- "epoch": 0.6135986733001658,
1049
- "grad_norm": 2.1125657621582943,
1050
- "learning_rate": 3.901514381633555e-06,
1051
- "loss": 0.417,
1052
  "step": 740
1053
  },
1054
  {
1055
- "epoch": 0.6177446102819237,
1056
- "grad_norm": 2.0283875800596656,
1057
- "learning_rate": 3.831013585436985e-06,
1058
- "loss": 0.4233,
1059
  "step": 745
1060
  },
1061
  {
1062
- "epoch": 0.6218905472636815,
1063
- "grad_norm": 2.137354636363815,
1064
- "learning_rate": 3.7607577983072486e-06,
1065
- "loss": 0.4164,
1066
  "step": 750
1067
  },
1068
  {
1069
- "epoch": 0.6260364842454395,
1070
- "grad_norm": 2.050231849456643,
1071
- "learning_rate": 3.6907617452271394e-06,
1072
- "loss": 0.4139,
1073
  "step": 755
1074
  },
1075
  {
1076
- "epoch": 0.6301824212271974,
1077
- "grad_norm": 2.0468536479970463,
1078
- "learning_rate": 3.621040096741526e-06,
1079
- "loss": 0.4103,
1080
  "step": 760
1081
  },
1082
  {
1083
- "epoch": 0.6343283582089553,
1084
- "grad_norm": 2.00832500150709,
1085
- "learning_rate": 3.55160746588254e-06,
1086
  "loss": 0.4047,
1087
  "step": 765
1088
  },
1089
  {
1090
- "epoch": 0.6384742951907131,
1091
- "grad_norm": 2.037614199654265,
1092
- "learning_rate": 3.482478405106803e-06,
1093
- "loss": 0.3997,
1094
  "step": 770
1095
  },
1096
  {
1097
- "epoch": 0.642620232172471,
1098
- "grad_norm": 2.0372146147706034,
1099
- "learning_rate": 3.4136674032453787e-06,
1100
- "loss": 0.395,
1101
  "step": 775
1102
  },
1103
  {
1104
- "epoch": 0.6467661691542289,
1105
- "grad_norm": 2.106013747364502,
1106
- "learning_rate": 3.34518888246703e-06,
1107
- "loss": 0.4043,
1108
  "step": 780
1109
  },
1110
  {
1111
- "epoch": 0.6509121061359867,
1112
- "grad_norm": 1.9589776388157505,
1113
- "learning_rate": 3.2770571952554674e-06,
1114
- "loss": 0.4019,
1115
  "step": 785
1116
  },
1117
  {
1118
- "epoch": 0.6550580431177446,
1119
- "grad_norm": 2.0694547079126493,
1120
- "learning_rate": 3.2092866214011984e-06,
1121
- "loss": 0.3981,
1122
  "step": 790
1123
  },
1124
  {
1125
- "epoch": 0.6592039800995025,
1126
- "grad_norm": 2.0599722406561756,
1127
- "learning_rate": 3.141891365008609e-06,
1128
- "loss": 0.3952,
1129
  "step": 795
1130
  },
1131
  {
1132
- "epoch": 0.6633499170812603,
1133
- "grad_norm": 2.0970343412952355,
1134
- "learning_rate": 3.0748855515189104e-06,
1135
- "loss": 0.3959,
1136
  "step": 800
1137
  },
1138
  {
1139
- "epoch": 0.6674958540630183,
1140
- "grad_norm": 2.273610851728786,
1141
- "learning_rate": 3.00828322474958e-06,
1142
- "loss": 0.3969,
1143
  "step": 805
1144
  },
1145
  {
1146
- "epoch": 0.6716417910447762,
1147
- "grad_norm": 2.0436745191048624,
1148
- "learning_rate": 2.942098343950891e-06,
1149
- "loss": 0.3884,
1150
  "step": 810
1151
  },
1152
  {
1153
- "epoch": 0.675787728026534,
1154
- "grad_norm": 2.072808845055863,
1155
- "learning_rate": 2.8763447808801914e-06,
1156
- "loss": 0.3975,
1157
  "step": 815
1158
  },
1159
  {
1160
- "epoch": 0.6799336650082919,
1161
- "grad_norm": 1.9533416222096471,
1162
- "learning_rate": 2.8110363168944976e-06,
1163
- "loss": 0.3895,
1164
  "step": 820
1165
  },
1166
  {
1167
- "epoch": 0.6840796019900498,
1168
- "grad_norm": 2.0748312813551637,
1169
- "learning_rate": 2.7461866400620506e-06,
1170
- "loss": 0.3835,
1171
  "step": 825
1172
  },
1173
  {
1174
- "epoch": 0.6882255389718076,
1175
- "grad_norm": 2.0084951743665136,
1176
- "learning_rate": 2.6818093422934254e-06,
1177
- "loss": 0.3875,
1178
  "step": 830
1179
  },
1180
  {
1181
- "epoch": 0.6923714759535655,
1182
- "grad_norm": 2.07333923962738,
1183
- "learning_rate": 2.617917916492776e-06,
1184
- "loss": 0.3885,
1185
  "step": 835
1186
  },
1187
  {
1188
- "epoch": 0.6965174129353234,
1189
- "grad_norm": 2.1253041540124604,
1190
- "learning_rate": 2.5545257537298497e-06,
1191
- "loss": 0.3868,
1192
  "step": 840
1193
  },
1194
  {
1195
- "epoch": 0.7006633499170812,
1196
- "grad_norm": 2.1599859472141487,
1197
- "learning_rate": 2.491646140433346e-06,
1198
- "loss": 0.3833,
1199
  "step": 845
1200
  },
1201
  {
1202
- "epoch": 0.7048092868988391,
1203
- "grad_norm": 1.988352776572986,
1204
- "learning_rate": 2.4292922556061877e-06,
1205
- "loss": 0.3873,
1206
  "step": 850
1207
  },
1208
  {
1209
- "epoch": 0.7089552238805971,
1210
- "grad_norm": 1.9733722941292888,
1211
- "learning_rate": 2.367477168063326e-06,
1212
- "loss": 0.378,
1213
  "step": 855
1214
  },
1215
  {
1216
- "epoch": 0.7131011608623549,
1217
- "grad_norm": 2.03005557972935,
1218
- "learning_rate": 2.3062138336926406e-06,
1219
- "loss": 0.3755,
1220
  "step": 860
1221
  },
1222
  {
1223
- "epoch": 0.7172470978441128,
1224
- "grad_norm": 1.9703455185572258,
1225
- "learning_rate": 2.245515092739488e-06,
1226
- "loss": 0.3826,
1227
  "step": 865
1228
  },
1229
  {
1230
- "epoch": 0.7213930348258707,
1231
- "grad_norm": 1.943797269427651,
1232
- "learning_rate": 2.185393667115513e-06,
1233
- "loss": 0.3787,
1234
  "step": 870
1235
  },
1236
  {
1237
- "epoch": 0.7255389718076285,
1238
- "grad_norm": 2.0343796970980774,
1239
- "learning_rate": 2.125862157732245e-06,
1240
- "loss": 0.3808,
1241
  "step": 875
1242
  },
1243
  {
1244
- "epoch": 0.7296849087893864,
1245
- "grad_norm": 2.0769747558322234,
1246
- "learning_rate": 2.066933041860059e-06,
1247
- "loss": 0.3739,
1248
  "step": 880
1249
  },
1250
  {
1251
- "epoch": 0.7338308457711443,
1252
- "grad_norm": 1.9514126276051975,
1253
- "learning_rate": 2.0086186705130545e-06,
1254
- "loss": 0.3763,
1255
  "step": 885
1256
  },
1257
  {
1258
- "epoch": 0.7379767827529021,
1259
- "grad_norm": 2.0280333984472474,
1260
- "learning_rate": 1.9509312658603954e-06,
1261
- "loss": 0.3644,
1262
  "step": 890
1263
  },
1264
  {
1265
- "epoch": 0.74212271973466,
1266
- "grad_norm": 1.993637787785018,
1267
- "learning_rate": 1.8938829186646484e-06,
1268
- "loss": 0.3778,
1269
  "step": 895
1270
  },
1271
  {
1272
- "epoch": 0.746268656716418,
1273
- "grad_norm": 1.999011643093531,
1274
- "learning_rate": 1.8374855857476687e-06,
1275
- "loss": 0.3671,
1276
  "step": 900
1277
  },
1278
  {
1279
- "epoch": 0.7504145936981758,
1280
- "grad_norm": 1.9814629158594814,
1281
- "learning_rate": 1.7817510874845585e-06,
1282
- "loss": 0.3654,
1283
  "step": 905
1284
  },
1285
  {
1286
- "epoch": 0.7545605306799337,
1287
- "grad_norm": 2.1351789026964565,
1288
- "learning_rate": 1.7266911053262196e-06,
1289
- "loss": 0.3627,
1290
  "step": 910
1291
  },
1292
  {
1293
- "epoch": 0.7587064676616916,
1294
- "grad_norm": 1.986179341864742,
1295
- "learning_rate": 1.6723171793510363e-06,
1296
- "loss": 0.3663,
1297
  "step": 915
1298
  },
1299
  {
1300
- "epoch": 0.7628524046434494,
1301
- "grad_norm": 2.0106970309059786,
1302
- "learning_rate": 1.6186407058461622e-06,
1303
- "loss": 0.3626,
1304
  "step": 920
1305
  },
1306
  {
1307
- "epoch": 0.7669983416252073,
1308
- "grad_norm": 1.9928576393268793,
1309
- "learning_rate": 1.5656729349189742e-06,
1310
- "loss": 0.3654,
1311
  "step": 925
1312
  },
1313
  {
1314
- "epoch": 0.7711442786069652,
1315
- "grad_norm": 2.020840965925827,
1316
- "learning_rate": 1.5134249681391416e-06,
1317
- "loss": 0.3662,
1318
  "step": 930
1319
  },
1320
  {
1321
- "epoch": 0.775290215588723,
1322
- "grad_norm": 2.1327926693330883,
1323
- "learning_rate": 1.4619077562118477e-06,
1324
- "loss": 0.3672,
1325
  "step": 935
1326
  },
1327
  {
1328
- "epoch": 0.7794361525704809,
1329
- "grad_norm": 2.1694762004743935,
1330
- "learning_rate": 1.411132096682606e-06,
1331
- "loss": 0.3665,
1332
  "step": 940
1333
  },
1334
  {
1335
- "epoch": 0.7835820895522388,
1336
- "grad_norm": 2.0850207502091083,
1337
- "learning_rate": 1.3611086316742057e-06,
1338
- "loss": 0.3601,
1339
  "step": 945
1340
  },
1341
  {
1342
- "epoch": 0.7877280265339967,
1343
- "grad_norm": 2.1205257199932803,
1344
- "learning_rate": 1.3118478456562073e-06,
1345
- "loss": 0.3657,
1346
  "step": 950
1347
  },
1348
  {
1349
- "epoch": 0.7918739635157546,
1350
- "grad_norm": 2.0711590900208567,
1351
- "learning_rate": 1.2633600632474962e-06,
1352
- "loss": 0.3728,
1353
  "step": 955
1354
  },
1355
  {
1356
- "epoch": 0.7960199004975125,
1357
- "grad_norm": 2.0253305540465454,
1358
- "learning_rate": 1.2156554470523364e-06,
1359
- "loss": 0.3588,
1360
  "step": 960
1361
  },
1362
  {
1363
- "epoch": 0.8001658374792703,
1364
- "grad_norm": 1.981963075200491,
1365
- "learning_rate": 1.1687439955303764e-06,
1366
- "loss": 0.356,
1367
  "step": 965
1368
  },
1369
  {
1370
- "epoch": 0.8043117744610282,
1371
- "grad_norm": 2.056494179090333,
1372
- "learning_rate": 1.1226355409010686e-06,
1373
- "loss": 0.3597,
1374
  "step": 970
1375
  },
1376
  {
1377
- "epoch": 0.8084577114427861,
1378
- "grad_norm": 2.0936384545536786,
1379
- "learning_rate": 1.0773397470829145e-06,
1380
- "loss": 0.3557,
1381
  "step": 975
1382
  },
1383
  {
1384
- "epoch": 0.8126036484245439,
1385
- "grad_norm": 2.111422019440366,
1386
- "learning_rate": 1.032866107667999e-06,
1387
- "loss": 0.3537,
1388
  "step": 980
1389
  },
1390
  {
1391
- "epoch": 0.8167495854063018,
1392
- "grad_norm": 1.9975991623707552,
1393
- "learning_rate": 9.892239439322243e-07,
1394
- "loss": 0.3567,
1395
  "step": 985
1396
  },
1397
  {
1398
- "epoch": 0.8208955223880597,
1399
- "grad_norm": 2.0987069308783908,
1400
- "learning_rate": 9.464224028816427e-07,
1401
- "loss": 0.3562,
1402
  "step": 990
1403
  },
1404
  {
1405
- "epoch": 0.8250414593698175,
1406
- "grad_norm": 2.0295869072161,
1407
- "learning_rate": 9.044704553353323e-07,
1408
- "loss": 0.3474,
1409
  "step": 995
1410
  },
1411
  {
1412
- "epoch": 0.8291873963515755,
1413
- "grad_norm": 2.018938720136701,
1414
- "learning_rate": 8.633768940451981e-07,
1415
- "loss": 0.3525,
1416
  "step": 1000
1417
  },
1418
  {
1419
- "epoch": 0.8333333333333334,
1420
- "grad_norm": 2.1287062697049834,
1421
- "learning_rate": 8.231503318530814e-07,
1422
- "loss": 0.3572,
1423
  "step": 1005
1424
  },
1425
  {
1426
- "epoch": 0.8374792703150912,
1427
- "grad_norm": 2.0030071560181963,
1428
- "learning_rate": 7.837991998855899e-07,
1429
- "loss": 0.3465,
1430
  "step": 1010
1431
  },
1432
  {
1433
- "epoch": 0.8416252072968491,
1434
- "grad_norm": 2.117089697184247,
1435
- "learning_rate": 7.453317457870096e-07,
1436
- "loss": 0.3524,
1437
  "step": 1015
1438
  },
1439
  {
1440
- "epoch": 0.845771144278607,
1441
- "grad_norm": 2.0518789378391293,
1442
- "learning_rate": 7.077560319906696e-07,
1443
- "loss": 0.3522,
1444
  "step": 1020
1445
  },
1446
  {
1447
- "epoch": 0.8499170812603648,
1448
- "grad_norm": 2.009728646972428,
1449
- "learning_rate": 6.710799340291341e-07,
1450
- "loss": 0.3416,
1451
  "step": 1025
1452
  },
1453
  {
1454
- "epoch": 0.8540630182421227,
1455
- "grad_norm": 2.007424635826345,
1456
- "learning_rate": 6.353111388835564e-07,
1457
- "loss": 0.3439,
1458
  "step": 1030
1459
  },
1460
  {
1461
- "epoch": 0.8582089552238806,
1462
- "grad_norm": 2.0389234580139273,
1463
- "learning_rate": 6.00457143372557e-07,
1464
- "loss": 0.3512,
1465
  "step": 1035
1466
  },
1467
  {
1468
- "epoch": 0.8623548922056384,
1469
- "grad_norm": 2.1162834962104795,
1470
- "learning_rate": 5.665252525809583e-07,
1471
- "loss": 0.3516,
1472
  "step": 1040
1473
  },
1474
  {
1475
- "epoch": 0.8665008291873963,
1476
- "grad_norm": 2.058625243809751,
1477
- "learning_rate": 5.335225783287051e-07,
1478
- "loss": 0.3498,
1479
  "step": 1045
1480
  },
1481
  {
1482
- "epoch": 0.8706467661691543,
1483
- "grad_norm": 1.9590304695730698,
1484
- "learning_rate": 5.014560376802913e-07,
1485
- "loss": 0.3518,
1486
  "step": 1050
1487
  },
1488
  {
1489
- "epoch": 0.8747927031509121,
1490
- "grad_norm": 2.0713636327974583,
1491
- "learning_rate": 4.703323514950042e-07,
1492
- "loss": 0.3458,
1493
  "step": 1055
1494
  },
1495
  {
1496
- "epoch": 0.87893864013267,
1497
- "grad_norm": 1.9932756170654953,
1498
- "learning_rate": 4.401580430182928e-07,
1499
- "loss": 0.3507,
1500
  "step": 1060
1501
  },
1502
  {
1503
- "epoch": 0.8830845771144279,
1504
- "grad_norm": 2.050397152398351,
1505
- "learning_rate": 4.1093943651455305e-07,
1506
- "loss": 0.3309,
1507
  "step": 1065
1508
  },
1509
  {
1510
- "epoch": 0.8872305140961857,
1511
- "grad_norm": 2.0116982015299123,
1512
- "learning_rate": 3.826826559416219e-07,
1513
- "loss": 0.3447,
1514
  "step": 1070
1515
  },
1516
  {
1517
- "epoch": 0.8913764510779436,
1518
- "grad_norm": 2.0174389673279833,
1519
- "learning_rate": 3.5539362366724784e-07,
1520
- "loss": 0.332,
1521
  "step": 1075
1522
  },
1523
  {
1524
- "epoch": 0.8955223880597015,
1525
- "grad_norm": 2.050607162918268,
1526
- "learning_rate": 3.290780592278148e-07,
1527
- "loss": 0.3442,
1528
  "step": 1080
1529
  },
1530
  {
1531
- "epoch": 0.8996683250414593,
1532
- "grad_norm": 2.049143185917848,
1533
- "learning_rate": 3.0374147812958387e-07,
1534
- "loss": 0.3458,
1535
  "step": 1085
1536
  },
1537
  {
1538
- "epoch": 0.9038142620232172,
1539
- "grad_norm": 2.100447867253057,
1540
- "learning_rate": 2.7938919069268654e-07,
1541
- "loss": 0.349,
1542
  "step": 1090
1543
  },
1544
  {
1545
- "epoch": 0.9079601990049752,
1546
- "grad_norm": 2.0177747343002967,
1547
- "learning_rate": 2.5602630093813253e-07,
1548
- "loss": 0.3434,
1549
  "step": 1095
1550
  },
1551
  {
1552
- "epoch": 0.912106135986733,
1553
- "grad_norm": 2.0945058270699066,
1554
- "learning_rate": 2.3365770551805223e-07,
1555
- "loss": 0.337,
1556
  "step": 1100
1557
  },
1558
  {
1559
- "epoch": 0.9162520729684909,
1560
- "grad_norm": 2.0268787956916015,
1561
- "learning_rate": 2.1228809268940164e-07,
1562
- "loss": 0.3392,
1563
  "step": 1105
1564
  },
1565
  {
1566
- "epoch": 0.9203980099502488,
1567
- "grad_norm": 2.0520304126033646,
1568
- "learning_rate": 1.919219413313478e-07,
1569
- "loss": 0.3368,
1570
  "step": 1110
1571
  },
1572
  {
1573
- "epoch": 0.9245439469320066,
1574
- "grad_norm": 1.9890419081513053,
1575
- "learning_rate": 1.725635200065323e-07,
1576
- "loss": 0.3458,
1577
  "step": 1115
1578
  },
1579
  {
1580
- "epoch": 0.9286898839137645,
1581
- "grad_norm": 2.029134597632683,
1582
- "learning_rate": 1.5421688606642392e-07,
1583
- "loss": 0.3421,
1584
  "step": 1120
1585
  },
1586
  {
1587
- "epoch": 0.9328358208955224,
1588
- "grad_norm": 2.102444347839262,
1589
- "learning_rate": 1.3688588480092913e-07,
1590
- "loss": 0.3359,
1591
  "step": 1125
1592
  },
1593
  {
1594
- "epoch": 0.9369817578772802,
1595
- "grad_norm": 2.0301370927071685,
1596
- "learning_rate": 1.205741486324552e-07,
1597
- "loss": 0.3321,
1598
  "step": 1130
1599
  },
1600
  {
1601
- "epoch": 0.9411276948590381,
1602
- "grad_norm": 2.0653340548808314,
1603
- "learning_rate": 1.0528509635458873e-07,
1604
- "loss": 0.3366,
1605
  "step": 1135
1606
  },
1607
  {
1608
- "epoch": 0.945273631840796,
1609
- "grad_norm": 2.095296286821954,
1610
- "learning_rate": 9.102193241554757e-08,
1611
- "loss": 0.3441,
1612
  "step": 1140
1613
  },
1614
  {
1615
- "epoch": 0.9494195688225538,
1616
- "grad_norm": 2.0208009819161634,
1617
- "learning_rate": 7.778764624655433e-08,
1618
- "loss": 0.3423,
1619
  "step": 1145
1620
  },
1621
  {
1622
- "epoch": 0.9535655058043118,
1623
- "grad_norm": 2.0824764268805898,
1624
- "learning_rate": 6.558501163527964e-08,
1625
- "loss": 0.3384,
1626
  "step": 1150
1627
  },
1628
  {
1629
- "epoch": 0.9577114427860697,
1630
- "grad_norm": 2.25374621388535,
1631
- "learning_rate": 5.44165861444812e-08,
1632
- "loss": 0.3407,
1633
  "step": 1155
1634
  },
1635
  {
1636
- "epoch": 0.9618573797678275,
1637
- "grad_norm": 2.119777820031747,
1638
- "learning_rate": 4.428471057596362e-08,
1639
- "loss": 0.3411,
1640
  "step": 1160
1641
  },
1642
  {
1643
- "epoch": 0.9660033167495854,
1644
- "grad_norm": 2.2374805577258012,
1645
- "learning_rate": 3.519150847996422e-08,
1646
- "loss": 0.3384,
1647
  "step": 1165
1648
  },
1649
  {
1650
- "epoch": 0.9701492537313433,
1651
- "grad_norm": 2.018401744262374,
1652
- "learning_rate": 2.713888571007739e-08,
1653
- "loss": 0.3381,
1654
  "step": 1170
1655
  },
1656
  {
1657
- "epoch": 0.9742951907131011,
1658
- "grad_norm": 1.9890275712431682,
1659
- "learning_rate": 2.012853002380466e-08,
1660
- "loss": 0.3367,
1661
  "step": 1175
1662
  },
1663
  {
1664
- "epoch": 0.978441127694859,
1665
- "grad_norm": 2.0156279224759746,
1666
- "learning_rate": 1.4161910728816009e-08,
1667
- "loss": 0.3354,
1668
  "step": 1180
1669
  },
1670
  {
1671
- "epoch": 0.9825870646766169,
1672
- "grad_norm": 2.060081895158268,
1673
- "learning_rate": 9.240278374995637e-09,
1674
- "loss": 0.338,
1675
  "step": 1185
1676
  },
1677
  {
1678
- "epoch": 0.9867330016583747,
1679
- "grad_norm": 1.9543352226188724,
1680
- "learning_rate": 5.364664492337746e-09,
1681
- "loss": 0.339,
1682
  "step": 1190
1683
  },
1684
  {
1685
- "epoch": 0.9908789386401327,
1686
- "grad_norm": 2.1043063879668145,
1687
- "learning_rate": 2.5358813747500266e-09,
1688
- "loss": 0.335,
1689
  "step": 1195
1690
  },
1691
  {
1692
- "epoch": 0.9950248756218906,
1693
- "grad_norm": 2.040091056235316,
1694
- "learning_rate": 7.545219097987444e-10,
1695
- "loss": 0.3364,
1696
  "step": 1200
1697
  },
1698
  {
1699
- "epoch": 0.9991708126036484,
1700
- "grad_norm": 1.9127034435135999,
1701
- "learning_rate": 2.0959454449243076e-11,
1702
- "loss": 0.3499,
1703
  "step": 1205
1704
  },
1705
  {
1706
- "epoch": 1.0,
1707
- "eval_loss": 0.3398154377937317,
1708
- "eval_runtime": 123.1865,
1709
- "eval_samples_per_second": 3.166,
1710
- "eval_steps_per_second": 0.796,
1711
- "step": 1206
1712
  },
1713
  {
1714
- "epoch": 1.0,
1715
- "step": 1206,
1716
- "total_flos": 252459519836160.0,
1717
- "train_loss": 0.5258198334506495,
1718
- "train_runtime": 27352.0353,
1719
- "train_samples_per_second": 1.411,
1720
- "train_steps_per_second": 0.044
1721
  }
1722
  ],
1723
  "logging_steps": 5,
1724
- "max_steps": 1206,
1725
  "num_input_tokens_seen": 0,
1726
  "num_train_epochs": 1,
1727
  "save_steps": 100,
@@ -1737,7 +1737,7 @@
1737
  "attributes": {}
1738
  }
1739
  },
1740
- "total_flos": 252459519836160.0,
1741
  "train_batch_size": 2,
1742
  "trial_name": null,
1743
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.999792917788362,
5
  "eval_steps": 500,
6
+ "global_step": 1207,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0008283288465520812,
13
+ "grad_norm": 24.23784679529398,
14
  "learning_rate": 8.264462809917357e-08,
15
+ "loss": 1.4304,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.0041416442327604054,
20
+ "grad_norm": 23.234915420644615,
21
  "learning_rate": 4.132231404958678e-07,
22
+ "loss": 1.4144,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.008283288465520811,
27
+ "grad_norm": 8.222025109804092,
28
  "learning_rate": 8.264462809917356e-07,
29
+ "loss": 1.307,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.012424932698281217,
34
+ "grad_norm": 10.133789775285283,
35
  "learning_rate": 1.2396694214876035e-06,
36
+ "loss": 1.1636,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.016566576931041622,
41
+ "grad_norm": 2.9246570997342918,
42
  "learning_rate": 1.6528925619834712e-06,
43
+ "loss": 1.0151,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.02070822116380203,
48
+ "grad_norm": 2.7388624531591717,
49
  "learning_rate": 2.066115702479339e-06,
50
+ "loss": 0.9484,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.024849865396562434,
55
+ "grad_norm": 2.365875950285307,
56
  "learning_rate": 2.479338842975207e-06,
57
+ "loss": 0.9332,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.028991509629322842,
62
+ "grad_norm": 2.374922722027774,
63
  "learning_rate": 2.8925619834710743e-06,
64
+ "loss": 0.9044,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.033133153862083244,
69
+ "grad_norm": 2.238010214125612,
70
  "learning_rate": 3.3057851239669424e-06,
71
+ "loss": 0.8913,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.03727479809484365,
76
+ "grad_norm": 2.4490973058865078,
77
  "learning_rate": 3.71900826446281e-06,
78
+ "loss": 0.8833,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.04141644232760406,
83
+ "grad_norm": 2.469516335084805,
84
  "learning_rate": 4.132231404958678e-06,
85
+ "loss": 0.8813,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.04555808656036447,
90
+ "grad_norm": 2.2371905595594495,
91
  "learning_rate": 4.5454545454545455e-06,
92
+ "loss": 0.8533,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.04969973079312487,
97
+ "grad_norm": 2.409737572391393,
98
  "learning_rate": 4.958677685950414e-06,
99
+ "loss": 0.8581,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.05384137502588528,
104
+ "grad_norm": 2.2291530452515977,
105
  "learning_rate": 5.371900826446281e-06,
106
+ "loss": 0.8519,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.057983019258645685,
111
+ "grad_norm": 2.35453266908653,
112
  "learning_rate": 5.785123966942149e-06,
113
+ "loss": 0.8515,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.062124663491406086,
118
+ "grad_norm": 2.336463988259032,
119
  "learning_rate": 6.198347107438017e-06,
120
+ "loss": 0.8295,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.06626630772416649,
125
+ "grad_norm": 2.4980329883385624,
126
  "learning_rate": 6.611570247933885e-06,
127
+ "loss": 0.832,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.0704079519569269,
132
+ "grad_norm": 2.6275854974083384,
133
  "learning_rate": 7.0247933884297525e-06,
134
+ "loss": 0.816,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.0745495961896873,
139
+ "grad_norm": 2.684536070785031,
140
  "learning_rate": 7.43801652892562e-06,
141
+ "loss": 0.8162,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.07869124042244771,
146
+ "grad_norm": 2.805145187507444,
147
  "learning_rate": 7.851239669421489e-06,
148
+ "loss": 0.8309,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.08283288465520812,
153
+ "grad_norm": 2.4685576686167874,
154
  "learning_rate": 8.264462809917356e-06,
155
+ "loss": 0.8155,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.08697452888796853,
160
+ "grad_norm": 2.540099419306748,
161
  "learning_rate": 8.677685950413224e-06,
162
+ "loss": 0.8071,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.09111617312072894,
167
+ "grad_norm": 2.562872480103855,
168
  "learning_rate": 9.090909090909091e-06,
169
+ "loss": 0.8029,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.09525781735348933,
174
+ "grad_norm": 2.5970244377170313,
175
  "learning_rate": 9.50413223140496e-06,
176
+ "loss": 0.7873,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.09939946158624974,
181
+ "grad_norm": 2.4308464508943346,
182
  "learning_rate": 9.917355371900828e-06,
183
+ "loss": 0.7915,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.10354110581901015,
188
+ "grad_norm": 2.6905207268547158,
189
+ "learning_rate": 9.999665269535307e-06,
190
+ "loss": 0.7993,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.10768275005177055,
195
+ "grad_norm": 34.133455991097065,
196
+ "learning_rate": 9.998305503833872e-06,
197
+ "loss": 0.8009,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.11182439428453096,
202
+ "grad_norm": 2.3166119518346866,
203
+ "learning_rate": 9.995900066492902e-06,
204
+ "loss": 0.785,
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 0.11596603851729137,
209
+ "grad_norm": 2.5556998820575076,
210
+ "learning_rate": 9.992449460742464e-06,
211
+ "loss": 0.8134,
212
  "step": 140
213
  },
214
  {
215
+ "epoch": 0.12010768275005176,
216
+ "grad_norm": 2.615095977165256,
217
+ "learning_rate": 9.98795440846732e-06,
218
+ "loss": 0.7898,
219
  "step": 145
220
  },
221
  {
222
+ "epoch": 0.12424932698281217,
223
+ "grad_norm": 2.729536225853297,
224
+ "learning_rate": 9.982415850055902e-06,
225
+ "loss": 0.7681,
226
  "step": 150
227
  },
228
  {
229
+ "epoch": 0.12839097121557258,
230
+ "grad_norm": 3.895339838479258,
231
+ "learning_rate": 9.975834944203581e-06,
232
+ "loss": 0.7503,
233
  "step": 155
234
  },
235
  {
236
+ "epoch": 0.13253261544833297,
237
+ "grad_norm": 2.4875917416718187,
238
+ "learning_rate": 9.968213067670265e-06,
239
+ "loss": 0.7632,
240
  "step": 160
241
  },
242
  {
243
+ "epoch": 0.1366742596810934,
244
+ "grad_norm": 2.5249141397756847,
245
+ "learning_rate": 9.959551814992364e-06,
246
+ "loss": 0.7444,
247
  "step": 165
248
  },
249
  {
250
+ "epoch": 0.1408159039138538,
251
+ "grad_norm": 2.6613194516011656,
252
+ "learning_rate": 9.949852998149217e-06,
253
+ "loss": 0.7592,
254
  "step": 170
255
  },
256
  {
257
+ "epoch": 0.1449575481466142,
258
+ "grad_norm": 2.535010612771426,
259
+ "learning_rate": 9.939118646184007e-06,
260
+ "loss": 0.7625,
261
  "step": 175
262
  },
263
  {
264
+ "epoch": 0.1490991923793746,
265
+ "grad_norm": 2.658211254916015,
266
+ "learning_rate": 9.927351004779275e-06,
267
+ "loss": 0.7405,
268
  "step": 180
269
  },
270
  {
271
+ "epoch": 0.15324083661213503,
272
+ "grad_norm": 2.343186435782198,
273
+ "learning_rate": 9.914552535787122e-06,
274
+ "loss": 0.7285,
275
  "step": 185
276
  },
277
  {
278
+ "epoch": 0.15738248084489542,
279
+ "grad_norm": 2.4661024566837653,
280
+ "learning_rate": 9.900725916714157e-06,
281
+ "loss": 0.7405,
282
  "step": 190
283
  },
284
  {
285
+ "epoch": 0.16152412507765582,
286
+ "grad_norm": 2.7732706905434497,
287
+ "learning_rate": 9.885874040161373e-06,
288
+ "loss": 0.7373,
289
  "step": 195
290
  },
291
  {
292
+ "epoch": 0.16566576931041624,
293
+ "grad_norm": 2.523332136572105,
294
+ "learning_rate": 9.87000001321898e-06,
295
+ "loss": 0.7261,
296
  "step": 200
297
  },
298
  {
299
+ "epoch": 0.16980741354317663,
300
+ "grad_norm": 2.2311284634557937,
301
+ "learning_rate": 9.853107156816393e-06,
302
+ "loss": 0.7184,
303
  "step": 205
304
  },
305
  {
306
+ "epoch": 0.17394905777593705,
307
+ "grad_norm": 2.379531410344561,
308
+ "learning_rate": 9.835199005027477e-06,
309
+ "loss": 0.7144,
310
  "step": 210
311
  },
312
  {
313
+ "epoch": 0.17809070200869745,
314
+ "grad_norm": 2.3058139982021215,
315
+ "learning_rate": 9.816279304331202e-06,
316
+ "loss": 0.7252,
317
  "step": 215
318
  },
319
  {
320
+ "epoch": 0.18223234624145787,
321
+ "grad_norm": 2.2361882677362974,
322
+ "learning_rate": 9.79635201282785e-06,
323
+ "loss": 0.7151,
324
  "step": 220
325
  },
326
  {
327
+ "epoch": 0.18637399047421827,
328
+ "grad_norm": 2.7702057401479006,
329
+ "learning_rate": 9.775421299410977e-06,
330
+ "loss": 0.7045,
331
  "step": 225
332
  },
333
  {
334
+ "epoch": 0.19051563470697866,
335
+ "grad_norm": 2.5524695305725404,
336
+ "learning_rate": 9.753491542895237e-06,
337
+ "loss": 0.715,
338
  "step": 230
339
  },
340
  {
341
+ "epoch": 0.19465727893973908,
342
+ "grad_norm": 2.4487445215481305,
343
+ "learning_rate": 9.730567331100333e-06,
344
+ "loss": 0.7082,
345
  "step": 235
346
  },
347
  {
348
+ "epoch": 0.19879892317249948,
349
+ "grad_norm": 2.2482641063346116,
350
+ "learning_rate": 9.706653459891207e-06,
351
+ "loss": 0.699,
352
  "step": 240
353
  },
354
  {
355
+ "epoch": 0.2029405674052599,
356
+ "grad_norm": 2.451295868720965,
357
+ "learning_rate": 9.681754932174719e-06,
358
+ "loss": 0.6913,
359
  "step": 245
360
  },
361
  {
362
+ "epoch": 0.2070822116380203,
363
+ "grad_norm": 2.56626439770444,
364
+ "learning_rate": 9.655876956853025e-06,
365
+ "loss": 0.6819,
366
  "step": 250
367
  },
368
  {
369
+ "epoch": 0.2112238558707807,
370
+ "grad_norm": 2.490881018154125,
371
+ "learning_rate": 9.629024947733836e-06,
372
+ "loss": 0.6843,
373
  "step": 255
374
  },
375
  {
376
+ "epoch": 0.2153655001035411,
377
+ "grad_norm": 2.4655094247796283,
378
+ "learning_rate": 9.601204522397826e-06,
379
+ "loss": 0.6874,
380
  "step": 260
381
  },
382
  {
383
+ "epoch": 0.2195071443363015,
384
+ "grad_norm": 2.6029112443690625,
385
+ "learning_rate": 9.572421501023403e-06,
386
+ "loss": 0.6773,
387
  "step": 265
388
  },
389
  {
390
+ "epoch": 0.22364878856906192,
391
+ "grad_norm": 2.303292362769761,
392
+ "learning_rate": 9.5426819051691e-06,
393
+ "loss": 0.6669,
394
  "step": 270
395
  },
396
  {
397
+ "epoch": 0.22779043280182232,
398
+ "grad_norm": 2.2947735744081568,
399
+ "learning_rate": 9.511991956513828e-06,
400
+ "loss": 0.6636,
401
  "step": 275
402
  },
403
  {
404
+ "epoch": 0.23193207703458274,
405
+ "grad_norm": 2.3561731246407045,
406
+ "learning_rate": 9.480358075555278e-06,
407
+ "loss": 0.6671,
408
  "step": 280
409
  },
410
  {
411
+ "epoch": 0.23607372126734313,
412
+ "grad_norm": 2.4666706522590442,
413
+ "learning_rate": 9.447786880266706e-06,
414
+ "loss": 0.6618,
415
  "step": 285
416
  },
417
  {
418
+ "epoch": 0.24021536550010353,
419
+ "grad_norm": 2.186824941231579,
420
+ "learning_rate": 9.414285184712432e-06,
421
+ "loss": 0.6619,
422
  "step": 290
423
  },
424
  {
425
+ "epoch": 0.24435700973286395,
426
+ "grad_norm": 2.1988625035941047,
427
+ "learning_rate": 9.37985999762229e-06,
428
  "loss": 0.6473,
429
  "step": 295
430
  },
431
  {
432
+ "epoch": 0.24849865396562434,
433
+ "grad_norm": 2.43262245251786,
434
+ "learning_rate": 9.344518520925377e-06,
435
+ "loss": 0.6534,
436
  "step": 300
437
  },
438
  {
439
+ "epoch": 0.25264029819838474,
440
+ "grad_norm": 2.3140467166469527,
441
+ "learning_rate": 9.308268148243355e-06,
442
+ "loss": 0.637,
443
  "step": 305
444
  },
445
  {
446
+ "epoch": 0.25678194243114516,
447
+ "grad_norm": 2.5057247056088996,
448
+ "learning_rate": 9.271116463343692e-06,
449
+ "loss": 0.6417,
450
  "step": 310
451
  },
452
  {
453
+ "epoch": 0.2609235866639056,
454
+ "grad_norm": 2.288973054378257,
455
+ "learning_rate": 9.23307123855307e-06,
456
+ "loss": 0.663,
457
  "step": 315
458
  },
459
  {
460
+ "epoch": 0.26506523089666595,
461
+ "grad_norm": 2.416778922999839,
462
+ "learning_rate": 9.194140433131397e-06,
463
+ "loss": 0.6552,
464
  "step": 320
465
  },
466
  {
467
+ "epoch": 0.26920687512942637,
468
+ "grad_norm": 2.2651292744956244,
469
+ "learning_rate": 9.154332191606671e-06,
470
+ "loss": 0.6267,
471
  "step": 325
472
  },
473
  {
474
+ "epoch": 0.2733485193621868,
475
+ "grad_norm": 2.3801866420145954,
476
+ "learning_rate": 9.113654842071114e-06,
477
+ "loss": 0.6306,
478
  "step": 330
479
  },
480
  {
481
+ "epoch": 0.2774901635949472,
482
+ "grad_norm": 2.5824252886751964,
483
+ "learning_rate": 9.072116894438885e-06,
484
+ "loss": 0.6369,
485
  "step": 335
486
  },
487
  {
488
+ "epoch": 0.2816318078277076,
489
+ "grad_norm": 2.3287880261931875,
490
+ "learning_rate": 9.029727038665765e-06,
491
+ "loss": 0.6252,
492
  "step": 340
493
  },
494
  {
495
+ "epoch": 0.285773452060468,
496
+ "grad_norm": 2.3253013276198358,
497
+ "learning_rate": 8.986494142931168e-06,
498
+ "loss": 0.6165,
499
  "step": 345
500
  },
501
  {
502
+ "epoch": 0.2899150962932284,
503
+ "grad_norm": 2.3141576363210086,
504
+ "learning_rate": 8.94242725178288e-06,
505
+ "loss": 0.6003,
506
  "step": 350
507
  },
508
  {
509
+ "epoch": 0.2940567405259888,
510
+ "grad_norm": 2.2453531818296595,
511
+ "learning_rate": 8.89753558424488e-06,
512
+ "loss": 0.6,
513
  "step": 355
514
  },
515
  {
516
+ "epoch": 0.2981983847587492,
517
+ "grad_norm": 2.224520930970933,
518
+ "learning_rate": 8.851828531888692e-06,
519
+ "loss": 0.6117,
520
  "step": 360
521
  },
522
  {
523
+ "epoch": 0.30234002899150964,
524
+ "grad_norm": 2.312078602771557,
525
+ "learning_rate": 8.805315656868587e-06,
526
+ "loss": 0.6067,
527
  "step": 365
528
  },
529
  {
530
+ "epoch": 0.30648167322427006,
531
+ "grad_norm": 2.287152946702887,
532
+ "learning_rate": 8.75800668992117e-06,
533
+ "loss": 0.5979,
534
  "step": 370
535
  },
536
  {
537
+ "epoch": 0.3106233174570304,
538
+ "grad_norm": 2.0335651194660596,
539
+ "learning_rate": 8.709911528329623e-06,
540
+ "loss": 0.5911,
541
  "step": 375
542
  },
543
  {
544
+ "epoch": 0.31476496168979085,
545
+ "grad_norm": 2.190338892871162,
546
+ "learning_rate": 8.661040233853166e-06,
547
+ "loss": 0.5884,
548
  "step": 380
549
  },
550
  {
551
+ "epoch": 0.31890660592255127,
552
+ "grad_norm": 2.198737460879657,
553
+ "learning_rate": 8.611403030622074e-06,
554
+ "loss": 0.578,
555
  "step": 385
556
  },
557
  {
558
+ "epoch": 0.32304825015531163,
559
+ "grad_norm": 2.320069378855735,
560
+ "learning_rate": 8.561010302998734e-06,
561
+ "loss": 0.5809,
562
  "step": 390
563
  },
564
  {
565
+ "epoch": 0.32718989438807206,
566
+ "grad_norm": 2.1718618035260793,
567
+ "learning_rate": 8.509872593405189e-06,
568
+ "loss": 0.5834,
569
  "step": 395
570
  },
571
  {
572
+ "epoch": 0.3313315386208325,
573
+ "grad_norm": 2.306249404711425,
574
+ "learning_rate": 8.458000600117604e-06,
575
+ "loss": 0.5817,
576
  "step": 400
577
  },
578
  {
579
+ "epoch": 0.3354731828535929,
580
+ "grad_norm": 2.2320669776692377,
581
+ "learning_rate": 8.40540517502813e-06,
582
+ "loss": 0.579,
583
  "step": 405
584
  },
585
  {
586
+ "epoch": 0.33961482708635327,
587
+ "grad_norm": 2.380541232391175,
588
+ "learning_rate": 8.35209732137463e-06,
589
+ "loss": 0.5657,
590
  "step": 410
591
  },
592
  {
593
+ "epoch": 0.3437564713191137,
594
+ "grad_norm": 2.270445952132619,
595
+ "learning_rate": 8.298088191438753e-06,
596
+ "loss": 0.5569,
597
  "step": 415
598
  },
599
  {
600
+ "epoch": 0.3478981155518741,
601
+ "grad_norm": 2.353449916475927,
602
+ "learning_rate": 8.243389084212808e-06,
603
+ "loss": 0.5642,
604
  "step": 420
605
  },
606
  {
607
+ "epoch": 0.3520397597846345,
608
+ "grad_norm": 2.321086875711968,
609
+ "learning_rate": 8.188011443035962e-06,
610
+ "loss": 0.5519,
611
  "step": 425
612
  },
613
  {
614
+ "epoch": 0.3561814040173949,
615
+ "grad_norm": 2.112886796528374,
616
+ "learning_rate": 8.131966853200226e-06,
617
+ "loss": 0.561,
618
  "step": 430
619
  },
620
  {
621
+ "epoch": 0.3603230482501553,
622
+ "grad_norm": 2.354896774745211,
623
+ "learning_rate": 8.075267039526764e-06,
624
+ "loss": 0.5586,
625
  "step": 435
626
  },
627
  {
628
+ "epoch": 0.36446469248291574,
629
+ "grad_norm": 2.291143610369071,
630
+ "learning_rate": 8.017923863912989e-06,
631
+ "loss": 0.5491,
632
  "step": 440
633
  },
634
  {
635
+ "epoch": 0.3686063367156761,
636
+ "grad_norm": 2.2300308493171115,
637
+ "learning_rate": 7.959949322850994e-06,
638
+ "loss": 0.5562,
639
  "step": 445
640
  },
641
  {
642
+ "epoch": 0.37274798094843653,
643
+ "grad_norm": 2.28359859552775,
644
+ "learning_rate": 7.901355544917827e-06,
645
+ "loss": 0.5494,
646
  "step": 450
647
  },
648
  {
649
+ "epoch": 0.37688962518119695,
650
+ "grad_norm": 2.251157158354555,
651
+ "learning_rate": 7.842154788238124e-06,
652
+ "loss": 0.5424,
653
  "step": 455
654
  },
655
  {
656
+ "epoch": 0.3810312694139573,
657
+ "grad_norm": 2.285389132836112,
658
+ "learning_rate": 7.782359437919644e-06,
659
+ "loss": 0.5423,
660
  "step": 460
661
  },
662
  {
663
+ "epoch": 0.38517291364671774,
664
+ "grad_norm": 2.578954308129232,
665
+ "learning_rate": 7.721982003462255e-06,
666
+ "loss": 0.5335,
667
  "step": 465
668
  },
669
  {
670
+ "epoch": 0.38931455787947816,
671
+ "grad_norm": 2.1334475270465774,
672
+ "learning_rate": 7.661035116140856e-06,
673
+ "loss": 0.5342,
674
  "step": 470
675
  },
676
  {
677
+ "epoch": 0.3934562021122386,
678
+ "grad_norm": 2.3233034598991797,
679
+ "learning_rate": 7.599531526362873e-06,
680
+ "loss": 0.5358,
681
  "step": 475
682
  },
683
  {
684
+ "epoch": 0.39759784634499895,
685
+ "grad_norm": 2.2758511904313785,
686
+ "learning_rate": 7.537484101000787e-06,
687
+ "loss": 0.532,
688
  "step": 480
689
  },
690
  {
691
+ "epoch": 0.4017394905777594,
692
+ "grad_norm": 2.151297761784936,
693
+ "learning_rate": 7.474905820700334e-06,
694
+ "loss": 0.5135,
695
  "step": 485
696
  },
697
  {
698
+ "epoch": 0.4058811348105198,
699
+ "grad_norm": 2.3747508627507066,
700
+ "learning_rate": 7.411809777164873e-06,
701
+ "loss": 0.5306,
702
  "step": 490
703
  },
704
  {
705
+ "epoch": 0.41002277904328016,
706
+ "grad_norm": 2.202301523886571,
707
+ "learning_rate": 7.3482091704165405e-06,
708
+ "loss": 0.5247,
709
  "step": 495
710
  },
711
  {
712
+ "epoch": 0.4141644232760406,
713
+ "grad_norm": 2.153444078468182,
714
+ "learning_rate": 7.284117306034733e-06,
715
+ "loss": 0.5243,
716
  "step": 500
717
  },
718
  {
719
+ "epoch": 0.418306067508801,
720
+ "grad_norm": 2.171064236273241,
721
+ "learning_rate": 7.219547592372512e-06,
722
+ "loss": 0.5187,
723
  "step": 505
724
  },
725
  {
726
+ "epoch": 0.4224477117415614,
727
+ "grad_norm": 2.2667901238143586,
728
+ "learning_rate": 7.15451353775151e-06,
729
+ "loss": 0.5126,
730
  "step": 510
731
  },
732
  {
733
+ "epoch": 0.4265893559743218,
734
+ "grad_norm": 2.4217664685492997,
735
+ "learning_rate": 7.089028747635908e-06,
736
+ "loss": 0.5166,
737
  "step": 515
738
  },
739
  {
740
+ "epoch": 0.4307310002070822,
741
+ "grad_norm": 2.269198948091921,
742
+ "learning_rate": 7.023106921786118e-06,
743
+ "loss": 0.5102,
744
  "step": 520
745
  },
746
  {
747
+ "epoch": 0.43487264443984264,
748
+ "grad_norm": 2.3389291631103717,
749
+ "learning_rate": 6.956761851392706e-06,
750
+ "loss": 0.5147,
751
  "step": 525
752
  },
753
  {
754
+ "epoch": 0.439014288672603,
755
+ "grad_norm": 2.130307708383959,
756
+ "learning_rate": 6.890007416191209e-06,
757
+ "loss": 0.5,
758
  "step": 530
759
  },
760
  {
761
+ "epoch": 0.4431559329053634,
762
+ "grad_norm": 2.2319755819000133,
763
+ "learning_rate": 6.822857581558423e-06,
764
+ "loss": 0.5031,
765
  "step": 535
766
  },
767
  {
768
+ "epoch": 0.44729757713812385,
769
+ "grad_norm": 2.201034276249066,
770
+ "learning_rate": 6.7553263955907755e-06,
771
+ "loss": 0.5003,
772
  "step": 540
773
  },
774
  {
775
+ "epoch": 0.4514392213708842,
776
+ "grad_norm": 2.1906710526713877,
777
+ "learning_rate": 6.687427986165379e-06,
778
+ "loss": 0.498,
779
  "step": 545
780
  },
781
  {
782
+ "epoch": 0.45558086560364464,
783
+ "grad_norm": 2.421545219253794,
784
+ "learning_rate": 6.6191765579844205e-06,
785
+ "loss": 0.4996,
786
  "step": 550
787
  },
788
  {
789
+ "epoch": 0.45972250983640506,
790
+ "grad_norm": 2.2909926333329977,
791
+ "learning_rate": 6.550586389603451e-06,
792
+ "loss": 0.4969,
793
  "step": 555
794
  },
795
  {
796
+ "epoch": 0.4638641540691655,
797
+ "grad_norm": 2.3444032170833116,
798
+ "learning_rate": 6.481671830444243e-06,
799
+ "loss": 0.4945,
800
  "step": 560
801
  },
802
  {
803
+ "epoch": 0.46800579830192585,
804
+ "grad_norm": 2.1725157981177916,
805
+ "learning_rate": 6.412447297792818e-06,
806
+ "loss": 0.4863,
807
  "step": 565
808
  },
809
  {
810
+ "epoch": 0.47214744253468627,
811
+ "grad_norm": 2.1573058335782753,
812
+ "learning_rate": 6.3429272737832726e-06,
813
+ "loss": 0.4891,
814
  "step": 570
815
  },
816
  {
817
+ "epoch": 0.4762890867674467,
818
+ "grad_norm": 2.2084194407586373,
819
+ "learning_rate": 6.273126302368037e-06,
820
+ "loss": 0.487,
821
  "step": 575
822
  },
823
  {
824
+ "epoch": 0.48043073100020706,
825
+ "grad_norm": 2.21344464284995,
826
+ "learning_rate": 6.203058986275207e-06,
827
+ "loss": 0.4857,
828
  "step": 580
829
  },
830
  {
831
+ "epoch": 0.4845723752329675,
832
+ "grad_norm": 2.2203608524225666,
833
+ "learning_rate": 6.132739983953579e-06,
834
+ "loss": 0.4828,
835
  "step": 585
836
  },
837
  {
838
+ "epoch": 0.4887140194657279,
839
+ "grad_norm": 2.113371175419138,
840
+ "learning_rate": 6.062184006506027e-06,
841
+ "loss": 0.4826,
842
  "step": 590
843
  },
844
  {
845
+ "epoch": 0.4928556636984883,
846
+ "grad_norm": 2.3454680885393464,
847
+ "learning_rate": 5.991405814611855e-06,
848
+ "loss": 0.4676,
849
  "step": 595
850
  },
851
  {
852
+ "epoch": 0.4969973079312487,
853
+ "grad_norm": 2.321964160860843,
854
+ "learning_rate": 5.920420215438794e-06,
855
+ "loss": 0.4737,
856
  "step": 600
857
  },
858
  {
859
+ "epoch": 0.5011389521640092,
860
+ "grad_norm": 2.1991634408721943,
861
+ "learning_rate": 5.849242059545259e-06,
862
+ "loss": 0.465,
863
  "step": 605
864
  },
865
  {
866
+ "epoch": 0.5052805963967695,
867
+ "grad_norm": 2.165800122945675,
868
+ "learning_rate": 5.777886237773542e-06,
869
+ "loss": 0.4623,
870
  "step": 610
871
  },
872
  {
873
+ "epoch": 0.5094222406295299,
874
+ "grad_norm": 2.1403595970614133,
875
+ "learning_rate": 5.706367678134562e-06,
876
+ "loss": 0.4767,
877
  "step": 615
878
  },
879
  {
880
+ "epoch": 0.5135638848622903,
881
+ "grad_norm": 2.1427662247598986,
882
+ "learning_rate": 5.634701342684852e-06,
883
+ "loss": 0.4607,
884
  "step": 620
885
  },
886
  {
887
+ "epoch": 0.5177055290950507,
888
+ "grad_norm": 2.0626908104196673,
889
+ "learning_rate": 5.562902224396416e-06,
890
+ "loss": 0.4617,
891
  "step": 625
892
  },
893
  {
894
+ "epoch": 0.5218471733278112,
895
+ "grad_norm": 2.0098208614700326,
896
+ "learning_rate": 5.49098534402012e-06,
897
+ "loss": 0.4618,
898
  "step": 630
899
  },
900
  {
901
+ "epoch": 0.5259888175605716,
902
+ "grad_norm": 2.0735743487283607,
903
+ "learning_rate": 5.418965746943281e-06,
904
+ "loss": 0.459,
905
  "step": 635
906
  },
907
  {
908
+ "epoch": 0.5301304617933319,
909
+ "grad_norm": 2.024087007627894,
910
+ "learning_rate": 5.34685850004208e-06,
911
+ "loss": 0.4539,
912
  "step": 640
913
  },
914
  {
915
+ "epoch": 0.5342721060260923,
916
+ "grad_norm": 2.130234594696483,
917
+ "learning_rate": 5.2746786885295034e-06,
918
+ "loss": 0.453,
919
  "step": 645
920
  },
921
  {
922
+ "epoch": 0.5384137502588527,
923
+ "grad_norm": 2.0986269383387706,
924
+ "learning_rate": 5.2024414127994325e-06,
925
+ "loss": 0.4538,
926
  "step": 650
927
  },
928
  {
929
+ "epoch": 0.5425553944916132,
930
+ "grad_norm": 2.209414073449144,
931
+ "learning_rate": 5.13016178526756e-06,
932
+ "loss": 0.4448,
933
  "step": 655
934
  },
935
  {
936
+ "epoch": 0.5466970387243736,
937
+ "grad_norm": 2.15118663280229,
938
+ "learning_rate": 5.057854927209804e-06,
939
+ "loss": 0.4446,
940
  "step": 660
941
  },
942
  {
943
+ "epoch": 0.550838682957134,
944
+ "grad_norm": 2.033860154274491,
945
+ "learning_rate": 4.985535965598843e-06,
946
+ "loss": 0.4447,
947
  "step": 665
948
  },
949
  {
950
+ "epoch": 0.5549803271898944,
951
+ "grad_norm": 2.129135189760016,
952
+ "learning_rate": 4.913220029939491e-06,
953
+ "loss": 0.4512,
954
  "step": 670
955
  },
956
  {
957
+ "epoch": 0.5591219714226547,
958
+ "grad_norm": 2.090833400754654,
959
+ "learning_rate": 4.840922249103506e-06,
960
+ "loss": 0.4467,
961
  "step": 675
962
  },
963
  {
964
+ "epoch": 0.5632636156554152,
965
+ "grad_norm": 2.0680502896348836,
966
+ "learning_rate": 4.7686577481645745e-06,
967
+ "loss": 0.4316,
968
  "step": 680
969
  },
970
  {
971
+ "epoch": 0.5674052598881756,
972
+ "grad_norm": 2.07449578249477,
973
+ "learning_rate": 4.696441645234042e-06,
974
+ "loss": 0.4421,
975
  "step": 685
976
  },
977
  {
978
+ "epoch": 0.571546904120936,
979
+ "grad_norm": 2.2932786860060235,
980
+ "learning_rate": 4.624289048298147e-06,
981
+ "loss": 0.4433,
982
  "step": 690
983
  },
984
  {
985
+ "epoch": 0.5756885483536964,
986
+ "grad_norm": 2.058591317854592,
987
+ "learning_rate": 4.55221505205734e-06,
988
+ "loss": 0.4298,
989
  "step": 695
990
  },
991
  {
992
+ "epoch": 0.5798301925864568,
993
+ "grad_norm": 2.3196625048015704,
994
+ "learning_rate": 4.480234734768393e-06,
995
+ "loss": 0.4326,
996
  "step": 700
997
  },
998
  {
999
+ "epoch": 0.5839718368192173,
1000
+ "grad_norm": 2.073473068971661,
1001
+ "learning_rate": 4.408363155089952e-06,
1002
+ "loss": 0.4335,
1003
  "step": 705
1004
  },
1005
  {
1006
+ "epoch": 0.5881134810519776,
1007
+ "grad_norm": 2.029766952429309,
1008
+ "learning_rate": 4.3366153489321855e-06,
1009
+ "loss": 0.4273,
1010
  "step": 710
1011
  },
1012
  {
1013
+ "epoch": 0.592255125284738,
1014
+ "grad_norm": 2.1078664280012007,
1015
+ "learning_rate": 4.265006326311199e-06,
1016
+ "loss": 0.415,
1017
  "step": 715
1018
  },
1019
  {
1020
+ "epoch": 0.5963967695174984,
1021
+ "grad_norm": 2.0038145759132915,
1022
+ "learning_rate": 4.1935510682088545e-06,
1023
+ "loss": 0.4244,
1024
  "step": 720
1025
  },
1026
  {
1027
+ "epoch": 0.6005384137502588,
1028
+ "grad_norm": 2.0468827081169976,
1029
+ "learning_rate": 4.122264523438668e-06,
1030
+ "loss": 0.4226,
1031
  "step": 725
1032
  },
1033
  {
1034
+ "epoch": 0.6046800579830193,
1035
+ "grad_norm": 2.0660110564277923,
1036
+ "learning_rate": 4.051161605518453e-06,
1037
+ "loss": 0.4222,
1038
  "step": 730
1039
  },
1040
  {
1041
+ "epoch": 0.6088217022157797,
1042
+ "grad_norm": 2.088150514221739,
1043
+ "learning_rate": 3.980257189550316e-06,
1044
+ "loss": 0.433,
1045
  "step": 735
1046
  },
1047
  {
1048
+ "epoch": 0.6129633464485401,
1049
+ "grad_norm": 2.1225320313052447,
1050
+ "learning_rate": 3.909566109108727e-06,
1051
+ "loss": 0.4161,
1052
  "step": 740
1053
  },
1054
  {
1055
+ "epoch": 0.6171049906813004,
1056
+ "grad_norm": 2.0990451349998227,
1057
+ "learning_rate": 3.839103153137247e-06,
1058
+ "loss": 0.417,
1059
  "step": 745
1060
  },
1061
  {
1062
+ "epoch": 0.6212466349140608,
1063
+ "grad_norm": 2.104242942119707,
1064
+ "learning_rate": 3.768883062854598e-06,
1065
+ "loss": 0.4081,
1066
  "step": 750
1067
  },
1068
  {
1069
+ "epoch": 0.6253882791468213,
1070
+ "grad_norm": 2.051137757693821,
1071
+ "learning_rate": 3.6989205286707398e-06,
1072
+ "loss": 0.4108,
1073
  "step": 755
1074
  },
1075
  {
1076
+ "epoch": 0.6295299233795817,
1077
+ "grad_norm": 2.099437836109555,
1078
+ "learning_rate": 3.6292301871135425e-06,
1079
+ "loss": 0.411,
1080
  "step": 760
1081
  },
1082
  {
1083
+ "epoch": 0.6336715676123421,
1084
+ "grad_norm": 2.056296390597689,
1085
+ "learning_rate": 3.55982661776676e-06,
1086
  "loss": 0.4047,
1087
  "step": 765
1088
  },
1089
  {
1090
+ "epoch": 0.6378132118451025,
1091
+ "grad_norm": 2.0454981613636503,
1092
+ "learning_rate": 3.4907243402199013e-06,
1093
+ "loss": 0.4044,
1094
  "step": 770
1095
  },
1096
  {
1097
+ "epoch": 0.641954856077863,
1098
+ "grad_norm": 2.0625058430327092,
1099
+ "learning_rate": 3.4219378110306523e-06,
1100
+ "loss": 0.4103,
1101
  "step": 775
1102
  },
1103
  {
1104
+ "epoch": 0.6460965003106233,
1105
+ "grad_norm": 2.0640184519207123,
1106
+ "learning_rate": 3.353481420700495e-06,
1107
+ "loss": 0.4109,
1108
  "step": 780
1109
  },
1110
  {
1111
+ "epoch": 0.6502381445433837,
1112
+ "grad_norm": 2.096435391565135,
1113
+ "learning_rate": 3.285369490664133e-06,
1114
+ "loss": 0.4103,
1115
  "step": 785
1116
  },
1117
  {
1118
+ "epoch": 0.6543797887761441,
1119
+ "grad_norm": 2.0511959776297983,
1120
+ "learning_rate": 3.2176162702933816e-06,
1121
+ "loss": 0.3991,
1122
  "step": 790
1123
  },
1124
  {
1125
+ "epoch": 0.6585214330089045,
1126
+ "grad_norm": 2.004915794597741,
1127
+ "learning_rate": 3.150235933916115e-06,
1128
+ "loss": 0.401,
1129
  "step": 795
1130
  },
1131
  {
1132
+ "epoch": 0.662663077241665,
1133
+ "grad_norm": 1.989931694202458,
1134
+ "learning_rate": 3.0832425778509235e-06,
1135
+ "loss": 0.4015,
1136
  "step": 800
1137
  },
1138
  {
1139
+ "epoch": 0.6668047214744254,
1140
+ "grad_norm": 2.0927029677784383,
1141
+ "learning_rate": 3.0166502174581012e-06,
1142
+ "loss": 0.3904,
1143
  "step": 805
1144
  },
1145
  {
1146
+ "epoch": 0.6709463657071858,
1147
+ "grad_norm": 2.05569943435716,
1148
+ "learning_rate": 2.950472784207544e-06,
1149
+ "loss": 0.3976,
1150
  "step": 810
1151
  },
1152
  {
1153
+ "epoch": 0.6750880099399461,
1154
+ "grad_norm": 2.0070990032518456,
1155
+ "learning_rate": 2.8847241227642255e-06,
1156
+ "loss": 0.3855,
1157
  "step": 815
1158
  },
1159
  {
1160
+ "epoch": 0.6792296541727065,
1161
+ "grad_norm": 1.9791880977464777,
1162
+ "learning_rate": 2.819417988091814e-06,
1163
+ "loss": 0.3831,
1164
  "step": 820
1165
  },
1166
  {
1167
+ "epoch": 0.683371298405467,
1168
+ "grad_norm": 1.9874217780709027,
1169
+ "learning_rate": 2.754568042575061e-06,
1170
+ "loss": 0.3928,
1171
  "step": 825
1172
  },
1173
  {
1174
+ "epoch": 0.6875129426382274,
1175
+ "grad_norm": 1.9444889966862584,
1176
+ "learning_rate": 2.6901878531615677e-06,
1177
+ "loss": 0.3967,
1178
  "step": 830
1179
  },
1180
  {
1181
+ "epoch": 0.6916545868709878,
1182
+ "grad_norm": 2.190695001941064,
1183
+ "learning_rate": 2.6262908885235046e-06,
1184
+ "loss": 0.384,
1185
  "step": 835
1186
  },
1187
  {
1188
+ "epoch": 0.6957962311037482,
1189
+ "grad_norm": 1.991552664612379,
1190
+ "learning_rate": 2.5628905162398797e-06,
1191
+ "loss": 0.3831,
1192
  "step": 840
1193
  },
1194
  {
1195
+ "epoch": 0.6999378753365086,
1196
+ "grad_norm": 2.017973650880143,
1197
+ "learning_rate": 2.5000000000000015e-06,
1198
+ "loss": 0.3851,
1199
  "step": 845
1200
  },
1201
  {
1202
+ "epoch": 0.704079519569269,
1203
+ "grad_norm": 2.138390804530181,
1204
+ "learning_rate": 2.4376324968286154e-06,
1205
+ "loss": 0.3777,
1206
  "step": 850
1207
  },
1208
  {
1209
+ "epoch": 0.7082211638020294,
1210
+ "grad_norm": 2.0021049336250814,
1211
+ "learning_rate": 2.375801054333409e-06,
1212
+ "loss": 0.3891,
1213
  "step": 855
1214
  },
1215
  {
1216
+ "epoch": 0.7123628080347898,
1217
+ "grad_norm": 2.1027439407928505,
1218
+ "learning_rate": 2.3145186079753685e-06,
1219
+ "loss": 0.381,
1220
  "step": 860
1221
  },
1222
  {
1223
+ "epoch": 0.7165044522675502,
1224
+ "grad_norm": 2.0200252919367823,
1225
+ "learning_rate": 2.253797978362617e-06,
1226
+ "loss": 0.3754,
1227
  "step": 865
1228
  },
1229
  {
1230
+ "epoch": 0.7206460965003106,
1231
+ "grad_norm": 2.022944794755911,
1232
+ "learning_rate": 2.193651868568285e-06,
1233
+ "loss": 0.3719,
1234
  "step": 870
1235
  },
1236
  {
1237
+ "epoch": 0.7247877407330711,
1238
+ "grad_norm": 2.011383665562108,
1239
+ "learning_rate": 2.1340928614729445e-06,
1240
+ "loss": 0.3716,
1241
  "step": 875
1242
  },
1243
  {
1244
+ "epoch": 0.7289293849658315,
1245
+ "grad_norm": 2.1000401745759767,
1246
+ "learning_rate": 2.075133417132223e-06,
1247
+ "loss": 0.3773,
1248
  "step": 880
1249
  },
1250
  {
1251
+ "epoch": 0.7330710291985918,
1252
+ "grad_norm": 2.0322415551222277,
1253
+ "learning_rate": 2.016785870170079e-06,
1254
+ "loss": 0.3755,
1255
  "step": 885
1256
  },
1257
  {
1258
+ "epoch": 0.7372126734313522,
1259
+ "grad_norm": 2.0586335139183327,
1260
+ "learning_rate": 1.9590624271983406e-06,
1261
+ "loss": 0.3749,
1262
  "step": 890
1263
  },
1264
  {
1265
+ "epoch": 0.7413543176641126,
1266
+ "grad_norm": 2.057098686991852,
1267
+ "learning_rate": 1.9019751642630252e-06,
1268
+ "loss": 0.3733,
1269
  "step": 895
1270
  },
1271
  {
1272
+ "epoch": 0.7454959618968731,
1273
+ "grad_norm": 1.9856940958346814,
1274
+ "learning_rate": 1.8455360243179537e-06,
1275
+ "loss": 0.3737,
1276
  "step": 900
1277
  },
1278
  {
1279
+ "epoch": 0.7496376061296335,
1280
+ "grad_norm": 2.049797779671496,
1281
+ "learning_rate": 1.7897568147262323e-06,
1282
+ "loss": 0.3678,
1283
  "step": 905
1284
  },
1285
  {
1286
+ "epoch": 0.7537792503623939,
1287
+ "grad_norm": 2.1869162271482083,
1288
+ "learning_rate": 1.7346492047900897e-06,
1289
+ "loss": 0.3769,
1290
  "step": 910
1291
  },
1292
  {
1293
+ "epoch": 0.7579208945951543,
1294
+ "grad_norm": 2.0166177063427444,
1295
+ "learning_rate": 1.6802247233095914e-06,
1296
+ "loss": 0.3722,
1297
  "step": 915
1298
  },
1299
  {
1300
+ "epoch": 0.7620625388279146,
1301
+ "grad_norm": 2.0284630775550094,
1302
+ "learning_rate": 1.626494756170765e-06,
1303
+ "loss": 0.3562,
1304
  "step": 920
1305
  },
1306
  {
1307
+ "epoch": 0.7662041830606751,
1308
+ "grad_norm": 1.9844703515401159,
1309
+ "learning_rate": 1.5734705439636017e-06,
1310
+ "loss": 0.3641,
1311
  "step": 925
1312
  },
1313
  {
1314
+ "epoch": 0.7703458272934355,
1315
+ "grad_norm": 2.026880974919187,
1316
+ "learning_rate": 1.5211631796304721e-06,
1317
+ "loss": 0.3671,
1318
  "step": 930
1319
  },
1320
  {
1321
+ "epoch": 0.7744874715261959,
1322
+ "grad_norm": 2.2261033018640775,
1323
+ "learning_rate": 1.46958360614543e-06,
1324
+ "loss": 0.3677,
1325
  "step": 935
1326
  },
1327
  {
1328
+ "epoch": 0.7786291157589563,
1329
+ "grad_norm": 2.0419572811826527,
1330
+ "learning_rate": 1.4187426142248723e-06,
1331
+ "loss": 0.3567,
1332
  "step": 940
1333
  },
1334
  {
1335
+ "epoch": 0.7827707599917167,
1336
+ "grad_norm": 1.9436995231419443,
1337
+ "learning_rate": 1.3686508400700787e-06,
1338
+ "loss": 0.3659,
1339
  "step": 945
1340
  },
1341
  {
1342
+ "epoch": 0.7869124042244772,
1343
+ "grad_norm": 2.118970998919544,
1344
+ "learning_rate": 1.3193187631420462e-06,
1345
+ "loss": 0.3621,
1346
  "step": 950
1347
  },
1348
  {
1349
+ "epoch": 0.7910540484572375,
1350
+ "grad_norm": 2.059747456229496,
1351
+ "learning_rate": 1.2707567039691505e-06,
1352
+ "loss": 0.3565,
1353
  "step": 955
1354
  },
1355
  {
1356
+ "epoch": 0.7951956926899979,
1357
+ "grad_norm": 2.090689117796637,
1358
+ "learning_rate": 1.222974821988024e-06,
1359
+ "loss": 0.3583,
1360
  "step": 960
1361
  },
1362
  {
1363
+ "epoch": 0.7993373369227583,
1364
+ "grad_norm": 2.059076495278081,
1365
+ "learning_rate": 1.1759831134181504e-06,
1366
+ "loss": 0.3622,
1367
  "step": 965
1368
  },
1369
  {
1370
+ "epoch": 0.8034789811555187,
1371
+ "grad_norm": 1.9884532847703864,
1372
+ "learning_rate": 1.1297914091706086e-06,
1373
+ "loss": 0.3541,
1374
  "step": 970
1375
  },
1376
  {
1377
+ "epoch": 0.8076206253882792,
1378
+ "grad_norm": 1.8905068597211632,
1379
+ "learning_rate": 1.0844093727913868e-06,
1380
+ "loss": 0.3578,
1381
  "step": 975
1382
  },
1383
  {
1384
+ "epoch": 0.8117622696210396,
1385
+ "grad_norm": 2.0536970678158206,
1386
+ "learning_rate": 1.039846498439727e-06,
1387
+ "loss": 0.353,
1388
  "step": 980
1389
  },
1390
  {
1391
+ "epoch": 0.8159039138538,
1392
+ "grad_norm": 1.9394493842379006,
1393
+ "learning_rate": 9.961121089018933e-07,
1394
+ "loss": 0.3552,
1395
  "step": 985
1396
  },
1397
  {
1398
+ "epoch": 0.8200455580865603,
1399
+ "grad_norm": 1.9161234213176144,
1400
+ "learning_rate": 9.532153536407923e-07,
1401
+ "loss": 0.3572,
1402
  "step": 990
1403
  },
1404
  {
1405
+ "epoch": 0.8241872023193207,
1406
+ "grad_norm": 2.0679533219870394,
1407
+ "learning_rate": 9.111652068818621e-07,
1408
+ "loss": 0.3499,
1409
  "step": 995
1410
  },
1411
  {
1412
+ "epoch": 0.8283288465520812,
1413
+ "grad_norm": 2.007021263318756,
1414
+ "learning_rate": 8.699704657356195e-07,
1415
+ "loss": 0.3503,
1416
  "step": 1000
1417
  },
1418
  {
1419
+ "epoch": 0.8324704907848416,
1420
+ "grad_norm": 1.940869076922602,
1421
+ "learning_rate": 8.296397483572515e-07,
1422
+ "loss": 0.3588,
1423
  "step": 1005
1424
  },
1425
  {
1426
+ "epoch": 0.836612135017602,
1427
+ "grad_norm": 2.0578809556631774,
1428
+ "learning_rate": 7.901814921436624e-07,
1429
+ "loss": 0.3497,
1430
  "step": 1010
1431
  },
1432
  {
1433
+ "epoch": 0.8407537792503624,
1434
+ "grad_norm": 1.9963820836617243,
1435
+ "learning_rate": 7.516039519683105e-07,
1436
+ "loss": 0.3459,
1437
  "step": 1015
1438
  },
1439
  {
1440
+ "epoch": 0.8448954234831229,
1441
+ "grad_norm": 2.1312384802093707,
1442
+ "learning_rate": 7.139151984542636e-07,
1443
+ "loss": 0.3515,
1444
  "step": 1020
1445
  },
1446
  {
1447
+ "epoch": 0.8490370677158832,
1448
+ "grad_norm": 2.051969582922354,
1449
+ "learning_rate": 6.771231162857722e-07,
1450
+ "loss": 0.3497,
1451
  "step": 1025
1452
  },
1453
  {
1454
+ "epoch": 0.8531787119486436,
1455
+ "grad_norm": 1.944237335752014,
1456
+ "learning_rate": 6.412354025587509e-07,
1457
+ "loss": 0.3454,
1458
  "step": 1030
1459
  },
1460
  {
1461
+ "epoch": 0.857320356181404,
1462
+ "grad_norm": 2.0013927970500824,
1463
+ "learning_rate": 6.062595651705111e-07,
1464
+ "loss": 0.3484,
1465
  "step": 1035
1466
  },
1467
  {
1468
+ "epoch": 0.8614620004141644,
1469
+ "grad_norm": 1.9953241481292785,
1470
+ "learning_rate": 5.722029212490666e-07,
1471
+ "loss": 0.3467,
1472
  "step": 1040
1473
  },
1474
  {
1475
+ "epoch": 0.8656036446469249,
1476
+ "grad_norm": 1.8562753423514788,
1477
+ "learning_rate": 5.390725956223531e-07,
1478
+ "loss": 0.3439,
1479
  "step": 1045
1480
  },
1481
  {
1482
+ "epoch": 0.8697452888796853,
1483
+ "grad_norm": 2.014412721333329,
1484
+ "learning_rate": 5.068755193276798e-07,
1485
+ "loss": 0.3475,
1486
  "step": 1050
1487
  },
1488
  {
1489
+ "epoch": 0.8738869331124457,
1490
+ "grad_norm": 2.0883606449971013,
1491
+ "learning_rate": 4.756184281617121e-07,
1492
+ "loss": 0.3442,
1493
  "step": 1055
1494
  },
1495
  {
1496
+ "epoch": 0.878028577345206,
1497
+ "grad_norm": 1.9479502623498623,
1498
+ "learning_rate": 4.4530786127131575e-07,
1499
+ "loss": 0.3516,
1500
  "step": 1060
1501
  },
1502
  {
1503
+ "epoch": 0.8821702215779664,
1504
+ "grad_norm": 1.9878832197736611,
1505
+ "learning_rate": 4.159501597855287e-07,
1506
+ "loss": 0.3468,
1507
  "step": 1065
1508
  },
1509
  {
1510
+ "epoch": 0.8863118658107269,
1511
+ "grad_norm": 1.981044415868457,
1512
+ "learning_rate": 3.8755146548896784e-07,
1513
+ "loss": 0.3442,
1514
  "step": 1070
1515
  },
1516
  {
1517
+ "epoch": 0.8904535100434873,
1518
+ "grad_norm": 2.078622119500157,
1519
+ "learning_rate": 3.6011771953693044e-07,
1520
+ "loss": 0.3414,
1521
  "step": 1075
1522
  },
1523
  {
1524
+ "epoch": 0.8945951542762477,
1525
+ "grad_norm": 1.9832065578238605,
1526
+ "learning_rate": 3.336546612124758e-07,
1527
+ "loss": 0.3462,
1528
  "step": 1080
1529
  },
1530
  {
1531
+ "epoch": 0.8987367985090081,
1532
+ "grad_norm": 2.077188737775822,
1533
+ "learning_rate": 3.081678267257404e-07,
1534
+ "loss": 0.3445,
1535
  "step": 1085
1536
  },
1537
  {
1538
+ "epoch": 0.9028784427417684,
1539
+ "grad_norm": 1.9680080545892447,
1540
+ "learning_rate": 2.836625480557265e-07,
1541
+ "loss": 0.3433,
1542
  "step": 1090
1543
  },
1544
  {
1545
+ "epoch": 0.9070200869745288,
1546
+ "grad_norm": 2.0554539318334344,
1547
+ "learning_rate": 2.601439518348331e-07,
1548
+ "loss": 0.3411,
1549
  "step": 1095
1550
  },
1551
  {
1552
+ "epoch": 0.9111617312072893,
1553
+ "grad_norm": 1.9738460247856697,
1554
+ "learning_rate": 2.376169582763288e-07,
1555
+ "loss": 0.3423,
1556
  "step": 1100
1557
  },
1558
  {
1559
+ "epoch": 0.9153033754400497,
1560
+ "grad_norm": 2.100671455290073,
1561
+ "learning_rate": 2.1608628014502364e-07,
1562
+ "loss": 0.3412,
1563
  "step": 1105
1564
  },
1565
  {
1566
+ "epoch": 0.9194450196728101,
1567
+ "grad_norm": 2.0790276531527865,
1568
+ "learning_rate": 1.955564217713335e-07,
1569
+ "loss": 0.3376,
1570
  "step": 1110
1571
  },
1572
  {
1573
+ "epoch": 0.9235866639055705,
1574
+ "grad_norm": 2.039950191776352,
1575
+ "learning_rate": 1.7603167810894662e-07,
1576
+ "loss": 0.3406,
1577
  "step": 1115
1578
  },
1579
  {
1580
+ "epoch": 0.927728308138331,
1581
+ "grad_norm": 2.0162487373459435,
1582
+ "learning_rate": 1.5751613383630128e-07,
1583
+ "loss": 0.3465,
1584
  "step": 1120
1585
  },
1586
  {
1587
+ "epoch": 0.9318699523710913,
1588
+ "grad_norm": 2.092197022372298,
1589
+ "learning_rate": 1.4001366250204762e-07,
1590
+ "loss": 0.337,
1591
  "step": 1125
1592
  },
1593
  {
1594
+ "epoch": 0.9360115966038517,
1595
+ "grad_norm": 1.944362051868436,
1596
+ "learning_rate": 1.235279257146804e-07,
1597
+ "loss": 0.3378,
1598
  "step": 1130
1599
  },
1600
  {
1601
+ "epoch": 0.9401532408366121,
1602
+ "grad_norm": 2.06431956869675,
1603
+ "learning_rate": 1.080623723765134e-07,
1604
+ "loss": 0.3352,
1605
  "step": 1135
1606
  },
1607
  {
1608
+ "epoch": 0.9442948850693725,
1609
+ "grad_norm": 2.067267199918096,
1610
+ "learning_rate": 9.362023796215036e-08,
1611
+ "loss": 0.3385,
1612
  "step": 1140
1613
  },
1614
  {
1615
+ "epoch": 0.948436529302133,
1616
+ "grad_norm": 2.050374938357445,
1617
+ "learning_rate": 8.020454384160437e-08,
1618
+ "loss": 0.345,
1619
  "step": 1145
1620
  },
1621
  {
1622
+ "epoch": 0.9525781735348934,
1623
+ "grad_norm": 2.058842049841172,
1624
+ "learning_rate": 6.78180966482156e-08,
1625
+ "loss": 0.3431,
1626
  "step": 1150
1627
  },
1628
  {
1629
+ "epoch": 0.9567198177676538,
1630
+ "grad_norm": 2.1228820426864825,
1631
+ "learning_rate": 5.646348769148491e-08,
1632
+ "loss": 0.3415,
1633
  "step": 1155
1634
  },
1635
  {
1636
+ "epoch": 0.9608614620004141,
1637
+ "grad_norm": 1.9574417210663868,
1638
+ "learning_rate": 4.6143092414961396e-08,
1639
+ "loss": 0.3346,
1640
  "step": 1160
1641
  },
1642
  {
1643
+ "epoch": 0.9650031062331745,
1644
+ "grad_norm": 2.0568028289295293,
1645
+ "learning_rate": 3.685906989928656e-08,
1646
+ "loss": 0.3404,
1647
  "step": 1165
1648
  },
1649
  {
1650
+ "epoch": 0.969144750465935,
1651
+ "grad_norm": 2.187483815397574,
1652
+ "learning_rate": 2.861336241050061e-08,
1653
+ "loss": 0.3366,
1654
  "step": 1170
1655
  },
1656
  {
1657
+ "epoch": 0.9732863946986954,
1658
+ "grad_norm": 2.0602914938549626,
1659
+ "learning_rate": 2.1407694993714755e-08,
1660
+ "loss": 0.3419,
1661
  "step": 1175
1662
  },
1663
  {
1664
+ "epoch": 0.9774280389314558,
1665
+ "grad_norm": 2.0097264769694174,
1666
+ "learning_rate": 1.5243575112218744e-08,
1667
+ "loss": 0.3391,
1668
  "step": 1180
1669
  },
1670
  {
1671
+ "epoch": 0.9815696831642162,
1672
+ "grad_norm": 2.1300950765468456,
1673
+ "learning_rate": 1.0122292332114814e-08,
1674
+ "loss": 0.3479,
1675
  "step": 1185
1676
  },
1677
  {
1678
+ "epoch": 0.9857113273969766,
1679
+ "grad_norm": 1.9814636182776308,
1680
+ "learning_rate": 6.044918052531268e-09,
1681
+ "loss": 0.3359,
1682
  "step": 1190
1683
  },
1684
  {
1685
+ "epoch": 0.989852971629737,
1686
+ "grad_norm": 2.0905112637682053,
1687
+ "learning_rate": 3.0123052814812203e-09,
1688
+ "loss": 0.3383,
1689
  "step": 1195
1690
  },
1691
  {
1692
+ "epoch": 0.9939946158624974,
1693
+ "grad_norm": 1.966590567136286,
1694
+ "learning_rate": 1.025088457409229e-09,
1695
+ "loss": 0.3325,
1696
  "step": 1200
1697
  },
1698
  {
1699
+ "epoch": 0.9981362600952578,
1700
+ "grad_norm": 2.035603528671296,
1701
+ "learning_rate": 8.368331646302353e-11,
1702
+ "loss": 0.3416,
1703
  "step": 1205
1704
  },
1705
  {
1706
+ "epoch": 0.999792917788362,
1707
+ "eval_loss": 0.3488326072692871,
1708
+ "eval_runtime": 0.9501,
1709
+ "eval_samples_per_second": 3.158,
1710
+ "eval_steps_per_second": 1.053,
1711
+ "step": 1207
1712
  },
1713
  {
1714
+ "epoch": 0.999792917788362,
1715
+ "step": 1207,
1716
+ "total_flos": 252668899491840.0,
1717
+ "train_loss": 0.5272446889569172,
1718
+ "train_runtime": 29415.576,
1719
+ "train_samples_per_second": 1.313,
1720
+ "train_steps_per_second": 0.041
1721
  }
1722
  ],
1723
  "logging_steps": 5,
1724
+ "max_steps": 1207,
1725
  "num_input_tokens_seen": 0,
1726
  "num_train_epochs": 1,
1727
  "save_steps": 100,
 
1737
  "attributes": {}
1738
  }
1739
  },
1740
+ "total_flos": 252668899491840.0,
1741
  "train_batch_size": 2,
1742
  "trial_name": null,
1743
  "trial_params": null