huranokuma commited on
Commit
a5f1839
1 Parent(s): 87ec6ce

End of training

Browse files
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.5895981374153325,
4
- "eval_loss": 1.842421054840088,
5
- "eval_runtime": 6167.0931,
6
  "eval_samples": 134942,
7
- "eval_samples_per_second": 21.881,
8
- "eval_steps_per_second": 10.94,
9
- "perplexity": 6.311800993960819,
10
- "train_loss": 0.612412237187797,
11
- "train_runtime": 12901.2681,
12
  "train_samples": 134942,
13
- "train_samples_per_second": 20.919,
14
- "train_steps_per_second": 20.919
15
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.6914216185247494,
4
+ "eval_loss": 1.320469856262207,
5
+ "eval_runtime": 4412.1637,
6
  "eval_samples": 134942,
7
+ "eval_samples_per_second": 30.584,
8
+ "eval_steps_per_second": 7.646,
9
+ "perplexity": 3.745180660509808,
10
+ "train_loss": 1.6101340950849867,
11
+ "train_runtime": 65579.3531,
12
  "train_samples": 134942,
13
+ "train_samples_per_second": 10.288,
14
+ "train_steps_per_second": 2.572
15
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.5895981374153325,
4
- "eval_loss": 1.842421054840088,
5
- "eval_runtime": 6167.0931,
6
  "eval_samples": 134942,
7
- "eval_samples_per_second": 21.881,
8
- "eval_steps_per_second": 10.94,
9
- "perplexity": 6.311800993960819
10
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.6914216185247494,
4
+ "eval_loss": 1.320469856262207,
5
+ "eval_runtime": 4412.1637,
6
  "eval_samples": 134942,
7
+ "eval_samples_per_second": 30.584,
8
+ "eval_steps_per_second": 7.646,
9
+ "perplexity": 3.745180660509808
10
  }
runs/Aug14_14-24-46_09d4b99a5f37/events.out.tfevents.1660558183.09d4b99a5f37.1218.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03653415a61db97e52147e1726fe1f136f239c212512936b8405e4e4c7073131
3
+ size 369
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "train_loss": 0.612412237187797,
4
- "train_runtime": 12901.2681,
5
  "train_samples": 134942,
6
- "train_samples_per_second": 20.919,
7
- "train_steps_per_second": 20.919
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "train_loss": 1.6101340950849867,
4
+ "train_runtime": 65579.3531,
5
  "train_samples": 134942,
6
+ "train_samples_per_second": 10.288,
7
+ "train_steps_per_second": 2.572
8
  }
trainer_state.json CHANGED
@@ -1,3259 +1,2047 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
- "global_step": 269884,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
- "learning_rate": 4.985179037230259e-05,
13
- "loss": 1.9823,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.03,
18
- "learning_rate": 4.970358074460517e-05,
19
- "loss": 2.0137,
20
  "step": 1000
21
  },
22
  {
23
  "epoch": 0.04,
24
- "learning_rate": 4.9555371116907756e-05,
25
- "loss": 2.0097,
26
  "step": 1500
27
  },
28
  {
29
  "epoch": 0.06,
30
- "learning_rate": 4.940716148921034e-05,
31
- "loss": 2.0074,
32
  "step": 2000
33
  },
34
  {
35
  "epoch": 0.07,
36
- "learning_rate": 4.9258951861512924e-05,
37
- "loss": 2.0169,
38
  "step": 2500
39
  },
40
  {
41
  "epoch": 0.09,
42
- "learning_rate": 4.911074223381551e-05,
43
- "loss": 2.0187,
44
  "step": 3000
45
  },
46
  {
47
  "epoch": 0.1,
48
- "learning_rate": 4.896253260611809e-05,
49
- "loss": 2.0182,
50
  "step": 3500
51
  },
52
  {
53
  "epoch": 0.12,
54
- "learning_rate": 4.881432297842068e-05,
55
- "loss": 2.035,
56
  "step": 4000
57
  },
58
  {
59
  "epoch": 0.13,
60
  "learning_rate": 4.866611335072326e-05,
61
- "loss": 2.0271,
62
  "step": 4500
63
  },
64
  {
65
  "epoch": 0.15,
66
  "learning_rate": 4.8517903723025847e-05,
67
- "loss": 2.0089,
68
  "step": 5000
69
  },
70
  {
71
  "epoch": 0.16,
72
  "learning_rate": 4.836969409532843e-05,
73
- "loss": 2.0152,
74
  "step": 5500
75
  },
76
  {
77
  "epoch": 0.18,
78
  "learning_rate": 4.8221484467631015e-05,
79
- "loss": 2.0181,
80
  "step": 6000
81
  },
82
  {
83
  "epoch": 0.19,
84
  "learning_rate": 4.80732748399336e-05,
85
- "loss": 2.0154,
86
  "step": 6500
87
  },
88
  {
89
  "epoch": 0.21,
90
  "learning_rate": 4.7925065212236184e-05,
91
- "loss": 2.026,
92
  "step": 7000
93
  },
94
  {
95
  "epoch": 0.22,
96
  "learning_rate": 4.7776855584538775e-05,
97
- "loss": 1.9978,
98
  "step": 7500
99
  },
100
  {
101
  "epoch": 0.24,
102
  "learning_rate": 4.762864595684136e-05,
103
- "loss": 2.0187,
104
  "step": 8000
105
  },
106
  {
107
  "epoch": 0.25,
108
  "learning_rate": 4.7480436329143944e-05,
109
- "loss": 2.0338,
110
  "step": 8500
111
  },
112
  {
113
  "epoch": 0.27,
114
  "learning_rate": 4.733222670144653e-05,
115
- "loss": 2.0258,
116
  "step": 9000
117
  },
118
  {
119
  "epoch": 0.28,
120
  "learning_rate": 4.718401707374911e-05,
121
- "loss": 2.0205,
122
  "step": 9500
123
  },
124
  {
125
  "epoch": 0.3,
126
  "learning_rate": 4.70358074460517e-05,
127
- "loss": 2.0218,
128
  "step": 10000
129
  },
130
  {
131
  "epoch": 0.31,
132
  "learning_rate": 4.688759781835428e-05,
133
- "loss": 2.0151,
134
  "step": 10500
135
  },
136
  {
137
  "epoch": 0.33,
138
  "learning_rate": 4.6739388190656866e-05,
139
- "loss": 2.0203,
140
  "step": 11000
141
  },
142
  {
143
  "epoch": 0.34,
144
  "learning_rate": 4.659117856295945e-05,
145
- "loss": 2.0439,
146
  "step": 11500
147
  },
148
  {
149
  "epoch": 0.36,
150
  "learning_rate": 4.6442968935262035e-05,
151
- "loss": 2.0389,
152
  "step": 12000
153
  },
154
  {
155
  "epoch": 0.37,
156
  "learning_rate": 4.629475930756462e-05,
157
- "loss": 2.019,
158
  "step": 12500
159
  },
160
  {
161
  "epoch": 0.39,
162
  "learning_rate": 4.6146549679867204e-05,
163
- "loss": 2.0375,
164
  "step": 13000
165
  },
166
  {
167
  "epoch": 0.4,
168
  "learning_rate": 4.599834005216979e-05,
169
- "loss": 2.0539,
170
  "step": 13500
171
  },
172
  {
173
  "epoch": 0.41,
174
  "learning_rate": 4.585013042447238e-05,
175
- "loss": 2.0415,
176
  "step": 14000
177
  },
178
  {
179
  "epoch": 0.43,
180
  "learning_rate": 4.5701920796774964e-05,
181
- "loss": 2.0262,
182
  "step": 14500
183
  },
184
  {
185
  "epoch": 0.44,
186
  "learning_rate": 4.555371116907755e-05,
187
- "loss": 2.0349,
188
  "step": 15000
189
  },
190
  {
191
  "epoch": 0.46,
192
  "learning_rate": 4.540550154138013e-05,
193
- "loss": 2.0392,
194
  "step": 15500
195
  },
196
  {
197
  "epoch": 0.47,
198
  "learning_rate": 4.525729191368272e-05,
199
- "loss": 2.037,
200
  "step": 16000
201
  },
202
  {
203
  "epoch": 0.49,
204
  "learning_rate": 4.51090822859853e-05,
205
- "loss": 2.0583,
206
  "step": 16500
207
  },
208
  {
209
  "epoch": 0.5,
210
  "learning_rate": 4.4960872658287886e-05,
211
- "loss": 2.0516,
212
  "step": 17000
213
  },
214
  {
215
  "epoch": 0.52,
216
  "learning_rate": 4.481266303059047e-05,
217
- "loss": 2.0336,
218
  "step": 17500
219
  },
220
  {
221
  "epoch": 0.53,
222
  "learning_rate": 4.4664453402893055e-05,
223
- "loss": 2.0478,
224
  "step": 18000
225
  },
226
  {
227
  "epoch": 0.55,
228
  "learning_rate": 4.451624377519564e-05,
229
- "loss": 2.0305,
230
  "step": 18500
231
  },
232
  {
233
  "epoch": 0.56,
234
  "learning_rate": 4.4368034147498224e-05,
235
- "loss": 2.0388,
236
  "step": 19000
237
  },
238
  {
239
  "epoch": 0.58,
240
  "learning_rate": 4.421982451980081e-05,
241
- "loss": 2.0297,
242
  "step": 19500
243
  },
244
  {
245
  "epoch": 0.59,
246
  "learning_rate": 4.407161489210339e-05,
247
- "loss": 2.0473,
248
  "step": 20000
249
  },
250
  {
251
  "epoch": 0.61,
252
  "learning_rate": 4.392340526440598e-05,
253
- "loss": 2.0396,
254
  "step": 20500
255
  },
256
  {
257
  "epoch": 0.62,
258
  "learning_rate": 4.377519563670856e-05,
259
- "loss": 2.0398,
260
  "step": 21000
261
  },
262
  {
263
  "epoch": 0.64,
264
  "learning_rate": 4.3626986009011146e-05,
265
- "loss": 2.0358,
266
  "step": 21500
267
  },
268
  {
269
  "epoch": 0.65,
270
  "learning_rate": 4.347877638131374e-05,
271
- "loss": 2.0565,
272
  "step": 22000
273
  },
274
  {
275
  "epoch": 0.67,
276
  "learning_rate": 4.333056675361632e-05,
277
- "loss": 2.0321,
278
  "step": 22500
279
  },
280
  {
281
  "epoch": 0.68,
282
  "learning_rate": 4.3182357125918906e-05,
283
- "loss": 2.0502,
284
  "step": 23000
285
  },
286
  {
287
  "epoch": 0.7,
288
  "learning_rate": 4.303414749822149e-05,
289
- "loss": 2.064,
290
  "step": 23500
291
  },
292
  {
293
  "epoch": 0.71,
294
  "learning_rate": 4.2885937870524075e-05,
295
- "loss": 2.0351,
296
  "step": 24000
297
  },
298
  {
299
  "epoch": 0.73,
300
  "learning_rate": 4.273772824282666e-05,
301
- "loss": 2.0362,
302
  "step": 24500
303
  },
304
  {
305
  "epoch": 0.74,
306
  "learning_rate": 4.258951861512924e-05,
307
- "loss": 2.0603,
308
  "step": 25000
309
  },
310
  {
311
  "epoch": 0.76,
312
  "learning_rate": 4.244130898743183e-05,
313
- "loss": 2.0452,
314
  "step": 25500
315
  },
316
  {
317
  "epoch": 0.77,
318
  "learning_rate": 4.229309935973441e-05,
319
- "loss": 2.0752,
320
  "step": 26000
321
  },
322
  {
323
  "epoch": 0.79,
324
  "learning_rate": 4.2144889732036997e-05,
325
- "loss": 2.062,
326
  "step": 26500
327
  },
328
  {
329
  "epoch": 0.8,
330
  "learning_rate": 4.199668010433958e-05,
331
- "loss": 2.0593,
332
  "step": 27000
333
  },
334
  {
335
  "epoch": 0.82,
336
  "learning_rate": 4.1848470476642165e-05,
337
- "loss": 2.0548,
338
  "step": 27500
339
  },
340
  {
341
  "epoch": 0.83,
342
  "learning_rate": 4.170026084894475e-05,
343
- "loss": 2.0516,
344
  "step": 28000
345
  },
346
  {
347
  "epoch": 0.84,
348
  "learning_rate": 4.1552051221247334e-05,
349
- "loss": 2.0609,
350
  "step": 28500
351
  },
352
  {
353
  "epoch": 0.86,
354
  "learning_rate": 4.140384159354992e-05,
355
- "loss": 2.0655,
356
  "step": 29000
357
  },
358
  {
359
  "epoch": 0.87,
360
  "learning_rate": 4.12556319658525e-05,
361
- "loss": 2.0562,
362
  "step": 29500
363
  },
364
  {
365
  "epoch": 0.89,
366
  "learning_rate": 4.110742233815509e-05,
367
- "loss": 2.0557,
368
  "step": 30000
369
  },
370
  {
371
  "epoch": 0.9,
372
  "learning_rate": 4.095921271045767e-05,
373
- "loss": 2.0608,
374
  "step": 30500
375
  },
376
  {
377
  "epoch": 0.92,
378
  "learning_rate": 4.0811003082760256e-05,
379
- "loss": 2.0641,
380
  "step": 31000
381
  },
382
  {
383
  "epoch": 0.93,
384
  "learning_rate": 4.066279345506284e-05,
385
- "loss": 2.0649,
386
  "step": 31500
387
  },
388
  {
389
  "epoch": 0.95,
390
  "learning_rate": 4.0514583827365425e-05,
391
- "loss": 2.0603,
392
  "step": 32000
393
  },
394
  {
395
  "epoch": 0.96,
396
  "learning_rate": 4.036637419966801e-05,
397
- "loss": 2.0696,
398
  "step": 32500
399
  },
400
  {
401
  "epoch": 0.98,
402
  "learning_rate": 4.0218164571970594e-05,
403
- "loss": 2.0619,
404
  "step": 33000
405
  },
406
  {
407
  "epoch": 0.99,
408
  "learning_rate": 4.006995494427318e-05,
409
- "loss": 2.0762,
410
  "step": 33500
411
  },
412
  {
413
  "epoch": 1.01,
414
  "learning_rate": 3.992174531657576e-05,
415
- "loss": 1.9612,
416
  "step": 34000
417
  },
418
  {
419
  "epoch": 1.02,
420
  "learning_rate": 3.977353568887835e-05,
421
- "loss": 1.8788,
422
  "step": 34500
423
  },
424
  {
425
  "epoch": 1.04,
426
  "learning_rate": 3.962532606118093e-05,
427
- "loss": 1.8705,
428
  "step": 35000
429
  },
430
  {
431
  "epoch": 1.05,
432
  "learning_rate": 3.9477116433483516e-05,
433
- "loss": 1.8726,
434
  "step": 35500
435
  },
436
  {
437
  "epoch": 1.07,
438
  "learning_rate": 3.93289068057861e-05,
439
- "loss": 1.887,
440
  "step": 36000
441
  },
442
  {
443
  "epoch": 1.08,
444
  "learning_rate": 3.9180697178088685e-05,
445
- "loss": 1.8795,
446
  "step": 36500
447
  },
448
  {
449
  "epoch": 1.1,
450
  "learning_rate": 3.9032487550391276e-05,
451
- "loss": 1.8778,
452
  "step": 37000
453
  },
454
  {
455
  "epoch": 1.11,
456
  "learning_rate": 3.888427792269386e-05,
457
- "loss": 1.8933,
458
  "step": 37500
459
  },
460
  {
461
  "epoch": 1.13,
462
  "learning_rate": 3.8736068294996445e-05,
463
- "loss": 1.8897,
464
  "step": 38000
465
  },
466
  {
467
  "epoch": 1.14,
468
  "learning_rate": 3.858785866729903e-05,
469
- "loss": 1.9019,
470
  "step": 38500
471
  },
472
  {
473
  "epoch": 1.16,
474
  "learning_rate": 3.8439649039601614e-05,
475
- "loss": 1.9074,
476
  "step": 39000
477
  },
478
  {
479
  "epoch": 1.17,
480
  "learning_rate": 3.82914394119042e-05,
481
- "loss": 1.916,
482
  "step": 39500
483
  },
484
  {
485
  "epoch": 1.19,
486
  "learning_rate": 3.814322978420678e-05,
487
- "loss": 1.9083,
488
  "step": 40000
489
  },
490
  {
491
  "epoch": 1.2,
492
  "learning_rate": 3.799502015650937e-05,
493
- "loss": 1.8971,
494
  "step": 40500
495
  },
496
  {
497
  "epoch": 1.22,
498
  "learning_rate": 3.784681052881195e-05,
499
- "loss": 1.9045,
500
  "step": 41000
501
  },
502
  {
503
  "epoch": 1.23,
504
  "learning_rate": 3.7698600901114536e-05,
505
- "loss": 1.9176,
506
  "step": 41500
507
  },
508
  {
509
  "epoch": 1.24,
510
  "learning_rate": 3.755039127341713e-05,
511
- "loss": 1.9223,
512
  "step": 42000
513
  },
514
  {
515
  "epoch": 1.26,
516
  "learning_rate": 3.740218164571971e-05,
517
- "loss": 1.9123,
518
  "step": 42500
519
  },
520
  {
521
  "epoch": 1.27,
522
  "learning_rate": 3.7253972018022296e-05,
523
- "loss": 1.9281,
524
  "step": 43000
525
  },
526
  {
527
  "epoch": 1.29,
528
  "learning_rate": 3.710576239032488e-05,
529
- "loss": 1.926,
530
  "step": 43500
531
  },
532
  {
533
  "epoch": 1.3,
534
  "learning_rate": 3.6957552762627465e-05,
535
- "loss": 1.9125,
536
  "step": 44000
537
  },
538
  {
539
  "epoch": 1.32,
540
  "learning_rate": 3.680934313493005e-05,
541
- "loss": 1.9354,
542
  "step": 44500
543
  },
544
  {
545
  "epoch": 1.33,
546
  "learning_rate": 3.6661133507232633e-05,
547
- "loss": 1.9339,
548
  "step": 45000
549
  },
550
  {
551
  "epoch": 1.35,
552
  "learning_rate": 3.651292387953522e-05,
553
- "loss": 1.9329,
554
  "step": 45500
555
  },
556
  {
557
  "epoch": 1.36,
558
  "learning_rate": 3.63647142518378e-05,
559
- "loss": 1.9223,
560
  "step": 46000
561
  },
562
  {
563
  "epoch": 1.38,
564
  "learning_rate": 3.621650462414039e-05,
565
- "loss": 1.9371,
566
  "step": 46500
567
  },
568
  {
569
  "epoch": 1.39,
570
  "learning_rate": 3.606829499644297e-05,
571
- "loss": 1.9348,
572
  "step": 47000
573
  },
574
  {
575
  "epoch": 1.41,
576
  "learning_rate": 3.5920085368745556e-05,
577
- "loss": 1.9424,
578
  "step": 47500
579
  },
580
  {
581
  "epoch": 1.42,
582
  "learning_rate": 3.577187574104814e-05,
583
- "loss": 1.9366,
584
  "step": 48000
585
  },
586
  {
587
  "epoch": 1.44,
588
  "learning_rate": 3.5623666113350724e-05,
589
- "loss": 1.9425,
590
  "step": 48500
591
  },
592
  {
593
  "epoch": 1.45,
594
  "learning_rate": 3.547545648565331e-05,
595
- "loss": 1.9292,
596
  "step": 49000
597
  },
598
  {
599
  "epoch": 1.47,
600
  "learning_rate": 3.532724685795589e-05,
601
- "loss": 1.9534,
602
  "step": 49500
603
  },
604
  {
605
  "epoch": 1.48,
606
  "learning_rate": 3.517903723025848e-05,
607
- "loss": 1.9499,
608
  "step": 50000
609
  },
610
  {
611
  "epoch": 1.5,
612
  "learning_rate": 3.503082760256106e-05,
613
- "loss": 1.9541,
614
  "step": 50500
615
  },
616
  {
617
  "epoch": 1.51,
618
  "learning_rate": 3.4882617974863646e-05,
619
- "loss": 1.9433,
620
  "step": 51000
621
  },
622
  {
623
  "epoch": 1.53,
624
  "learning_rate": 3.473440834716624e-05,
625
- "loss": 1.9481,
626
  "step": 51500
627
  },
628
  {
629
  "epoch": 1.54,
630
  "learning_rate": 3.458619871946882e-05,
631
- "loss": 1.9647,
632
  "step": 52000
633
  },
634
  {
635
  "epoch": 1.56,
636
  "learning_rate": 3.4437989091771406e-05,
637
- "loss": 1.9608,
638
  "step": 52500
639
  },
640
  {
641
  "epoch": 1.57,
642
  "learning_rate": 3.428977946407399e-05,
643
- "loss": 1.9775,
644
  "step": 53000
645
  },
646
  {
647
  "epoch": 1.59,
648
  "learning_rate": 3.4141569836376575e-05,
649
- "loss": 1.9676,
650
  "step": 53500
651
  },
652
  {
653
  "epoch": 1.6,
654
  "learning_rate": 3.399336020867916e-05,
655
- "loss": 1.978,
656
  "step": 54000
657
  },
658
  {
659
  "epoch": 1.62,
660
  "learning_rate": 3.3845150580981744e-05,
661
- "loss": 1.9705,
662
  "step": 54500
663
  },
664
  {
665
  "epoch": 1.63,
666
  "learning_rate": 3.369694095328433e-05,
667
- "loss": 1.9706,
668
  "step": 55000
669
  },
670
  {
671
  "epoch": 1.65,
672
  "learning_rate": 3.354873132558691e-05,
673
- "loss": 1.9605,
674
  "step": 55500
675
  },
676
  {
677
  "epoch": 1.66,
678
  "learning_rate": 3.34005216978895e-05,
679
- "loss": 1.9515,
680
  "step": 56000
681
  },
682
  {
683
  "epoch": 1.67,
684
  "learning_rate": 3.325231207019208e-05,
685
- "loss": 1.9836,
686
  "step": 56500
687
  },
688
  {
689
  "epoch": 1.69,
690
  "learning_rate": 3.3104102442494666e-05,
691
- "loss": 1.9752,
692
  "step": 57000
693
  },
694
  {
695
  "epoch": 1.7,
696
  "learning_rate": 3.295589281479725e-05,
697
- "loss": 1.9636,
698
  "step": 57500
699
  },
700
  {
701
  "epoch": 1.72,
702
  "learning_rate": 3.2807683187099835e-05,
703
- "loss": 1.9643,
704
  "step": 58000
705
  },
706
  {
707
  "epoch": 1.73,
708
  "learning_rate": 3.265947355940242e-05,
709
- "loss": 1.9778,
710
  "step": 58500
711
  },
712
  {
713
  "epoch": 1.75,
714
  "learning_rate": 3.2511263931705004e-05,
715
- "loss": 1.9894,
716
  "step": 59000
717
  },
718
  {
719
  "epoch": 1.76,
720
  "learning_rate": 3.236305430400759e-05,
721
- "loss": 1.9787,
722
  "step": 59500
723
  },
724
  {
725
  "epoch": 1.78,
726
  "learning_rate": 3.221484467631017e-05,
727
- "loss": 1.9945,
728
  "step": 60000
729
  },
730
  {
731
  "epoch": 1.79,
732
  "learning_rate": 3.206663504861276e-05,
733
- "loss": 1.9872,
734
  "step": 60500
735
  },
736
  {
737
  "epoch": 1.81,
738
  "learning_rate": 3.191842542091534e-05,
739
- "loss": 1.9684,
740
  "step": 61000
741
  },
742
  {
743
  "epoch": 1.82,
744
  "learning_rate": 3.1770215793217926e-05,
745
- "loss": 1.9923,
746
  "step": 61500
747
  },
748
  {
749
  "epoch": 1.84,
750
  "learning_rate": 3.162200616552051e-05,
751
- "loss": 1.9902,
752
  "step": 62000
753
  },
754
  {
755
  "epoch": 1.85,
756
  "learning_rate": 3.1473796537823095e-05,
757
- "loss": 1.991,
758
  "step": 62500
759
  },
760
  {
761
  "epoch": 1.87,
762
  "learning_rate": 3.132558691012568e-05,
763
- "loss": 1.9865,
764
  "step": 63000
765
  },
766
  {
767
  "epoch": 1.88,
768
  "learning_rate": 3.1177377282428263e-05,
769
- "loss": 2.0036,
770
  "step": 63500
771
  },
772
  {
773
  "epoch": 1.9,
774
  "learning_rate": 3.102916765473085e-05,
775
- "loss": 1.9841,
776
  "step": 64000
777
  },
778
  {
779
  "epoch": 1.91,
780
  "learning_rate": 3.088095802703343e-05,
781
- "loss": 1.9879,
782
  "step": 64500
783
  },
784
  {
785
  "epoch": 1.93,
786
  "learning_rate": 3.073274839933602e-05,
787
- "loss": 2.0156,
788
  "step": 65000
789
  },
790
  {
791
  "epoch": 1.94,
792
  "learning_rate": 3.05845387716386e-05,
793
- "loss": 1.9924,
794
  "step": 65500
795
  },
796
  {
797
  "epoch": 1.96,
798
  "learning_rate": 3.0436329143941196e-05,
799
- "loss": 2.0209,
800
  "step": 66000
801
  },
802
  {
803
  "epoch": 1.97,
804
  "learning_rate": 3.028811951624378e-05,
805
- "loss": 2.0109,
806
  "step": 66500
807
  },
808
  {
809
  "epoch": 1.99,
810
  "learning_rate": 3.0139909888546365e-05,
811
- "loss": 2.0246,
812
  "step": 67000
813
  },
814
  {
815
  "epoch": 2.0,
816
  "learning_rate": 2.999170026084895e-05,
817
- "loss": 2.0058,
818
  "step": 67500
819
  },
820
  {
821
  "epoch": 2.02,
822
  "learning_rate": 2.9843490633151533e-05,
823
- "loss": 1.8351,
824
  "step": 68000
825
  },
826
  {
827
  "epoch": 2.03,
828
  "learning_rate": 2.9695281005454118e-05,
829
- "loss": 1.8244,
830
  "step": 68500
831
  },
832
  {
833
  "epoch": 2.05,
834
  "learning_rate": 2.9547071377756702e-05,
835
- "loss": 1.8274,
836
  "step": 69000
837
  },
838
  {
839
  "epoch": 2.06,
840
  "learning_rate": 2.9398861750059287e-05,
841
- "loss": 1.8208,
842
  "step": 69500
843
  },
844
  {
845
  "epoch": 2.07,
846
  "learning_rate": 2.925065212236187e-05,
847
- "loss": 1.8353,
848
  "step": 70000
849
  },
850
  {
851
  "epoch": 2.09,
852
  "learning_rate": 2.9102442494664455e-05,
853
- "loss": 1.8316,
854
  "step": 70500
855
  },
856
  {
857
  "epoch": 2.1,
858
  "learning_rate": 2.895423286696704e-05,
859
- "loss": 1.8313,
860
  "step": 71000
861
  },
862
  {
863
  "epoch": 2.12,
864
  "learning_rate": 2.8806023239269624e-05,
865
- "loss": 1.8318,
866
  "step": 71500
867
  },
868
  {
869
  "epoch": 2.13,
870
  "learning_rate": 2.865781361157221e-05,
871
- "loss": 1.8448,
872
  "step": 72000
873
  },
874
  {
875
  "epoch": 2.15,
876
  "learning_rate": 2.8509603983874793e-05,
877
- "loss": 1.8367,
878
  "step": 72500
879
  },
880
  {
881
  "epoch": 2.16,
882
  "learning_rate": 2.8361394356177377e-05,
883
- "loss": 1.8535,
884
  "step": 73000
885
  },
886
  {
887
  "epoch": 2.18,
888
  "learning_rate": 2.8213184728479962e-05,
889
- "loss": 1.8474,
890
  "step": 73500
891
  },
892
  {
893
  "epoch": 2.19,
894
  "learning_rate": 2.8064975100782546e-05,
895
- "loss": 1.8525,
896
  "step": 74000
897
  },
898
  {
899
  "epoch": 2.21,
900
  "learning_rate": 2.7916765473085134e-05,
901
- "loss": 1.8612,
902
  "step": 74500
903
  },
904
  {
905
  "epoch": 2.22,
906
  "learning_rate": 2.776855584538772e-05,
907
- "loss": 1.8486,
908
  "step": 75000
909
  },
910
  {
911
  "epoch": 2.24,
912
  "learning_rate": 2.7620346217690303e-05,
913
- "loss": 1.8339,
914
  "step": 75500
915
  },
916
  {
917
  "epoch": 2.25,
918
  "learning_rate": 2.7472136589992887e-05,
919
- "loss": 1.85,
920
  "step": 76000
921
  },
922
  {
923
  "epoch": 2.27,
924
  "learning_rate": 2.7323926962295472e-05,
925
- "loss": 1.8645,
926
  "step": 76500
927
  },
928
  {
929
  "epoch": 2.28,
930
  "learning_rate": 2.7175717334598056e-05,
931
- "loss": 1.849,
932
  "step": 77000
933
  },
934
  {
935
  "epoch": 2.3,
936
  "learning_rate": 2.702750770690064e-05,
937
- "loss": 1.8608,
938
  "step": 77500
939
  },
940
  {
941
  "epoch": 2.31,
942
  "learning_rate": 2.6879298079203225e-05,
943
- "loss": 1.8568,
944
  "step": 78000
945
  },
946
  {
947
  "epoch": 2.33,
948
  "learning_rate": 2.673108845150581e-05,
949
- "loss": 1.8607,
950
  "step": 78500
951
  },
952
  {
953
  "epoch": 2.34,
954
  "learning_rate": 2.6582878823808394e-05,
955
- "loss": 1.8607,
956
  "step": 79000
957
  },
958
  {
959
  "epoch": 2.36,
960
  "learning_rate": 2.6434669196110978e-05,
961
- "loss": 1.8478,
962
  "step": 79500
963
  },
964
  {
965
  "epoch": 2.37,
966
  "learning_rate": 2.6286459568413563e-05,
967
- "loss": 1.8785,
968
  "step": 80000
969
  },
970
  {
971
  "epoch": 2.39,
972
  "learning_rate": 2.6138249940716147e-05,
973
- "loss": 1.8729,
974
  "step": 80500
975
  },
976
  {
977
  "epoch": 2.4,
978
  "learning_rate": 2.5990040313018738e-05,
979
- "loss": 1.8511,
980
  "step": 81000
981
  },
982
  {
983
  "epoch": 2.42,
984
  "learning_rate": 2.5841830685321323e-05,
985
- "loss": 1.8701,
986
  "step": 81500
987
  },
988
  {
989
  "epoch": 2.43,
990
  "learning_rate": 2.5693621057623907e-05,
991
- "loss": 1.8792,
992
  "step": 82000
993
  },
994
  {
995
  "epoch": 2.45,
996
  "learning_rate": 2.554541142992649e-05,
997
- "loss": 1.8703,
998
  "step": 82500
999
  },
1000
  {
1001
  "epoch": 2.46,
1002
  "learning_rate": 2.5397201802229076e-05,
1003
- "loss": 1.8479,
1004
  "step": 83000
1005
  },
1006
  {
1007
  "epoch": 2.48,
1008
  "learning_rate": 2.524899217453166e-05,
1009
- "loss": 1.8603,
1010
  "step": 83500
1011
  },
1012
  {
1013
  "epoch": 2.49,
1014
  "learning_rate": 2.5100782546834245e-05,
1015
- "loss": 1.8752,
1016
  "step": 84000
1017
  },
1018
  {
1019
  "epoch": 2.5,
1020
  "learning_rate": 2.495257291913683e-05,
1021
- "loss": 1.8685,
1022
  "step": 84500
1023
  },
1024
  {
1025
  "epoch": 2.52,
1026
  "learning_rate": 2.4804363291439414e-05,
1027
- "loss": 1.8814,
1028
  "step": 85000
1029
  },
1030
  {
1031
  "epoch": 2.53,
1032
  "learning_rate": 2.4656153663741998e-05,
1033
- "loss": 1.8903,
1034
  "step": 85500
1035
  },
1036
  {
1037
  "epoch": 2.55,
1038
  "learning_rate": 2.4507944036044582e-05,
1039
- "loss": 1.8863,
1040
  "step": 86000
1041
  },
1042
  {
1043
  "epoch": 2.56,
1044
  "learning_rate": 2.4359734408347167e-05,
1045
- "loss": 1.8704,
1046
  "step": 86500
1047
  },
1048
  {
1049
  "epoch": 2.58,
1050
  "learning_rate": 2.421152478064975e-05,
1051
- "loss": 1.8822,
1052
  "step": 87000
1053
  },
1054
  {
1055
  "epoch": 2.59,
1056
  "learning_rate": 2.4063315152952336e-05,
1057
- "loss": 1.8917,
1058
  "step": 87500
1059
  },
1060
  {
1061
  "epoch": 2.61,
1062
  "learning_rate": 2.391510552525492e-05,
1063
- "loss": 1.8734,
1064
  "step": 88000
1065
  },
1066
  {
1067
  "epoch": 2.62,
1068
  "learning_rate": 2.3766895897557508e-05,
1069
- "loss": 1.8797,
1070
  "step": 88500
1071
  },
1072
  {
1073
  "epoch": 2.64,
1074
  "learning_rate": 2.3618686269860092e-05,
1075
- "loss": 1.8659,
1076
  "step": 89000
1077
  },
1078
  {
1079
  "epoch": 2.65,
1080
  "learning_rate": 2.3470476642162677e-05,
1081
- "loss": 1.8844,
1082
  "step": 89500
1083
  },
1084
  {
1085
  "epoch": 2.67,
1086
  "learning_rate": 2.332226701446526e-05,
1087
- "loss": 1.8956,
1088
  "step": 90000
1089
  },
1090
  {
1091
  "epoch": 2.68,
1092
  "learning_rate": 2.3174057386767846e-05,
1093
- "loss": 1.89,
1094
  "step": 90500
1095
  },
1096
  {
1097
  "epoch": 2.7,
1098
  "learning_rate": 2.302584775907043e-05,
1099
- "loss": 1.8722,
1100
  "step": 91000
1101
  },
1102
  {
1103
  "epoch": 2.71,
1104
  "learning_rate": 2.2877638131373014e-05,
1105
- "loss": 1.8978,
1106
  "step": 91500
1107
  },
1108
  {
1109
  "epoch": 2.73,
1110
  "learning_rate": 2.2729428503675602e-05,
1111
- "loss": 1.8858,
1112
  "step": 92000
1113
  },
1114
  {
1115
  "epoch": 2.74,
1116
  "learning_rate": 2.2581218875978187e-05,
1117
- "loss": 1.8845,
1118
  "step": 92500
1119
  },
1120
  {
1121
  "epoch": 2.76,
1122
  "learning_rate": 2.243300924828077e-05,
1123
- "loss": 1.879,
1124
  "step": 93000
1125
  },
1126
  {
1127
  "epoch": 2.77,
1128
  "learning_rate": 2.2284799620583355e-05,
1129
- "loss": 1.8963,
1130
  "step": 93500
1131
  },
1132
  {
1133
  "epoch": 2.79,
1134
  "learning_rate": 2.213658999288594e-05,
1135
- "loss": 1.9058,
1136
  "step": 94000
1137
  },
1138
  {
1139
  "epoch": 2.8,
1140
  "learning_rate": 2.1988380365188524e-05,
1141
- "loss": 1.8961,
1142
  "step": 94500
1143
  },
1144
  {
1145
  "epoch": 2.82,
1146
  "learning_rate": 2.184017073749111e-05,
1147
- "loss": 1.8718,
1148
  "step": 95000
1149
  },
1150
  {
1151
  "epoch": 2.83,
1152
  "learning_rate": 2.1691961109793693e-05,
1153
- "loss": 1.8841,
1154
  "step": 95500
1155
  },
1156
  {
1157
  "epoch": 2.85,
1158
  "learning_rate": 2.1543751482096277e-05,
1159
- "loss": 1.8928,
1160
  "step": 96000
1161
  },
1162
  {
1163
  "epoch": 2.86,
1164
  "learning_rate": 2.1395541854398862e-05,
1165
- "loss": 1.8924,
1166
  "step": 96500
1167
  },
1168
  {
1169
  "epoch": 2.88,
1170
  "learning_rate": 2.1247332226701446e-05,
1171
- "loss": 1.912,
1172
  "step": 97000
1173
  },
1174
  {
1175
  "epoch": 2.89,
1176
  "learning_rate": 2.109912259900403e-05,
1177
- "loss": 1.886,
1178
  "step": 97500
1179
  },
1180
  {
1181
  "epoch": 2.9,
1182
  "learning_rate": 2.0950912971306615e-05,
1183
- "loss": 1.9036,
1184
  "step": 98000
1185
  },
1186
  {
1187
  "epoch": 2.92,
1188
  "learning_rate": 2.08027033436092e-05,
1189
- "loss": 1.9081,
1190
  "step": 98500
1191
  },
1192
  {
1193
  "epoch": 2.93,
1194
  "learning_rate": 2.0654493715911787e-05,
1195
- "loss": 1.8979,
1196
  "step": 99000
1197
  },
1198
  {
1199
  "epoch": 2.95,
1200
  "learning_rate": 2.0506284088214372e-05,
1201
- "loss": 1.9121,
1202
  "step": 99500
1203
  },
1204
  {
1205
  "epoch": 2.96,
1206
  "learning_rate": 2.0358074460516956e-05,
1207
- "loss": 1.9092,
1208
  "step": 100000
1209
  },
1210
  {
1211
  "epoch": 2.98,
1212
  "learning_rate": 2.020986483281954e-05,
1213
- "loss": 1.8997,
1214
  "step": 100500
1215
  },
1216
  {
1217
  "epoch": 2.99,
1218
  "learning_rate": 2.0061655205122125e-05,
1219
- "loss": 1.9143,
1220
  "step": 101000
1221
  },
1222
  {
1223
  "epoch": 3.01,
1224
  "learning_rate": 1.991344557742471e-05,
1225
- "loss": 1.8386,
1226
  "step": 101500
1227
  },
1228
  {
1229
  "epoch": 3.02,
1230
  "learning_rate": 1.9765235949727294e-05,
1231
- "loss": 1.7796,
1232
  "step": 102000
1233
  },
1234
  {
1235
  "epoch": 3.04,
1236
  "learning_rate": 1.961702632202988e-05,
1237
- "loss": 1.7844,
1238
  "step": 102500
1239
  },
1240
  {
1241
  "epoch": 3.05,
1242
  "learning_rate": 1.9468816694332466e-05,
1243
- "loss": 1.7941,
1244
  "step": 103000
1245
  },
1246
  {
1247
  "epoch": 3.07,
1248
  "learning_rate": 1.932060706663505e-05,
1249
- "loss": 1.8003,
1250
  "step": 103500
1251
  },
1252
  {
1253
  "epoch": 3.08,
1254
  "learning_rate": 1.9172397438937635e-05,
1255
- "loss": 1.7926,
1256
  "step": 104000
1257
  },
1258
  {
1259
  "epoch": 3.1,
1260
  "learning_rate": 1.902418781124022e-05,
1261
- "loss": 1.7893,
1262
  "step": 104500
1263
  },
1264
  {
1265
  "epoch": 3.11,
1266
  "learning_rate": 1.8875978183542804e-05,
1267
- "loss": 1.8033,
1268
  "step": 105000
1269
  },
1270
  {
1271
  "epoch": 3.13,
1272
  "learning_rate": 1.8727768555845388e-05,
1273
- "loss": 1.8031,
1274
  "step": 105500
1275
  },
1276
  {
1277
  "epoch": 3.14,
1278
  "learning_rate": 1.8579558928147972e-05,
1279
- "loss": 1.7973,
1280
  "step": 106000
1281
  },
1282
  {
1283
  "epoch": 3.16,
1284
  "learning_rate": 1.843134930045056e-05,
1285
- "loss": 1.7955,
1286
  "step": 106500
1287
  },
1288
  {
1289
  "epoch": 3.17,
1290
  "learning_rate": 1.8283139672753145e-05,
1291
- "loss": 1.7894,
1292
  "step": 107000
1293
  },
1294
  {
1295
  "epoch": 3.19,
1296
  "learning_rate": 1.813493004505573e-05,
1297
- "loss": 1.8005,
1298
  "step": 107500
1299
  },
1300
  {
1301
  "epoch": 3.2,
1302
  "learning_rate": 1.7986720417358314e-05,
1303
- "loss": 1.8061,
1304
  "step": 108000
1305
  },
1306
  {
1307
  "epoch": 3.22,
1308
  "learning_rate": 1.7838510789660898e-05,
1309
- "loss": 1.8076,
1310
  "step": 108500
1311
  },
1312
  {
1313
  "epoch": 3.23,
1314
  "learning_rate": 1.7690301161963482e-05,
1315
- "loss": 1.7835,
1316
  "step": 109000
1317
  },
1318
  {
1319
  "epoch": 3.25,
1320
  "learning_rate": 1.7542091534266067e-05,
1321
- "loss": 1.805,
1322
  "step": 109500
1323
  },
1324
  {
1325
  "epoch": 3.26,
1326
  "learning_rate": 1.739388190656865e-05,
1327
- "loss": 1.8052,
1328
  "step": 110000
1329
  },
1330
  {
1331
  "epoch": 3.28,
1332
  "learning_rate": 1.7245672278871236e-05,
1333
- "loss": 1.8022,
1334
  "step": 110500
1335
  },
1336
  {
1337
  "epoch": 3.29,
1338
  "learning_rate": 1.709746265117382e-05,
1339
- "loss": 1.7789,
1340
  "step": 111000
1341
  },
1342
  {
1343
  "epoch": 3.31,
1344
  "learning_rate": 1.6949253023476404e-05,
1345
- "loss": 1.8049,
1346
  "step": 111500
1347
  },
1348
  {
1349
  "epoch": 3.32,
1350
  "learning_rate": 1.680104339577899e-05,
1351
- "loss": 1.8087,
1352
  "step": 112000
1353
  },
1354
  {
1355
  "epoch": 3.33,
1356
  "learning_rate": 1.6652833768081573e-05,
1357
- "loss": 1.8048,
1358
  "step": 112500
1359
  },
1360
  {
1361
  "epoch": 3.35,
1362
  "learning_rate": 1.6504624140384158e-05,
1363
- "loss": 1.7941,
1364
  "step": 113000
1365
  },
1366
  {
1367
  "epoch": 3.36,
1368
  "learning_rate": 1.6356414512686745e-05,
1369
- "loss": 1.8048,
1370
  "step": 113500
1371
  },
1372
  {
1373
  "epoch": 3.38,
1374
  "learning_rate": 1.620820488498933e-05,
1375
- "loss": 1.7987,
1376
  "step": 114000
1377
  },
1378
  {
1379
  "epoch": 3.39,
1380
  "learning_rate": 1.6059995257291914e-05,
1381
- "loss": 1.7982,
1382
  "step": 114500
1383
  },
1384
  {
1385
  "epoch": 3.41,
1386
  "learning_rate": 1.59117856295945e-05,
1387
- "loss": 1.8063,
1388
  "step": 115000
1389
  },
1390
  {
1391
  "epoch": 3.42,
1392
  "learning_rate": 1.5763576001897083e-05,
1393
- "loss": 1.8015,
1394
  "step": 115500
1395
  },
1396
  {
1397
  "epoch": 3.44,
1398
  "learning_rate": 1.5615366374199667e-05,
1399
- "loss": 1.8037,
1400
  "step": 116000
1401
  },
1402
  {
1403
  "epoch": 3.45,
1404
  "learning_rate": 1.5467156746502255e-05,
1405
- "loss": 1.8172,
1406
  "step": 116500
1407
  },
1408
  {
1409
  "epoch": 3.47,
1410
  "learning_rate": 1.531894711880484e-05,
1411
- "loss": 1.806,
1412
  "step": 117000
1413
  },
1414
  {
1415
  "epoch": 3.48,
1416
  "learning_rate": 1.5170737491107422e-05,
1417
- "loss": 1.8122,
1418
  "step": 117500
1419
  },
1420
  {
1421
  "epoch": 3.5,
1422
  "learning_rate": 1.5022527863410007e-05,
1423
- "loss": 1.7992,
1424
  "step": 118000
1425
  },
1426
  {
1427
  "epoch": 3.51,
1428
  "learning_rate": 1.4874318235712591e-05,
1429
- "loss": 1.7984,
1430
  "step": 118500
1431
  },
1432
  {
1433
  "epoch": 3.53,
1434
  "learning_rate": 1.4726108608015177e-05,
1435
- "loss": 1.7979,
1436
  "step": 119000
1437
  },
1438
  {
1439
  "epoch": 3.54,
1440
  "learning_rate": 1.4577898980317762e-05,
1441
- "loss": 1.8054,
1442
  "step": 119500
1443
  },
1444
  {
1445
  "epoch": 3.56,
1446
  "learning_rate": 1.4429689352620346e-05,
1447
- "loss": 1.7976,
1448
  "step": 120000
1449
  },
1450
  {
1451
  "epoch": 3.57,
1452
  "learning_rate": 1.428147972492293e-05,
1453
- "loss": 1.8096,
1454
  "step": 120500
1455
  },
1456
  {
1457
  "epoch": 3.59,
1458
  "learning_rate": 1.4133270097225518e-05,
1459
- "loss": 1.8067,
1460
  "step": 121000
1461
  },
1462
  {
1463
  "epoch": 3.6,
1464
  "learning_rate": 1.3985060469528103e-05,
1465
- "loss": 1.8065,
1466
  "step": 121500
1467
  },
1468
  {
1469
  "epoch": 3.62,
1470
  "learning_rate": 1.3836850841830687e-05,
1471
- "loss": 1.8052,
1472
  "step": 122000
1473
  },
1474
  {
1475
  "epoch": 3.63,
1476
  "learning_rate": 1.3688641214133272e-05,
1477
- "loss": 1.8016,
1478
  "step": 122500
1479
  },
1480
  {
1481
  "epoch": 3.65,
1482
  "learning_rate": 1.3540431586435856e-05,
1483
- "loss": 1.8101,
1484
  "step": 123000
1485
  },
1486
  {
1487
  "epoch": 3.66,
1488
  "learning_rate": 1.339222195873844e-05,
1489
- "loss": 1.8089,
1490
  "step": 123500
1491
  },
1492
  {
1493
  "epoch": 3.68,
1494
  "learning_rate": 1.3244012331041025e-05,
1495
- "loss": 1.8038,
1496
  "step": 124000
1497
  },
1498
  {
1499
  "epoch": 3.69,
1500
  "learning_rate": 1.309580270334361e-05,
1501
- "loss": 1.8165,
1502
  "step": 124500
1503
  },
1504
  {
1505
  "epoch": 3.71,
1506
  "learning_rate": 1.2947593075646194e-05,
1507
- "loss": 1.807,
1508
  "step": 125000
1509
  },
1510
  {
1511
  "epoch": 3.72,
1512
  "learning_rate": 1.279938344794878e-05,
1513
- "loss": 1.8113,
1514
  "step": 125500
1515
  },
1516
  {
1517
  "epoch": 3.73,
1518
  "learning_rate": 1.2651173820251364e-05,
1519
- "loss": 1.8073,
1520
  "step": 126000
1521
  },
1522
  {
1523
  "epoch": 3.75,
1524
  "learning_rate": 1.2502964192553949e-05,
1525
- "loss": 1.8129,
1526
  "step": 126500
1527
  },
1528
  {
1529
  "epoch": 3.76,
1530
  "learning_rate": 1.2354754564856535e-05,
1531
- "loss": 1.8475,
1532
  "step": 127000
1533
  },
1534
  {
1535
  "epoch": 3.78,
1536
  "learning_rate": 1.220654493715912e-05,
1537
- "loss": 1.8003,
1538
  "step": 127500
1539
  },
1540
  {
1541
  "epoch": 3.79,
1542
  "learning_rate": 1.2058335309461704e-05,
1543
- "loss": 1.816,
1544
  "step": 128000
1545
  },
1546
  {
1547
  "epoch": 3.81,
1548
  "learning_rate": 1.1910125681764288e-05,
1549
- "loss": 1.8198,
1550
  "step": 128500
1551
  },
1552
  {
1553
  "epoch": 3.82,
1554
  "learning_rate": 1.1761916054066872e-05,
1555
- "loss": 1.8339,
1556
  "step": 129000
1557
  },
1558
  {
1559
  "epoch": 3.84,
1560
  "learning_rate": 1.1613706426369457e-05,
1561
- "loss": 1.8053,
1562
  "step": 129500
1563
  },
1564
  {
1565
  "epoch": 3.85,
1566
  "learning_rate": 1.1465496798672043e-05,
1567
- "loss": 1.7948,
1568
  "step": 130000
1569
  },
1570
  {
1571
  "epoch": 3.87,
1572
  "learning_rate": 1.1317287170974627e-05,
1573
- "loss": 1.8261,
1574
  "step": 130500
1575
  },
1576
  {
1577
  "epoch": 3.88,
1578
  "learning_rate": 1.1169077543277212e-05,
1579
- "loss": 1.8097,
1580
  "step": 131000
1581
  },
1582
  {
1583
  "epoch": 3.9,
1584
  "learning_rate": 1.1020867915579796e-05,
1585
- "loss": 1.8101,
1586
  "step": 131500
1587
  },
1588
  {
1589
  "epoch": 3.91,
1590
  "learning_rate": 1.087265828788238e-05,
1591
- "loss": 1.8135,
1592
  "step": 132000
1593
  },
1594
  {
1595
  "epoch": 3.93,
1596
  "learning_rate": 1.0724448660184967e-05,
1597
- "loss": 1.8245,
1598
  "step": 132500
1599
  },
1600
  {
1601
  "epoch": 3.94,
1602
  "learning_rate": 1.0576239032487551e-05,
1603
- "loss": 1.8059,
1604
  "step": 133000
1605
  },
1606
  {
1607
  "epoch": 3.96,
1608
  "learning_rate": 1.0428029404790136e-05,
1609
- "loss": 1.8147,
1610
  "step": 133500
1611
  },
1612
  {
1613
  "epoch": 3.97,
1614
  "learning_rate": 1.0279819777092722e-05,
1615
- "loss": 1.8108,
1616
  "step": 134000
1617
  },
1618
  {
1619
  "epoch": 3.99,
1620
  "learning_rate": 1.0131610149395306e-05,
1621
- "loss": 1.8207,
1622
  "step": 134500
1623
  },
1624
  {
1625
  "epoch": 4.0,
1626
  "learning_rate": 9.98340052169789e-06,
1627
- "loss": 1.8035,
1628
  "step": 135000
1629
  },
1630
  {
1631
  "epoch": 4.02,
1632
  "learning_rate": 9.835190894000475e-06,
1633
- "loss": 1.7493,
1634
  "step": 135500
1635
  },
1636
  {
1637
  "epoch": 4.03,
1638
  "learning_rate": 9.68698126630306e-06,
1639
- "loss": 1.7577,
1640
  "step": 136000
1641
  },
1642
  {
1643
  "epoch": 4.05,
1644
  "learning_rate": 9.538771638605644e-06,
1645
- "loss": 1.7447,
1646
  "step": 136500
1647
  },
1648
  {
1649
  "epoch": 4.06,
1650
  "learning_rate": 9.390562010908228e-06,
1651
- "loss": 1.7555,
1652
  "step": 137000
1653
  },
1654
  {
1655
  "epoch": 4.08,
1656
  "learning_rate": 9.242352383210814e-06,
1657
- "loss": 1.7644,
1658
  "step": 137500
1659
  },
1660
  {
1661
  "epoch": 4.09,
1662
  "learning_rate": 9.094142755513399e-06,
1663
- "loss": 1.7464,
1664
  "step": 138000
1665
  },
1666
  {
1667
  "epoch": 4.11,
1668
  "learning_rate": 8.945933127815983e-06,
1669
- "loss": 1.751,
1670
  "step": 138500
1671
  },
1672
  {
1673
  "epoch": 4.12,
1674
  "learning_rate": 8.797723500118567e-06,
1675
- "loss": 1.7587,
1676
  "step": 139000
1677
  },
1678
  {
1679
  "epoch": 4.14,
1680
  "learning_rate": 8.649513872421154e-06,
1681
- "loss": 1.7638,
1682
  "step": 139500
1683
  },
1684
  {
1685
  "epoch": 4.15,
1686
  "learning_rate": 8.501304244723738e-06,
1687
- "loss": 1.763,
1688
  "step": 140000
1689
  },
1690
  {
1691
  "epoch": 4.16,
1692
  "learning_rate": 8.353094617026322e-06,
1693
- "loss": 1.7529,
1694
  "step": 140500
1695
  },
1696
  {
1697
  "epoch": 4.18,
1698
  "learning_rate": 8.204884989328908e-06,
1699
- "loss": 1.745,
1700
  "step": 141000
1701
  },
1702
  {
1703
  "epoch": 4.19,
1704
  "learning_rate": 8.056675361631493e-06,
1705
- "loss": 1.7497,
1706
  "step": 141500
1707
  },
1708
  {
1709
  "epoch": 4.21,
1710
  "learning_rate": 7.908465733934077e-06,
1711
- "loss": 1.7582,
1712
  "step": 142000
1713
  },
1714
  {
1715
  "epoch": 4.22,
1716
  "learning_rate": 7.760256106236662e-06,
1717
- "loss": 1.7425,
1718
  "step": 142500
1719
  },
1720
  {
1721
  "epoch": 4.24,
1722
  "learning_rate": 7.612046478539246e-06,
1723
- "loss": 1.7576,
1724
  "step": 143000
1725
  },
1726
  {
1727
  "epoch": 4.25,
1728
  "learning_rate": 7.4638368508418305e-06,
1729
- "loss": 1.7482,
1730
  "step": 143500
1731
  },
1732
  {
1733
  "epoch": 4.27,
1734
  "learning_rate": 7.315627223144415e-06,
1735
- "loss": 1.7449,
1736
  "step": 144000
1737
  },
1738
  {
1739
  "epoch": 4.28,
1740
  "learning_rate": 7.167417595447e-06,
1741
- "loss": 1.7451,
1742
  "step": 144500
1743
  },
1744
  {
1745
  "epoch": 4.3,
1746
  "learning_rate": 7.0192079677495855e-06,
1747
- "loss": 1.765,
1748
  "step": 145000
1749
  },
1750
  {
1751
  "epoch": 4.31,
1752
  "learning_rate": 6.870998340052171e-06,
1753
- "loss": 1.7539,
1754
  "step": 145500
1755
  },
1756
  {
1757
  "epoch": 4.33,
1758
  "learning_rate": 6.722788712354755e-06,
1759
- "loss": 1.7476,
1760
  "step": 146000
1761
  },
1762
  {
1763
  "epoch": 4.34,
1764
  "learning_rate": 6.5745790846573396e-06,
1765
- "loss": 1.7635,
1766
  "step": 146500
1767
  },
1768
  {
1769
  "epoch": 4.36,
1770
  "learning_rate": 6.426369456959924e-06,
1771
- "loss": 1.7672,
1772
  "step": 147000
1773
  },
1774
  {
1775
  "epoch": 4.37,
1776
  "learning_rate": 6.278159829262508e-06,
1777
- "loss": 1.7673,
1778
  "step": 147500
1779
  },
1780
  {
1781
  "epoch": 4.39,
1782
  "learning_rate": 6.129950201565094e-06,
1783
- "loss": 1.7783,
1784
  "step": 148000
1785
  },
1786
  {
1787
  "epoch": 4.4,
1788
  "learning_rate": 5.981740573867679e-06,
1789
- "loss": 1.7514,
1790
  "step": 148500
1791
  },
1792
  {
1793
  "epoch": 4.42,
1794
  "learning_rate": 5.833530946170263e-06,
1795
- "loss": 1.7546,
1796
  "step": 149000
1797
  },
1798
  {
1799
  "epoch": 4.43,
1800
  "learning_rate": 5.6853213184728486e-06,
1801
- "loss": 1.7502,
1802
  "step": 149500
1803
  },
1804
  {
1805
  "epoch": 4.45,
1806
  "learning_rate": 5.537111690775433e-06,
1807
- "loss": 1.7542,
1808
  "step": 150000
1809
  },
1810
  {
1811
  "epoch": 4.46,
1812
  "learning_rate": 5.388902063078017e-06,
1813
- "loss": 1.772,
1814
  "step": 150500
1815
  },
1816
  {
1817
  "epoch": 4.48,
1818
  "learning_rate": 5.240692435380603e-06,
1819
- "loss": 1.7575,
1820
  "step": 151000
1821
  },
1822
  {
1823
  "epoch": 4.49,
1824
  "learning_rate": 5.092482807683188e-06,
1825
- "loss": 1.755,
1826
  "step": 151500
1827
  },
1828
  {
1829
  "epoch": 4.51,
1830
  "learning_rate": 4.944273179985772e-06,
1831
- "loss": 1.7393,
1832
  "step": 152000
1833
  },
1834
  {
1835
  "epoch": 4.52,
1836
  "learning_rate": 4.796063552288357e-06,
1837
- "loss": 1.7534,
1838
  "step": 152500
1839
  },
1840
  {
1841
  "epoch": 4.54,
1842
  "learning_rate": 4.647853924590942e-06,
1843
- "loss": 1.75,
1844
  "step": 153000
1845
  },
1846
  {
1847
  "epoch": 4.55,
1848
  "learning_rate": 4.499644296893526e-06,
1849
- "loss": 1.7493,
1850
  "step": 153500
1851
  },
1852
  {
1853
  "epoch": 4.56,
1854
  "learning_rate": 4.351434669196111e-06,
1855
- "loss": 1.7508,
1856
  "step": 154000
1857
  },
1858
  {
1859
  "epoch": 4.58,
1860
  "learning_rate": 4.203225041498696e-06,
1861
- "loss": 1.7482,
1862
  "step": 154500
1863
  },
1864
  {
1865
  "epoch": 4.59,
1866
  "learning_rate": 4.055015413801281e-06,
1867
- "loss": 1.7632,
1868
  "step": 155000
1869
  },
1870
  {
1871
  "epoch": 4.61,
1872
  "learning_rate": 3.906805786103866e-06,
1873
- "loss": 1.7621,
1874
  "step": 155500
1875
  },
1876
  {
1877
  "epoch": 4.62,
1878
  "learning_rate": 3.75859615840645e-06,
1879
- "loss": 1.7632,
1880
  "step": 156000
1881
  },
1882
  {
1883
  "epoch": 4.64,
1884
  "learning_rate": 3.6103865307090354e-06,
1885
- "loss": 1.7635,
1886
  "step": 156500
1887
  },
1888
  {
1889
  "epoch": 4.65,
1890
  "learning_rate": 3.46217690301162e-06,
1891
- "loss": 1.7587,
1892
  "step": 157000
1893
  },
1894
  {
1895
  "epoch": 4.67,
1896
  "learning_rate": 3.3139672753142047e-06,
1897
- "loss": 1.7453,
1898
  "step": 157500
1899
  },
1900
  {
1901
  "epoch": 4.68,
1902
  "learning_rate": 3.165757647616789e-06,
1903
- "loss": 1.7471,
1904
  "step": 158000
1905
  },
1906
  {
1907
  "epoch": 4.7,
1908
  "learning_rate": 3.017548019919374e-06,
1909
- "loss": 1.7476,
1910
  "step": 158500
1911
  },
1912
  {
1913
  "epoch": 4.71,
1914
  "learning_rate": 2.8693383922219588e-06,
1915
- "loss": 1.7567,
1916
  "step": 159000
1917
  },
1918
  {
1919
  "epoch": 4.73,
1920
  "learning_rate": 2.7211287645245436e-06,
1921
- "loss": 1.7796,
1922
  "step": 159500
1923
  },
1924
  {
1925
  "epoch": 4.74,
1926
  "learning_rate": 2.5729191368271284e-06,
1927
- "loss": 1.7471,
1928
  "step": 160000
1929
  },
1930
  {
1931
  "epoch": 4.76,
1932
  "learning_rate": 2.424709509129713e-06,
1933
- "loss": 1.7699,
1934
  "step": 160500
1935
  },
1936
  {
1937
  "epoch": 4.77,
1938
  "learning_rate": 2.276499881432298e-06,
1939
- "loss": 1.739,
1940
  "step": 161000
1941
  },
1942
  {
1943
  "epoch": 4.79,
1944
  "learning_rate": 2.1282902537348825e-06,
1945
- "loss": 1.7413,
1946
  "step": 161500
1947
  },
1948
  {
1949
  "epoch": 4.8,
1950
  "learning_rate": 1.9800806260374674e-06,
1951
- "loss": 1.7553,
1952
  "step": 162000
1953
  },
1954
  {
1955
  "epoch": 4.82,
1956
  "learning_rate": 1.8318709983400524e-06,
1957
- "loss": 1.7391,
1958
  "step": 162500
1959
  },
1960
  {
1961
  "epoch": 4.83,
1962
  "learning_rate": 1.683661370642637e-06,
1963
- "loss": 1.7446,
1964
  "step": 163000
1965
  },
1966
  {
1967
  "epoch": 4.85,
1968
  "learning_rate": 1.5354517429452217e-06,
1969
- "loss": 1.7626,
1970
  "step": 163500
1971
  },
1972
  {
1973
  "epoch": 4.86,
1974
  "learning_rate": 1.3872421152478065e-06,
1975
- "loss": 1.7454,
1976
  "step": 164000
1977
  },
1978
  {
1979
  "epoch": 4.88,
1980
  "learning_rate": 1.2390324875503913e-06,
1981
- "loss": 1.7657,
1982
  "step": 164500
1983
  },
1984
  {
1985
  "epoch": 4.89,
1986
  "learning_rate": 1.0908228598529762e-06,
1987
- "loss": 1.758,
1988
  "step": 165000
1989
  },
1990
  {
1991
- "epoch": 1.23,
1992
- "learning_rate": 3.7735471535919134e-05,
1993
- "loss": 1.7432,
1994
  "step": 165500
1995
  },
1996
  {
1997
- "epoch": 1.23,
1998
- "learning_rate": 3.76984185798343e-05,
1999
- "loss": 1.7559,
2000
  "step": 166000
2001
  },
2002
  {
2003
- "epoch": 1.23,
2004
- "learning_rate": 3.766136562374947e-05,
2005
- "loss": 1.7691,
2006
  "step": 166500
2007
  },
2008
  {
2009
- "epoch": 1.24,
2010
- "learning_rate": 3.762431266766463e-05,
2011
- "loss": 1.7535,
2012
  "step": 167000
2013
  },
2014
  {
2015
- "epoch": 1.24,
2016
- "learning_rate": 3.7587259711579795e-05,
2017
- "loss": 1.7727,
2018
  "step": 167500
2019
  },
2020
  {
2021
- "epoch": 1.24,
2022
- "learning_rate": 3.755020675549495e-05,
2023
- "loss": 1.7932,
2024
  "step": 168000
2025
  },
2026
  {
2027
- "epoch": 1.25,
2028
- "learning_rate": 3.7513153799410116e-05,
2029
- "loss": 1.7775,
2030
  "step": 168500
2031
  },
2032
  {
2033
- "epoch": 1.25,
2034
- "learning_rate": 3.7476100843325286e-05,
2035
- "loss": 1.7774,
2036
- "step": 169000
2037
- },
2038
- {
2039
- "epoch": 1.26,
2040
- "learning_rate": 3.743904788724045e-05,
2041
- "loss": 1.7898,
2042
- "step": 169500
2043
- },
2044
- {
2045
- "epoch": 1.26,
2046
- "learning_rate": 3.7401994931155607e-05,
2047
- "loss": 1.7823,
2048
- "step": 170000
2049
- },
2050
- {
2051
- "epoch": 1.26,
2052
- "learning_rate": 3.736494197507077e-05,
2053
- "loss": 1.8264,
2054
- "step": 170500
2055
- },
2056
- {
2057
- "epoch": 1.27,
2058
- "learning_rate": 3.7327889018985934e-05,
2059
- "loss": 1.792,
2060
- "step": 171000
2061
- },
2062
- {
2063
- "epoch": 1.27,
2064
- "learning_rate": 3.7290836062901104e-05,
2065
- "loss": 1.8131,
2066
- "step": 171500
2067
- },
2068
- {
2069
- "epoch": 1.27,
2070
- "learning_rate": 3.725378310681626e-05,
2071
- "loss": 1.8023,
2072
- "step": 172000
2073
- },
2074
- {
2075
- "epoch": 1.28,
2076
- "learning_rate": 3.7216730150731425e-05,
2077
- "loss": 1.8306,
2078
- "step": 172500
2079
- },
2080
- {
2081
- "epoch": 1.28,
2082
- "learning_rate": 3.717967719464659e-05,
2083
- "loss": 1.7956,
2084
- "step": 173000
2085
- },
2086
- {
2087
- "epoch": 1.29,
2088
- "learning_rate": 3.714262423856175e-05,
2089
- "loss": 1.8114,
2090
- "step": 173500
2091
- },
2092
- {
2093
- "epoch": 1.29,
2094
- "learning_rate": 3.710557128247692e-05,
2095
- "loss": 1.8154,
2096
- "step": 174000
2097
- },
2098
- {
2099
- "epoch": 1.29,
2100
- "learning_rate": 3.706851832639208e-05,
2101
- "loss": 1.8034,
2102
- "step": 174500
2103
- },
2104
- {
2105
- "epoch": 1.3,
2106
- "learning_rate": 3.703146537030724e-05,
2107
- "loss": 1.8171,
2108
- "step": 175000
2109
- },
2110
- {
2111
- "epoch": 1.3,
2112
- "learning_rate": 3.699441241422241e-05,
2113
- "loss": 1.8111,
2114
- "step": 175500
2115
- },
2116
- {
2117
- "epoch": 1.3,
2118
- "learning_rate": 3.695735945813757e-05,
2119
- "loss": 1.8063,
2120
- "step": 176000
2121
- },
2122
- {
2123
- "epoch": 1.31,
2124
- "learning_rate": 3.6920306502052734e-05,
2125
- "loss": 1.828,
2126
- "step": 176500
2127
- },
2128
- {
2129
- "epoch": 1.31,
2130
- "learning_rate": 3.68832535459679e-05,
2131
- "loss": 1.8398,
2132
- "step": 177000
2133
- },
2134
- {
2135
- "epoch": 1.32,
2136
- "learning_rate": 3.684620058988306e-05,
2137
- "loss": 1.8362,
2138
- "step": 177500
2139
- },
2140
- {
2141
- "epoch": 1.32,
2142
- "learning_rate": 3.6809147633798225e-05,
2143
- "loss": 1.8502,
2144
- "step": 178000
2145
- },
2146
- {
2147
- "epoch": 1.32,
2148
- "learning_rate": 3.6772094677713396e-05,
2149
- "loss": 1.8581,
2150
- "step": 178500
2151
- },
2152
- {
2153
- "epoch": 1.33,
2154
- "learning_rate": 3.673504172162855e-05,
2155
- "loss": 1.8401,
2156
- "step": 179000
2157
- },
2158
- {
2159
- "epoch": 1.33,
2160
- "learning_rate": 3.6697988765543716e-05,
2161
- "loss": 1.8324,
2162
- "step": 179500
2163
- },
2164
- {
2165
- "epoch": 1.33,
2166
- "learning_rate": 3.666093580945888e-05,
2167
- "loss": 1.8523,
2168
- "step": 180000
2169
- },
2170
- {
2171
- "epoch": 1.34,
2172
- "learning_rate": 1.6559707133435105e-05,
2173
- "loss": 1.817,
2174
- "step": 180500
2175
- },
2176
- {
2177
- "epoch": 1.34,
2178
- "learning_rate": 1.6467074743223014e-05,
2179
- "loss": 1.8734,
2180
- "step": 181000
2181
- },
2182
- {
2183
- "epoch": 1.35,
2184
- "learning_rate": 1.6374442353010923e-05,
2185
- "loss": 1.812,
2186
- "step": 181500
2187
- },
2188
- {
2189
- "epoch": 1.35,
2190
- "learning_rate": 1.6281809962798833e-05,
2191
- "loss": 1.8239,
2192
- "step": 182000
2193
- },
2194
- {
2195
- "epoch": 1.35,
2196
- "learning_rate": 1.6189177572586742e-05,
2197
- "loss": 1.8156,
2198
- "step": 182500
2199
- },
2200
- {
2201
- "epoch": 1.36,
2202
- "learning_rate": 1.609654518237465e-05,
2203
- "loss": 1.8019,
2204
- "step": 183000
2205
- },
2206
- {
2207
- "epoch": 1.36,
2208
- "learning_rate": 1.600391279216256e-05,
2209
- "loss": 1.8153,
2210
- "step": 183500
2211
- },
2212
- {
2213
- "epoch": 1.36,
2214
- "learning_rate": 1.591128040195047e-05,
2215
- "loss": 1.8063,
2216
- "step": 184000
2217
- },
2218
- {
2219
- "epoch": 1.37,
2220
- "learning_rate": 1.581864801173838e-05,
2221
- "loss": 1.8251,
2222
- "step": 184500
2223
- },
2224
- {
2225
- "epoch": 1.37,
2226
- "learning_rate": 1.5726015621526284e-05,
2227
- "loss": 1.8205,
2228
- "step": 185000
2229
- },
2230
- {
2231
- "epoch": 1.37,
2232
- "learning_rate": 1.5633383231314193e-05,
2233
- "loss": 1.805,
2234
- "step": 185500
2235
- },
2236
- {
2237
- "epoch": 1.38,
2238
- "learning_rate": 1.5540750841102102e-05,
2239
- "loss": 1.839,
2240
- "step": 186000
2241
- },
2242
- {
2243
- "epoch": 1.38,
2244
- "learning_rate": 1.544811845089001e-05,
2245
- "loss": 1.8183,
2246
- "step": 186500
2247
- },
2248
- {
2249
- "epoch": 1.39,
2250
- "learning_rate": 1.535548606067792e-05,
2251
- "loss": 1.8262,
2252
- "step": 187000
2253
- },
2254
- {
2255
- "epoch": 1.39,
2256
- "learning_rate": 1.526285367046583e-05,
2257
- "loss": 1.7968,
2258
- "step": 187500
2259
- },
2260
- {
2261
- "epoch": 1.39,
2262
- "learning_rate": 1.5170221280253737e-05,
2263
- "loss": 1.822,
2264
- "step": 188000
2265
- },
2266
- {
2267
- "epoch": 1.4,
2268
- "learning_rate": 1.507758889004165e-05,
2269
- "loss": 1.8239,
2270
- "step": 188500
2271
- },
2272
- {
2273
- "epoch": 1.4,
2274
- "learning_rate": 1.4984956499829559e-05,
2275
- "loss": 1.8336,
2276
- "step": 189000
2277
- },
2278
- {
2279
- "epoch": 1.4,
2280
- "learning_rate": 1.4892324109617466e-05,
2281
- "loss": 1.8248,
2282
- "step": 189500
2283
- },
2284
- {
2285
- "epoch": 1.41,
2286
- "learning_rate": 1.4799691719405376e-05,
2287
- "loss": 1.8142,
2288
- "step": 190000
2289
- },
2290
- {
2291
- "epoch": 1.41,
2292
- "learning_rate": 1.4707059329193285e-05,
2293
- "loss": 1.8187,
2294
- "step": 190500
2295
- },
2296
- {
2297
- "epoch": 1.42,
2298
- "learning_rate": 1.4614426938981194e-05,
2299
- "loss": 1.8219,
2300
- "step": 191000
2301
- },
2302
- {
2303
- "epoch": 1.42,
2304
- "learning_rate": 1.4521794548769101e-05,
2305
- "loss": 1.8274,
2306
- "step": 191500
2307
- },
2308
- {
2309
- "epoch": 1.42,
2310
- "learning_rate": 1.442916215855701e-05,
2311
- "loss": 1.7992,
2312
- "step": 192000
2313
- },
2314
- {
2315
- "epoch": 1.43,
2316
- "learning_rate": 1.433652976834492e-05,
2317
- "loss": 1.8345,
2318
- "step": 192500
2319
- },
2320
- {
2321
- "epoch": 1.43,
2322
- "learning_rate": 1.4243897378132829e-05,
2323
- "loss": 1.8225,
2324
- "step": 193000
2325
- },
2326
- {
2327
- "epoch": 1.43,
2328
- "learning_rate": 1.4151264987920736e-05,
2329
- "loss": 1.8417,
2330
- "step": 193500
2331
- },
2332
- {
2333
- "epoch": 1.44,
2334
- "learning_rate": 1.4058632597708645e-05,
2335
- "loss": 1.8025,
2336
- "step": 194000
2337
- },
2338
- {
2339
- "epoch": 1.44,
2340
- "learning_rate": 1.3966000207496555e-05,
2341
- "loss": 1.818,
2342
- "step": 194500
2343
- },
2344
- {
2345
- "epoch": 1.45,
2346
- "learning_rate": 1.3873367817284464e-05,
2347
- "loss": 1.803,
2348
- "step": 195000
2349
- },
2350
- {
2351
- "epoch": 1.45,
2352
- "learning_rate": 1.3780735427072371e-05,
2353
- "loss": 1.8012,
2354
- "step": 195500
2355
- },
2356
- {
2357
- "epoch": 1.45,
2358
- "learning_rate": 1.368810303686028e-05,
2359
- "loss": 1.8183,
2360
- "step": 196000
2361
- },
2362
- {
2363
- "epoch": 1.46,
2364
- "learning_rate": 1.359547064664819e-05,
2365
- "loss": 1.822,
2366
- "step": 196500
2367
- },
2368
- {
2369
- "epoch": 1.46,
2370
- "learning_rate": 1.3502838256436099e-05,
2371
- "loss": 1.8482,
2372
- "step": 197000
2373
- },
2374
- {
2375
- "epoch": 1.46,
2376
- "learning_rate": 1.3410205866224008e-05,
2377
- "loss": 1.8322,
2378
- "step": 197500
2379
- },
2380
- {
2381
- "epoch": 1.47,
2382
- "learning_rate": 1.3317573476011915e-05,
2383
- "loss": 1.8187,
2384
- "step": 198000
2385
- },
2386
- {
2387
- "epoch": 1.47,
2388
- "learning_rate": 1.3224941085799824e-05,
2389
- "loss": 1.8166,
2390
- "step": 198500
2391
- },
2392
- {
2393
- "epoch": 1.47,
2394
- "learning_rate": 1.3132308695587734e-05,
2395
- "loss": 1.8481,
2396
- "step": 199000
2397
- },
2398
- {
2399
- "epoch": 1.48,
2400
- "learning_rate": 1.3039676305375643e-05,
2401
- "loss": 1.8319,
2402
- "step": 199500
2403
- },
2404
- {
2405
- "epoch": 1.48,
2406
- "learning_rate": 1.2947043915163554e-05,
2407
- "loss": 1.8159,
2408
- "step": 200000
2409
- },
2410
- {
2411
- "epoch": 1.49,
2412
- "learning_rate": 1.2854411524951463e-05,
2413
- "loss": 1.8474,
2414
- "step": 200500
2415
- },
2416
- {
2417
- "epoch": 1.49,
2418
- "learning_rate": 1.2761779134739372e-05,
2419
- "loss": 1.8305,
2420
- "step": 201000
2421
- },
2422
- {
2423
- "epoch": 1.49,
2424
- "learning_rate": 1.266914674452728e-05,
2425
- "loss": 1.827,
2426
- "step": 201500
2427
- },
2428
- {
2429
- "epoch": 1.5,
2430
- "learning_rate": 1.2576514354315189e-05,
2431
- "loss": 1.8219,
2432
- "step": 202000
2433
- },
2434
- {
2435
- "epoch": 1.5,
2436
- "learning_rate": 1.2483881964103098e-05,
2437
- "loss": 1.8142,
2438
- "step": 202500
2439
- },
2440
- {
2441
- "epoch": 1.5,
2442
- "learning_rate": 1.2391249573891007e-05,
2443
- "loss": 1.8161,
2444
- "step": 203000
2445
- },
2446
- {
2447
- "epoch": 1.51,
2448
- "learning_rate": 1.2298617183678914e-05,
2449
- "loss": 1.8336,
2450
- "step": 203500
2451
- },
2452
- {
2453
- "epoch": 1.51,
2454
- "learning_rate": 1.2205984793466823e-05,
2455
- "loss": 1.816,
2456
- "step": 204000
2457
- },
2458
- {
2459
- "epoch": 1.52,
2460
- "learning_rate": 1.2113352403254733e-05,
2461
- "loss": 1.8304,
2462
- "step": 204500
2463
- },
2464
- {
2465
- "epoch": 1.52,
2466
- "learning_rate": 1.2020720013042642e-05,
2467
- "loss": 1.8101,
2468
- "step": 205000
2469
- },
2470
- {
2471
- "epoch": 1.52,
2472
- "learning_rate": 1.192808762283055e-05,
2473
- "loss": 1.8219,
2474
- "step": 205500
2475
- },
2476
- {
2477
- "epoch": 1.53,
2478
- "learning_rate": 1.1835455232618458e-05,
2479
- "loss": 1.8271,
2480
- "step": 206000
2481
- },
2482
- {
2483
- "epoch": 1.53,
2484
- "learning_rate": 1.1742822842406368e-05,
2485
- "loss": 1.8542,
2486
- "step": 206500
2487
- },
2488
- {
2489
- "epoch": 1.53,
2490
- "learning_rate": 1.1650190452194277e-05,
2491
- "loss": 1.8504,
2492
- "step": 207000
2493
- },
2494
- {
2495
- "epoch": 1.54,
2496
- "learning_rate": 1.1557558061982184e-05,
2497
- "loss": 1.8177,
2498
- "step": 207500
2499
- },
2500
- {
2501
- "epoch": 1.54,
2502
- "learning_rate": 1.1464925671770093e-05,
2503
- "loss": 1.8297,
2504
- "step": 208000
2505
- },
2506
- {
2507
- "epoch": 1.55,
2508
- "learning_rate": 1.1372293281558004e-05,
2509
- "loss": 1.8154,
2510
- "step": 208500
2511
- },
2512
- {
2513
- "epoch": 1.55,
2514
- "learning_rate": 1.1279660891345913e-05,
2515
- "loss": 1.8371,
2516
- "step": 209000
2517
- },
2518
- {
2519
- "epoch": 1.55,
2520
- "learning_rate": 1.118702850113382e-05,
2521
- "loss": 1.8386,
2522
- "step": 209500
2523
- },
2524
- {
2525
- "epoch": 1.56,
2526
- "learning_rate": 1.109439611092173e-05,
2527
- "loss": 1.8414,
2528
- "step": 210000
2529
- },
2530
- {
2531
- "epoch": 1.56,
2532
- "learning_rate": 1.1001763720709639e-05,
2533
- "loss": 1.8708,
2534
- "step": 210500
2535
- },
2536
- {
2537
- "epoch": 1.56,
2538
- "learning_rate": 1.0909131330497548e-05,
2539
- "loss": 1.8323,
2540
- "step": 211000
2541
- },
2542
- {
2543
- "epoch": 1.57,
2544
- "learning_rate": 1.0816498940285456e-05,
2545
- "loss": 1.8587,
2546
- "step": 211500
2547
- },
2548
- {
2549
- "epoch": 1.57,
2550
- "learning_rate": 1.0723866550073365e-05,
2551
- "loss": 1.8364,
2552
- "step": 212000
2553
- },
2554
- {
2555
- "epoch": 1.57,
2556
- "learning_rate": 1.0631234159861274e-05,
2557
- "loss": 1.8504,
2558
- "step": 212500
2559
- },
2560
- {
2561
- "epoch": 1.58,
2562
- "learning_rate": 1.0538601769649183e-05,
2563
- "loss": 1.8394,
2564
- "step": 213000
2565
- },
2566
- {
2567
- "epoch": 1.58,
2568
- "learning_rate": 1.044596937943709e-05,
2569
- "loss": 1.811,
2570
- "step": 213500
2571
- },
2572
- {
2573
- "epoch": 1.59,
2574
- "learning_rate": 1.0353336989225001e-05,
2575
- "loss": 1.8517,
2576
- "step": 214000
2577
- },
2578
- {
2579
- "epoch": 1.59,
2580
- "learning_rate": 1.026070459901291e-05,
2581
- "loss": 1.8225,
2582
- "step": 214500
2583
- },
2584
- {
2585
- "epoch": 1.59,
2586
- "learning_rate": 1.016807220880082e-05,
2587
- "loss": 1.8488,
2588
- "step": 215000
2589
- },
2590
- {
2591
- "epoch": 1.6,
2592
- "learning_rate": 1.0075439818588727e-05,
2593
- "loss": 1.8289,
2594
- "step": 215500
2595
- },
2596
- {
2597
- "epoch": 1.6,
2598
- "learning_rate": 9.982807428376636e-06,
2599
- "loss": 1.8847,
2600
- "step": 216000
2601
- },
2602
- {
2603
- "epoch": 1.6,
2604
- "learning_rate": 9.890175038164546e-06,
2605
- "loss": 1.8436,
2606
- "step": 216500
2607
- },
2608
- {
2609
- "epoch": 1.61,
2610
- "learning_rate": 9.797542647952455e-06,
2611
- "loss": 1.8496,
2612
- "step": 217000
2613
- },
2614
- {
2615
- "epoch": 1.61,
2616
- "learning_rate": 9.704910257740362e-06,
2617
- "loss": 1.8299,
2618
- "step": 217500
2619
- },
2620
- {
2621
- "epoch": 1.62,
2622
- "learning_rate": 9.612277867528271e-06,
2623
- "loss": 1.8302,
2624
- "step": 218000
2625
- },
2626
- {
2627
- "epoch": 1.62,
2628
- "learning_rate": 9.51964547731618e-06,
2629
- "loss": 1.8361,
2630
- "step": 218500
2631
- },
2632
- {
2633
- "epoch": 1.62,
2634
- "learning_rate": 9.42701308710409e-06,
2635
- "loss": 1.8672,
2636
- "step": 219000
2637
- },
2638
- {
2639
- "epoch": 1.63,
2640
- "learning_rate": 9.334380696891999e-06,
2641
- "loss": 1.8126,
2642
- "step": 219500
2643
- },
2644
- {
2645
- "epoch": 1.63,
2646
- "learning_rate": 9.241748306679908e-06,
2647
- "loss": 1.8328,
2648
- "step": 220000
2649
- },
2650
- {
2651
- "epoch": 1.63,
2652
- "learning_rate": 9.149115916467817e-06,
2653
- "loss": 1.8235,
2654
- "step": 220500
2655
- },
2656
- {
2657
- "epoch": 1.64,
2658
- "learning_rate": 9.056483526255726e-06,
2659
- "loss": 1.834,
2660
- "step": 221000
2661
- },
2662
- {
2663
- "epoch": 1.64,
2664
- "learning_rate": 8.963851136043634e-06,
2665
- "loss": 1.8367,
2666
- "step": 221500
2667
- },
2668
- {
2669
- "epoch": 1.65,
2670
- "learning_rate": 8.871218745831543e-06,
2671
- "loss": 1.8178,
2672
- "step": 222000
2673
- },
2674
- {
2675
- "epoch": 1.65,
2676
- "learning_rate": 8.778586355619452e-06,
2677
- "loss": 1.827,
2678
- "step": 222500
2679
- },
2680
- {
2681
- "epoch": 1.65,
2682
- "learning_rate": 8.685953965407361e-06,
2683
- "loss": 1.8176,
2684
- "step": 223000
2685
- },
2686
- {
2687
- "epoch": 1.66,
2688
- "learning_rate": 8.593321575195269e-06,
2689
- "loss": 1.8043,
2690
- "step": 223500
2691
- },
2692
- {
2693
- "epoch": 1.66,
2694
- "learning_rate": 8.500689184983178e-06,
2695
- "loss": 1.824,
2696
- "step": 224000
2697
- },
2698
- {
2699
- "epoch": 1.66,
2700
- "learning_rate": 8.408056794771087e-06,
2701
- "loss": 1.8549,
2702
- "step": 224500
2703
- },
2704
- {
2705
- "epoch": 1.67,
2706
- "learning_rate": 8.315424404558996e-06,
2707
- "loss": 1.8264,
2708
- "step": 225000
2709
- },
2710
- {
2711
- "epoch": 1.67,
2712
- "learning_rate": 8.222792014346905e-06,
2713
- "loss": 1.8622,
2714
- "step": 225500
2715
- },
2716
- {
2717
- "epoch": 1.67,
2718
- "learning_rate": 8.130159624134814e-06,
2719
- "loss": 1.8566,
2720
- "step": 226000
2721
- },
2722
- {
2723
- "epoch": 1.68,
2724
- "learning_rate": 8.037527233922724e-06,
2725
- "loss": 1.8803,
2726
- "step": 226500
2727
- },
2728
- {
2729
- "epoch": 1.68,
2730
- "learning_rate": 7.944894843710633e-06,
2731
- "loss": 1.8482,
2732
- "step": 227000
2733
- },
2734
- {
2735
- "epoch": 1.69,
2736
- "learning_rate": 7.85226245349854e-06,
2737
- "loss": 1.8063,
2738
- "step": 227500
2739
- },
2740
- {
2741
- "epoch": 1.69,
2742
- "learning_rate": 7.75963006328645e-06,
2743
- "loss": 1.8298,
2744
- "step": 228000
2745
- },
2746
- {
2747
- "epoch": 1.69,
2748
- "learning_rate": 7.666997673074358e-06,
2749
- "loss": 1.8222,
2750
- "step": 228500
2751
- },
2752
- {
2753
- "epoch": 1.7,
2754
- "learning_rate": 7.574365282862267e-06,
2755
- "loss": 1.8158,
2756
- "step": 229000
2757
- },
2758
- {
2759
- "epoch": 1.7,
2760
- "learning_rate": 7.481732892650176e-06,
2761
- "loss": 1.8452,
2762
- "step": 229500
2763
- },
2764
- {
2765
- "epoch": 1.7,
2766
- "learning_rate": 7.389100502438084e-06,
2767
- "loss": 1.8256,
2768
- "step": 230000
2769
- },
2770
- {
2771
- "epoch": 1.71,
2772
- "learning_rate": 7.296468112225993e-06,
2773
- "loss": 1.7952,
2774
- "step": 230500
2775
- },
2776
- {
2777
- "epoch": 1.71,
2778
- "learning_rate": 7.203835722013903e-06,
2779
- "loss": 1.8425,
2780
- "step": 231000
2781
- },
2782
- {
2783
- "epoch": 1.72,
2784
- "learning_rate": 7.111203331801812e-06,
2785
- "loss": 1.838,
2786
- "step": 231500
2787
- },
2788
- {
2789
- "epoch": 1.72,
2790
- "learning_rate": 7.018570941589721e-06,
2791
- "loss": 1.8279,
2792
- "step": 232000
2793
- },
2794
- {
2795
- "epoch": 1.72,
2796
- "learning_rate": 6.925938551377629e-06,
2797
- "loss": 1.8274,
2798
- "step": 232500
2799
- },
2800
- {
2801
- "epoch": 1.73,
2802
- "learning_rate": 6.833306161165538e-06,
2803
- "loss": 1.8322,
2804
- "step": 233000
2805
- },
2806
- {
2807
- "epoch": 1.73,
2808
- "learning_rate": 6.7406737709534474e-06,
2809
- "loss": 1.8451,
2810
- "step": 233500
2811
- },
2812
- {
2813
- "epoch": 1.73,
2814
- "learning_rate": 6.648041380741356e-06,
2815
- "loss": 1.8469,
2816
- "step": 234000
2817
- },
2818
- {
2819
- "epoch": 1.74,
2820
- "learning_rate": 6.555408990529265e-06,
2821
- "loss": 1.8336,
2822
- "step": 234500
2823
- },
2824
- {
2825
- "epoch": 1.74,
2826
- "learning_rate": 6.462776600317173e-06,
2827
- "loss": 1.8538,
2828
- "step": 235000
2829
- },
2830
- {
2831
- "epoch": 1.75,
2832
- "learning_rate": 6.370144210105082e-06,
2833
- "loss": 1.8638,
2834
- "step": 235500
2835
- },
2836
- {
2837
- "epoch": 1.75,
2838
- "learning_rate": 6.277511819892991e-06,
2839
- "loss": 1.8497,
2840
- "step": 236000
2841
- },
2842
- {
2843
- "epoch": 1.75,
2844
- "learning_rate": 6.184879429680901e-06,
2845
- "loss": 1.8547,
2846
- "step": 236500
2847
- },
2848
- {
2849
- "epoch": 1.76,
2850
- "learning_rate": 6.092247039468809e-06,
2851
- "loss": 1.8248,
2852
- "step": 237000
2853
- },
2854
- {
2855
- "epoch": 1.76,
2856
- "learning_rate": 5.999614649256718e-06,
2857
- "loss": 1.8354,
2858
- "step": 237500
2859
- },
2860
- {
2861
- "epoch": 1.76,
2862
- "learning_rate": 5.906982259044627e-06,
2863
- "loss": 1.8425,
2864
- "step": 238000
2865
- },
2866
- {
2867
- "epoch": 1.77,
2868
- "learning_rate": 5.8143498688325356e-06,
2869
- "loss": 1.8571,
2870
- "step": 238500
2871
- },
2872
- {
2873
- "epoch": 1.77,
2874
- "learning_rate": 5.721717478620445e-06,
2875
- "loss": 1.8436,
2876
- "step": 239000
2877
- },
2878
- {
2879
- "epoch": 1.77,
2880
- "learning_rate": 5.629085088408354e-06,
2881
- "loss": 1.863,
2882
- "step": 239500
2883
- },
2884
- {
2885
- "epoch": 1.78,
2886
- "learning_rate": 5.536452698196262e-06,
2887
- "loss": 1.8557,
2888
- "step": 240000
2889
- },
2890
- {
2891
- "epoch": 1.78,
2892
- "learning_rate": 5.443820307984171e-06,
2893
- "loss": 1.8647,
2894
- "step": 240500
2895
- },
2896
- {
2897
- "epoch": 1.79,
2898
- "learning_rate": 5.3511879177720805e-06,
2899
- "loss": 1.8469,
2900
- "step": 241000
2901
- },
2902
- {
2903
- "epoch": 1.79,
2904
- "learning_rate": 5.258555527559989e-06,
2905
- "loss": 1.8427,
2906
- "step": 241500
2907
- },
2908
- {
2909
- "epoch": 1.79,
2910
- "learning_rate": 5.165923137347898e-06,
2911
- "loss": 1.8385,
2912
- "step": 242000
2913
- },
2914
- {
2915
- "epoch": 1.8,
2916
- "learning_rate": 5.073290747135806e-06,
2917
- "loss": 1.8224,
2918
- "step": 242500
2919
- },
2920
- {
2921
- "epoch": 1.8,
2922
- "learning_rate": 4.980658356923715e-06,
2923
- "loss": 1.8266,
2924
- "step": 243000
2925
- },
2926
- {
2927
- "epoch": 1.8,
2928
- "learning_rate": 4.8880259667116246e-06,
2929
- "loss": 1.8374,
2930
- "step": 243500
2931
- },
2932
- {
2933
- "epoch": 1.81,
2934
- "learning_rate": 4.795393576499534e-06,
2935
- "loss": 1.8227,
2936
- "step": 244000
2937
- },
2938
- {
2939
- "epoch": 1.81,
2940
- "learning_rate": 4.702761186287442e-06,
2941
- "loss": 1.9014,
2942
- "step": 244500
2943
- },
2944
- {
2945
- "epoch": 1.82,
2946
- "learning_rate": 4.610128796075351e-06,
2947
- "loss": 1.8057,
2948
- "step": 245000
2949
- },
2950
- {
2951
- "epoch": 1.82,
2952
- "learning_rate": 4.5174964058632595e-06,
2953
- "loss": 1.8626,
2954
- "step": 245500
2955
- },
2956
- {
2957
- "epoch": 1.82,
2958
- "learning_rate": 4.424864015651169e-06,
2959
- "loss": 1.8353,
2960
- "step": 246000
2961
- },
2962
- {
2963
- "epoch": 1.83,
2964
- "learning_rate": 4.332231625439078e-06,
2965
- "loss": 1.8497,
2966
- "step": 246500
2967
- },
2968
- {
2969
- "epoch": 1.83,
2970
- "learning_rate": 4.239599235226987e-06,
2971
- "loss": 1.8413,
2972
- "step": 247000
2973
- },
2974
- {
2975
- "epoch": 1.83,
2976
- "learning_rate": 4.146966845014895e-06,
2977
- "loss": 1.8455,
2978
- "step": 247500
2979
- },
2980
- {
2981
- "epoch": 1.84,
2982
- "learning_rate": 4.054334454802804e-06,
2983
- "loss": 1.8563,
2984
- "step": 248000
2985
- },
2986
- {
2987
- "epoch": 1.84,
2988
- "learning_rate": 3.961702064590713e-06,
2989
- "loss": 1.8326,
2990
- "step": 248500
2991
- },
2992
- {
2993
- "epoch": 1.85,
2994
- "learning_rate": 3.869069674378622e-06,
2995
- "loss": 1.862,
2996
- "step": 249000
2997
- },
2998
- {
2999
- "epoch": 1.85,
3000
- "learning_rate": 3.7764372841665314e-06,
3001
- "loss": 1.8478,
3002
- "step": 249500
3003
- },
3004
- {
3005
- "epoch": 1.85,
3006
- "learning_rate": 3.68380489395444e-06,
3007
- "loss": 1.8536,
3008
- "step": 250000
3009
- },
3010
- {
3011
- "epoch": 1.86,
3012
- "learning_rate": 3.591172503742349e-06,
3013
- "loss": 1.8461,
3014
- "step": 250500
3015
- },
3016
- {
3017
- "epoch": 1.86,
3018
- "learning_rate": 3.4985401135302576e-06,
3019
- "loss": 1.8516,
3020
- "step": 251000
3021
- },
3022
- {
3023
- "epoch": 1.86,
3024
- "learning_rate": 3.4059077233181664e-06,
3025
- "loss": 1.8275,
3026
- "step": 251500
3027
- },
3028
- {
3029
- "epoch": 1.87,
3030
- "learning_rate": 3.313275333106075e-06,
3031
- "loss": 1.842,
3032
- "step": 252000
3033
- },
3034
- {
3035
- "epoch": 1.87,
3036
- "learning_rate": 3.2206429428939847e-06,
3037
- "loss": 1.8754,
3038
- "step": 252500
3039
- },
3040
- {
3041
- "epoch": 1.87,
3042
- "learning_rate": 3.1280105526818934e-06,
3043
- "loss": 1.8505,
3044
- "step": 253000
3045
- },
3046
- {
3047
- "epoch": 1.88,
3048
- "learning_rate": 3.035378162469802e-06,
3049
- "loss": 1.8354,
3050
- "step": 253500
3051
- },
3052
- {
3053
- "epoch": 1.88,
3054
- "learning_rate": 2.942745772257711e-06,
3055
- "loss": 1.871,
3056
- "step": 254000
3057
- },
3058
- {
3059
- "epoch": 1.89,
3060
- "learning_rate": 2.85011338204562e-06,
3061
- "loss": 1.8274,
3062
- "step": 254500
3063
- },
3064
- {
3065
- "epoch": 1.89,
3066
- "learning_rate": 2.7574809918335287e-06,
3067
- "loss": 1.8193,
3068
- "step": 255000
3069
- },
3070
- {
3071
- "epoch": 1.89,
3072
- "learning_rate": 2.6648486016214375e-06,
3073
- "loss": 1.8336,
3074
- "step": 255500
3075
- },
3076
- {
3077
- "epoch": 1.9,
3078
- "learning_rate": 2.5722162114093466e-06,
3079
- "loss": 1.8599,
3080
- "step": 256000
3081
- },
3082
- {
3083
- "epoch": 1.9,
3084
- "learning_rate": 2.4795838211972554e-06,
3085
- "loss": 1.8584,
3086
- "step": 256500
3087
- },
3088
- {
3089
- "epoch": 1.9,
3090
- "learning_rate": 2.386951430985164e-06,
3091
- "loss": 1.8074,
3092
- "step": 257000
3093
- },
3094
- {
3095
- "epoch": 1.91,
3096
- "learning_rate": 2.2943190407730732e-06,
3097
- "loss": 1.8509,
3098
- "step": 257500
3099
- },
3100
- {
3101
- "epoch": 1.91,
3102
- "learning_rate": 2.201686650560982e-06,
3103
- "loss": 1.8579,
3104
- "step": 258000
3105
- },
3106
- {
3107
- "epoch": 1.92,
3108
- "learning_rate": 2.1090542603488907e-06,
3109
- "loss": 1.8529,
3110
- "step": 258500
3111
- },
3112
- {
3113
- "epoch": 1.92,
3114
- "learning_rate": 2.0164218701368e-06,
3115
- "loss": 1.8535,
3116
- "step": 259000
3117
- },
3118
- {
3119
- "epoch": 1.92,
3120
- "learning_rate": 1.9237894799247086e-06,
3121
- "loss": 1.899,
3122
- "step": 259500
3123
- },
3124
- {
3125
- "epoch": 1.93,
3126
- "learning_rate": 1.8311570897126173e-06,
3127
- "loss": 1.8707,
3128
- "step": 260000
3129
- },
3130
- {
3131
- "epoch": 1.93,
3132
- "learning_rate": 1.7385246995005263e-06,
3133
- "loss": 1.8482,
3134
- "step": 260500
3135
- },
3136
- {
3137
- "epoch": 1.93,
3138
- "learning_rate": 1.645892309288435e-06,
3139
- "loss": 1.8445,
3140
- "step": 261000
3141
- },
3142
- {
3143
- "epoch": 1.94,
3144
- "learning_rate": 1.553259919076344e-06,
3145
- "loss": 1.8472,
3146
- "step": 261500
3147
- },
3148
- {
3149
- "epoch": 1.94,
3150
- "learning_rate": 1.4606275288642529e-06,
3151
- "loss": 1.838,
3152
- "step": 262000
3153
- },
3154
- {
3155
- "epoch": 1.95,
3156
- "learning_rate": 1.3679951386521616e-06,
3157
- "loss": 1.8691,
3158
- "step": 262500
3159
- },
3160
- {
3161
- "epoch": 1.95,
3162
- "learning_rate": 1.2753627484400705e-06,
3163
- "loss": 1.8888,
3164
- "step": 263000
3165
- },
3166
- {
3167
- "epoch": 1.95,
3168
- "learning_rate": 1.1827303582279795e-06,
3169
- "loss": 1.8937,
3170
- "step": 263500
3171
- },
3172
- {
3173
- "epoch": 1.96,
3174
- "learning_rate": 1.0900979680158882e-06,
3175
- "loss": 1.8475,
3176
- "step": 264000
3177
- },
3178
- {
3179
- "epoch": 1.96,
3180
- "learning_rate": 9.974655778037972e-07,
3181
- "loss": 1.8725,
3182
- "step": 264500
3183
- },
3184
- {
3185
- "epoch": 1.96,
3186
- "learning_rate": 9.048331875917061e-07,
3187
- "loss": 1.8697,
3188
- "step": 265000
3189
- },
3190
- {
3191
- "epoch": 1.97,
3192
- "learning_rate": 8.12200797379615e-07,
3193
- "loss": 1.8696,
3194
- "step": 265500
3195
- },
3196
- {
3197
- "epoch": 1.97,
3198
- "learning_rate": 7.195684071675239e-07,
3199
- "loss": 1.852,
3200
- "step": 266000
3201
- },
3202
- {
3203
- "epoch": 1.97,
3204
- "learning_rate": 6.269360169554327e-07,
3205
- "loss": 1.8908,
3206
- "step": 266500
3207
- },
3208
- {
3209
- "epoch": 1.98,
3210
- "learning_rate": 5.343036267433415e-07,
3211
- "loss": 1.8715,
3212
- "step": 267000
3213
- },
3214
- {
3215
- "epoch": 1.98,
3216
- "learning_rate": 4.416712365312505e-07,
3217
- "loss": 1.8736,
3218
- "step": 267500
3219
- },
3220
- {
3221
- "epoch": 1.99,
3222
- "learning_rate": 3.490388463191593e-07,
3223
- "loss": 1.8706,
3224
- "step": 268000
3225
- },
3226
- {
3227
- "epoch": 1.99,
3228
- "learning_rate": 2.564064561070682e-07,
3229
- "loss": 1.8902,
3230
- "step": 268500
3231
- },
3232
- {
3233
- "epoch": 1.99,
3234
- "learning_rate": 1.637740658949771e-07,
3235
- "loss": 1.8956,
3236
- "step": 269000
3237
- },
3238
- {
3239
- "epoch": 2.0,
3240
- "learning_rate": 7.114167568288598e-08,
3241
- "loss": 1.8512,
3242
- "step": 269500
3243
- },
3244
- {
3245
- "epoch": 2.0,
3246
- "step": 269884,
3247
- "total_flos": 7.103408724277985e+17,
3248
- "train_loss": 0.612412237187797,
3249
- "train_runtime": 12901.2681,
3250
- "train_samples_per_second": 20.919,
3251
- "train_steps_per_second": 20.919
3252
  }
3253
  ],
3254
- "max_steps": 269884,
3255
- "num_train_epochs": 2,
3256
- "total_flos": 7.103408724277985e+17,
3257
  "trial_name": null,
3258
  "trial_params": null
3259
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "global_step": 168680,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
+ "learning_rate": 4.9925895186151295e-05,
13
+ "loss": 1.6378,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.03,
18
+ "learning_rate": 4.985179037230259e-05,
19
+ "loss": 1.6723,
20
  "step": 1000
21
  },
22
  {
23
  "epoch": 0.04,
24
+ "learning_rate": 4.977768555845388e-05,
25
+ "loss": 1.6694,
26
  "step": 1500
27
  },
28
  {
29
  "epoch": 0.06,
30
+ "learning_rate": 4.970358074460517e-05,
31
+ "loss": 1.6685,
32
  "step": 2000
33
  },
34
  {
35
  "epoch": 0.07,
36
+ "learning_rate": 4.9629475930756463e-05,
37
+ "loss": 1.6812,
38
  "step": 2500
39
  },
40
  {
41
  "epoch": 0.09,
42
+ "learning_rate": 4.9555371116907756e-05,
43
+ "loss": 1.6792,
44
  "step": 3000
45
  },
46
  {
47
  "epoch": 0.1,
48
+ "learning_rate": 4.948126630305905e-05,
49
+ "loss": 1.6873,
50
  "step": 3500
51
  },
52
  {
53
  "epoch": 0.12,
54
+ "learning_rate": 4.940716148921034e-05,
55
+ "loss": 1.7036,
56
  "step": 4000
57
  },
58
  {
59
  "epoch": 0.13,
60
  "learning_rate": 4.866611335072326e-05,
61
+ "loss": 1.7003,
62
  "step": 4500
63
  },
64
  {
65
  "epoch": 0.15,
66
  "learning_rate": 4.8517903723025847e-05,
67
+ "loss": 1.6828,
68
  "step": 5000
69
  },
70
  {
71
  "epoch": 0.16,
72
  "learning_rate": 4.836969409532843e-05,
73
+ "loss": 1.6904,
74
  "step": 5500
75
  },
76
  {
77
  "epoch": 0.18,
78
  "learning_rate": 4.8221484467631015e-05,
79
+ "loss": 1.6975,
80
  "step": 6000
81
  },
82
  {
83
  "epoch": 0.19,
84
  "learning_rate": 4.80732748399336e-05,
85
+ "loss": 1.6904,
86
  "step": 6500
87
  },
88
  {
89
  "epoch": 0.21,
90
  "learning_rate": 4.7925065212236184e-05,
91
+ "loss": 1.6979,
92
  "step": 7000
93
  },
94
  {
95
  "epoch": 0.22,
96
  "learning_rate": 4.7776855584538775e-05,
97
+ "loss": 1.6774,
98
  "step": 7500
99
  },
100
  {
101
  "epoch": 0.24,
102
  "learning_rate": 4.762864595684136e-05,
103
+ "loss": 1.6976,
104
  "step": 8000
105
  },
106
  {
107
  "epoch": 0.25,
108
  "learning_rate": 4.7480436329143944e-05,
109
+ "loss": 1.715,
110
  "step": 8500
111
  },
112
  {
113
  "epoch": 0.27,
114
  "learning_rate": 4.733222670144653e-05,
115
+ "loss": 1.7125,
116
  "step": 9000
117
  },
118
  {
119
  "epoch": 0.28,
120
  "learning_rate": 4.718401707374911e-05,
121
+ "loss": 1.7065,
122
  "step": 9500
123
  },
124
  {
125
  "epoch": 0.3,
126
  "learning_rate": 4.70358074460517e-05,
127
+ "loss": 1.7062,
128
  "step": 10000
129
  },
130
  {
131
  "epoch": 0.31,
132
  "learning_rate": 4.688759781835428e-05,
133
+ "loss": 1.7038,
134
  "step": 10500
135
  },
136
  {
137
  "epoch": 0.33,
138
  "learning_rate": 4.6739388190656866e-05,
139
+ "loss": 1.7092,
140
  "step": 11000
141
  },
142
  {
143
  "epoch": 0.34,
144
  "learning_rate": 4.659117856295945e-05,
145
+ "loss": 1.7377,
146
  "step": 11500
147
  },
148
  {
149
  "epoch": 0.36,
150
  "learning_rate": 4.6442968935262035e-05,
151
+ "loss": 1.7306,
152
  "step": 12000
153
  },
154
  {
155
  "epoch": 0.37,
156
  "learning_rate": 4.629475930756462e-05,
157
+ "loss": 1.7128,
158
  "step": 12500
159
  },
160
  {
161
  "epoch": 0.39,
162
  "learning_rate": 4.6146549679867204e-05,
163
+ "loss": 1.7324,
164
  "step": 13000
165
  },
166
  {
167
  "epoch": 0.4,
168
  "learning_rate": 4.599834005216979e-05,
169
+ "loss": 1.7483,
170
  "step": 13500
171
  },
172
  {
173
  "epoch": 0.41,
174
  "learning_rate": 4.585013042447238e-05,
175
+ "loss": 1.7351,
176
  "step": 14000
177
  },
178
  {
179
  "epoch": 0.43,
180
  "learning_rate": 4.5701920796774964e-05,
181
+ "loss": 1.7238,
182
  "step": 14500
183
  },
184
  {
185
  "epoch": 0.44,
186
  "learning_rate": 4.555371116907755e-05,
187
+ "loss": 1.7315,
188
  "step": 15000
189
  },
190
  {
191
  "epoch": 0.46,
192
  "learning_rate": 4.540550154138013e-05,
193
+ "loss": 1.7391,
194
  "step": 15500
195
  },
196
  {
197
  "epoch": 0.47,
198
  "learning_rate": 4.525729191368272e-05,
199
+ "loss": 1.7384,
200
  "step": 16000
201
  },
202
  {
203
  "epoch": 0.49,
204
  "learning_rate": 4.51090822859853e-05,
205
+ "loss": 1.7613,
206
  "step": 16500
207
  },
208
  {
209
  "epoch": 0.5,
210
  "learning_rate": 4.4960872658287886e-05,
211
+ "loss": 1.7558,
212
  "step": 17000
213
  },
214
  {
215
  "epoch": 0.52,
216
  "learning_rate": 4.481266303059047e-05,
217
+ "loss": 1.7353,
218
  "step": 17500
219
  },
220
  {
221
  "epoch": 0.53,
222
  "learning_rate": 4.4664453402893055e-05,
223
+ "loss": 1.7513,
224
  "step": 18000
225
  },
226
  {
227
  "epoch": 0.55,
228
  "learning_rate": 4.451624377519564e-05,
229
+ "loss": 1.7365,
230
  "step": 18500
231
  },
232
  {
233
  "epoch": 0.56,
234
  "learning_rate": 4.4368034147498224e-05,
235
+ "loss": 1.7428,
236
  "step": 19000
237
  },
238
  {
239
  "epoch": 0.58,
240
  "learning_rate": 4.421982451980081e-05,
241
+ "loss": 1.7352,
242
  "step": 19500
243
  },
244
  {
245
  "epoch": 0.59,
246
  "learning_rate": 4.407161489210339e-05,
247
+ "loss": 1.7561,
248
  "step": 20000
249
  },
250
  {
251
  "epoch": 0.61,
252
  "learning_rate": 4.392340526440598e-05,
253
+ "loss": 1.7478,
254
  "step": 20500
255
  },
256
  {
257
  "epoch": 0.62,
258
  "learning_rate": 4.377519563670856e-05,
259
+ "loss": 1.7504,
260
  "step": 21000
261
  },
262
  {
263
  "epoch": 0.64,
264
  "learning_rate": 4.3626986009011146e-05,
265
+ "loss": 1.7484,
266
  "step": 21500
267
  },
268
  {
269
  "epoch": 0.65,
270
  "learning_rate": 4.347877638131374e-05,
271
+ "loss": 1.7685,
272
  "step": 22000
273
  },
274
  {
275
  "epoch": 0.67,
276
  "learning_rate": 4.333056675361632e-05,
277
+ "loss": 1.743,
278
  "step": 22500
279
  },
280
  {
281
  "epoch": 0.68,
282
  "learning_rate": 4.3182357125918906e-05,
283
+ "loss": 1.7624,
284
  "step": 23000
285
  },
286
  {
287
  "epoch": 0.7,
288
  "learning_rate": 4.303414749822149e-05,
289
+ "loss": 1.7769,
290
  "step": 23500
291
  },
292
  {
293
  "epoch": 0.71,
294
  "learning_rate": 4.2885937870524075e-05,
295
+ "loss": 1.7504,
296
  "step": 24000
297
  },
298
  {
299
  "epoch": 0.73,
300
  "learning_rate": 4.273772824282666e-05,
301
+ "loss": 1.7516,
302
  "step": 24500
303
  },
304
  {
305
  "epoch": 0.74,
306
  "learning_rate": 4.258951861512924e-05,
307
+ "loss": 1.7763,
308
  "step": 25000
309
  },
310
  {
311
  "epoch": 0.76,
312
  "learning_rate": 4.244130898743183e-05,
313
+ "loss": 1.7626,
314
  "step": 25500
315
  },
316
  {
317
  "epoch": 0.77,
318
  "learning_rate": 4.229309935973441e-05,
319
+ "loss": 1.7935,
320
  "step": 26000
321
  },
322
  {
323
  "epoch": 0.79,
324
  "learning_rate": 4.2144889732036997e-05,
325
+ "loss": 1.7821,
326
  "step": 26500
327
  },
328
  {
329
  "epoch": 0.8,
330
  "learning_rate": 4.199668010433958e-05,
331
+ "loss": 1.7785,
332
  "step": 27000
333
  },
334
  {
335
  "epoch": 0.82,
336
  "learning_rate": 4.1848470476642165e-05,
337
+ "loss": 1.7732,
338
  "step": 27500
339
  },
340
  {
341
  "epoch": 0.83,
342
  "learning_rate": 4.170026084894475e-05,
343
+ "loss": 1.7696,
344
  "step": 28000
345
  },
346
  {
347
  "epoch": 0.84,
348
  "learning_rate": 4.1552051221247334e-05,
349
+ "loss": 1.7821,
350
  "step": 28500
351
  },
352
  {
353
  "epoch": 0.86,
354
  "learning_rate": 4.140384159354992e-05,
355
+ "loss": 1.7855,
356
  "step": 29000
357
  },
358
  {
359
  "epoch": 0.87,
360
  "learning_rate": 4.12556319658525e-05,
361
+ "loss": 1.7815,
362
  "step": 29500
363
  },
364
  {
365
  "epoch": 0.89,
366
  "learning_rate": 4.110742233815509e-05,
367
+ "loss": 1.7776,
368
  "step": 30000
369
  },
370
  {
371
  "epoch": 0.9,
372
  "learning_rate": 4.095921271045767e-05,
373
+ "loss": 1.7864,
374
  "step": 30500
375
  },
376
  {
377
  "epoch": 0.92,
378
  "learning_rate": 4.0811003082760256e-05,
379
+ "loss": 1.7894,
380
  "step": 31000
381
  },
382
  {
383
  "epoch": 0.93,
384
  "learning_rate": 4.066279345506284e-05,
385
+ "loss": 1.7879,
386
  "step": 31500
387
  },
388
  {
389
  "epoch": 0.95,
390
  "learning_rate": 4.0514583827365425e-05,
391
+ "loss": 1.7928,
392
  "step": 32000
393
  },
394
  {
395
  "epoch": 0.96,
396
  "learning_rate": 4.036637419966801e-05,
397
+ "loss": 1.7965,
398
  "step": 32500
399
  },
400
  {
401
  "epoch": 0.98,
402
  "learning_rate": 4.0218164571970594e-05,
403
+ "loss": 1.7906,
404
  "step": 33000
405
  },
406
  {
407
  "epoch": 0.99,
408
  "learning_rate": 4.006995494427318e-05,
409
+ "loss": 1.8053,
410
  "step": 33500
411
  },
412
  {
413
  "epoch": 1.01,
414
  "learning_rate": 3.992174531657576e-05,
415
+ "loss": 1.6997,
416
  "step": 34000
417
  },
418
  {
419
  "epoch": 1.02,
420
  "learning_rate": 3.977353568887835e-05,
421
+ "loss": 1.627,
422
  "step": 34500
423
  },
424
  {
425
  "epoch": 1.04,
426
  "learning_rate": 3.962532606118093e-05,
427
+ "loss": 1.6205,
428
  "step": 35000
429
  },
430
  {
431
  "epoch": 1.05,
432
  "learning_rate": 3.9477116433483516e-05,
433
+ "loss": 1.624,
434
  "step": 35500
435
  },
436
  {
437
  "epoch": 1.07,
438
  "learning_rate": 3.93289068057861e-05,
439
+ "loss": 1.6387,
440
  "step": 36000
441
  },
442
  {
443
  "epoch": 1.08,
444
  "learning_rate": 3.9180697178088685e-05,
445
+ "loss": 1.632,
446
  "step": 36500
447
  },
448
  {
449
  "epoch": 1.1,
450
  "learning_rate": 3.9032487550391276e-05,
451
+ "loss": 1.6339,
452
  "step": 37000
453
  },
454
  {
455
  "epoch": 1.11,
456
  "learning_rate": 3.888427792269386e-05,
457
+ "loss": 1.6487,
458
  "step": 37500
459
  },
460
  {
461
  "epoch": 1.13,
462
  "learning_rate": 3.8736068294996445e-05,
463
+ "loss": 1.6449,
464
  "step": 38000
465
  },
466
  {
467
  "epoch": 1.14,
468
  "learning_rate": 3.858785866729903e-05,
469
+ "loss": 1.6598,
470
  "step": 38500
471
  },
472
  {
473
  "epoch": 1.16,
474
  "learning_rate": 3.8439649039601614e-05,
475
+ "loss": 1.6621,
476
  "step": 39000
477
  },
478
  {
479
  "epoch": 1.17,
480
  "learning_rate": 3.82914394119042e-05,
481
+ "loss": 1.6735,
482
  "step": 39500
483
  },
484
  {
485
  "epoch": 1.19,
486
  "learning_rate": 3.814322978420678e-05,
487
+ "loss": 1.665,
488
  "step": 40000
489
  },
490
  {
491
  "epoch": 1.2,
492
  "learning_rate": 3.799502015650937e-05,
493
+ "loss": 1.6542,
494
  "step": 40500
495
  },
496
  {
497
  "epoch": 1.22,
498
  "learning_rate": 3.784681052881195e-05,
499
+ "loss": 1.6632,
500
  "step": 41000
501
  },
502
  {
503
  "epoch": 1.23,
504
  "learning_rate": 3.7698600901114536e-05,
505
+ "loss": 1.6096,
506
  "step": 41500
507
  },
508
  {
509
  "epoch": 1.24,
510
  "learning_rate": 3.755039127341713e-05,
511
+ "loss": 1.5457,
512
  "step": 42000
513
  },
514
  {
515
  "epoch": 1.26,
516
  "learning_rate": 3.740218164571971e-05,
517
+ "loss": 1.538,
518
  "step": 42500
519
  },
520
  {
521
  "epoch": 1.27,
522
  "learning_rate": 3.7253972018022296e-05,
523
+ "loss": 1.5538,
524
  "step": 43000
525
  },
526
  {
527
  "epoch": 1.29,
528
  "learning_rate": 3.710576239032488e-05,
529
+ "loss": 1.5505,
530
  "step": 43500
531
  },
532
  {
533
  "epoch": 1.3,
534
  "learning_rate": 3.6957552762627465e-05,
535
+ "loss": 1.5386,
536
  "step": 44000
537
  },
538
  {
539
  "epoch": 1.32,
540
  "learning_rate": 3.680934313493005e-05,
541
+ "loss": 1.5629,
542
  "step": 44500
543
  },
544
  {
545
  "epoch": 1.33,
546
  "learning_rate": 3.6661133507232633e-05,
547
+ "loss": 1.5639,
548
  "step": 45000
549
  },
550
  {
551
  "epoch": 1.35,
552
  "learning_rate": 3.651292387953522e-05,
553
+ "loss": 1.6376,
554
  "step": 45500
555
  },
556
  {
557
  "epoch": 1.36,
558
  "learning_rate": 3.63647142518378e-05,
559
+ "loss": 1.6273,
560
  "step": 46000
561
  },
562
  {
563
  "epoch": 1.38,
564
  "learning_rate": 3.621650462414039e-05,
565
+ "loss": 1.6433,
566
  "step": 46500
567
  },
568
  {
569
  "epoch": 1.39,
570
  "learning_rate": 3.606829499644297e-05,
571
+ "loss": 1.6421,
572
  "step": 47000
573
  },
574
  {
575
  "epoch": 1.41,
576
  "learning_rate": 3.5920085368745556e-05,
577
+ "loss": 1.6529,
578
  "step": 47500
579
  },
580
  {
581
  "epoch": 1.42,
582
  "learning_rate": 3.577187574104814e-05,
583
+ "loss": 1.6501,
584
  "step": 48000
585
  },
586
  {
587
  "epoch": 1.44,
588
  "learning_rate": 3.5623666113350724e-05,
589
+ "loss": 1.6568,
590
  "step": 48500
591
  },
592
  {
593
  "epoch": 1.45,
594
  "learning_rate": 3.547545648565331e-05,
595
+ "loss": 1.6454,
596
  "step": 49000
597
  },
598
  {
599
  "epoch": 1.47,
600
  "learning_rate": 3.532724685795589e-05,
601
+ "loss": 1.6698,
602
  "step": 49500
603
  },
604
  {
605
  "epoch": 1.48,
606
  "learning_rate": 3.517903723025848e-05,
607
+ "loss": 1.672,
608
  "step": 50000
609
  },
610
  {
611
  "epoch": 1.5,
612
  "learning_rate": 3.503082760256106e-05,
613
+ "loss": 1.6761,
614
  "step": 50500
615
  },
616
  {
617
  "epoch": 1.51,
618
  "learning_rate": 3.4882617974863646e-05,
619
+ "loss": 1.6686,
620
  "step": 51000
621
  },
622
  {
623
  "epoch": 1.53,
624
  "learning_rate": 3.473440834716624e-05,
625
+ "loss": 1.6736,
626
  "step": 51500
627
  },
628
  {
629
  "epoch": 1.54,
630
  "learning_rate": 3.458619871946882e-05,
631
+ "loss": 1.6939,
632
  "step": 52000
633
  },
634
  {
635
  "epoch": 1.56,
636
  "learning_rate": 3.4437989091771406e-05,
637
+ "loss": 1.6905,
638
  "step": 52500
639
  },
640
  {
641
  "epoch": 1.57,
642
  "learning_rate": 3.428977946407399e-05,
643
+ "loss": 1.7088,
644
  "step": 53000
645
  },
646
  {
647
  "epoch": 1.59,
648
  "learning_rate": 3.4141569836376575e-05,
649
+ "loss": 1.7015,
650
  "step": 53500
651
  },
652
  {
653
  "epoch": 1.6,
654
  "learning_rate": 3.399336020867916e-05,
655
+ "loss": 1.7107,
656
  "step": 54000
657
  },
658
  {
659
  "epoch": 1.62,
660
  "learning_rate": 3.3845150580981744e-05,
661
+ "loss": 1.705,
662
  "step": 54500
663
  },
664
  {
665
  "epoch": 1.63,
666
  "learning_rate": 3.369694095328433e-05,
667
+ "loss": 1.71,
668
  "step": 55000
669
  },
670
  {
671
  "epoch": 1.65,
672
  "learning_rate": 3.354873132558691e-05,
673
+ "loss": 1.7015,
674
  "step": 55500
675
  },
676
  {
677
  "epoch": 1.66,
678
  "learning_rate": 3.34005216978895e-05,
679
+ "loss": 1.6962,
680
  "step": 56000
681
  },
682
  {
683
  "epoch": 1.67,
684
  "learning_rate": 3.325231207019208e-05,
685
+ "loss": 1.7304,
686
  "step": 56500
687
  },
688
  {
689
  "epoch": 1.69,
690
  "learning_rate": 3.3104102442494666e-05,
691
+ "loss": 1.7234,
692
  "step": 57000
693
  },
694
  {
695
  "epoch": 1.7,
696
  "learning_rate": 3.295589281479725e-05,
697
+ "loss": 1.7124,
698
  "step": 57500
699
  },
700
  {
701
  "epoch": 1.72,
702
  "learning_rate": 3.2807683187099835e-05,
703
+ "loss": 1.7161,
704
  "step": 58000
705
  },
706
  {
707
  "epoch": 1.73,
708
  "learning_rate": 3.265947355940242e-05,
709
+ "loss": 1.7298,
710
  "step": 58500
711
  },
712
  {
713
  "epoch": 1.75,
714
  "learning_rate": 3.2511263931705004e-05,
715
+ "loss": 1.7463,
716
  "step": 59000
717
  },
718
  {
719
  "epoch": 1.76,
720
  "learning_rate": 3.236305430400759e-05,
721
+ "loss": 1.7358,
722
  "step": 59500
723
  },
724
  {
725
  "epoch": 1.78,
726
  "learning_rate": 3.221484467631017e-05,
727
+ "loss": 1.7545,
728
  "step": 60000
729
  },
730
  {
731
  "epoch": 1.79,
732
  "learning_rate": 3.206663504861276e-05,
733
+ "loss": 1.7504,
734
  "step": 60500
735
  },
736
  {
737
  "epoch": 1.81,
738
  "learning_rate": 3.191842542091534e-05,
739
+ "loss": 1.7291,
740
  "step": 61000
741
  },
742
  {
743
  "epoch": 1.82,
744
  "learning_rate": 3.1770215793217926e-05,
745
+ "loss": 1.7584,
746
  "step": 61500
747
  },
748
  {
749
  "epoch": 1.84,
750
  "learning_rate": 3.162200616552051e-05,
751
+ "loss": 1.7563,
752
  "step": 62000
753
  },
754
  {
755
  "epoch": 1.85,
756
  "learning_rate": 3.1473796537823095e-05,
757
+ "loss": 1.7621,
758
  "step": 62500
759
  },
760
  {
761
  "epoch": 1.87,
762
  "learning_rate": 3.132558691012568e-05,
763
+ "loss": 1.7567,
764
  "step": 63000
765
  },
766
  {
767
  "epoch": 1.88,
768
  "learning_rate": 3.1177377282428263e-05,
769
+ "loss": 1.7759,
770
  "step": 63500
771
  },
772
  {
773
  "epoch": 1.9,
774
  "learning_rate": 3.102916765473085e-05,
775
+ "loss": 1.7544,
776
  "step": 64000
777
  },
778
  {
779
  "epoch": 1.91,
780
  "learning_rate": 3.088095802703343e-05,
781
+ "loss": 1.7663,
782
  "step": 64500
783
  },
784
  {
785
  "epoch": 1.93,
786
  "learning_rate": 3.073274839933602e-05,
787
+ "loss": 1.7943,
788
  "step": 65000
789
  },
790
  {
791
  "epoch": 1.94,
792
  "learning_rate": 3.05845387716386e-05,
793
+ "loss": 1.7706,
794
  "step": 65500
795
  },
796
  {
797
  "epoch": 1.96,
798
  "learning_rate": 3.0436329143941196e-05,
799
+ "loss": 1.8033,
800
  "step": 66000
801
  },
802
  {
803
  "epoch": 1.97,
804
  "learning_rate": 3.028811951624378e-05,
805
+ "loss": 1.7962,
806
  "step": 66500
807
  },
808
  {
809
  "epoch": 1.99,
810
  "learning_rate": 3.0139909888546365e-05,
811
+ "loss": 1.8109,
812
  "step": 67000
813
  },
814
  {
815
  "epoch": 2.0,
816
  "learning_rate": 2.999170026084895e-05,
817
+ "loss": 1.7925,
818
  "step": 67500
819
  },
820
  {
821
  "epoch": 2.02,
822
  "learning_rate": 2.9843490633151533e-05,
823
+ "loss": 1.5786,
824
  "step": 68000
825
  },
826
  {
827
  "epoch": 2.03,
828
  "learning_rate": 2.9695281005454118e-05,
829
+ "loss": 1.5676,
830
  "step": 68500
831
  },
832
  {
833
  "epoch": 2.05,
834
  "learning_rate": 2.9547071377756702e-05,
835
+ "loss": 1.5703,
836
  "step": 69000
837
  },
838
  {
839
  "epoch": 2.06,
840
  "learning_rate": 2.9398861750059287e-05,
841
+ "loss": 1.566,
842
  "step": 69500
843
  },
844
  {
845
  "epoch": 2.07,
846
  "learning_rate": 2.925065212236187e-05,
847
+ "loss": 1.5811,
848
  "step": 70000
849
  },
850
  {
851
  "epoch": 2.09,
852
  "learning_rate": 2.9102442494664455e-05,
853
+ "loss": 1.5771,
854
  "step": 70500
855
  },
856
  {
857
  "epoch": 2.1,
858
  "learning_rate": 2.895423286696704e-05,
859
+ "loss": 1.5829,
860
  "step": 71000
861
  },
862
  {
863
  "epoch": 2.12,
864
  "learning_rate": 2.8806023239269624e-05,
865
+ "loss": 1.5809,
866
  "step": 71500
867
  },
868
  {
869
  "epoch": 2.13,
870
  "learning_rate": 2.865781361157221e-05,
871
+ "loss": 1.5957,
872
  "step": 72000
873
  },
874
  {
875
  "epoch": 2.15,
876
  "learning_rate": 2.8509603983874793e-05,
877
+ "loss": 1.5873,
878
  "step": 72500
879
  },
880
  {
881
  "epoch": 2.16,
882
  "learning_rate": 2.8361394356177377e-05,
883
+ "loss": 1.6051,
884
  "step": 73000
885
  },
886
  {
887
  "epoch": 2.18,
888
  "learning_rate": 2.8213184728479962e-05,
889
+ "loss": 1.6021,
890
  "step": 73500
891
  },
892
  {
893
  "epoch": 2.19,
894
  "learning_rate": 2.8064975100782546e-05,
895
+ "loss": 1.6071,
896
  "step": 74000
897
  },
898
  {
899
  "epoch": 2.21,
900
  "learning_rate": 2.7916765473085134e-05,
901
+ "loss": 1.6173,
902
  "step": 74500
903
  },
904
  {
905
  "epoch": 2.22,
906
  "learning_rate": 2.776855584538772e-05,
907
+ "loss": 1.6023,
908
  "step": 75000
909
  },
910
  {
911
  "epoch": 2.24,
912
  "learning_rate": 2.7620346217690303e-05,
913
+ "loss": 1.5913,
914
  "step": 75500
915
  },
916
  {
917
  "epoch": 2.25,
918
  "learning_rate": 2.7472136589992887e-05,
919
+ "loss": 1.6075,
920
  "step": 76000
921
  },
922
  {
923
  "epoch": 2.27,
924
  "learning_rate": 2.7323926962295472e-05,
925
+ "loss": 1.6212,
926
  "step": 76500
927
  },
928
  {
929
  "epoch": 2.28,
930
  "learning_rate": 2.7175717334598056e-05,
931
+ "loss": 1.6085,
932
  "step": 77000
933
  },
934
  {
935
  "epoch": 2.3,
936
  "learning_rate": 2.702750770690064e-05,
937
+ "loss": 1.6209,
938
  "step": 77500
939
  },
940
  {
941
  "epoch": 2.31,
942
  "learning_rate": 2.6879298079203225e-05,
943
+ "loss": 1.6171,
944
  "step": 78000
945
  },
946
  {
947
  "epoch": 2.33,
948
  "learning_rate": 2.673108845150581e-05,
949
+ "loss": 1.6225,
950
  "step": 78500
951
  },
952
  {
953
  "epoch": 2.34,
954
  "learning_rate": 2.6582878823808394e-05,
955
+ "loss": 1.6236,
956
  "step": 79000
957
  },
958
  {
959
  "epoch": 2.36,
960
  "learning_rate": 2.6434669196110978e-05,
961
+ "loss": 1.6109,
962
  "step": 79500
963
  },
964
  {
965
  "epoch": 2.37,
966
  "learning_rate": 2.6286459568413563e-05,
967
+ "loss": 1.6405,
968
  "step": 80000
969
  },
970
  {
971
  "epoch": 2.39,
972
  "learning_rate": 2.6138249940716147e-05,
973
+ "loss": 1.6367,
974
  "step": 80500
975
  },
976
  {
977
  "epoch": 2.4,
978
  "learning_rate": 2.5990040313018738e-05,
979
+ "loss": 1.6158,
980
  "step": 81000
981
  },
982
  {
983
  "epoch": 2.42,
984
  "learning_rate": 2.5841830685321323e-05,
985
+ "loss": 1.6349,
986
  "step": 81500
987
  },
988
  {
989
  "epoch": 2.43,
990
  "learning_rate": 2.5693621057623907e-05,
991
+ "loss": 1.6462,
992
  "step": 82000
993
  },
994
  {
995
  "epoch": 2.45,
996
  "learning_rate": 2.554541142992649e-05,
997
+ "loss": 1.6367,
998
  "step": 82500
999
  },
1000
  {
1001
  "epoch": 2.46,
1002
  "learning_rate": 2.5397201802229076e-05,
1003
+ "loss": 1.6175,
1004
  "step": 83000
1005
  },
1006
  {
1007
  "epoch": 2.48,
1008
  "learning_rate": 2.524899217453166e-05,
1009
+ "loss": 1.6294,
1010
  "step": 83500
1011
  },
1012
  {
1013
  "epoch": 2.49,
1014
  "learning_rate": 2.5100782546834245e-05,
1015
+ "loss": 1.6447,
1016
  "step": 84000
1017
  },
1018
  {
1019
  "epoch": 2.5,
1020
  "learning_rate": 2.495257291913683e-05,
1021
+ "loss": 1.6362,
1022
  "step": 84500
1023
  },
1024
  {
1025
  "epoch": 2.52,
1026
  "learning_rate": 2.4804363291439414e-05,
1027
+ "loss": 1.6515,
1028
  "step": 85000
1029
  },
1030
  {
1031
  "epoch": 2.53,
1032
  "learning_rate": 2.4656153663741998e-05,
1033
+ "loss": 1.6596,
1034
  "step": 85500
1035
  },
1036
  {
1037
  "epoch": 2.55,
1038
  "learning_rate": 2.4507944036044582e-05,
1039
+ "loss": 1.6572,
1040
  "step": 86000
1041
  },
1042
  {
1043
  "epoch": 2.56,
1044
  "learning_rate": 2.4359734408347167e-05,
1045
+ "loss": 1.6417,
1046
  "step": 86500
1047
  },
1048
  {
1049
  "epoch": 2.58,
1050
  "learning_rate": 2.421152478064975e-05,
1051
+ "loss": 1.656,
1052
  "step": 87000
1053
  },
1054
  {
1055
  "epoch": 2.59,
1056
  "learning_rate": 2.4063315152952336e-05,
1057
+ "loss": 1.6642,
1058
  "step": 87500
1059
  },
1060
  {
1061
  "epoch": 2.61,
1062
  "learning_rate": 2.391510552525492e-05,
1063
+ "loss": 1.6481,
1064
  "step": 88000
1065
  },
1066
  {
1067
  "epoch": 2.62,
1068
  "learning_rate": 2.3766895897557508e-05,
1069
+ "loss": 1.6528,
1070
  "step": 88500
1071
  },
1072
  {
1073
  "epoch": 2.64,
1074
  "learning_rate": 2.3618686269860092e-05,
1075
+ "loss": 1.64,
1076
  "step": 89000
1077
  },
1078
  {
1079
  "epoch": 2.65,
1080
  "learning_rate": 2.3470476642162677e-05,
1081
+ "loss": 1.6621,
1082
  "step": 89500
1083
  },
1084
  {
1085
  "epoch": 2.67,
1086
  "learning_rate": 2.332226701446526e-05,
1087
+ "loss": 1.6731,
1088
  "step": 90000
1089
  },
1090
  {
1091
  "epoch": 2.68,
1092
  "learning_rate": 2.3174057386767846e-05,
1093
+ "loss": 1.6672,
1094
  "step": 90500
1095
  },
1096
  {
1097
  "epoch": 2.7,
1098
  "learning_rate": 2.302584775907043e-05,
1099
+ "loss": 1.6476,
1100
  "step": 91000
1101
  },
1102
  {
1103
  "epoch": 2.71,
1104
  "learning_rate": 2.2877638131373014e-05,
1105
+ "loss": 1.6773,
1106
  "step": 91500
1107
  },
1108
  {
1109
  "epoch": 2.73,
1110
  "learning_rate": 2.2729428503675602e-05,
1111
+ "loss": 1.6671,
1112
  "step": 92000
1113
  },
1114
  {
1115
  "epoch": 2.74,
1116
  "learning_rate": 2.2581218875978187e-05,
1117
+ "loss": 1.6623,
1118
  "step": 92500
1119
  },
1120
  {
1121
  "epoch": 2.76,
1122
  "learning_rate": 2.243300924828077e-05,
1123
+ "loss": 1.6602,
1124
  "step": 93000
1125
  },
1126
  {
1127
  "epoch": 2.77,
1128
  "learning_rate": 2.2284799620583355e-05,
1129
+ "loss": 1.6765,
1130
  "step": 93500
1131
  },
1132
  {
1133
  "epoch": 2.79,
1134
  "learning_rate": 2.213658999288594e-05,
1135
+ "loss": 1.6874,
1136
  "step": 94000
1137
  },
1138
  {
1139
  "epoch": 2.8,
1140
  "learning_rate": 2.1988380365188524e-05,
1141
+ "loss": 1.6777,
1142
  "step": 94500
1143
  },
1144
  {
1145
  "epoch": 2.82,
1146
  "learning_rate": 2.184017073749111e-05,
1147
+ "loss": 1.6582,
1148
  "step": 95000
1149
  },
1150
  {
1151
  "epoch": 2.83,
1152
  "learning_rate": 2.1691961109793693e-05,
1153
+ "loss": 1.6663,
1154
  "step": 95500
1155
  },
1156
  {
1157
  "epoch": 2.85,
1158
  "learning_rate": 2.1543751482096277e-05,
1159
+ "loss": 1.6773,
1160
  "step": 96000
1161
  },
1162
  {
1163
  "epoch": 2.86,
1164
  "learning_rate": 2.1395541854398862e-05,
1165
+ "loss": 1.6781,
1166
  "step": 96500
1167
  },
1168
  {
1169
  "epoch": 2.88,
1170
  "learning_rate": 2.1247332226701446e-05,
1171
+ "loss": 1.6983,
1172
  "step": 97000
1173
  },
1174
  {
1175
  "epoch": 2.89,
1176
  "learning_rate": 2.109912259900403e-05,
1177
+ "loss": 1.6733,
1178
  "step": 97500
1179
  },
1180
  {
1181
  "epoch": 2.9,
1182
  "learning_rate": 2.0950912971306615e-05,
1183
+ "loss": 1.6931,
1184
  "step": 98000
1185
  },
1186
  {
1187
  "epoch": 2.92,
1188
  "learning_rate": 2.08027033436092e-05,
1189
+ "loss": 1.6948,
1190
  "step": 98500
1191
  },
1192
  {
1193
  "epoch": 2.93,
1194
  "learning_rate": 2.0654493715911787e-05,
1195
+ "loss": 1.6866,
1196
  "step": 99000
1197
  },
1198
  {
1199
  "epoch": 2.95,
1200
  "learning_rate": 2.0506284088214372e-05,
1201
+ "loss": 1.7013,
1202
  "step": 99500
1203
  },
1204
  {
1205
  "epoch": 2.96,
1206
  "learning_rate": 2.0358074460516956e-05,
1207
+ "loss": 1.6963,
1208
  "step": 100000
1209
  },
1210
  {
1211
  "epoch": 2.98,
1212
  "learning_rate": 2.020986483281954e-05,
1213
+ "loss": 1.6904,
1214
  "step": 100500
1215
  },
1216
  {
1217
  "epoch": 2.99,
1218
  "learning_rate": 2.0061655205122125e-05,
1219
+ "loss": 1.7072,
1220
  "step": 101000
1221
  },
1222
  {
1223
  "epoch": 3.01,
1224
  "learning_rate": 1.991344557742471e-05,
1225
+ "loss": 1.6192,
1226
  "step": 101500
1227
  },
1228
  {
1229
  "epoch": 3.02,
1230
  "learning_rate": 1.9765235949727294e-05,
1231
+ "loss": 1.5539,
1232
  "step": 102000
1233
  },
1234
  {
1235
  "epoch": 3.04,
1236
  "learning_rate": 1.961702632202988e-05,
1237
+ "loss": 1.5617,
1238
  "step": 102500
1239
  },
1240
  {
1241
  "epoch": 3.05,
1242
  "learning_rate": 1.9468816694332466e-05,
1243
+ "loss": 1.5699,
1244
  "step": 103000
1245
  },
1246
  {
1247
  "epoch": 3.07,
1248
  "learning_rate": 1.932060706663505e-05,
1249
+ "loss": 1.5772,
1250
  "step": 103500
1251
  },
1252
  {
1253
  "epoch": 3.08,
1254
  "learning_rate": 1.9172397438937635e-05,
1255
+ "loss": 1.5697,
1256
  "step": 104000
1257
  },
1258
  {
1259
  "epoch": 3.1,
1260
  "learning_rate": 1.902418781124022e-05,
1261
+ "loss": 1.5681,
1262
  "step": 104500
1263
  },
1264
  {
1265
  "epoch": 3.11,
1266
  "learning_rate": 1.8875978183542804e-05,
1267
+ "loss": 1.5799,
1268
  "step": 105000
1269
  },
1270
  {
1271
  "epoch": 3.13,
1272
  "learning_rate": 1.8727768555845388e-05,
1273
+ "loss": 1.5845,
1274
  "step": 105500
1275
  },
1276
  {
1277
  "epoch": 3.14,
1278
  "learning_rate": 1.8579558928147972e-05,
1279
+ "loss": 1.5768,
1280
  "step": 106000
1281
  },
1282
  {
1283
  "epoch": 3.16,
1284
  "learning_rate": 1.843134930045056e-05,
1285
+ "loss": 1.5776,
1286
  "step": 106500
1287
  },
1288
  {
1289
  "epoch": 3.17,
1290
  "learning_rate": 1.8283139672753145e-05,
1291
+ "loss": 1.5728,
1292
  "step": 107000
1293
  },
1294
  {
1295
  "epoch": 3.19,
1296
  "learning_rate": 1.813493004505573e-05,
1297
+ "loss": 1.5831,
1298
  "step": 107500
1299
  },
1300
  {
1301
  "epoch": 3.2,
1302
  "learning_rate": 1.7986720417358314e-05,
1303
+ "loss": 1.5911,
1304
  "step": 108000
1305
  },
1306
  {
1307
  "epoch": 3.22,
1308
  "learning_rate": 1.7838510789660898e-05,
1309
+ "loss": 1.5932,
1310
  "step": 108500
1311
  },
1312
  {
1313
  "epoch": 3.23,
1314
  "learning_rate": 1.7690301161963482e-05,
1315
+ "loss": 1.5702,
1316
  "step": 109000
1317
  },
1318
  {
1319
  "epoch": 3.25,
1320
  "learning_rate": 1.7542091534266067e-05,
1321
+ "loss": 1.5901,
1322
  "step": 109500
1323
  },
1324
  {
1325
  "epoch": 3.26,
1326
  "learning_rate": 1.739388190656865e-05,
1327
+ "loss": 1.594,
1328
  "step": 110000
1329
  },
1330
  {
1331
  "epoch": 3.28,
1332
  "learning_rate": 1.7245672278871236e-05,
1333
+ "loss": 1.5922,
1334
  "step": 110500
1335
  },
1336
  {
1337
  "epoch": 3.29,
1338
  "learning_rate": 1.709746265117382e-05,
1339
+ "loss": 1.5669,
1340
  "step": 111000
1341
  },
1342
  {
1343
  "epoch": 3.31,
1344
  "learning_rate": 1.6949253023476404e-05,
1345
+ "loss": 1.5945,
1346
  "step": 111500
1347
  },
1348
  {
1349
  "epoch": 3.32,
1350
  "learning_rate": 1.680104339577899e-05,
1351
+ "loss": 1.5999,
1352
  "step": 112000
1353
  },
1354
  {
1355
  "epoch": 3.33,
1356
  "learning_rate": 1.6652833768081573e-05,
1357
+ "loss": 1.5991,
1358
  "step": 112500
1359
  },
1360
  {
1361
  "epoch": 3.35,
1362
  "learning_rate": 1.6504624140384158e-05,
1363
+ "loss": 1.5865,
1364
  "step": 113000
1365
  },
1366
  {
1367
  "epoch": 3.36,
1368
  "learning_rate": 1.6356414512686745e-05,
1369
+ "loss": 1.598,
1370
  "step": 113500
1371
  },
1372
  {
1373
  "epoch": 3.38,
1374
  "learning_rate": 1.620820488498933e-05,
1375
+ "loss": 1.5958,
1376
  "step": 114000
1377
  },
1378
  {
1379
  "epoch": 3.39,
1380
  "learning_rate": 1.6059995257291914e-05,
1381
+ "loss": 1.5898,
1382
  "step": 114500
1383
  },
1384
  {
1385
  "epoch": 3.41,
1386
  "learning_rate": 1.59117856295945e-05,
1387
+ "loss": 1.602,
1388
  "step": 115000
1389
  },
1390
  {
1391
  "epoch": 3.42,
1392
  "learning_rate": 1.5763576001897083e-05,
1393
+ "loss": 1.5987,
1394
  "step": 115500
1395
  },
1396
  {
1397
  "epoch": 3.44,
1398
  "learning_rate": 1.5615366374199667e-05,
1399
+ "loss": 1.5998,
1400
  "step": 116000
1401
  },
1402
  {
1403
  "epoch": 3.45,
1404
  "learning_rate": 1.5467156746502255e-05,
1405
+ "loss": 1.6131,
1406
  "step": 116500
1407
  },
1408
  {
1409
  "epoch": 3.47,
1410
  "learning_rate": 1.531894711880484e-05,
1411
+ "loss": 1.6055,
1412
  "step": 117000
1413
  },
1414
  {
1415
  "epoch": 3.48,
1416
  "learning_rate": 1.5170737491107422e-05,
1417
+ "loss": 1.6107,
1418
  "step": 117500
1419
  },
1420
  {
1421
  "epoch": 3.5,
1422
  "learning_rate": 1.5022527863410007e-05,
1423
+ "loss": 1.5989,
1424
  "step": 118000
1425
  },
1426
  {
1427
  "epoch": 3.51,
1428
  "learning_rate": 1.4874318235712591e-05,
1429
+ "loss": 1.6009,
1430
  "step": 118500
1431
  },
1432
  {
1433
  "epoch": 3.53,
1434
  "learning_rate": 1.4726108608015177e-05,
1435
+ "loss": 1.601,
1436
  "step": 119000
1437
  },
1438
  {
1439
  "epoch": 3.54,
1440
  "learning_rate": 1.4577898980317762e-05,
1441
+ "loss": 1.6077,
1442
  "step": 119500
1443
  },
1444
  {
1445
  "epoch": 3.56,
1446
  "learning_rate": 1.4429689352620346e-05,
1447
+ "loss": 1.5987,
1448
  "step": 120000
1449
  },
1450
  {
1451
  "epoch": 3.57,
1452
  "learning_rate": 1.428147972492293e-05,
1453
+ "loss": 1.6148,
1454
  "step": 120500
1455
  },
1456
  {
1457
  "epoch": 3.59,
1458
  "learning_rate": 1.4133270097225518e-05,
1459
+ "loss": 1.6126,
1460
  "step": 121000
1461
  },
1462
  {
1463
  "epoch": 3.6,
1464
  "learning_rate": 1.3985060469528103e-05,
1465
+ "loss": 1.6129,
1466
  "step": 121500
1467
  },
1468
  {
1469
  "epoch": 3.62,
1470
  "learning_rate": 1.3836850841830687e-05,
1471
+ "loss": 1.6093,
1472
  "step": 122000
1473
  },
1474
  {
1475
  "epoch": 3.63,
1476
  "learning_rate": 1.3688641214133272e-05,
1477
+ "loss": 1.6098,
1478
  "step": 122500
1479
  },
1480
  {
1481
  "epoch": 3.65,
1482
  "learning_rate": 1.3540431586435856e-05,
1483
+ "loss": 1.6178,
1484
  "step": 123000
1485
  },
1486
  {
1487
  "epoch": 3.66,
1488
  "learning_rate": 1.339222195873844e-05,
1489
+ "loss": 1.6175,
1490
  "step": 123500
1491
  },
1492
  {
1493
  "epoch": 3.68,
1494
  "learning_rate": 1.3244012331041025e-05,
1495
+ "loss": 1.6113,
1496
  "step": 124000
1497
  },
1498
  {
1499
  "epoch": 3.69,
1500
  "learning_rate": 1.309580270334361e-05,
1501
+ "loss": 1.6271,
1502
  "step": 124500
1503
  },
1504
  {
1505
  "epoch": 3.71,
1506
  "learning_rate": 1.2947593075646194e-05,
1507
+ "loss": 1.6193,
1508
  "step": 125000
1509
  },
1510
  {
1511
  "epoch": 3.72,
1512
  "learning_rate": 1.279938344794878e-05,
1513
+ "loss": 1.621,
1514
  "step": 125500
1515
  },
1516
  {
1517
  "epoch": 3.73,
1518
  "learning_rate": 1.2651173820251364e-05,
1519
+ "loss": 1.6174,
1520
  "step": 126000
1521
  },
1522
  {
1523
  "epoch": 3.75,
1524
  "learning_rate": 1.2502964192553949e-05,
1525
+ "loss": 1.6252,
1526
  "step": 126500
1527
  },
1528
  {
1529
  "epoch": 3.76,
1530
  "learning_rate": 1.2354754564856535e-05,
1531
+ "loss": 1.6591,
1532
  "step": 127000
1533
  },
1534
  {
1535
  "epoch": 3.78,
1536
  "learning_rate": 1.220654493715912e-05,
1537
+ "loss": 1.6134,
1538
  "step": 127500
1539
  },
1540
  {
1541
  "epoch": 3.79,
1542
  "learning_rate": 1.2058335309461704e-05,
1543
+ "loss": 1.6309,
1544
  "step": 128000
1545
  },
1546
  {
1547
  "epoch": 3.81,
1548
  "learning_rate": 1.1910125681764288e-05,
1549
+ "loss": 1.6346,
1550
  "step": 128500
1551
  },
1552
  {
1553
  "epoch": 3.82,
1554
  "learning_rate": 1.1761916054066872e-05,
1555
+ "loss": 1.6497,
1556
  "step": 129000
1557
  },
1558
  {
1559
  "epoch": 3.84,
1560
  "learning_rate": 1.1613706426369457e-05,
1561
+ "loss": 1.6228,
1562
  "step": 129500
1563
  },
1564
  {
1565
  "epoch": 3.85,
1566
  "learning_rate": 1.1465496798672043e-05,
1567
+ "loss": 1.6137,
1568
  "step": 130000
1569
  },
1570
  {
1571
  "epoch": 3.87,
1572
  "learning_rate": 1.1317287170974627e-05,
1573
+ "loss": 1.6424,
1574
  "step": 130500
1575
  },
1576
  {
1577
  "epoch": 3.88,
1578
  "learning_rate": 1.1169077543277212e-05,
1579
+ "loss": 1.6277,
1580
  "step": 131000
1581
  },
1582
  {
1583
  "epoch": 3.9,
1584
  "learning_rate": 1.1020867915579796e-05,
1585
+ "loss": 1.6309,
1586
  "step": 131500
1587
  },
1588
  {
1589
  "epoch": 3.91,
1590
  "learning_rate": 1.087265828788238e-05,
1591
+ "loss": 1.6329,
1592
  "step": 132000
1593
  },
1594
  {
1595
  "epoch": 3.93,
1596
  "learning_rate": 1.0724448660184967e-05,
1597
+ "loss": 1.6469,
1598
  "step": 132500
1599
  },
1600
  {
1601
  "epoch": 3.94,
1602
  "learning_rate": 1.0576239032487551e-05,
1603
+ "loss": 1.628,
1604
  "step": 133000
1605
  },
1606
  {
1607
  "epoch": 3.96,
1608
  "learning_rate": 1.0428029404790136e-05,
1609
+ "loss": 1.6392,
1610
  "step": 133500
1611
  },
1612
  {
1613
  "epoch": 3.97,
1614
  "learning_rate": 1.0279819777092722e-05,
1615
+ "loss": 1.6337,
1616
  "step": 134000
1617
  },
1618
  {
1619
  "epoch": 3.99,
1620
  "learning_rate": 1.0131610149395306e-05,
1621
+ "loss": 1.6455,
1622
  "step": 134500
1623
  },
1624
  {
1625
  "epoch": 4.0,
1626
  "learning_rate": 9.98340052169789e-06,
1627
+ "loss": 1.6251,
1628
  "step": 135000
1629
  },
1630
  {
1631
  "epoch": 4.02,
1632
  "learning_rate": 9.835190894000475e-06,
1633
+ "loss": 1.5574,
1634
  "step": 135500
1635
  },
1636
  {
1637
  "epoch": 4.03,
1638
  "learning_rate": 9.68698126630306e-06,
1639
+ "loss": 1.5637,
1640
  "step": 136000
1641
  },
1642
  {
1643
  "epoch": 4.05,
1644
  "learning_rate": 9.538771638605644e-06,
1645
+ "loss": 1.5522,
1646
  "step": 136500
1647
  },
1648
  {
1649
  "epoch": 4.06,
1650
  "learning_rate": 9.390562010908228e-06,
1651
+ "loss": 1.5643,
1652
  "step": 137000
1653
  },
1654
  {
1655
  "epoch": 4.08,
1656
  "learning_rate": 9.242352383210814e-06,
1657
+ "loss": 1.5713,
1658
  "step": 137500
1659
  },
1660
  {
1661
  "epoch": 4.09,
1662
  "learning_rate": 9.094142755513399e-06,
1663
+ "loss": 1.5562,
1664
  "step": 138000
1665
  },
1666
  {
1667
  "epoch": 4.11,
1668
  "learning_rate": 8.945933127815983e-06,
1669
+ "loss": 1.561,
1670
  "step": 138500
1671
  },
1672
  {
1673
  "epoch": 4.12,
1674
  "learning_rate": 8.797723500118567e-06,
1675
+ "loss": 1.57,
1676
  "step": 139000
1677
  },
1678
  {
1679
  "epoch": 4.14,
1680
  "learning_rate": 8.649513872421154e-06,
1681
+ "loss": 1.5747,
1682
  "step": 139500
1683
  },
1684
  {
1685
  "epoch": 4.15,
1686
  "learning_rate": 8.501304244723738e-06,
1687
+ "loss": 1.5745,
1688
  "step": 140000
1689
  },
1690
  {
1691
  "epoch": 4.16,
1692
  "learning_rate": 8.353094617026322e-06,
1693
+ "loss": 1.5642,
1694
  "step": 140500
1695
  },
1696
  {
1697
  "epoch": 4.18,
1698
  "learning_rate": 8.204884989328908e-06,
1699
+ "loss": 1.5595,
1700
  "step": 141000
1701
  },
1702
  {
1703
  "epoch": 4.19,
1704
  "learning_rate": 8.056675361631493e-06,
1705
+ "loss": 1.563,
1706
  "step": 141500
1707
  },
1708
  {
1709
  "epoch": 4.21,
1710
  "learning_rate": 7.908465733934077e-06,
1711
+ "loss": 1.573,
1712
  "step": 142000
1713
  },
1714
  {
1715
  "epoch": 4.22,
1716
  "learning_rate": 7.760256106236662e-06,
1717
+ "loss": 1.5599,
1718
  "step": 142500
1719
  },
1720
  {
1721
  "epoch": 4.24,
1722
  "learning_rate": 7.612046478539246e-06,
1723
+ "loss": 1.5728,
1724
  "step": 143000
1725
  },
1726
  {
1727
  "epoch": 4.25,
1728
  "learning_rate": 7.4638368508418305e-06,
1729
+ "loss": 1.5663,
1730
  "step": 143500
1731
  },
1732
  {
1733
  "epoch": 4.27,
1734
  "learning_rate": 7.315627223144415e-06,
1735
+ "loss": 1.5639,
1736
  "step": 144000
1737
  },
1738
  {
1739
  "epoch": 4.28,
1740
  "learning_rate": 7.167417595447e-06,
1741
+ "loss": 1.5638,
1742
  "step": 144500
1743
  },
1744
  {
1745
  "epoch": 4.3,
1746
  "learning_rate": 7.0192079677495855e-06,
1747
+ "loss": 1.5848,
1748
  "step": 145000
1749
  },
1750
  {
1751
  "epoch": 4.31,
1752
  "learning_rate": 6.870998340052171e-06,
1753
+ "loss": 1.5749,
1754
  "step": 145500
1755
  },
1756
  {
1757
  "epoch": 4.33,
1758
  "learning_rate": 6.722788712354755e-06,
1759
+ "loss": 1.5706,
1760
  "step": 146000
1761
  },
1762
  {
1763
  "epoch": 4.34,
1764
  "learning_rate": 6.5745790846573396e-06,
1765
+ "loss": 1.5832,
1766
  "step": 146500
1767
  },
1768
  {
1769
  "epoch": 4.36,
1770
  "learning_rate": 6.426369456959924e-06,
1771
+ "loss": 1.5897,
1772
  "step": 147000
1773
  },
1774
  {
1775
  "epoch": 4.37,
1776
  "learning_rate": 6.278159829262508e-06,
1777
+ "loss": 1.5883,
1778
  "step": 147500
1779
  },
1780
  {
1781
  "epoch": 4.39,
1782
  "learning_rate": 6.129950201565094e-06,
1783
+ "loss": 1.6014,
1784
  "step": 148000
1785
  },
1786
  {
1787
  "epoch": 4.4,
1788
  "learning_rate": 5.981740573867679e-06,
1789
+ "loss": 1.5756,
1790
  "step": 148500
1791
  },
1792
  {
1793
  "epoch": 4.42,
1794
  "learning_rate": 5.833530946170263e-06,
1795
+ "loss": 1.5797,
1796
  "step": 149000
1797
  },
1798
  {
1799
  "epoch": 4.43,
1800
  "learning_rate": 5.6853213184728486e-06,
1801
+ "loss": 1.577,
1802
  "step": 149500
1803
  },
1804
  {
1805
  "epoch": 4.45,
1806
  "learning_rate": 5.537111690775433e-06,
1807
+ "loss": 1.583,
1808
  "step": 150000
1809
  },
1810
  {
1811
  "epoch": 4.46,
1812
  "learning_rate": 5.388902063078017e-06,
1813
+ "loss": 1.5979,
1814
  "step": 150500
1815
  },
1816
  {
1817
  "epoch": 4.48,
1818
  "learning_rate": 5.240692435380603e-06,
1819
+ "loss": 1.5847,
1820
  "step": 151000
1821
  },
1822
  {
1823
  "epoch": 4.49,
1824
  "learning_rate": 5.092482807683188e-06,
1825
+ "loss": 1.5839,
1826
  "step": 151500
1827
  },
1828
  {
1829
  "epoch": 4.51,
1830
  "learning_rate": 4.944273179985772e-06,
1831
+ "loss": 1.5707,
1832
  "step": 152000
1833
  },
1834
  {
1835
  "epoch": 4.52,
1836
  "learning_rate": 4.796063552288357e-06,
1837
+ "loss": 1.5825,
1838
  "step": 152500
1839
  },
1840
  {
1841
  "epoch": 4.54,
1842
  "learning_rate": 4.647853924590942e-06,
1843
+ "loss": 1.5799,
1844
  "step": 153000
1845
  },
1846
  {
1847
  "epoch": 4.55,
1848
  "learning_rate": 4.499644296893526e-06,
1849
+ "loss": 1.5804,
1850
  "step": 153500
1851
  },
1852
  {
1853
  "epoch": 4.56,
1854
  "learning_rate": 4.351434669196111e-06,
1855
+ "loss": 1.5841,
1856
  "step": 154000
1857
  },
1858
  {
1859
  "epoch": 4.58,
1860
  "learning_rate": 4.203225041498696e-06,
1861
+ "loss": 1.582,
1862
  "step": 154500
1863
  },
1864
  {
1865
  "epoch": 4.59,
1866
  "learning_rate": 4.055015413801281e-06,
1867
+ "loss": 1.5959,
1868
  "step": 155000
1869
  },
1870
  {
1871
  "epoch": 4.61,
1872
  "learning_rate": 3.906805786103866e-06,
1873
+ "loss": 1.5959,
1874
  "step": 155500
1875
  },
1876
  {
1877
  "epoch": 4.62,
1878
  "learning_rate": 3.75859615840645e-06,
1879
+ "loss": 1.6,
1880
  "step": 156000
1881
  },
1882
  {
1883
  "epoch": 4.64,
1884
  "learning_rate": 3.6103865307090354e-06,
1885
+ "loss": 1.5994,
1886
  "step": 156500
1887
  },
1888
  {
1889
  "epoch": 4.65,
1890
  "learning_rate": 3.46217690301162e-06,
1891
+ "loss": 1.5941,
1892
  "step": 157000
1893
  },
1894
  {
1895
  "epoch": 4.67,
1896
  "learning_rate": 3.3139672753142047e-06,
1897
+ "loss": 1.582,
1898
  "step": 157500
1899
  },
1900
  {
1901
  "epoch": 4.68,
1902
  "learning_rate": 3.165757647616789e-06,
1903
+ "loss": 1.5842,
1904
  "step": 158000
1905
  },
1906
  {
1907
  "epoch": 4.7,
1908
  "learning_rate": 3.017548019919374e-06,
1909
+ "loss": 1.5866,
1910
  "step": 158500
1911
  },
1912
  {
1913
  "epoch": 4.71,
1914
  "learning_rate": 2.8693383922219588e-06,
1915
+ "loss": 1.5969,
1916
  "step": 159000
1917
  },
1918
  {
1919
  "epoch": 4.73,
1920
  "learning_rate": 2.7211287645245436e-06,
1921
+ "loss": 1.6188,
1922
  "step": 159500
1923
  },
1924
  {
1925
  "epoch": 4.74,
1926
  "learning_rate": 2.5729191368271284e-06,
1927
+ "loss": 1.5876,
1928
  "step": 160000
1929
  },
1930
  {
1931
  "epoch": 4.76,
1932
  "learning_rate": 2.424709509129713e-06,
1933
+ "loss": 1.6092,
1934
  "step": 160500
1935
  },
1936
  {
1937
  "epoch": 4.77,
1938
  "learning_rate": 2.276499881432298e-06,
1939
+ "loss": 1.5818,
1940
  "step": 161000
1941
  },
1942
  {
1943
  "epoch": 4.79,
1944
  "learning_rate": 2.1282902537348825e-06,
1945
+ "loss": 1.586,
1946
  "step": 161500
1947
  },
1948
  {
1949
  "epoch": 4.8,
1950
  "learning_rate": 1.9800806260374674e-06,
1951
+ "loss": 1.5991,
1952
  "step": 162000
1953
  },
1954
  {
1955
  "epoch": 4.82,
1956
  "learning_rate": 1.8318709983400524e-06,
1957
+ "loss": 1.5824,
1958
  "step": 162500
1959
  },
1960
  {
1961
  "epoch": 4.83,
1962
  "learning_rate": 1.683661370642637e-06,
1963
+ "loss": 1.5907,
1964
  "step": 163000
1965
  },
1966
  {
1967
  "epoch": 4.85,
1968
  "learning_rate": 1.5354517429452217e-06,
1969
+ "loss": 1.607,
1970
  "step": 163500
1971
  },
1972
  {
1973
  "epoch": 4.86,
1974
  "learning_rate": 1.3872421152478065e-06,
1975
+ "loss": 1.591,
1976
  "step": 164000
1977
  },
1978
  {
1979
  "epoch": 4.88,
1980
  "learning_rate": 1.2390324875503913e-06,
1981
+ "loss": 1.6106,
1982
  "step": 164500
1983
  },
1984
  {
1985
  "epoch": 4.89,
1986
  "learning_rate": 1.0908228598529762e-06,
1987
+ "loss": 1.6029,
1988
  "step": 165000
1989
  },
1990
  {
1991
+ "epoch": 4.91,
1992
+ "learning_rate": 9.426132321555609e-07,
1993
+ "loss": 1.6054,
1994
  "step": 165500
1995
  },
1996
  {
1997
+ "epoch": 4.92,
1998
+ "learning_rate": 7.944036044581456e-07,
1999
+ "loss": 1.6038,
2000
  "step": 166000
2001
  },
2002
  {
2003
+ "epoch": 4.94,
2004
+ "learning_rate": 6.461939767607305e-07,
2005
+ "loss": 1.6129,
2006
  "step": 166500
2007
  },
2008
  {
2009
+ "epoch": 4.95,
2010
+ "learning_rate": 4.979843490633152e-07,
2011
+ "loss": 1.603,
2012
  "step": 167000
2013
  },
2014
  {
2015
+ "epoch": 4.97,
2016
+ "learning_rate": 3.4977472136589993e-07,
2017
+ "loss": 1.5922,
2018
  "step": 167500
2019
  },
2020
  {
2021
+ "epoch": 4.98,
2022
+ "learning_rate": 2.0156509366848474e-07,
2023
+ "loss": 1.6177,
2024
  "step": 168000
2025
  },
2026
  {
2027
+ "epoch": 4.99,
2028
+ "learning_rate": 5.3355465971069484e-08,
2029
+ "loss": 1.5988,
2030
  "step": 168500
2031
  },
2032
  {
2033
+ "epoch": 5.0,
2034
+ "step": 168680,
2035
+ "total_flos": 6.266036456049869e+17,
2036
+ "train_loss": 1.6101340950849867,
2037
+ "train_runtime": 65579.3531,
2038
+ "train_samples_per_second": 10.288,
2039
+ "train_steps_per_second": 2.572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2040
  }
2041
  ],
2042
+ "max_steps": 168680,
2043
+ "num_train_epochs": 5,
2044
+ "total_flos": 6.266036456049869e+17,
2045
  "trial_name": null,
2046
  "trial_params": null
2047
  }