de-Rodrigo commited on
Commit
67893cd
·
1 Parent(s): 1e5ab1d

Upload 8 files

Browse files
Files changed (5) hide show
  1. optimizer.pt +1 -1
  2. rng_state.pth +1 -1
  3. scheduler.pt +1 -1
  4. trainer_state.json +380 -932
  5. training_args.bin +1 -1
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40e486e534b16c41b45d08413ceb5f443c0991730446ff950bd02f1ea93d3a71
3
  size 33661637
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c799f21594ab06d6b91129c311773ce342760d0c02360ffb01ea6208e4faace8
3
  size 33661637
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd771d4fdcc07b3c7d3128e34406ab567381117cf663ec63f46bc1ee9385a49b
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f329b74b9ae80cb5e6b64ce551d508321a3549bc3d469325a37753fa7dd59f60
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdf9887d830501ed44fcfc9b1240b3894b0a39f156a92a6534273c610fd7f49b
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ad5ba60863d9ee4b493aebecbb69b4c26f9bad3b815096052e17626ba0bb35
3
  size 627
trainer_state.json CHANGED
@@ -1,1248 +1,696 @@
1
  {
2
- "best_metric": 0.43608614802360535,
3
- "best_model_checkpoint": "experiments/checkpoint-1000",
4
- "epoch": 16.0,
5
- "global_step": 1000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.08,
12
- "learning_rate": 2.9999999999999997e-06,
13
- "loss": 1.5328,
14
  "step": 5
15
  },
16
  {
17
- "epoch": 0.16,
18
- "learning_rate": 5.999999999999999e-06,
19
- "loss": 1.5296,
20
  "step": 10
21
  },
22
  {
23
- "epoch": 0.24,
24
- "learning_rate": 8.999999999999999e-06,
25
- "loss": 1.5193,
26
  "step": 15
27
  },
28
  {
29
- "epoch": 0.32,
30
- "learning_rate": 1.1999999999999999e-05,
31
- "loss": 1.5027,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 0.4,
36
- "learning_rate": 1.4999999999999999e-05,
37
- "loss": 1.5015,
38
  "step": 25
39
  },
40
  {
41
- "epoch": 0.48,
42
- "learning_rate": 1.7999999999999997e-05,
43
- "loss": 1.4799,
44
  "step": 30
45
  },
46
  {
47
- "epoch": 0.56,
48
- "learning_rate": 2.1e-05,
49
- "loss": 1.4705,
50
  "step": 35
51
  },
52
  {
53
- "epoch": 0.64,
54
- "learning_rate": 2.3999999999999997e-05,
55
- "loss": 1.4524,
56
  "step": 40
57
  },
58
  {
59
- "epoch": 0.72,
60
- "learning_rate": 2.6999999999999996e-05,
61
- "loss": 1.4231,
62
  "step": 45
63
  },
64
  {
65
- "epoch": 0.8,
66
- "learning_rate": 2.9999999999999997e-05,
67
- "loss": 1.3934,
 
 
 
 
 
 
 
 
68
  "step": 50
69
  },
70
  {
71
- "epoch": 0.88,
72
- "learning_rate": 3.2999999999999996e-05,
73
- "loss": 1.3408,
74
  "step": 55
75
  },
76
  {
77
- "epoch": 0.96,
78
- "learning_rate": 3.5999999999999994e-05,
79
- "loss": 1.2911,
80
  "step": 60
81
  },
82
  {
83
- "epoch": 1.04,
84
- "learning_rate": 3.9e-05,
85
- "loss": 1.2097,
86
  "step": 65
87
  },
88
  {
89
- "epoch": 1.12,
90
- "learning_rate": 4.2e-05,
91
- "loss": 1.121,
92
  "step": 70
93
  },
94
  {
95
- "epoch": 1.2,
96
- "learning_rate": 4.4999999999999996e-05,
97
- "loss": 1.0317,
98
  "step": 75
99
  },
100
  {
101
- "epoch": 1.28,
102
- "learning_rate": 4.7999999999999994e-05,
103
- "loss": 0.9421,
104
  "step": 80
105
  },
106
  {
107
- "epoch": 1.36,
108
- "learning_rate": 5.1e-05,
109
- "loss": 0.883,
110
  "step": 85
111
  },
112
  {
113
- "epoch": 1.44,
114
- "learning_rate": 5.399999999999999e-05,
115
- "loss": 0.8388,
116
  "step": 90
117
  },
118
  {
119
- "epoch": 1.52,
120
- "learning_rate": 5.6999999999999996e-05,
121
- "loss": 0.8171,
122
  "step": 95
123
  },
124
  {
125
- "epoch": 1.6,
126
- "learning_rate": 5.9999999999999995e-05,
127
- "loss": 0.7882,
128
  "step": 100
129
  },
130
  {
131
- "epoch": 1.68,
132
- "learning_rate": 6.299999999999999e-05,
133
- "loss": 0.7571,
 
 
 
 
 
 
 
 
134
  "step": 105
135
  },
136
  {
137
- "epoch": 1.76,
138
- "learning_rate": 6.599999999999999e-05,
139
- "loss": 0.7267,
140
  "step": 110
141
  },
142
  {
143
- "epoch": 1.84,
144
- "learning_rate": 6.9e-05,
145
- "loss": 0.7099,
146
  "step": 115
147
  },
148
  {
149
- "epoch": 1.92,
150
- "learning_rate": 7.199999999999999e-05,
151
- "loss": 0.69,
152
  "step": 120
153
  },
154
  {
155
- "epoch": 2.0,
156
- "learning_rate": 7.5e-05,
157
- "loss": 0.6809,
158
  "step": 125
159
  },
160
  {
161
- "epoch": 2.08,
162
- "learning_rate": 7.8e-05,
163
- "loss": 0.6636,
164
  "step": 130
165
  },
166
  {
167
- "epoch": 2.16,
168
- "learning_rate": 8.1e-05,
169
- "loss": 0.6522,
170
  "step": 135
171
  },
172
  {
173
- "epoch": 2.24,
174
- "learning_rate": 8.4e-05,
175
- "loss": 0.6398,
176
  "step": 140
177
  },
178
  {
179
- "epoch": 2.32,
180
- "learning_rate": 8.699999999999999e-05,
181
- "loss": 0.6322,
182
  "step": 145
183
  },
184
  {
185
- "epoch": 2.4,
186
- "learning_rate": 8.999999999999999e-05,
187
- "loss": 0.6268,
188
  "step": 150
189
  },
190
  {
191
- "epoch": 2.48,
192
- "learning_rate": 9.3e-05,
193
- "loss": 0.6159,
 
 
 
 
 
 
 
 
194
  "step": 155
195
  },
196
  {
197
- "epoch": 2.56,
198
- "learning_rate": 9.599999999999999e-05,
199
- "loss": 0.6111,
200
  "step": 160
201
  },
202
  {
203
- "epoch": 2.64,
204
- "learning_rate": 9.9e-05,
205
- "loss": 0.5995,
206
  "step": 165
207
  },
208
  {
209
- "epoch": 2.72,
210
- "learning_rate": 0.000102,
211
- "loss": 0.591,
212
  "step": 170
213
  },
214
  {
215
- "epoch": 2.8,
216
- "learning_rate": 0.00010499999999999999,
217
- "loss": 0.5885,
218
  "step": 175
219
  },
220
  {
221
- "epoch": 2.88,
222
- "learning_rate": 0.00010799999999999998,
223
- "loss": 0.5779,
224
  "step": 180
225
  },
226
  {
227
- "epoch": 2.96,
228
- "learning_rate": 0.00011099999999999999,
229
- "loss": 0.5663,
230
  "step": 185
231
  },
232
  {
233
- "epoch": 3.04,
234
- "learning_rate": 0.00011399999999999999,
235
- "loss": 0.5686,
236
  "step": 190
237
  },
238
  {
239
- "epoch": 3.12,
240
- "learning_rate": 0.000117,
241
- "loss": 0.558,
242
  "step": 195
243
  },
244
  {
245
- "epoch": 3.2,
246
- "learning_rate": 0.00011999999999999999,
247
- "loss": 0.5568,
 
 
 
 
 
 
 
 
248
  "step": 200
249
  },
250
  {
251
- "epoch": 3.28,
252
- "learning_rate": 0.00012299999999999998,
253
- "loss": 0.5528,
254
  "step": 205
255
  },
256
  {
257
- "epoch": 3.36,
258
- "learning_rate": 0.00012599999999999997,
259
- "loss": 0.5469,
260
  "step": 210
261
  },
262
  {
263
- "epoch": 3.44,
264
- "learning_rate": 0.000129,
265
- "loss": 0.545,
266
  "step": 215
267
  },
268
  {
269
- "epoch": 3.52,
270
- "learning_rate": 0.00013199999999999998,
271
- "loss": 0.5373,
272
  "step": 220
273
  },
274
  {
275
- "epoch": 3.6,
276
- "learning_rate": 0.000135,
277
- "loss": 0.5376,
278
  "step": 225
279
  },
280
  {
281
- "epoch": 3.68,
282
- "learning_rate": 0.000138,
283
- "loss": 0.5373,
284
  "step": 230
285
  },
286
  {
287
- "epoch": 3.76,
288
- "learning_rate": 0.00014099999999999998,
289
- "loss": 0.5241,
290
  "step": 235
291
  },
292
  {
293
- "epoch": 3.84,
294
- "learning_rate": 0.00014399999999999998,
295
- "loss": 0.5221,
296
  "step": 240
297
  },
298
  {
299
- "epoch": 3.92,
300
- "learning_rate": 0.000147,
301
- "loss": 0.5229,
302
  "step": 245
303
  },
304
  {
305
- "epoch": 4.0,
306
- "learning_rate": 0.00015,
307
- "loss": 0.5199,
308
  "step": 250
309
  },
310
  {
311
- "epoch": 4.0,
312
- "eval_loss": 0.5244991183280945,
313
- "eval_runtime": 265.103,
314
- "eval_samples_per_second": 7.548,
315
- "eval_steps_per_second": 0.947,
316
  "step": 250
317
  },
318
  {
319
- "epoch": 4.08,
320
- "learning_rate": 0.00015299999999999998,
321
- "loss": 0.5141,
322
  "step": 255
323
  },
324
  {
325
- "epoch": 4.16,
326
- "learning_rate": 0.000156,
327
- "loss": 0.5086,
328
  "step": 260
329
  },
330
  {
331
- "epoch": 4.24,
332
- "learning_rate": 0.000159,
333
- "loss": 0.5129,
334
  "step": 265
335
  },
336
  {
337
- "epoch": 4.32,
338
- "learning_rate": 0.000162,
339
- "loss": 0.5156,
340
  "step": 270
341
  },
342
  {
343
- "epoch": 4.4,
344
- "learning_rate": 0.000165,
345
- "loss": 0.5053,
346
  "step": 275
347
  },
348
  {
349
- "epoch": 4.48,
350
- "learning_rate": 0.000168,
351
- "loss": 0.5024,
352
  "step": 280
353
  },
354
  {
355
- "epoch": 4.56,
356
- "learning_rate": 0.00017099999999999998,
357
- "loss": 0.5048,
358
  "step": 285
359
  },
360
  {
361
- "epoch": 4.64,
362
- "learning_rate": 0.00017399999999999997,
363
- "loss": 0.5001,
364
  "step": 290
365
  },
366
  {
367
- "epoch": 4.72,
368
- "learning_rate": 0.00017699999999999997,
369
- "loss": 0.497,
370
  "step": 295
371
  },
372
  {
373
- "epoch": 4.8,
374
- "learning_rate": 0.00017999999999999998,
375
- "loss": 0.4929,
376
  "step": 300
377
  },
378
  {
379
- "epoch": 4.88,
380
- "learning_rate": 0.00018299999999999998,
381
- "loss": 0.4962,
 
 
 
 
 
 
 
 
382
  "step": 305
383
  },
384
  {
385
- "epoch": 4.96,
386
- "learning_rate": 0.000186,
387
- "loss": 0.4941,
388
  "step": 310
389
  },
390
  {
391
- "epoch": 5.04,
392
- "learning_rate": 0.00018899999999999999,
393
- "loss": 0.4927,
394
  "step": 315
395
  },
396
  {
397
- "epoch": 5.12,
398
- "learning_rate": 0.00019199999999999998,
399
- "loss": 0.4879,
400
  "step": 320
401
  },
402
  {
403
- "epoch": 5.2,
404
- "learning_rate": 0.000195,
405
- "loss": 0.4825,
406
  "step": 325
407
  },
408
  {
409
- "epoch": 5.28,
410
- "learning_rate": 0.000198,
411
- "loss": 0.4841,
412
  "step": 330
413
  },
414
  {
415
- "epoch": 5.36,
416
- "learning_rate": 0.000201,
417
- "loss": 0.484,
418
  "step": 335
419
  },
420
  {
421
- "epoch": 5.44,
422
- "learning_rate": 0.000204,
423
- "loss": 0.4777,
424
  "step": 340
425
  },
426
  {
427
- "epoch": 5.52,
428
- "learning_rate": 0.00020699999999999996,
429
- "loss": 0.4777,
430
  "step": 345
431
  },
432
  {
433
- "epoch": 5.6,
434
- "learning_rate": 0.00020999999999999998,
435
- "loss": 0.4834,
 
 
 
 
 
 
 
 
436
  "step": 350
437
  },
438
  {
439
- "epoch": 5.68,
440
- "learning_rate": 0.00021299999999999997,
441
- "loss": 0.4724,
442
  "step": 355
443
  },
444
  {
445
- "epoch": 5.76,
446
- "learning_rate": 0.00021599999999999996,
447
- "loss": 0.4777,
448
  "step": 360
449
  },
450
  {
451
- "epoch": 5.84,
452
- "learning_rate": 0.00021899999999999998,
453
- "loss": 0.4799,
454
  "step": 365
455
  },
456
  {
457
- "epoch": 5.92,
458
- "learning_rate": 0.00022199999999999998,
459
- "loss": 0.4771,
460
  "step": 370
461
  },
462
  {
463
- "epoch": 6.0,
464
- "learning_rate": 0.000225,
465
- "loss": 0.4771,
466
  "step": 375
467
  },
468
  {
469
- "epoch": 6.08,
470
- "learning_rate": 0.00022799999999999999,
471
- "loss": 0.4671,
472
  "step": 380
473
  },
474
  {
475
- "epoch": 6.16,
476
- "learning_rate": 0.00023099999999999998,
477
- "loss": 0.465,
478
  "step": 385
479
  },
480
  {
481
- "epoch": 6.24,
482
- "learning_rate": 0.000234,
483
- "loss": 0.4634,
484
  "step": 390
485
  },
486
  {
487
- "epoch": 6.32,
488
- "learning_rate": 0.000237,
489
- "loss": 0.4656,
490
  "step": 395
491
  },
492
  {
493
- "epoch": 6.4,
494
- "learning_rate": 0.00023999999999999998,
495
- "loss": 0.4726,
496
  "step": 400
497
  },
498
  {
499
- "epoch": 6.48,
500
- "learning_rate": 0.000243,
501
- "loss": 0.4679,
 
 
 
 
 
 
 
 
502
  "step": 405
503
  },
504
  {
505
- "epoch": 6.56,
506
- "learning_rate": 0.00024599999999999996,
507
- "loss": 0.467,
508
  "step": 410
509
  },
510
  {
511
- "epoch": 6.64,
512
- "learning_rate": 0.000249,
513
- "loss": 0.4642,
514
  "step": 415
515
  },
516
  {
517
- "epoch": 6.72,
518
- "learning_rate": 0.00025199999999999995,
519
- "loss": 0.462,
520
  "step": 420
521
  },
522
  {
523
- "epoch": 6.8,
524
- "learning_rate": 0.00025499999999999996,
525
- "loss": 0.4599,
526
  "step": 425
527
  },
528
  {
529
- "epoch": 6.88,
530
- "learning_rate": 0.000258,
531
- "loss": 0.4569,
532
  "step": 430
533
  },
534
  {
535
- "epoch": 6.96,
536
- "learning_rate": 0.000261,
537
- "loss": 0.4644,
538
  "step": 435
539
  },
540
  {
541
- "epoch": 7.04,
542
- "learning_rate": 0.00026399999999999997,
543
- "loss": 0.4594,
544
  "step": 440
545
  },
546
  {
547
- "epoch": 7.12,
548
- "learning_rate": 0.000267,
549
- "loss": 0.4524,
550
  "step": 445
551
  },
552
  {
553
- "epoch": 7.2,
554
- "learning_rate": 0.00027,
555
- "loss": 0.4547,
 
 
 
 
 
 
 
 
556
  "step": 450
557
  },
558
  {
559
- "epoch": 7.28,
560
- "learning_rate": 0.00027299999999999997,
561
- "loss": 0.4565,
562
  "step": 455
563
  },
564
  {
565
- "epoch": 7.36,
566
- "learning_rate": 0.000276,
567
- "loss": 0.4521,
568
  "step": 460
569
  },
570
  {
571
- "epoch": 7.44,
572
- "learning_rate": 0.000279,
573
- "loss": 0.4557,
574
  "step": 465
575
  },
576
  {
577
- "epoch": 7.52,
578
- "learning_rate": 0.00028199999999999997,
579
- "loss": 0.4541,
580
  "step": 470
581
  },
582
  {
583
- "epoch": 7.6,
584
- "learning_rate": 0.000285,
585
- "loss": 0.4537,
586
  "step": 475
587
  },
588
  {
589
- "epoch": 7.68,
590
- "learning_rate": 0.00028799999999999995,
591
- "loss": 0.4516,
592
  "step": 480
593
  },
594
  {
595
- "epoch": 7.76,
596
- "learning_rate": 0.00029099999999999997,
597
- "loss": 0.4423,
598
  "step": 485
599
  },
600
  {
601
- "epoch": 7.84,
602
- "learning_rate": 0.000294,
603
- "loss": 0.4479,
604
  "step": 490
605
  },
606
  {
607
- "epoch": 7.92,
608
- "learning_rate": 0.00029699999999999996,
609
- "loss": 0.4484,
610
  "step": 495
611
  },
612
  {
613
- "epoch": 8.0,
614
- "learning_rate": 0.0003,
615
- "loss": 0.4589,
616
  "step": 500
617
  },
618
  {
619
- "epoch": 8.0,
620
- "eval_loss": 0.4642273485660553,
621
- "eval_runtime": 264.6645,
622
- "eval_samples_per_second": 7.561,
623
- "eval_steps_per_second": 0.948,
624
  "step": 500
625
- },
626
- {
627
- "epoch": 8.08,
628
- "learning_rate": 0.00029984210526315787,
629
- "loss": 0.441,
630
- "step": 505
631
- },
632
- {
633
- "epoch": 8.16,
634
- "learning_rate": 0.00029968421052631577,
635
- "loss": 0.4404,
636
- "step": 510
637
- },
638
- {
639
- "epoch": 8.24,
640
- "learning_rate": 0.00029952631578947366,
641
- "loss": 0.4452,
642
- "step": 515
643
- },
644
- {
645
- "epoch": 8.32,
646
- "learning_rate": 0.00029936842105263156,
647
- "loss": 0.4385,
648
- "step": 520
649
- },
650
- {
651
- "epoch": 8.4,
652
- "learning_rate": 0.00029921052631578946,
653
- "loss": 0.4457,
654
- "step": 525
655
- },
656
- {
657
- "epoch": 8.48,
658
- "learning_rate": 0.00029905263157894735,
659
- "loss": 0.4407,
660
- "step": 530
661
- },
662
- {
663
- "epoch": 8.56,
664
- "learning_rate": 0.00029889473684210525,
665
- "loss": 0.4432,
666
- "step": 535
667
- },
668
- {
669
- "epoch": 8.64,
670
- "learning_rate": 0.00029873684210526315,
671
- "loss": 0.4408,
672
- "step": 540
673
- },
674
- {
675
- "epoch": 8.72,
676
- "learning_rate": 0.00029857894736842104,
677
- "loss": 0.4431,
678
- "step": 545
679
- },
680
- {
681
- "epoch": 8.8,
682
- "learning_rate": 0.00029842105263157894,
683
- "loss": 0.4403,
684
- "step": 550
685
- },
686
- {
687
- "epoch": 8.88,
688
- "learning_rate": 0.0002982631578947368,
689
- "loss": 0.4357,
690
- "step": 555
691
- },
692
- {
693
- "epoch": 8.96,
694
- "learning_rate": 0.00029810526315789473,
695
- "loss": 0.4411,
696
- "step": 560
697
- },
698
- {
699
- "epoch": 9.04,
700
- "learning_rate": 0.00029794736842105263,
701
- "loss": 0.4347,
702
- "step": 565
703
- },
704
- {
705
- "epoch": 9.12,
706
- "learning_rate": 0.0002977894736842105,
707
- "loss": 0.4317,
708
- "step": 570
709
- },
710
- {
711
- "epoch": 9.2,
712
- "learning_rate": 0.00029763157894736837,
713
- "loss": 0.4332,
714
- "step": 575
715
- },
716
- {
717
- "epoch": 9.28,
718
- "learning_rate": 0.00029747368421052627,
719
- "loss": 0.4261,
720
- "step": 580
721
- },
722
- {
723
- "epoch": 9.36,
724
- "learning_rate": 0.0002973157894736842,
725
- "loss": 0.4348,
726
- "step": 585
727
- },
728
- {
729
- "epoch": 9.44,
730
- "learning_rate": 0.00029715789473684206,
731
- "loss": 0.434,
732
- "step": 590
733
- },
734
- {
735
- "epoch": 9.52,
736
- "learning_rate": 0.00029699999999999996,
737
- "loss": 0.4334,
738
- "step": 595
739
- },
740
- {
741
- "epoch": 9.6,
742
- "learning_rate": 0.00029684210526315785,
743
- "loss": 0.4348,
744
- "step": 600
745
- },
746
- {
747
- "epoch": 9.68,
748
- "learning_rate": 0.0002966842105263158,
749
- "loss": 0.4292,
750
- "step": 605
751
- },
752
- {
753
- "epoch": 9.76,
754
- "learning_rate": 0.00029652631578947364,
755
- "loss": 0.4332,
756
- "step": 610
757
- },
758
- {
759
- "epoch": 9.84,
760
- "learning_rate": 0.00029636842105263154,
761
- "loss": 0.4302,
762
- "step": 615
763
- },
764
- {
765
- "epoch": 9.92,
766
- "learning_rate": 0.00029621052631578944,
767
- "loss": 0.4303,
768
- "step": 620
769
- },
770
- {
771
- "epoch": 10.0,
772
- "learning_rate": 0.00029605263157894733,
773
- "loss": 0.427,
774
- "step": 625
775
- },
776
- {
777
- "epoch": 10.08,
778
- "learning_rate": 0.00029589473684210523,
779
- "loss": 0.4234,
780
- "step": 630
781
- },
782
- {
783
- "epoch": 10.16,
784
- "learning_rate": 0.00029573684210526313,
785
- "loss": 0.4251,
786
- "step": 635
787
- },
788
- {
789
- "epoch": 10.24,
790
- "learning_rate": 0.000295578947368421,
791
- "loss": 0.4208,
792
- "step": 640
793
- },
794
- {
795
- "epoch": 10.32,
796
- "learning_rate": 0.0002954210526315789,
797
- "loss": 0.4252,
798
- "step": 645
799
- },
800
- {
801
- "epoch": 10.4,
802
- "learning_rate": 0.0002952631578947368,
803
- "loss": 0.4263,
804
- "step": 650
805
- },
806
- {
807
- "epoch": 10.48,
808
- "learning_rate": 0.0002951052631578947,
809
- "loss": 0.4221,
810
- "step": 655
811
- },
812
- {
813
- "epoch": 10.56,
814
- "learning_rate": 0.0002949473684210526,
815
- "loss": 0.4169,
816
- "step": 660
817
- },
818
- {
819
- "epoch": 10.64,
820
- "learning_rate": 0.0002947894736842105,
821
- "loss": 0.4282,
822
- "step": 665
823
- },
824
- {
825
- "epoch": 10.72,
826
- "learning_rate": 0.0002946315789473684,
827
- "loss": 0.4207,
828
- "step": 670
829
- },
830
- {
831
- "epoch": 10.8,
832
- "learning_rate": 0.0002944736842105263,
833
- "loss": 0.4257,
834
- "step": 675
835
- },
836
- {
837
- "epoch": 10.88,
838
- "learning_rate": 0.0002943157894736842,
839
- "loss": 0.4207,
840
- "step": 680
841
- },
842
- {
843
- "epoch": 10.96,
844
- "learning_rate": 0.0002941578947368421,
845
- "loss": 0.4171,
846
- "step": 685
847
- },
848
- {
849
- "epoch": 11.04,
850
- "learning_rate": 0.000294,
851
- "loss": 0.4215,
852
- "step": 690
853
- },
854
- {
855
- "epoch": 11.12,
856
- "learning_rate": 0.00029384210526315783,
857
- "loss": 0.4112,
858
- "step": 695
859
- },
860
- {
861
- "epoch": 11.2,
862
- "learning_rate": 0.0002936842105263158,
863
- "loss": 0.4127,
864
- "step": 700
865
- },
866
- {
867
- "epoch": 11.28,
868
- "learning_rate": 0.0002935263157894737,
869
- "loss": 0.4125,
870
- "step": 705
871
- },
872
- {
873
- "epoch": 11.36,
874
- "learning_rate": 0.0002933684210526316,
875
- "loss": 0.4149,
876
- "step": 710
877
- },
878
- {
879
- "epoch": 11.44,
880
- "learning_rate": 0.0002932105263157894,
881
- "loss": 0.4144,
882
- "step": 715
883
- },
884
- {
885
- "epoch": 11.52,
886
- "learning_rate": 0.0002930526315789473,
887
- "loss": 0.4157,
888
- "step": 720
889
- },
890
- {
891
- "epoch": 11.6,
892
- "learning_rate": 0.00029289473684210527,
893
- "loss": 0.4087,
894
- "step": 725
895
- },
896
- {
897
- "epoch": 11.68,
898
- "learning_rate": 0.00029273684210526316,
899
- "loss": 0.4162,
900
- "step": 730
901
- },
902
- {
903
- "epoch": 11.76,
904
- "learning_rate": 0.000292578947368421,
905
- "loss": 0.4104,
906
- "step": 735
907
- },
908
- {
909
- "epoch": 11.84,
910
- "learning_rate": 0.0002924210526315789,
911
- "loss": 0.414,
912
- "step": 740
913
- },
914
- {
915
- "epoch": 11.92,
916
- "learning_rate": 0.00029226315789473685,
917
- "loss": 0.4132,
918
- "step": 745
919
- },
920
- {
921
- "epoch": 12.0,
922
- "learning_rate": 0.0002921052631578947,
923
- "loss": 0.4223,
924
- "step": 750
925
- },
926
- {
927
- "epoch": 12.0,
928
- "eval_loss": 0.44360578060150146,
929
- "eval_runtime": 265.1184,
930
- "eval_samples_per_second": 7.548,
931
- "eval_steps_per_second": 0.947,
932
- "step": 750
933
- },
934
- {
935
- "epoch": 12.08,
936
- "learning_rate": 0.0002919473684210526,
937
- "loss": 0.4084,
938
- "step": 755
939
- },
940
- {
941
- "epoch": 12.16,
942
- "learning_rate": 0.0002917894736842105,
943
- "loss": 0.4045,
944
- "step": 760
945
- },
946
- {
947
- "epoch": 12.24,
948
- "learning_rate": 0.0002916315789473684,
949
- "loss": 0.4084,
950
- "step": 765
951
- },
952
- {
953
- "epoch": 12.32,
954
- "learning_rate": 0.0002914736842105263,
955
- "loss": 0.4053,
956
- "step": 770
957
- },
958
- {
959
- "epoch": 12.4,
960
- "learning_rate": 0.0002913157894736842,
961
- "loss": 0.4079,
962
- "step": 775
963
- },
964
- {
965
- "epoch": 12.48,
966
- "learning_rate": 0.0002911578947368421,
967
- "loss": 0.4045,
968
- "step": 780
969
- },
970
- {
971
- "epoch": 12.56,
972
- "learning_rate": 0.00029099999999999997,
973
- "loss": 0.4009,
974
- "step": 785
975
- },
976
- {
977
- "epoch": 12.64,
978
- "learning_rate": 0.00029084210526315787,
979
- "loss": 0.4064,
980
- "step": 790
981
- },
982
- {
983
- "epoch": 12.72,
984
- "learning_rate": 0.00029068421052631577,
985
- "loss": 0.4104,
986
- "step": 795
987
- },
988
- {
989
- "epoch": 12.8,
990
- "learning_rate": 0.00029052631578947366,
991
- "loss": 0.4121,
992
- "step": 800
993
- },
994
- {
995
- "epoch": 12.88,
996
- "learning_rate": 0.00029036842105263156,
997
- "loss": 0.4064,
998
- "step": 805
999
- },
1000
- {
1001
- "epoch": 12.96,
1002
- "learning_rate": 0.00029021052631578945,
1003
- "loss": 0.4126,
1004
- "step": 810
1005
- },
1006
- {
1007
- "epoch": 13.04,
1008
- "learning_rate": 0.00029005263157894735,
1009
- "loss": 0.4028,
1010
- "step": 815
1011
- },
1012
- {
1013
- "epoch": 13.12,
1014
- "learning_rate": 0.00028989473684210525,
1015
- "loss": 0.3973,
1016
- "step": 820
1017
- },
1018
- {
1019
- "epoch": 13.2,
1020
- "learning_rate": 0.00028973684210526314,
1021
- "loss": 0.3977,
1022
- "step": 825
1023
- },
1024
- {
1025
- "epoch": 13.28,
1026
- "learning_rate": 0.00028957894736842104,
1027
- "loss": 0.3989,
1028
- "step": 830
1029
- },
1030
- {
1031
- "epoch": 13.36,
1032
- "learning_rate": 0.0002894210526315789,
1033
- "loss": 0.4025,
1034
- "step": 835
1035
- },
1036
- {
1037
- "epoch": 13.44,
1038
- "learning_rate": 0.00028926315789473683,
1039
- "loss": 0.4013,
1040
- "step": 840
1041
- },
1042
- {
1043
- "epoch": 13.52,
1044
- "learning_rate": 0.00028910526315789473,
1045
- "loss": 0.4044,
1046
- "step": 845
1047
- },
1048
- {
1049
- "epoch": 13.6,
1050
- "learning_rate": 0.00028894736842105263,
1051
- "loss": 0.4037,
1052
- "step": 850
1053
- },
1054
- {
1055
- "epoch": 13.68,
1056
- "learning_rate": 0.00028878947368421047,
1057
- "loss": 0.4,
1058
- "step": 855
1059
- },
1060
- {
1061
- "epoch": 13.76,
1062
- "learning_rate": 0.00028863157894736837,
1063
- "loss": 0.4023,
1064
- "step": 860
1065
- },
1066
- {
1067
- "epoch": 13.84,
1068
- "learning_rate": 0.0002884736842105263,
1069
- "loss": 0.402,
1070
- "step": 865
1071
- },
1072
- {
1073
- "epoch": 13.92,
1074
- "learning_rate": 0.0002883157894736842,
1075
- "loss": 0.4033,
1076
- "step": 870
1077
- },
1078
- {
1079
- "epoch": 14.0,
1080
- "learning_rate": 0.00028815789473684206,
1081
- "loss": 0.4009,
1082
- "step": 875
1083
- },
1084
- {
1085
- "epoch": 14.08,
1086
- "learning_rate": 0.00028799999999999995,
1087
- "loss": 0.3952,
1088
- "step": 880
1089
- },
1090
- {
1091
- "epoch": 14.16,
1092
- "learning_rate": 0.0002878421052631579,
1093
- "loss": 0.3911,
1094
- "step": 885
1095
- },
1096
- {
1097
- "epoch": 14.24,
1098
- "learning_rate": 0.00028768421052631575,
1099
- "loss": 0.392,
1100
- "step": 890
1101
- },
1102
- {
1103
- "epoch": 14.32,
1104
- "learning_rate": 0.00028752631578947364,
1105
- "loss": 0.3912,
1106
- "step": 895
1107
- },
1108
- {
1109
- "epoch": 14.4,
1110
- "learning_rate": 0.00028736842105263154,
1111
- "loss": 0.3918,
1112
- "step": 900
1113
- },
1114
- {
1115
- "epoch": 14.48,
1116
- "learning_rate": 0.00028721052631578944,
1117
- "loss": 0.3955,
1118
- "step": 905
1119
- },
1120
- {
1121
- "epoch": 14.56,
1122
- "learning_rate": 0.00028705263157894733,
1123
- "loss": 0.3979,
1124
- "step": 910
1125
- },
1126
- {
1127
- "epoch": 14.64,
1128
- "learning_rate": 0.00028689473684210523,
1129
- "loss": 0.396,
1130
- "step": 915
1131
- },
1132
- {
1133
- "epoch": 14.72,
1134
- "learning_rate": 0.0002867368421052631,
1135
- "loss": 0.3957,
1136
- "step": 920
1137
- },
1138
- {
1139
- "epoch": 14.8,
1140
- "learning_rate": 0.000286578947368421,
1141
- "loss": 0.3975,
1142
- "step": 925
1143
- },
1144
- {
1145
- "epoch": 14.88,
1146
- "learning_rate": 0.0002864210526315789,
1147
- "loss": 0.398,
1148
- "step": 930
1149
- },
1150
- {
1151
- "epoch": 14.96,
1152
- "learning_rate": 0.0002862631578947368,
1153
- "loss": 0.3965,
1154
- "step": 935
1155
- },
1156
- {
1157
- "epoch": 15.04,
1158
- "learning_rate": 0.0002861052631578947,
1159
- "loss": 0.3906,
1160
- "step": 940
1161
- },
1162
- {
1163
- "epoch": 15.12,
1164
- "learning_rate": 0.0002859473684210526,
1165
- "loss": 0.3852,
1166
- "step": 945
1167
- },
1168
- {
1169
- "epoch": 15.2,
1170
- "learning_rate": 0.0002857894736842105,
1171
- "loss": 0.3933,
1172
- "step": 950
1173
- },
1174
- {
1175
- "epoch": 15.28,
1176
- "learning_rate": 0.0002856315789473684,
1177
- "loss": 0.3857,
1178
- "step": 955
1179
- },
1180
- {
1181
- "epoch": 15.36,
1182
- "learning_rate": 0.0002854736842105263,
1183
- "loss": 0.3897,
1184
- "step": 960
1185
- },
1186
- {
1187
- "epoch": 15.44,
1188
- "learning_rate": 0.0002853157894736842,
1189
- "loss": 0.3885,
1190
- "step": 965
1191
- },
1192
- {
1193
- "epoch": 15.52,
1194
- "learning_rate": 0.0002851578947368421,
1195
- "loss": 0.3882,
1196
- "step": 970
1197
- },
1198
- {
1199
- "epoch": 15.6,
1200
- "learning_rate": 0.000285,
1201
- "loss": 0.3961,
1202
- "step": 975
1203
- },
1204
- {
1205
- "epoch": 15.68,
1206
- "learning_rate": 0.0002848421052631579,
1207
- "loss": 0.3886,
1208
- "step": 980
1209
- },
1210
- {
1211
- "epoch": 15.76,
1212
- "learning_rate": 0.0002846842105263158,
1213
- "loss": 0.391,
1214
- "step": 985
1215
- },
1216
- {
1217
- "epoch": 15.84,
1218
- "learning_rate": 0.0002845263157894737,
1219
- "loss": 0.391,
1220
- "step": 990
1221
- },
1222
- {
1223
- "epoch": 15.92,
1224
- "learning_rate": 0.0002843684210526315,
1225
- "loss": 0.3901,
1226
- "step": 995
1227
- },
1228
- {
1229
- "epoch": 16.0,
1230
- "learning_rate": 0.0002842105263157894,
1231
- "loss": 0.3869,
1232
- "step": 1000
1233
- },
1234
- {
1235
- "epoch": 16.0,
1236
- "eval_loss": 0.43608614802360535,
1237
- "eval_runtime": 265.3444,
1238
- "eval_samples_per_second": 7.541,
1239
- "eval_steps_per_second": 0.946,
1240
- "step": 1000
1241
  }
1242
  ],
1243
  "max_steps": 10000,
1244
- "num_train_epochs": 162,
1245
- "total_flos": 2.599744722936791e+18,
1246
  "trial_name": null,
1247
  "trial_params": null
1248
  }
 
1
  {
2
+ "best_metric": 1.9192386865615845,
3
+ "best_model_checkpoint": "experiments/checkpoint-500",
4
+ "epoch": 100.0,
5
+ "global_step": 500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 1.0,
12
+ "learning_rate": 1.8e-06,
13
+ "loss": 1.7688,
14
  "step": 5
15
  },
16
  {
17
+ "epoch": 2.0,
18
+ "learning_rate": 4.8e-06,
19
+ "loss": 1.7684,
20
  "step": 10
21
  },
22
  {
23
+ "epoch": 3.0,
24
+ "learning_rate": 7.799999999999998e-06,
25
+ "loss": 1.7564,
26
  "step": 15
27
  },
28
  {
29
+ "epoch": 4.0,
30
+ "learning_rate": 1.0799999999999998e-05,
31
+ "loss": 1.7424,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 5.0,
36
+ "learning_rate": 1.3799999999999998e-05,
37
+ "loss": 1.727,
38
  "step": 25
39
  },
40
  {
41
+ "epoch": 6.0,
42
+ "learning_rate": 1.68e-05,
43
+ "loss": 1.7135,
44
  "step": 30
45
  },
46
  {
47
+ "epoch": 7.0,
48
+ "learning_rate": 1.98e-05,
49
+ "loss": 1.701,
50
  "step": 35
51
  },
52
  {
53
+ "epoch": 8.0,
54
+ "learning_rate": 2.28e-05,
55
+ "loss": 1.6797,
56
  "step": 40
57
  },
58
  {
59
+ "epoch": 9.0,
60
+ "learning_rate": 2.5799999999999997e-05,
61
+ "loss": 1.6547,
62
  "step": 45
63
  },
64
  {
65
+ "epoch": 10.0,
66
+ "learning_rate": 2.88e-05,
67
+ "loss": 1.6245,
68
+ "step": 50
69
+ },
70
+ {
71
+ "epoch": 10.0,
72
+ "eval_loss": 1.5684312582015991,
73
+ "eval_runtime": 1.4166,
74
+ "eval_samples_per_second": 7.059,
75
+ "eval_steps_per_second": 1.412,
76
  "step": 50
77
  },
78
  {
79
+ "epoch": 11.0,
80
+ "learning_rate": 3.1799999999999994e-05,
81
+ "loss": 1.5841,
82
  "step": 55
83
  },
84
  {
85
+ "epoch": 12.0,
86
+ "learning_rate": 3.48e-05,
87
+ "loss": 1.5316,
88
  "step": 60
89
  },
90
  {
91
+ "epoch": 13.0,
92
+ "learning_rate": 3.78e-05,
93
+ "loss": 1.4644,
94
  "step": 65
95
  },
96
  {
97
+ "epoch": 14.0,
98
+ "learning_rate": 4.08e-05,
99
+ "loss": 1.3728,
100
  "step": 70
101
  },
102
  {
103
+ "epoch": 15.0,
104
+ "learning_rate": 4.3799999999999994e-05,
105
+ "loss": 1.2692,
106
  "step": 75
107
  },
108
  {
109
+ "epoch": 16.0,
110
+ "learning_rate": 4.56e-05,
111
+ "loss": 1.1998,
112
  "step": 80
113
  },
114
  {
115
+ "epoch": 17.0,
116
+ "learning_rate": 4.8599999999999995e-05,
117
+ "loss": 1.1159,
118
  "step": 85
119
  },
120
  {
121
+ "epoch": 18.0,
122
+ "learning_rate": 5.1599999999999994e-05,
123
+ "loss": 1.0442,
124
  "step": 90
125
  },
126
  {
127
+ "epoch": 19.0,
128
+ "learning_rate": 5.459999999999999e-05,
129
+ "loss": 0.9944,
130
  "step": 95
131
  },
132
  {
133
+ "epoch": 20.0,
134
+ "learning_rate": 5.76e-05,
135
+ "loss": 0.9518,
136
  "step": 100
137
  },
138
  {
139
+ "epoch": 20.0,
140
+ "eval_loss": 0.9818406105041504,
141
+ "eval_runtime": 1.3835,
142
+ "eval_samples_per_second": 7.228,
143
+ "eval_steps_per_second": 1.446,
144
+ "step": 100
145
+ },
146
+ {
147
+ "epoch": 21.0,
148
+ "learning_rate": 6.0599999999999996e-05,
149
+ "loss": 0.908,
150
  "step": 105
151
  },
152
  {
153
+ "epoch": 22.0,
154
+ "learning_rate": 6.359999999999999e-05,
155
+ "loss": 0.8678,
156
  "step": 110
157
  },
158
  {
159
+ "epoch": 23.0,
160
+ "learning_rate": 6.659999999999999e-05,
161
+ "loss": 0.8303,
162
  "step": 115
163
  },
164
  {
165
+ "epoch": 24.0,
166
+ "learning_rate": 6.96e-05,
167
+ "loss": 0.7928,
168
  "step": 120
169
  },
170
  {
171
+ "epoch": 25.0,
172
+ "learning_rate": 7.259999999999999e-05,
173
+ "loss": 0.7594,
174
  "step": 125
175
  },
176
  {
177
+ "epoch": 26.0,
178
+ "learning_rate": 7.56e-05,
179
+ "loss": 0.73,
180
  "step": 130
181
  },
182
  {
183
+ "epoch": 27.0,
184
+ "learning_rate": 7.86e-05,
185
+ "loss": 0.7034,
186
  "step": 135
187
  },
188
  {
189
+ "epoch": 28.0,
190
+ "learning_rate": 8.16e-05,
191
+ "loss": 0.6777,
192
  "step": 140
193
  },
194
  {
195
+ "epoch": 29.0,
196
+ "learning_rate": 8.459999999999998e-05,
197
+ "loss": 0.6493,
198
  "step": 145
199
  },
200
  {
201
+ "epoch": 30.0,
202
+ "learning_rate": 8.759999999999999e-05,
203
+ "loss": 0.6249,
204
  "step": 150
205
  },
206
  {
207
+ "epoch": 30.0,
208
+ "eval_loss": 0.8599645495414734,
209
+ "eval_runtime": 1.4099,
210
+ "eval_samples_per_second": 7.093,
211
+ "eval_steps_per_second": 1.419,
212
+ "step": 150
213
+ },
214
+ {
215
+ "epoch": 31.0,
216
+ "learning_rate": 9.059999999999999e-05,
217
+ "loss": 0.6007,
218
  "step": 155
219
  },
220
  {
221
+ "epoch": 32.0,
222
+ "learning_rate": 9.36e-05,
223
+ "loss": 0.5716,
224
  "step": 160
225
  },
226
  {
227
+ "epoch": 33.0,
228
+ "learning_rate": 9.659999999999999e-05,
229
+ "loss": 0.5465,
230
  "step": 165
231
  },
232
  {
233
+ "epoch": 34.0,
234
+ "learning_rate": 9.96e-05,
235
+ "loss": 0.5191,
236
  "step": 170
237
  },
238
  {
239
+ "epoch": 35.0,
240
+ "learning_rate": 0.0001026,
241
+ "loss": 0.4947,
242
  "step": 175
243
  },
244
  {
245
+ "epoch": 36.0,
246
+ "learning_rate": 0.00010559999999999998,
247
+ "loss": 0.4681,
248
  "step": 180
249
  },
250
  {
251
+ "epoch": 37.0,
252
+ "learning_rate": 0.00010859999999999998,
253
+ "loss": 0.4417,
254
  "step": 185
255
  },
256
  {
257
+ "epoch": 38.0,
258
+ "learning_rate": 0.00011159999999999999,
259
+ "loss": 0.4116,
260
  "step": 190
261
  },
262
  {
263
+ "epoch": 39.0,
264
+ "learning_rate": 0.0001146,
265
+ "loss": 0.3804,
266
  "step": 195
267
  },
268
  {
269
+ "epoch": 40.0,
270
+ "learning_rate": 0.0001176,
271
+ "loss": 0.3544,
272
+ "step": 200
273
+ },
274
+ {
275
+ "epoch": 40.0,
276
+ "eval_loss": 1.1081293821334839,
277
+ "eval_runtime": 1.3764,
278
+ "eval_samples_per_second": 7.265,
279
+ "eval_steps_per_second": 1.453,
280
  "step": 200
281
  },
282
  {
283
+ "epoch": 41.0,
284
+ "learning_rate": 0.00012059999999999999,
285
+ "loss": 0.3248,
286
  "step": 205
287
  },
288
  {
289
+ "epoch": 42.0,
290
+ "learning_rate": 0.0001236,
291
+ "loss": 0.2931,
292
  "step": 210
293
  },
294
  {
295
+ "epoch": 43.0,
296
+ "learning_rate": 0.0001266,
297
+ "loss": 0.2677,
298
  "step": 215
299
  },
300
  {
301
+ "epoch": 44.0,
302
+ "learning_rate": 0.00012959999999999998,
303
+ "loss": 0.2386,
304
  "step": 220
305
  },
306
  {
307
+ "epoch": 45.0,
308
+ "learning_rate": 0.0001326,
309
+ "loss": 0.2142,
310
  "step": 225
311
  },
312
  {
313
+ "epoch": 46.0,
314
+ "learning_rate": 0.0001356,
315
+ "loss": 0.1932,
316
  "step": 230
317
  },
318
  {
319
+ "epoch": 47.0,
320
+ "learning_rate": 0.0001386,
321
+ "loss": 0.1709,
322
  "step": 235
323
  },
324
  {
325
+ "epoch": 48.0,
326
+ "learning_rate": 0.00014159999999999997,
327
+ "loss": 0.1571,
328
  "step": 240
329
  },
330
  {
331
+ "epoch": 49.0,
332
+ "learning_rate": 0.0001446,
333
+ "loss": 0.1417,
334
  "step": 245
335
  },
336
  {
337
+ "epoch": 50.0,
338
+ "learning_rate": 0.00014759999999999998,
339
+ "loss": 0.1184,
340
  "step": 250
341
  },
342
  {
343
+ "epoch": 50.0,
344
+ "eval_loss": 1.5212451219558716,
345
+ "eval_runtime": 1.3827,
346
+ "eval_samples_per_second": 7.232,
347
+ "eval_steps_per_second": 1.446,
348
  "step": 250
349
  },
350
  {
351
+ "epoch": 51.0,
352
+ "learning_rate": 0.00015059999999999997,
353
+ "loss": 0.1096,
354
  "step": 255
355
  },
356
  {
357
+ "epoch": 52.0,
358
+ "learning_rate": 0.0001536,
359
+ "loss": 0.1037,
360
  "step": 260
361
  },
362
  {
363
+ "epoch": 53.0,
364
+ "learning_rate": 0.00015659999999999998,
365
+ "loss": 0.095,
366
  "step": 265
367
  },
368
  {
369
+ "epoch": 54.0,
370
+ "learning_rate": 0.0001596,
371
+ "loss": 0.0865,
372
  "step": 270
373
  },
374
  {
375
+ "epoch": 55.0,
376
+ "learning_rate": 0.0001626,
377
+ "loss": 0.0808,
378
  "step": 275
379
  },
380
  {
381
+ "epoch": 56.0,
382
+ "learning_rate": 0.0001656,
383
+ "loss": 0.0794,
384
  "step": 280
385
  },
386
  {
387
+ "epoch": 57.0,
388
+ "learning_rate": 0.0001686,
389
+ "loss": 0.075,
390
  "step": 285
391
  },
392
  {
393
+ "epoch": 58.0,
394
+ "learning_rate": 0.00017159999999999997,
395
+ "loss": 0.0726,
396
  "step": 290
397
  },
398
  {
399
+ "epoch": 59.0,
400
+ "learning_rate": 0.00017459999999999996,
401
+ "loss": 0.0696,
402
  "step": 295
403
  },
404
  {
405
+ "epoch": 60.0,
406
+ "learning_rate": 0.00017759999999999998,
407
+ "loss": 0.0665,
408
  "step": 300
409
  },
410
  {
411
+ "epoch": 60.0,
412
+ "eval_loss": 1.7048699855804443,
413
+ "eval_runtime": 1.3753,
414
+ "eval_samples_per_second": 7.271,
415
+ "eval_steps_per_second": 1.454,
416
+ "step": 300
417
+ },
418
+ {
419
+ "epoch": 61.0,
420
+ "learning_rate": 0.00018059999999999997,
421
+ "loss": 0.065,
422
  "step": 305
423
  },
424
  {
425
+ "epoch": 62.0,
426
+ "learning_rate": 0.0001836,
427
+ "loss": 0.0623,
428
  "step": 310
429
  },
430
  {
431
+ "epoch": 63.0,
432
+ "learning_rate": 0.00018659999999999998,
433
+ "loss": 0.0574,
434
  "step": 315
435
  },
436
  {
437
+ "epoch": 64.0,
438
+ "learning_rate": 0.00018959999999999997,
439
+ "loss": 0.0577,
440
  "step": 320
441
  },
442
  {
443
+ "epoch": 65.0,
444
+ "learning_rate": 0.0001926,
445
+ "loss": 0.0597,
446
  "step": 325
447
  },
448
  {
449
+ "epoch": 66.0,
450
+ "learning_rate": 0.00019559999999999998,
451
+ "loss": 0.0546,
452
  "step": 330
453
  },
454
  {
455
+ "epoch": 67.0,
456
+ "learning_rate": 0.0001986,
457
+ "loss": 0.0603,
458
  "step": 335
459
  },
460
  {
461
+ "epoch": 68.0,
462
+ "learning_rate": 0.0002016,
463
+ "loss": 0.0555,
464
  "step": 340
465
  },
466
  {
467
+ "epoch": 69.0,
468
+ "learning_rate": 0.00020459999999999999,
469
+ "loss": 0.0551,
470
  "step": 345
471
  },
472
  {
473
+ "epoch": 70.0,
474
+ "learning_rate": 0.00020759999999999998,
475
+ "loss": 0.0529,
476
+ "step": 350
477
+ },
478
+ {
479
+ "epoch": 70.0,
480
+ "eval_loss": 1.901442527770996,
481
+ "eval_runtime": 1.3785,
482
+ "eval_samples_per_second": 7.254,
483
+ "eval_steps_per_second": 1.451,
484
  "step": 350
485
  },
486
  {
487
+ "epoch": 71.0,
488
+ "learning_rate": 0.00021059999999999997,
489
+ "loss": 0.051,
490
  "step": 355
491
  },
492
  {
493
+ "epoch": 72.0,
494
+ "learning_rate": 0.00021359999999999996,
495
+ "loss": 0.05,
496
  "step": 360
497
  },
498
  {
499
+ "epoch": 73.0,
500
+ "learning_rate": 0.00021659999999999998,
501
+ "loss": 0.0493,
502
  "step": 365
503
  },
504
  {
505
+ "epoch": 74.0,
506
+ "learning_rate": 0.00021959999999999997,
507
+ "loss": 0.0465,
508
  "step": 370
509
  },
510
  {
511
+ "epoch": 75.0,
512
+ "learning_rate": 0.0002226,
513
+ "loss": 0.0504,
514
  "step": 375
515
  },
516
  {
517
+ "epoch": 76.0,
518
+ "learning_rate": 0.00022559999999999998,
519
+ "loss": 0.0491,
520
  "step": 380
521
  },
522
  {
523
+ "epoch": 77.0,
524
+ "learning_rate": 0.00022859999999999997,
525
+ "loss": 0.0485,
526
  "step": 385
527
  },
528
  {
529
+ "epoch": 78.0,
530
+ "learning_rate": 0.0002316,
531
+ "loss": 0.0451,
532
  "step": 390
533
  },
534
  {
535
+ "epoch": 79.0,
536
+ "learning_rate": 0.00023459999999999998,
537
+ "loss": 0.0478,
538
  "step": 395
539
  },
540
  {
541
+ "epoch": 80.0,
542
+ "learning_rate": 0.0002376,
543
+ "loss": 0.0435,
544
  "step": 400
545
  },
546
  {
547
+ "epoch": 80.0,
548
+ "eval_loss": 1.9840269088745117,
549
+ "eval_runtime": 1.3787,
550
+ "eval_samples_per_second": 7.253,
551
+ "eval_steps_per_second": 1.451,
552
+ "step": 400
553
+ },
554
+ {
555
+ "epoch": 81.0,
556
+ "learning_rate": 0.0002406,
557
+ "loss": 0.0429,
558
  "step": 405
559
  },
560
  {
561
+ "epoch": 82.0,
562
+ "learning_rate": 0.00024359999999999999,
563
+ "loss": 0.0464,
564
  "step": 410
565
  },
566
  {
567
+ "epoch": 83.0,
568
+ "learning_rate": 0.0002466,
569
+ "loss": 0.0458,
570
  "step": 415
571
  },
572
  {
573
+ "epoch": 84.0,
574
+ "learning_rate": 0.00024959999999999994,
575
+ "loss": 0.0441,
576
  "step": 420
577
  },
578
  {
579
+ "epoch": 85.0,
580
+ "learning_rate": 0.00025259999999999996,
581
+ "loss": 0.0421,
582
  "step": 425
583
  },
584
  {
585
+ "epoch": 86.0,
586
+ "learning_rate": 0.0002556,
587
+ "loss": 0.0433,
588
  "step": 430
589
  },
590
  {
591
+ "epoch": 87.0,
592
+ "learning_rate": 0.0002586,
593
+ "loss": 0.0444,
594
  "step": 435
595
  },
596
  {
597
+ "epoch": 88.0,
598
+ "learning_rate": 0.00026159999999999996,
599
+ "loss": 0.0472,
600
  "step": 440
601
  },
602
  {
603
+ "epoch": 89.0,
604
+ "learning_rate": 0.0002646,
605
+ "loss": 0.0442,
606
  "step": 445
607
  },
608
  {
609
+ "epoch": 90.0,
610
+ "learning_rate": 0.0002676,
611
+ "loss": 0.0431,
612
+ "step": 450
613
+ },
614
+ {
615
+ "epoch": 90.0,
616
+ "eval_loss": 1.839582085609436,
617
+ "eval_runtime": 1.3848,
618
+ "eval_samples_per_second": 7.221,
619
+ "eval_steps_per_second": 1.444,
620
  "step": 450
621
  },
622
  {
623
+ "epoch": 91.0,
624
+ "learning_rate": 0.00027059999999999996,
625
+ "loss": 0.0431,
626
  "step": 455
627
  },
628
  {
629
+ "epoch": 92.0,
630
+ "learning_rate": 0.0002736,
631
+ "loss": 0.044,
632
  "step": 460
633
  },
634
  {
635
+ "epoch": 93.0,
636
+ "learning_rate": 0.0002766,
637
+ "loss": 0.0429,
638
  "step": 465
639
  },
640
  {
641
+ "epoch": 94.0,
642
+ "learning_rate": 0.00027959999999999997,
643
+ "loss": 0.042,
644
  "step": 470
645
  },
646
  {
647
+ "epoch": 95.0,
648
+ "learning_rate": 0.0002826,
649
+ "loss": 0.0415,
650
  "step": 475
651
  },
652
  {
653
+ "epoch": 96.0,
654
+ "learning_rate": 0.00028559999999999995,
655
+ "loss": 0.0433,
656
  "step": 480
657
  },
658
  {
659
+ "epoch": 97.0,
660
+ "learning_rate": 0.00028859999999999997,
661
+ "loss": 0.0405,
662
  "step": 485
663
  },
664
  {
665
+ "epoch": 98.0,
666
+ "learning_rate": 0.0002916,
667
+ "loss": 0.0416,
668
  "step": 490
669
  },
670
  {
671
+ "epoch": 99.0,
672
+ "learning_rate": 0.00029459999999999995,
673
+ "loss": 0.0378,
674
  "step": 495
675
  },
676
  {
677
+ "epoch": 100.0,
678
+ "learning_rate": 0.00029759999999999997,
679
+ "loss": 0.0379,
680
  "step": 500
681
  },
682
  {
683
+ "epoch": 100.0,
684
+ "eval_loss": 1.9192386865615845,
685
+ "eval_runtime": 1.3756,
686
+ "eval_samples_per_second": 7.269,
687
+ "eval_steps_per_second": 1.454,
688
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
  }
690
  ],
691
  "max_steps": 10000,
692
+ "num_train_epochs": 2000,
693
+ "total_flos": 8.0956959522816e+16,
694
  "trial_name": null,
695
  "trial_params": null
696
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:938e3a85e1584e1669ed7b89d0c0c72ab3ee7f45a24587d91ef691e943ee42d6
3
  size 3963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccf3cbed56dd6bc8e51a2fe8aa833170620c235dac2584c4369f03afd0d2c443
3
  size 3963