sekarmulyani commited on
Commit
8d53a12
1 Parent(s): b97fda3

Upload 8 files

Browse files
Files changed (6) hide show
  1. optimizer.pt +1 -1
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +231 -631
  6. training_args.bin +1 -1
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a1e9a60b91ab1ff423e42b69a9585990dbff8f041adeb2669137349ce385df1
3
  size 995641861
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff010bc66fd61f1d34710e22fc453cbd4986ef52cbca0f808e71d2287359c01a
3
  size 995641861
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdceaa2fa53da129ce343dd96a07c92e444e7dddec6dba851b45a46e5a39cf91
3
  size 497807197
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86bf6ab09b64321efc94f3bc379e531d0a1338c8c5fa3b38c7c52464847c79d8
3
  size 497807197
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:370c3a07f37a8aae6ea141b54ca992b21699546baf7407eb587b6056f787333b
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dd3a816ab8628e6038ecf426e93a907752049203fbc39b63fcde557182a866f
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39651c15c8edcba0e527a13c5e91b60df7995ee89991b270f951b1ffc793ec92
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b18054e524341e87a895cc798ffc44bc6c3d095dc41640d72b87475609e792
3
  size 627
trainer_state.json CHANGED
@@ -1,851 +1,451 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 8.0,
5
  "eval_steps": 500,
6
- "global_step": 64056,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.06,
13
- "learning_rate": 9.968777319845137e-06,
14
- "loss": 3.8481,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.12,
19
- "learning_rate": 9.937554639690272e-06,
20
- "loss": 3.0215,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.19,
25
- "learning_rate": 9.906331959535406e-06,
26
- "loss": 2.564,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.25,
31
- "learning_rate": 9.875109279380542e-06,
32
- "loss": 2.3377,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.31,
37
- "learning_rate": 9.843886599225678e-06,
38
- "loss": 2.2458,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.37,
43
- "learning_rate": 9.812663919070815e-06,
44
- "loss": 2.1878,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 0.44,
49
- "learning_rate": 9.78144123891595e-06,
50
- "loss": 2.143,
51
  "step": 3500
52
  },
53
  {
54
- "epoch": 0.5,
55
- "learning_rate": 9.750218558761085e-06,
56
- "loss": 2.113,
57
  "step": 4000
58
  },
59
  {
60
- "epoch": 0.56,
61
- "learning_rate": 9.71899587860622e-06,
62
- "loss": 2.0957,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 0.62,
67
- "learning_rate": 9.687773198451356e-06,
68
- "loss": 2.0639,
69
  "step": 5000
70
  },
71
  {
72
- "epoch": 0.69,
73
- "learning_rate": 9.65655051829649e-06,
74
- "loss": 2.0461,
75
  "step": 5500
76
  },
77
  {
78
- "epoch": 0.75,
79
- "learning_rate": 9.625327838141627e-06,
80
- "loss": 2.0174,
81
  "step": 6000
82
  },
83
  {
84
- "epoch": 0.81,
85
- "learning_rate": 9.594105157986763e-06,
86
- "loss": 2.0111,
87
  "step": 6500
88
  },
89
  {
90
- "epoch": 0.87,
91
- "learning_rate": 9.562882477831898e-06,
92
- "loss": 1.9898,
93
  "step": 7000
94
  },
95
  {
96
- "epoch": 0.94,
97
- "learning_rate": 9.531659797677034e-06,
98
- "loss": 1.98,
99
  "step": 7500
100
  },
101
  {
102
- "epoch": 1.0,
103
- "learning_rate": 9.500437117522168e-06,
104
- "loss": 1.9702,
105
  "step": 8000
106
  },
107
  {
108
- "epoch": 1.0,
109
- "eval_loss": 1.9547929763793945,
110
- "eval_runtime": 33.4979,
111
- "eval_samples_per_second": 42.421,
112
- "eval_steps_per_second": 5.314,
113
- "step": 8007
114
- },
115
- {
116
- "epoch": 1.06,
117
- "learning_rate": 9.469214437367304e-06,
118
- "loss": 1.9463,
119
  "step": 8500
120
  },
121
  {
122
- "epoch": 1.12,
123
- "learning_rate": 9.43799175721244e-06,
124
- "loss": 1.9336,
125
  "step": 9000
126
  },
127
  {
128
- "epoch": 1.19,
129
- "learning_rate": 9.406769077057575e-06,
130
- "loss": 1.9187,
131
  "step": 9500
132
  },
133
  {
134
- "epoch": 1.25,
135
- "learning_rate": 9.37554639690271e-06,
136
- "loss": 1.9095,
137
  "step": 10000
138
  },
139
  {
140
- "epoch": 1.31,
141
- "learning_rate": 9.344323716747846e-06,
142
- "loss": 1.903,
143
  "step": 10500
144
  },
145
  {
146
- "epoch": 1.37,
147
- "learning_rate": 9.313101036592982e-06,
148
- "loss": 1.8976,
149
  "step": 11000
150
  },
151
  {
152
- "epoch": 1.44,
153
- "learning_rate": 9.281878356438118e-06,
154
- "loss": 1.8823,
 
 
 
 
 
 
 
 
155
  "step": 11500
156
  },
157
  {
158
- "epoch": 1.5,
159
- "learning_rate": 9.250655676283253e-06,
160
- "loss": 1.8908,
161
  "step": 12000
162
  },
163
  {
164
- "epoch": 1.56,
165
- "learning_rate": 9.219432996128387e-06,
166
- "loss": 1.872,
167
  "step": 12500
168
  },
169
  {
170
- "epoch": 1.62,
171
- "learning_rate": 9.188210315973524e-06,
172
- "loss": 1.8637,
173
  "step": 13000
174
  },
175
  {
176
- "epoch": 1.69,
177
- "learning_rate": 9.15698763581866e-06,
178
- "loss": 1.8651,
179
  "step": 13500
180
  },
181
  {
182
- "epoch": 1.75,
183
- "learning_rate": 9.125764955663794e-06,
184
- "loss": 1.8496,
185
  "step": 14000
186
  },
187
  {
188
- "epoch": 1.81,
189
- "learning_rate": 9.09454227550893e-06,
190
- "loss": 1.8411,
191
  "step": 14500
192
  },
193
  {
194
- "epoch": 1.87,
195
- "learning_rate": 9.063319595354067e-06,
196
- "loss": 1.8354,
197
  "step": 15000
198
  },
199
  {
200
- "epoch": 1.94,
201
- "learning_rate": 9.032096915199201e-06,
202
- "loss": 1.8307,
203
  "step": 15500
204
  },
205
  {
206
- "epoch": 2.0,
207
- "learning_rate": 9.000874235044337e-06,
208
- "loss": 1.8301,
209
  "step": 16000
210
  },
211
  {
212
- "epoch": 2.0,
213
- "eval_loss": 1.8236933946609497,
214
- "eval_runtime": 33.5205,
215
- "eval_samples_per_second": 42.392,
216
- "eval_steps_per_second": 5.31,
217
- "step": 16014
218
- },
219
- {
220
- "epoch": 2.06,
221
- "learning_rate": 8.969651554889472e-06,
222
- "loss": 1.8129,
223
  "step": 16500
224
  },
225
  {
226
- "epoch": 2.12,
227
- "learning_rate": 8.938428874734608e-06,
228
- "loss": 1.8119,
229
  "step": 17000
230
  },
231
  {
232
- "epoch": 2.19,
233
- "learning_rate": 8.907206194579744e-06,
234
- "loss": 1.797,
235
  "step": 17500
236
  },
237
  {
238
- "epoch": 2.25,
239
- "learning_rate": 8.875983514424879e-06,
240
- "loss": 1.8038,
241
  "step": 18000
242
  },
243
  {
244
- "epoch": 2.31,
245
- "learning_rate": 8.844760834270013e-06,
246
- "loss": 1.8043,
247
  "step": 18500
248
  },
249
  {
250
- "epoch": 2.37,
251
- "learning_rate": 8.81353815411515e-06,
252
- "loss": 1.7932,
253
  "step": 19000
254
  },
255
  {
256
- "epoch": 2.44,
257
- "learning_rate": 8.782315473960286e-06,
258
- "loss": 1.7881,
259
  "step": 19500
260
  },
261
  {
262
- "epoch": 2.5,
263
- "learning_rate": 8.751092793805422e-06,
264
- "loss": 1.7729,
265
  "step": 20000
266
  },
267
  {
268
- "epoch": 2.56,
269
- "learning_rate": 8.719870113650557e-06,
270
- "loss": 1.7718,
271
  "step": 20500
272
  },
273
  {
274
- "epoch": 2.62,
275
- "learning_rate": 8.688647433495691e-06,
276
- "loss": 1.7715,
277
  "step": 21000
278
  },
279
  {
280
- "epoch": 2.69,
281
- "learning_rate": 8.657424753340827e-06,
282
- "loss": 1.7664,
283
  "step": 21500
284
  },
285
  {
286
- "epoch": 2.75,
287
- "learning_rate": 8.626202073185963e-06,
288
- "loss": 1.7619,
289
  "step": 22000
290
  },
291
  {
292
- "epoch": 2.81,
293
- "learning_rate": 8.594979393031098e-06,
294
- "loss": 1.7563,
295
  "step": 22500
296
  },
297
  {
298
- "epoch": 2.87,
299
- "learning_rate": 8.563756712876234e-06,
300
- "loss": 1.7621,
 
 
 
 
 
 
 
 
301
  "step": 23000
302
  },
303
  {
304
- "epoch": 2.93,
305
- "learning_rate": 8.532534032721369e-06,
306
- "loss": 1.7578,
307
  "step": 23500
308
  },
309
  {
310
- "epoch": 3.0,
311
- "learning_rate": 8.501311352566505e-06,
312
- "loss": 1.7502,
313
  "step": 24000
314
  },
315
  {
316
- "epoch": 3.0,
317
- "eval_loss": 1.7523757219314575,
318
- "eval_runtime": 33.535,
319
- "eval_samples_per_second": 42.374,
320
- "eval_steps_per_second": 5.308,
321
- "step": 24021
322
- },
323
- {
324
- "epoch": 3.06,
325
- "learning_rate": 8.470088672411641e-06,
326
- "loss": 1.7503,
327
  "step": 24500
328
  },
329
  {
330
- "epoch": 3.12,
331
- "learning_rate": 8.438865992256776e-06,
332
- "loss": 1.7381,
333
  "step": 25000
334
  },
335
  {
336
- "epoch": 3.18,
337
- "learning_rate": 8.407643312101912e-06,
338
- "loss": 1.7336,
339
  "step": 25500
340
  },
341
  {
342
- "epoch": 3.25,
343
- "learning_rate": 8.376420631947046e-06,
344
- "loss": 1.7252,
345
  "step": 26000
346
  },
347
  {
348
- "epoch": 3.31,
349
- "learning_rate": 8.345197951792183e-06,
350
- "loss": 1.7312,
351
  "step": 26500
352
  },
353
  {
354
- "epoch": 3.37,
355
- "learning_rate": 8.313975271637319e-06,
356
- "loss": 1.7271,
357
  "step": 27000
358
  },
359
  {
360
- "epoch": 3.43,
361
- "learning_rate": 8.282752591482453e-06,
362
- "loss": 1.7336,
363
  "step": 27500
364
  },
365
  {
366
- "epoch": 3.5,
367
- "learning_rate": 8.25152991132759e-06,
368
- "loss": 1.7251,
369
  "step": 28000
370
  },
371
  {
372
- "epoch": 3.56,
373
- "learning_rate": 8.220307231172726e-06,
374
- "loss": 1.7162,
375
  "step": 28500
376
  },
377
  {
378
- "epoch": 3.62,
379
- "learning_rate": 8.18908455101786e-06,
380
- "loss": 1.7067,
381
  "step": 29000
382
  },
383
  {
384
- "epoch": 3.68,
385
- "learning_rate": 8.157861870862995e-06,
386
- "loss": 1.7104,
387
  "step": 29500
388
  },
389
  {
390
- "epoch": 3.75,
391
- "learning_rate": 8.126639190708131e-06,
392
- "loss": 1.7089,
393
  "step": 30000
394
  },
395
  {
396
- "epoch": 3.81,
397
- "learning_rate": 8.095416510553267e-06,
398
- "loss": 1.7087,
399
  "step": 30500
400
  },
401
  {
402
- "epoch": 3.87,
403
- "learning_rate": 8.064193830398402e-06,
404
- "loss": 1.6976,
405
  "step": 31000
406
  },
407
  {
408
- "epoch": 3.93,
409
- "learning_rate": 8.032971150243538e-06,
410
- "loss": 1.6907,
411
  "step": 31500
412
  },
413
  {
414
- "epoch": 4.0,
415
- "learning_rate": 8.001748470088672e-06,
416
- "loss": 1.6994,
417
  "step": 32000
418
  },
419
  {
420
- "epoch": 4.0,
421
- "eval_loss": 1.705617070198059,
422
- "eval_runtime": 33.5302,
423
- "eval_samples_per_second": 42.38,
424
- "eval_steps_per_second": 5.309,
425
- "step": 32028
426
- },
427
- {
428
- "epoch": 4.06,
429
- "learning_rate": 7.970525789933809e-06,
430
- "loss": 1.6913,
431
  "step": 32500
432
  },
433
  {
434
- "epoch": 4.12,
435
- "learning_rate": 7.939303109778945e-06,
436
- "loss": 1.6853,
437
  "step": 33000
438
  },
439
  {
440
- "epoch": 4.18,
441
- "learning_rate": 7.90808042962408e-06,
442
- "loss": 1.6854,
443
  "step": 33500
444
  },
445
  {
446
- "epoch": 4.25,
447
- "learning_rate": 7.876857749469215e-06,
448
- "loss": 1.6884,
449
  "step": 34000
450
  },
451
  {
452
- "epoch": 4.31,
453
- "learning_rate": 7.84563506931435e-06,
454
- "loss": 1.6813,
455
- "step": 34500
456
- },
457
- {
458
- "epoch": 4.37,
459
- "learning_rate": 7.814412389159486e-06,
460
- "loss": 1.6834,
461
- "step": 35000
462
- },
463
- {
464
- "epoch": 4.43,
465
- "learning_rate": 7.783189709004622e-06,
466
- "loss": 1.6717,
467
- "step": 35500
468
- },
469
- {
470
- "epoch": 4.5,
471
- "learning_rate": 7.751967028849757e-06,
472
- "loss": 1.6712,
473
- "step": 36000
474
- },
475
- {
476
- "epoch": 4.56,
477
- "learning_rate": 7.720744348694893e-06,
478
- "loss": 1.6873,
479
- "step": 36500
480
- },
481
- {
482
- "epoch": 4.62,
483
- "learning_rate": 7.689521668540028e-06,
484
- "loss": 1.6688,
485
- "step": 37000
486
- },
487
- {
488
- "epoch": 4.68,
489
- "learning_rate": 7.658298988385164e-06,
490
- "loss": 1.6589,
491
- "step": 37500
492
- },
493
- {
494
- "epoch": 4.75,
495
- "learning_rate": 7.627076308230299e-06,
496
- "loss": 1.6668,
497
- "step": 38000
498
- },
499
- {
500
- "epoch": 4.81,
501
- "learning_rate": 7.5958536280754345e-06,
502
- "loss": 1.6665,
503
- "step": 38500
504
- },
505
- {
506
- "epoch": 4.87,
507
- "learning_rate": 7.564630947920571e-06,
508
- "loss": 1.6679,
509
- "step": 39000
510
- },
511
- {
512
- "epoch": 4.93,
513
- "learning_rate": 7.533408267765706e-06,
514
- "loss": 1.6624,
515
- "step": 39500
516
- },
517
- {
518
- "epoch": 5.0,
519
- "learning_rate": 7.502185587610841e-06,
520
- "loss": 1.6621,
521
- "step": 40000
522
- },
523
- {
524
- "epoch": 5.0,
525
- "eval_loss": 1.6710957288742065,
526
- "eval_runtime": 33.5526,
527
- "eval_samples_per_second": 42.351,
528
- "eval_steps_per_second": 5.305,
529
- "step": 40035
530
- },
531
- {
532
- "epoch": 5.06,
533
- "learning_rate": 7.470962907455977e-06,
534
- "loss": 1.6497,
535
- "step": 40500
536
- },
537
- {
538
- "epoch": 5.12,
539
- "learning_rate": 7.439740227301112e-06,
540
- "loss": 1.6487,
541
- "step": 41000
542
- },
543
- {
544
- "epoch": 5.18,
545
- "learning_rate": 7.4085175471462475e-06,
546
- "loss": 1.6439,
547
- "step": 41500
548
- },
549
- {
550
- "epoch": 5.25,
551
- "learning_rate": 7.377294866991384e-06,
552
- "loss": 1.6552,
553
- "step": 42000
554
- },
555
- {
556
- "epoch": 5.31,
557
- "learning_rate": 7.346072186836518e-06,
558
- "loss": 1.6473,
559
- "step": 42500
560
- },
561
- {
562
- "epoch": 5.37,
563
- "learning_rate": 7.314849506681654e-06,
564
- "loss": 1.6447,
565
- "step": 43000
566
- },
567
- {
568
- "epoch": 5.43,
569
- "learning_rate": 7.28362682652679e-06,
570
- "loss": 1.6463,
571
- "step": 43500
572
- },
573
- {
574
- "epoch": 5.5,
575
- "learning_rate": 7.252404146371925e-06,
576
- "loss": 1.6493,
577
- "step": 44000
578
- },
579
- {
580
- "epoch": 5.56,
581
- "learning_rate": 7.2211814662170606e-06,
582
- "loss": 1.6433,
583
- "step": 44500
584
- },
585
- {
586
- "epoch": 5.62,
587
- "learning_rate": 7.189958786062197e-06,
588
- "loss": 1.6483,
589
- "step": 45000
590
- },
591
- {
592
- "epoch": 5.68,
593
- "learning_rate": 7.158736105907331e-06,
594
- "loss": 1.6347,
595
- "step": 45500
596
- },
597
- {
598
- "epoch": 5.74,
599
- "learning_rate": 7.127513425752467e-06,
600
- "loss": 1.6363,
601
- "step": 46000
602
- },
603
- {
604
- "epoch": 5.81,
605
- "learning_rate": 7.096290745597603e-06,
606
- "loss": 1.6284,
607
- "step": 46500
608
- },
609
- {
610
- "epoch": 5.87,
611
- "learning_rate": 7.065068065442738e-06,
612
- "loss": 1.6268,
613
- "step": 47000
614
- },
615
- {
616
- "epoch": 5.93,
617
- "learning_rate": 7.033845385287874e-06,
618
- "loss": 1.633,
619
- "step": 47500
620
- },
621
- {
622
- "epoch": 5.99,
623
- "learning_rate": 7.002622705133009e-06,
624
- "loss": 1.6313,
625
- "step": 48000
626
- },
627
- {
628
- "epoch": 6.0,
629
- "eval_loss": 1.644548773765564,
630
- "eval_runtime": 33.5436,
631
- "eval_samples_per_second": 42.363,
632
- "eval_steps_per_second": 5.307,
633
- "step": 48042
634
- },
635
- {
636
- "epoch": 6.06,
637
- "learning_rate": 6.971400024978144e-06,
638
- "loss": 1.628,
639
- "step": 48500
640
- },
641
- {
642
- "epoch": 6.12,
643
- "learning_rate": 6.9401773448232805e-06,
644
- "loss": 1.6263,
645
- "step": 49000
646
- },
647
- {
648
- "epoch": 6.18,
649
- "learning_rate": 6.908954664668416e-06,
650
- "loss": 1.6154,
651
- "step": 49500
652
- },
653
- {
654
- "epoch": 6.24,
655
- "learning_rate": 6.877731984513551e-06,
656
- "loss": 1.6141,
657
- "step": 50000
658
- },
659
- {
660
- "epoch": 6.31,
661
- "learning_rate": 6.846509304358687e-06,
662
- "loss": 1.6198,
663
- "step": 50500
664
- },
665
- {
666
- "epoch": 6.37,
667
- "learning_rate": 6.815286624203822e-06,
668
- "loss": 1.6131,
669
- "step": 51000
670
- },
671
- {
672
- "epoch": 6.43,
673
- "learning_rate": 6.784063944048957e-06,
674
- "loss": 1.6178,
675
- "step": 51500
676
- },
677
- {
678
- "epoch": 6.49,
679
- "learning_rate": 6.7528412638940935e-06,
680
- "loss": 1.6108,
681
- "step": 52000
682
- },
683
- {
684
- "epoch": 6.56,
685
- "learning_rate": 6.721618583739229e-06,
686
- "loss": 1.619,
687
- "step": 52500
688
- },
689
- {
690
- "epoch": 6.62,
691
- "learning_rate": 6.690395903584364e-06,
692
- "loss": 1.6167,
693
- "step": 53000
694
- },
695
- {
696
- "epoch": 6.68,
697
- "learning_rate": 6.6591732234294996e-06,
698
- "loss": 1.6051,
699
- "step": 53500
700
- },
701
- {
702
- "epoch": 6.74,
703
- "learning_rate": 6.627950543274635e-06,
704
- "loss": 1.6156,
705
- "step": 54000
706
- },
707
- {
708
- "epoch": 6.81,
709
- "learning_rate": 6.59672786311977e-06,
710
- "loss": 1.6155,
711
- "step": 54500
712
- },
713
- {
714
- "epoch": 6.87,
715
- "learning_rate": 6.5655051829649065e-06,
716
- "loss": 1.6074,
717
- "step": 55000
718
- },
719
- {
720
- "epoch": 6.93,
721
- "learning_rate": 6.534282502810042e-06,
722
- "loss": 1.6082,
723
- "step": 55500
724
- },
725
- {
726
- "epoch": 6.99,
727
- "learning_rate": 6.503059822655178e-06,
728
- "loss": 1.6009,
729
- "step": 56000
730
- },
731
- {
732
- "epoch": 7.0,
733
- "eval_loss": 1.6238889694213867,
734
- "eval_runtime": 33.6415,
735
- "eval_samples_per_second": 42.24,
736
- "eval_steps_per_second": 5.291,
737
- "step": 56049
738
- },
739
- {
740
- "epoch": 7.06,
741
- "learning_rate": 6.471837142500313e-06,
742
- "loss": 1.6086,
743
- "step": 56500
744
- },
745
- {
746
- "epoch": 7.12,
747
- "learning_rate": 6.440614462345448e-06,
748
- "loss": 1.597,
749
- "step": 57000
750
- },
751
- {
752
- "epoch": 7.18,
753
- "learning_rate": 6.409391782190584e-06,
754
- "loss": 1.5983,
755
- "step": 57500
756
- },
757
- {
758
- "epoch": 7.24,
759
- "learning_rate": 6.3781691020357195e-06,
760
- "loss": 1.6085,
761
- "step": 58000
762
- },
763
- {
764
- "epoch": 7.31,
765
- "learning_rate": 6.346946421880855e-06,
766
- "loss": 1.5866,
767
- "step": 58500
768
- },
769
- {
770
- "epoch": 7.37,
771
- "learning_rate": 6.315723741725989e-06,
772
- "loss": 1.5995,
773
- "step": 59000
774
- },
775
- {
776
- "epoch": 7.43,
777
- "learning_rate": 6.284501061571126e-06,
778
- "loss": 1.5943,
779
- "step": 59500
780
- },
781
- {
782
- "epoch": 7.49,
783
- "learning_rate": 6.253278381416261e-06,
784
- "loss": 1.5994,
785
- "step": 60000
786
- },
787
- {
788
- "epoch": 7.56,
789
- "learning_rate": 6.222055701261397e-06,
790
- "loss": 1.5913,
791
- "step": 60500
792
- },
793
- {
794
- "epoch": 7.62,
795
- "learning_rate": 6.1908330211065325e-06,
796
- "loss": 1.6005,
797
- "step": 61000
798
- },
799
- {
800
- "epoch": 7.68,
801
- "learning_rate": 6.159610340951669e-06,
802
- "loss": 1.5857,
803
- "step": 61500
804
- },
805
- {
806
- "epoch": 7.74,
807
- "learning_rate": 6.128387660796803e-06,
808
- "loss": 1.5808,
809
- "step": 62000
810
- },
811
- {
812
- "epoch": 7.81,
813
- "learning_rate": 6.097164980641939e-06,
814
- "loss": 1.582,
815
- "step": 62500
816
- },
817
- {
818
- "epoch": 7.87,
819
- "learning_rate": 6.065942300487074e-06,
820
- "loss": 1.587,
821
- "step": 63000
822
- },
823
- {
824
- "epoch": 7.93,
825
- "learning_rate": 6.03471962033221e-06,
826
- "loss": 1.5858,
827
- "step": 63500
828
- },
829
- {
830
- "epoch": 7.99,
831
- "learning_rate": 6.0034969401773455e-06,
832
- "loss": 1.5835,
833
- "step": 64000
834
- },
835
- {
836
- "epoch": 8.0,
837
- "eval_loss": 1.6067923307418823,
838
- "eval_runtime": 33.5833,
839
- "eval_samples_per_second": 42.313,
840
- "eval_steps_per_second": 5.3,
841
- "step": 64056
842
  }
843
  ],
844
  "logging_steps": 500,
845
- "max_steps": 160140,
846
- "num_train_epochs": 20,
847
  "save_steps": 500,
848
- "total_flos": 1.00412960145408e+17,
849
  "trial_name": null,
850
  "trial_params": null
851
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 34431,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.04,
13
+ "learning_rate": 9.945543260433913e-06,
14
+ "loss": 3.7697,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.09,
19
+ "learning_rate": 9.891086520867823e-06,
20
+ "loss": 2.9544,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.13,
25
+ "learning_rate": 9.836629781301735e-06,
26
+ "loss": 2.4309,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.17,
31
+ "learning_rate": 9.782173041735646e-06,
32
+ "loss": 2.1416,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.22,
37
+ "learning_rate": 9.727716302169558e-06,
38
+ "loss": 2.0346,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.26,
43
+ "learning_rate": 9.673259562603468e-06,
44
+ "loss": 1.9859,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.3,
49
+ "learning_rate": 9.61880282303738e-06,
50
+ "loss": 1.9512,
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.35,
55
+ "learning_rate": 9.56434608347129e-06,
56
+ "loss": 1.9171,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.39,
61
+ "learning_rate": 9.509889343905202e-06,
62
+ "loss": 1.8989,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.44,
67
+ "learning_rate": 9.455432604339114e-06,
68
+ "loss": 1.868,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 0.48,
73
+ "learning_rate": 9.400975864773026e-06,
74
+ "loss": 1.8423,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 0.52,
79
+ "learning_rate": 9.346519125206936e-06,
80
+ "loss": 1.8311,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 0.57,
85
+ "learning_rate": 9.292062385640848e-06,
86
+ "loss": 1.8139,
87
  "step": 6500
88
  },
89
  {
90
+ "epoch": 0.61,
91
+ "learning_rate": 9.237605646074758e-06,
92
+ "loss": 1.809,
93
  "step": 7000
94
  },
95
  {
96
+ "epoch": 0.65,
97
+ "learning_rate": 9.18314890650867e-06,
98
+ "loss": 1.7932,
99
  "step": 7500
100
  },
101
  {
102
+ "epoch": 0.7,
103
+ "learning_rate": 9.128692166942582e-06,
104
+ "loss": 1.7807,
105
  "step": 8000
106
  },
107
  {
108
+ "epoch": 0.74,
109
+ "learning_rate": 9.074235427376494e-06,
110
+ "loss": 1.7729,
 
 
 
 
 
 
 
 
111
  "step": 8500
112
  },
113
  {
114
+ "epoch": 0.78,
115
+ "learning_rate": 9.019778687810404e-06,
116
+ "loss": 1.7695,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 0.83,
121
+ "learning_rate": 8.965321948244316e-06,
122
+ "loss": 1.7464,
123
  "step": 9500
124
  },
125
  {
126
+ "epoch": 0.87,
127
+ "learning_rate": 8.910865208678226e-06,
128
+ "loss": 1.7436,
129
  "step": 10000
130
  },
131
  {
132
+ "epoch": 0.91,
133
+ "learning_rate": 8.856408469112138e-06,
134
+ "loss": 1.736,
135
  "step": 10500
136
  },
137
  {
138
+ "epoch": 0.96,
139
+ "learning_rate": 8.80195172954605e-06,
140
+ "loss": 1.729,
141
  "step": 11000
142
  },
143
  {
144
+ "epoch": 1.0,
145
+ "eval_loss": 1.7001079320907593,
146
+ "eval_runtime": 3.4215,
147
+ "eval_samples_per_second": 39.748,
148
+ "eval_steps_per_second": 4.969,
149
+ "step": 11477
150
+ },
151
+ {
152
+ "epoch": 1.0,
153
+ "learning_rate": 8.747494989979961e-06,
154
+ "loss": 1.7264,
155
  "step": 11500
156
  },
157
  {
158
+ "epoch": 1.05,
159
+ "learning_rate": 8.693038250413871e-06,
160
+ "loss": 1.708,
161
  "step": 12000
162
  },
163
  {
164
+ "epoch": 1.09,
165
+ "learning_rate": 8.638581510847783e-06,
166
+ "loss": 1.6948,
167
  "step": 12500
168
  },
169
  {
170
+ "epoch": 1.13,
171
+ "learning_rate": 8.584124771281695e-06,
172
+ "loss": 1.698,
173
  "step": 13000
174
  },
175
  {
176
+ "epoch": 1.18,
177
+ "learning_rate": 8.529668031715605e-06,
178
+ "loss": 1.692,
179
  "step": 13500
180
  },
181
  {
182
+ "epoch": 1.22,
183
+ "learning_rate": 8.475211292149517e-06,
184
+ "loss": 1.684,
185
  "step": 14000
186
  },
187
  {
188
+ "epoch": 1.26,
189
+ "learning_rate": 8.420754552583429e-06,
190
+ "loss": 1.6879,
191
  "step": 14500
192
  },
193
  {
194
+ "epoch": 1.31,
195
+ "learning_rate": 8.36629781301734e-06,
196
+ "loss": 1.6804,
197
  "step": 15000
198
  },
199
  {
200
+ "epoch": 1.35,
201
+ "learning_rate": 8.311841073451251e-06,
202
+ "loss": 1.6713,
203
  "step": 15500
204
  },
205
  {
206
+ "epoch": 1.39,
207
+ "learning_rate": 8.257384333885163e-06,
208
+ "loss": 1.6703,
209
  "step": 16000
210
  },
211
  {
212
+ "epoch": 1.44,
213
+ "learning_rate": 8.202927594319073e-06,
214
+ "loss": 1.6646,
 
 
 
 
 
 
 
 
215
  "step": 16500
216
  },
217
  {
218
+ "epoch": 1.48,
219
+ "learning_rate": 8.148470854752985e-06,
220
+ "loss": 1.651,
221
  "step": 17000
222
  },
223
  {
224
+ "epoch": 1.52,
225
+ "learning_rate": 8.094014115186897e-06,
226
+ "loss": 1.6488,
227
  "step": 17500
228
  },
229
  {
230
+ "epoch": 1.57,
231
+ "learning_rate": 8.039557375620808e-06,
232
+ "loss": 1.6452,
233
  "step": 18000
234
  },
235
  {
236
+ "epoch": 1.61,
237
+ "learning_rate": 7.985100636054719e-06,
238
+ "loss": 1.6386,
239
  "step": 18500
240
  },
241
  {
242
+ "epoch": 1.66,
243
+ "learning_rate": 7.93064389648863e-06,
244
+ "loss": 1.6349,
245
  "step": 19000
246
  },
247
  {
248
+ "epoch": 1.7,
249
+ "learning_rate": 7.87618715692254e-06,
250
+ "loss": 1.6345,
251
  "step": 19500
252
  },
253
  {
254
+ "epoch": 1.74,
255
+ "learning_rate": 7.821730417356452e-06,
256
+ "loss": 1.6294,
257
  "step": 20000
258
  },
259
  {
260
+ "epoch": 1.79,
261
+ "learning_rate": 7.767273677790364e-06,
262
+ "loss": 1.631,
263
  "step": 20500
264
  },
265
  {
266
+ "epoch": 1.83,
267
+ "learning_rate": 7.712816938224276e-06,
268
+ "loss": 1.6261,
269
  "step": 21000
270
  },
271
  {
272
+ "epoch": 1.87,
273
+ "learning_rate": 7.658360198658186e-06,
274
+ "loss": 1.6281,
275
  "step": 21500
276
  },
277
  {
278
+ "epoch": 1.92,
279
+ "learning_rate": 7.603903459092098e-06,
280
+ "loss": 1.611,
281
  "step": 22000
282
  },
283
  {
284
+ "epoch": 1.96,
285
+ "learning_rate": 7.549446719526009e-06,
286
+ "loss": 1.6155,
287
  "step": 22500
288
  },
289
  {
290
+ "epoch": 2.0,
291
+ "eval_loss": 1.596663475036621,
292
+ "eval_runtime": 3.4296,
293
+ "eval_samples_per_second": 39.655,
294
+ "eval_steps_per_second": 4.957,
295
+ "step": 22954
296
+ },
297
+ {
298
+ "epoch": 2.0,
299
+ "learning_rate": 7.49498997995992e-06,
300
+ "loss": 1.6029,
301
  "step": 23000
302
  },
303
  {
304
+ "epoch": 2.05,
305
+ "learning_rate": 7.440533240393831e-06,
306
+ "loss": 1.607,
307
  "step": 23500
308
  },
309
  {
310
+ "epoch": 2.09,
311
+ "learning_rate": 7.386076500827744e-06,
312
+ "loss": 1.5977,
313
  "step": 24000
314
  },
315
  {
316
+ "epoch": 2.13,
317
+ "learning_rate": 7.331619761261655e-06,
318
+ "loss": 1.5922,
 
 
 
 
 
 
 
 
319
  "step": 24500
320
  },
321
  {
322
+ "epoch": 2.18,
323
+ "learning_rate": 7.277163021695566e-06,
324
+ "loss": 1.5956,
325
  "step": 25000
326
  },
327
  {
328
+ "epoch": 2.22,
329
+ "learning_rate": 7.222706282129477e-06,
330
+ "loss": 1.5855,
331
  "step": 25500
332
  },
333
  {
334
+ "epoch": 2.27,
335
+ "learning_rate": 7.168249542563388e-06,
336
+ "loss": 1.5826,
337
  "step": 26000
338
  },
339
  {
340
+ "epoch": 2.31,
341
+ "learning_rate": 7.1137928029972995e-06,
342
+ "loss": 1.5846,
343
  "step": 26500
344
  },
345
  {
346
+ "epoch": 2.35,
347
+ "learning_rate": 7.059336063431211e-06,
348
+ "loss": 1.5899,
349
  "step": 27000
350
  },
351
  {
352
+ "epoch": 2.4,
353
+ "learning_rate": 7.004879323865122e-06,
354
+ "loss": 1.5828,
355
  "step": 27500
356
  },
357
  {
358
+ "epoch": 2.44,
359
+ "learning_rate": 6.950422584299033e-06,
360
+ "loss": 1.5762,
361
  "step": 28000
362
  },
363
  {
364
+ "epoch": 2.48,
365
+ "learning_rate": 6.895965844732945e-06,
366
+ "loss": 1.5739,
367
  "step": 28500
368
  },
369
  {
370
+ "epoch": 2.53,
371
+ "learning_rate": 6.841509105166856e-06,
372
+ "loss": 1.574,
373
  "step": 29000
374
  },
375
  {
376
+ "epoch": 2.57,
377
+ "learning_rate": 6.787052365600767e-06,
378
+ "loss": 1.5759,
379
  "step": 29500
380
  },
381
  {
382
+ "epoch": 2.61,
383
+ "learning_rate": 6.732595626034678e-06,
384
+ "loss": 1.5737,
385
  "step": 30000
386
  },
387
  {
388
+ "epoch": 2.66,
389
+ "learning_rate": 6.67813888646859e-06,
390
+ "loss": 1.5637,
391
  "step": 30500
392
  },
393
  {
394
+ "epoch": 2.7,
395
+ "learning_rate": 6.623682146902502e-06,
396
+ "loss": 1.5635,
397
  "step": 31000
398
  },
399
  {
400
+ "epoch": 2.74,
401
+ "learning_rate": 6.569225407336413e-06,
402
+ "loss": 1.5641,
403
  "step": 31500
404
  },
405
  {
406
+ "epoch": 2.79,
407
+ "learning_rate": 6.514768667770324e-06,
408
+ "loss": 1.553,
409
  "step": 32000
410
  },
411
  {
412
+ "epoch": 2.83,
413
+ "learning_rate": 6.460311928204235e-06,
414
+ "loss": 1.5699,
 
 
 
 
 
 
 
 
415
  "step": 32500
416
  },
417
  {
418
+ "epoch": 2.88,
419
+ "learning_rate": 6.405855188638146e-06,
420
+ "loss": 1.5695,
421
  "step": 33000
422
  },
423
  {
424
+ "epoch": 2.92,
425
+ "learning_rate": 6.3513984490720584e-06,
426
+ "loss": 1.5665,
427
  "step": 33500
428
  },
429
  {
430
+ "epoch": 2.96,
431
+ "learning_rate": 6.296941709505969e-06,
432
+ "loss": 1.5527,
433
  "step": 34000
434
  },
435
  {
436
+ "epoch": 3.0,
437
+ "eval_loss": 1.5436657667160034,
438
+ "eval_runtime": 3.2624,
439
+ "eval_samples_per_second": 41.687,
440
+ "eval_steps_per_second": 5.211,
441
+ "step": 34431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  }
443
  ],
444
  "logging_steps": 500,
445
+ "max_steps": 91816,
446
+ "num_train_epochs": 8,
447
  "save_steps": 500,
448
+ "total_flos": 5.397574828032e+16,
449
  "trial_name": null,
450
  "trial_params": null
451
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:571766107e6fb04c3fc5b250f343c7485c50c5a9f3e7aaf19f68a994ad56346d
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed2cc3807546f2cb55ecbce521c3690c744d9469e27b3404476816476ca082c6
3
  size 4027