edanigoben commited on
Commit
c048cb0
·
1 Parent(s): 287b37a

Classification tuned over mask lm 50 epochs 32 batch size

Browse files
Files changed (8) hide show
  1. config.json +1 -1
  2. optimizer.pt +2 -2
  3. pytorch_model.bin +1 -1
  4. rng_state.pth +1 -1
  5. scaler.pt +1 -1
  6. scheduler.pt +1 -1
  7. trainer_state.json +336 -544
  8. training_args.bin +1 -1
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "google/electra-small-generator",
3
  "architectures": [
4
  "ElectraForMaskedLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "factored/electra-fr-explorer-mlm",
3
  "architectures": [
4
  "ElectraForMaskedLM"
5
  ],
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a41f3b11c84d7da4a9743ce8310e19901dc457aa737f4fc4e0dbeafe07c03c7
3
- size 108493381
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c1eb0a9ede330f1a2ae441653c5d937ad29a8203bd9ca40964757dcfd7a8626
3
+ size 108492997
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39e43fb05f5efb970cf90a967a7751e994326d5f2685efd9a3b1e85036634221
3
  size 54261249
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70cc7c05cec640a561b87f47167d3df540a2e905e84cd75dc591796f7377b6c9
3
  size 54261249
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cea533a89b2544a3dd1251891c69b80a38083ac6e4a328a74fb9dcf37f83a3d6
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57398a737a50b74774b4d2844b3914f0610d12c7b222d5be2a9fcf7797cb99b8
3
  size 14575
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50aea151c5f20e32891d48e17d25cb58a62ab96c29ad66fed893801a78352bd2
3
  size 557
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16972d715adf496be8933e6bd369a2b9ee09d5563bc0a104babf5df390e74ec5
3
  size 557
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a68a014ef7352c80af8f4bee88886be9fab2aaa62e4867fa274fd40142f3442
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e843c34009ed217f8b4ed69dfa5bbf79f39670df24945af9fa5f6a41852cef2
3
  size 627
trainer_state.json CHANGED
@@ -1,702 +1,494 @@
1
  {
2
- "best_metric": 1.3500477075576782,
3
- "best_model_checkpoint": "./output_c/checkpoint-842163",
4
- "epoch": 49.0,
5
- "global_step": 842163,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 1.0,
12
- "learning_rate": 1.960020946063886e-05,
13
- "loss": 2.4473,
14
- "step": 17186
15
- },
16
- {
17
- "epoch": 1.0,
18
- "eval_loss": 2.0222833156585693,
19
- "eval_runtime": 62.1889,
20
- "eval_samples_per_second": 1104.941,
21
- "eval_steps_per_second": 34.54,
22
- "step": 17187
23
- },
24
- {
25
- "epoch": 2.0,
26
- "learning_rate": 1.920032582766044e-05,
27
- "loss": 2.0795,
28
- "step": 34372
29
  },
30
  {
31
  "epoch": 2.0,
32
- "eval_loss": 1.871239185333252,
33
- "eval_runtime": 58.1601,
34
- "eval_samples_per_second": 1181.481,
35
- "eval_steps_per_second": 36.933,
36
- "step": 34374
37
  },
38
  {
39
  "epoch": 3.0,
40
- "learning_rate": 1.880051201489498e-05,
41
- "loss": 1.958,
42
- "step": 51558
43
- },
44
- {
45
- "epoch": 3.0,
46
- "eval_loss": 1.781246542930603,
47
- "eval_runtime": 54.7184,
48
- "eval_samples_per_second": 1255.793,
49
- "eval_steps_per_second": 39.256,
50
- "step": 51561
51
  },
52
  {
53
  "epoch": 4.0,
54
- "learning_rate": 1.8400698202129518e-05,
55
- "loss": 1.883,
56
- "step": 68744
57
- },
58
- {
59
- "epoch": 4.0,
60
- "eval_loss": 1.7132632732391357,
61
- "eval_runtime": 53.9408,
62
- "eval_samples_per_second": 1273.896,
63
- "eval_steps_per_second": 39.821,
64
- "step": 68748
65
- },
66
- {
67
- "epoch": 5.0,
68
- "learning_rate": 1.8000884389364056e-05,
69
- "loss": 1.8308,
70
- "step": 85930
71
  },
72
  {
73
  "epoch": 5.0,
74
- "eval_loss": 1.6726739406585693,
75
- "eval_runtime": 54.0625,
76
- "eval_samples_per_second": 1271.03,
77
- "eval_steps_per_second": 39.732,
78
- "step": 85935
79
  },
80
  {
81
  "epoch": 6.0,
82
- "learning_rate": 1.7601047303194276e-05,
83
- "loss": 1.7889,
84
- "step": 103116
85
- },
86
- {
87
- "epoch": 6.0,
88
- "eval_loss": 1.6451952457427979,
89
- "eval_runtime": 56.4005,
90
- "eval_samples_per_second": 1218.341,
91
- "eval_steps_per_second": 38.085,
92
- "step": 103122
93
  },
94
  {
95
  "epoch": 7.0,
96
- "learning_rate": 1.7201256763833133e-05,
97
- "loss": 1.7562,
98
- "step": 120302
99
- },
100
- {
101
- "epoch": 7.0,
102
- "eval_loss": 1.6122530698776245,
103
- "eval_runtime": 56.1664,
104
- "eval_samples_per_second": 1223.419,
105
- "eval_steps_per_second": 38.244,
106
- "step": 120309
107
- },
108
- {
109
- "epoch": 8.0,
110
- "learning_rate": 1.6801442951067668e-05,
111
- "loss": 1.7289,
112
- "step": 137488
113
  },
114
  {
115
  "epoch": 8.0,
116
- "eval_loss": 1.5861868858337402,
117
- "eval_runtime": 59.6521,
118
- "eval_samples_per_second": 1151.929,
119
- "eval_steps_per_second": 36.009,
120
- "step": 137496
121
  },
122
  {
123
  "epoch": 9.0,
124
- "learning_rate": 1.6401652411706525e-05,
125
- "loss": 1.7051,
126
- "step": 154674
127
- },
128
- {
129
- "epoch": 9.0,
130
- "eval_loss": 1.5660465955734253,
131
- "eval_runtime": 55.3868,
132
- "eval_samples_per_second": 1240.638,
133
- "eval_steps_per_second": 38.782,
134
- "step": 154683
135
- },
136
- {
137
- "epoch": 10.0,
138
- "learning_rate": 1.6001838598941063e-05,
139
- "loss": 1.6828,
140
- "step": 171860
141
  },
142
  {
143
  "epoch": 10.0,
144
- "eval_loss": 1.551895022392273,
145
- "eval_runtime": 56.8329,
146
- "eval_samples_per_second": 1209.07,
147
- "eval_steps_per_second": 37.795,
148
- "step": 171870
149
- },
150
- {
151
- "epoch": 11.0,
152
- "learning_rate": 1.5602048059579917e-05,
153
- "loss": 1.666,
154
- "step": 189046
155
  },
156
  {
157
  "epoch": 11.0,
158
- "eval_loss": 1.5311487913131714,
159
- "eval_runtime": 55.1299,
160
- "eval_samples_per_second": 1246.419,
161
- "eval_steps_per_second": 38.962,
162
- "step": 189057
163
  },
164
  {
165
  "epoch": 12.0,
166
- "learning_rate": 1.5202234246814453e-05,
167
- "loss": 1.65,
168
- "step": 206232
169
- },
170
- {
171
- "epoch": 12.0,
172
- "eval_loss": 1.521924376487732,
173
- "eval_runtime": 54.0055,
174
- "eval_samples_per_second": 1272.369,
175
- "eval_steps_per_second": 39.774,
176
- "step": 206244
177
  },
178
  {
179
  "epoch": 13.0,
180
- "learning_rate": 1.4802443707453309e-05,
181
- "loss": 1.6347,
182
- "step": 223418
183
- },
184
- {
185
- "epoch": 13.0,
186
- "eval_loss": 1.5064960718154907,
187
- "eval_runtime": 53.5728,
188
- "eval_samples_per_second": 1282.647,
189
- "eval_steps_per_second": 40.095,
190
- "step": 223431
191
- },
192
- {
193
- "epoch": 14.0,
194
- "learning_rate": 1.4402629894687847e-05,
195
- "loss": 1.6208,
196
- "step": 240604
197
  },
198
  {
199
  "epoch": 14.0,
200
- "eval_loss": 1.4912291765213013,
201
- "eval_runtime": 55.9464,
202
- "eval_samples_per_second": 1228.23,
203
- "eval_steps_per_second": 38.394,
204
- "step": 240618
205
  },
206
  {
207
  "epoch": 15.0,
208
- "learning_rate": 1.4002816081922384e-05,
209
- "loss": 1.6095,
210
- "step": 257790
211
- },
212
- {
213
- "epoch": 15.0,
214
- "eval_loss": 1.4807928800582886,
215
- "eval_runtime": 57.4004,
216
- "eval_samples_per_second": 1197.116,
217
- "eval_steps_per_second": 37.421,
218
- "step": 257805
219
- },
220
- {
221
- "epoch": 16.0,
222
- "learning_rate": 1.3603002269156922e-05,
223
- "loss": 1.6,
224
- "step": 274976
225
  },
226
  {
227
  "epoch": 16.0,
228
- "eval_loss": 1.4743348360061646,
229
- "eval_runtime": 56.4585,
230
- "eval_samples_per_second": 1217.089,
231
- "eval_steps_per_second": 38.046,
232
- "step": 274992
233
- },
234
- {
235
- "epoch": 17.0,
236
- "learning_rate": 1.3203235003200094e-05,
237
- "loss": 1.5898,
238
- "step": 292162
239
  },
240
  {
241
  "epoch": 17.0,
242
- "eval_loss": 1.460829734802246,
243
- "eval_runtime": 55.6267,
244
- "eval_samples_per_second": 1235.289,
245
- "eval_steps_per_second": 38.615,
246
- "step": 292179
247
  },
248
  {
249
  "epoch": 18.0,
250
- "learning_rate": 1.2803397917030315e-05,
251
- "loss": 1.5802,
252
- "step": 309348
253
- },
254
- {
255
- "epoch": 18.0,
256
- "eval_loss": 1.455512523651123,
257
- "eval_runtime": 55.4831,
258
- "eval_samples_per_second": 1238.486,
259
- "eval_steps_per_second": 38.715,
260
- "step": 309366
261
- },
262
- {
263
- "epoch": 19.0,
264
- "learning_rate": 1.2403584104264852e-05,
265
- "loss": 1.5711,
266
- "step": 326534
267
  },
268
  {
269
  "epoch": 19.0,
270
- "eval_loss": 1.4483146667480469,
271
- "eval_runtime": 55.2066,
272
- "eval_samples_per_second": 1244.689,
273
- "eval_steps_per_second": 38.908,
274
- "step": 326553
275
- },
276
- {
277
- "epoch": 20.0,
278
- "learning_rate": 1.2003793564903707e-05,
279
- "loss": 1.5638,
280
- "step": 343720
281
  },
282
  {
283
  "epoch": 20.0,
284
- "eval_loss": 1.4421828985214233,
285
- "eval_runtime": 53.3823,
286
- "eval_samples_per_second": 1287.225,
287
- "eval_steps_per_second": 40.238,
288
- "step": 343740
289
  },
290
  {
291
  "epoch": 21.0,
292
- "learning_rate": 1.1603956478733929e-05,
293
- "loss": 1.5577,
294
- "step": 360906
295
- },
296
- {
297
- "epoch": 21.0,
298
- "eval_loss": 1.4301999807357788,
299
- "eval_runtime": 52.0928,
300
- "eval_samples_per_second": 1319.088,
301
- "eval_steps_per_second": 41.234,
302
- "step": 360927
303
- },
304
- {
305
- "epoch": 22.0,
306
- "learning_rate": 1.1204165939372782e-05,
307
- "loss": 1.5494,
308
- "step": 378092
309
  },
310
  {
311
  "epoch": 22.0,
312
- "eval_loss": 1.4282857179641724,
313
- "eval_runtime": 51.7394,
314
- "eval_samples_per_second": 1328.098,
315
- "eval_steps_per_second": 41.516,
316
- "step": 378114
317
- },
318
- {
319
- "epoch": 23.0,
320
- "learning_rate": 1.0804328853203004e-05,
321
- "loss": 1.5437,
322
- "step": 395278
323
  },
324
  {
325
  "epoch": 23.0,
326
- "eval_loss": 1.4197551012039185,
327
- "eval_runtime": 56.4255,
328
- "eval_samples_per_second": 1217.801,
329
- "eval_steps_per_second": 38.068,
330
- "step": 395301
331
  },
332
  {
333
  "epoch": 24.0,
334
- "learning_rate": 1.0404515040437542e-05,
335
- "loss": 1.5377,
336
- "step": 412464
337
- },
338
- {
339
- "epoch": 24.0,
340
- "eval_loss": 1.4187239408493042,
341
- "eval_runtime": 57.0967,
342
- "eval_samples_per_second": 1203.485,
343
- "eval_steps_per_second": 37.62,
344
- "step": 412488
345
  },
346
  {
347
  "epoch": 25.0,
348
- "learning_rate": 1.0004701227672077e-05,
349
- "loss": 1.532,
350
- "step": 429650
351
- },
352
- {
353
- "epoch": 25.0,
354
- "eval_loss": 1.407697319984436,
355
- "eval_runtime": 57.5147,
356
- "eval_samples_per_second": 1194.737,
357
- "eval_steps_per_second": 37.347,
358
- "step": 429675
359
- },
360
- {
361
- "epoch": 26.0,
362
- "learning_rate": 9.604887414906617e-06,
363
- "loss": 1.5259,
364
- "step": 446836
365
  },
366
  {
367
  "epoch": 26.0,
368
- "eval_loss": 1.406160593032837,
369
- "eval_runtime": 57.387,
370
- "eval_samples_per_second": 1197.396,
371
- "eval_steps_per_second": 37.43,
372
- "step": 446862
373
  },
374
  {
375
  "epoch": 27.0,
376
- "learning_rate": 9.205073602141154e-06,
377
- "loss": 1.5202,
378
- "step": 464022
379
- },
380
- {
381
- "epoch": 27.0,
382
- "eval_loss": 1.4006047248840332,
383
- "eval_runtime": 57.8577,
384
- "eval_samples_per_second": 1187.656,
385
- "eval_steps_per_second": 37.126,
386
- "step": 464049
387
- },
388
- {
389
- "epoch": 28.0,
390
- "learning_rate": 8.805283062780009e-06,
391
- "loss": 1.5165,
392
- "step": 481208
393
  },
394
  {
395
  "epoch": 28.0,
396
- "eval_loss": 1.401613473892212,
397
- "eval_runtime": 57.2222,
398
- "eval_samples_per_second": 1200.846,
399
- "eval_steps_per_second": 37.538,
400
- "step": 481236
401
- },
402
- {
403
- "epoch": 29.0,
404
- "learning_rate": 8.405469250014546e-06,
405
- "loss": 1.5118,
406
- "step": 498394
407
  },
408
  {
409
  "epoch": 29.0,
410
- "eval_loss": 1.3900607824325562,
411
- "eval_runtime": 57.1887,
412
- "eval_samples_per_second": 1201.549,
413
- "eval_steps_per_second": 37.56,
414
- "step": 498423
415
  },
416
  {
417
  "epoch": 30.0,
418
- "learning_rate": 8.005678710653402e-06,
419
- "loss": 1.5079,
420
- "step": 515580
421
- },
422
- {
423
- "epoch": 30.0,
424
- "eval_loss": 1.3925185203552246,
425
- "eval_runtime": 57.0206,
426
- "eval_samples_per_second": 1205.091,
427
- "eval_steps_per_second": 37.671,
428
- "step": 515610
429
- },
430
- {
431
- "epoch": 31.0,
432
- "learning_rate": 7.605864897887939e-06,
433
- "loss": 1.5037,
434
- "step": 532766
435
  },
436
  {
437
  "epoch": 31.0,
438
- "eval_loss": 1.3832355737686157,
439
- "eval_runtime": 57.7842,
440
- "eval_samples_per_second": 1189.166,
441
- "eval_steps_per_second": 37.173,
442
- "step": 532797
443
- },
444
- {
445
- "epoch": 32.0,
446
- "learning_rate": 7.206027811718159e-06,
447
- "loss": 1.4998,
448
- "step": 549952
449
  },
450
  {
451
  "epoch": 32.0,
452
- "eval_loss": 1.3857349157333374,
453
- "eval_runtime": 57.6923,
454
- "eval_samples_per_second": 1191.061,
455
- "eval_steps_per_second": 37.232,
456
- "step": 549984
457
  },
458
  {
459
  "epoch": 33.0,
460
- "learning_rate": 6.806213998952697e-06,
461
- "loss": 1.4953,
462
- "step": 567138
463
- },
464
- {
465
- "epoch": 33.0,
466
- "eval_loss": 1.3751457929611206,
467
- "eval_runtime": 56.0689,
468
- "eval_samples_per_second": 1225.545,
469
- "eval_steps_per_second": 38.31,
470
- "step": 567171
471
- },
472
- {
473
- "epoch": 34.0,
474
- "learning_rate": 6.406423459591552e-06,
475
- "loss": 1.4924,
476
- "step": 584324
477
  },
478
  {
479
  "epoch": 34.0,
480
- "eval_loss": 1.3691054582595825,
481
- "eval_runtime": 57.9405,
482
- "eval_samples_per_second": 1185.957,
483
- "eval_steps_per_second": 37.072,
484
- "step": 584358
485
- },
486
- {
487
- "epoch": 35.0,
488
- "learning_rate": 6.006586373421773e-06,
489
- "loss": 1.489,
490
- "step": 601510
491
  },
492
  {
493
  "epoch": 35.0,
494
- "eval_loss": 1.3761118650436401,
495
- "eval_runtime": 57.9115,
496
- "eval_samples_per_second": 1186.551,
497
- "eval_steps_per_second": 37.091,
498
- "step": 601545
499
  },
500
  {
501
  "epoch": 36.0,
502
- "learning_rate": 5.606749287251994e-06,
503
- "loss": 1.4857,
504
- "step": 618696
505
- },
506
- {
507
- "epoch": 36.0,
508
- "eval_loss": 1.3728961944580078,
509
- "eval_runtime": 57.678,
510
- "eval_samples_per_second": 1191.356,
511
- "eval_steps_per_second": 37.241,
512
- "step": 618732
513
- },
514
- {
515
- "epoch": 37.0,
516
- "learning_rate": 5.2069354744865304e-06,
517
- "loss": 1.4836,
518
- "step": 635882
519
  },
520
  {
521
  "epoch": 37.0,
522
- "eval_loss": 1.3688822984695435,
523
- "eval_runtime": 57.411,
524
- "eval_samples_per_second": 1196.896,
525
- "eval_steps_per_second": 37.414,
526
- "step": 635919
527
- },
528
- {
529
- "epoch": 38.0,
530
- "learning_rate": 4.807121661721069e-06,
531
- "loss": 1.4814,
532
- "step": 653068
533
  },
534
  {
535
  "epoch": 38.0,
536
- "eval_loss": 1.3645319938659668,
537
- "eval_runtime": 57.5288,
538
- "eval_samples_per_second": 1194.446,
539
- "eval_steps_per_second": 37.338,
540
- "step": 653106
541
  },
542
  {
543
  "epoch": 39.0,
544
- "learning_rate": 4.407284575551289e-06,
545
- "loss": 1.4804,
546
- "step": 670254
547
- },
548
- {
549
- "epoch": 39.0,
550
- "eval_loss": 1.3667703866958618,
551
- "eval_runtime": 57.9227,
552
- "eval_samples_per_second": 1186.322,
553
- "eval_steps_per_second": 37.084,
554
- "step": 670293
555
- },
556
- {
557
- "epoch": 40.0,
558
- "learning_rate": 4.007447489381509e-06,
559
- "loss": 1.4773,
560
- "step": 687440
561
  },
562
  {
563
  "epoch": 40.0,
564
- "eval_loss": 1.3591846227645874,
565
- "eval_runtime": 57.728,
566
- "eval_samples_per_second": 1190.324,
567
- "eval_steps_per_second": 37.209,
568
- "step": 687480
569
  },
570
  {
571
- "epoch": 41.0,
572
- "learning_rate": 3.6076569500203645e-06,
573
- "loss": 1.4735,
574
- "step": 704626
575
  },
576
  {
577
  "epoch": 41.0,
578
- "eval_loss": 1.3576593399047852,
579
- "eval_runtime": 57.8776,
580
- "eval_samples_per_second": 1187.247,
581
- "eval_steps_per_second": 37.113,
582
- "step": 704667
583
  },
584
  {
585
  "epoch": 42.0,
586
- "learning_rate": 3.2078664106592196e-06,
587
- "loss": 1.473,
588
- "step": 721812
589
- },
590
- {
591
- "epoch": 42.0,
592
- "eval_loss": 1.3613665103912354,
593
- "eval_runtime": 57.6832,
594
- "eval_samples_per_second": 1191.247,
595
- "eval_steps_per_second": 37.238,
596
- "step": 721854
597
- },
598
- {
599
- "epoch": 43.0,
600
- "learning_rate": 2.8080758712980744e-06,
601
- "loss": 1.4715,
602
- "step": 738998
603
  },
604
  {
605
  "epoch": 43.0,
606
- "eval_loss": 1.3575751781463623,
607
- "eval_runtime": 58.5415,
608
- "eval_samples_per_second": 1173.783,
609
- "eval_steps_per_second": 36.692,
610
- "step": 739041
611
  },
612
  {
613
  "epoch": 44.0,
614
- "learning_rate": 2.4082620585326123e-06,
615
- "loss": 1.4696,
616
- "step": 756184
617
- },
618
- {
619
- "epoch": 44.0,
620
- "eval_loss": 1.3553742170333862,
621
- "eval_runtime": 57.9841,
622
- "eval_samples_per_second": 1185.066,
623
- "eval_steps_per_second": 37.045,
624
- "step": 756228
625
  },
626
  {
627
  "epoch": 45.0,
628
- "learning_rate": 2.00844824576715e-06,
629
- "loss": 1.4681,
630
- "step": 773370
631
- },
632
- {
633
- "epoch": 45.0,
634
- "eval_loss": 1.3520816564559937,
635
- "eval_runtime": 57.2264,
636
- "eval_samples_per_second": 1200.757,
637
- "eval_steps_per_second": 37.535,
638
- "step": 773415
639
- },
640
- {
641
- "epoch": 46.0,
642
- "learning_rate": 1.6086111595973703e-06,
643
- "loss": 1.4667,
644
- "step": 790556
645
  },
646
  {
647
  "epoch": 46.0,
648
- "eval_loss": 1.3518463373184204,
649
- "eval_runtime": 57.2812,
650
- "eval_samples_per_second": 1199.609,
651
- "eval_steps_per_second": 37.499,
652
- "step": 790602
653
  },
654
  {
655
  "epoch": 47.0,
656
- "learning_rate": 1.2088206202362252e-06,
657
- "loss": 1.465,
658
- "step": 807742
659
- },
660
- {
661
- "epoch": 47.0,
662
- "eval_loss": 1.353028416633606,
663
- "eval_runtime": 57.8145,
664
- "eval_samples_per_second": 1188.543,
665
- "eval_steps_per_second": 37.153,
666
- "step": 807789
667
  },
668
  {
669
  "epoch": 48.0,
670
- "learning_rate": 8.090300808750801e-07,
671
- "loss": 1.4637,
672
- "step": 824928
673
- },
674
- {
675
- "epoch": 48.0,
676
- "eval_loss": 1.3512929677963257,
677
- "eval_runtime": 57.6297,
678
- "eval_samples_per_second": 1192.354,
679
- "eval_steps_per_second": 37.272,
680
- "step": 824976
681
- },
682
- {
683
- "epoch": 49.0,
684
- "learning_rate": 4.092162681096178e-07,
685
- "loss": 1.462,
686
- "step": 842114
687
  },
688
  {
689
  "epoch": 49.0,
690
- "eval_loss": 1.3500477075576782,
691
- "eval_runtime": 58.0134,
692
- "eval_samples_per_second": 1184.467,
693
- "eval_steps_per_second": 37.026,
694
- "step": 842163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  }
696
  ],
697
- "max_steps": 859350,
698
- "num_train_epochs": 50,
699
- "total_flos": 1.9815098644202803e+17,
700
  "trial_name": null,
701
  "trial_params": null
702
  }
 
1
  {
2
+ "best_metric": 2.190300464630127,
3
+ "best_model_checkpoint": "./output_c/checkpoint-615",
4
+ "epoch": 59.0,
5
+ "global_step": 885,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 1.0,
12
+ "eval_loss": 3.077622652053833,
13
+ "eval_runtime": 0.2951,
14
+ "eval_samples_per_second": 494.804,
15
+ "eval_steps_per_second": 16.945,
16
+ "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
18
  {
19
  "epoch": 2.0,
20
+ "eval_loss": 2.9188127517700195,
21
+ "eval_runtime": 0.3714,
22
+ "eval_samples_per_second": 393.09,
23
+ "eval_steps_per_second": 13.462,
24
+ "step": 30
25
  },
26
  {
27
  "epoch": 3.0,
28
+ "eval_loss": 2.8426482677459717,
29
+ "eval_runtime": 0.2609,
30
+ "eval_samples_per_second": 559.557,
31
+ "eval_steps_per_second": 19.163,
32
+ "step": 45
 
 
 
 
 
 
33
  },
34
  {
35
  "epoch": 4.0,
36
+ "eval_loss": 2.6221985816955566,
37
+ "eval_runtime": 0.2726,
38
+ "eval_samples_per_second": 535.638,
39
+ "eval_steps_per_second": 18.344,
40
+ "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
41
  },
42
  {
43
  "epoch": 5.0,
44
+ "eval_loss": 2.6243655681610107,
45
+ "eval_runtime": 0.2669,
46
+ "eval_samples_per_second": 546.955,
47
+ "eval_steps_per_second": 18.731,
48
+ "step": 75
49
  },
50
  {
51
  "epoch": 6.0,
52
+ "eval_loss": 2.6885011196136475,
53
+ "eval_runtime": 0.2608,
54
+ "eval_samples_per_second": 559.912,
55
+ "eval_steps_per_second": 19.175,
56
+ "step": 90
 
 
 
 
 
 
57
  },
58
  {
59
  "epoch": 7.0,
60
+ "eval_loss": 2.4477977752685547,
61
+ "eval_runtime": 0.266,
62
+ "eval_samples_per_second": 548.901,
63
+ "eval_steps_per_second": 18.798,
64
+ "step": 105
 
 
 
 
 
 
 
 
 
 
 
 
65
  },
66
  {
67
  "epoch": 8.0,
68
+ "eval_loss": 2.588456153869629,
69
+ "eval_runtime": 0.3103,
70
+ "eval_samples_per_second": 470.587,
71
+ "eval_steps_per_second": 16.116,
72
+ "step": 120
73
  },
74
  {
75
  "epoch": 9.0,
76
+ "eval_loss": 2.4026825428009033,
77
+ "eval_runtime": 0.2649,
78
+ "eval_samples_per_second": 551.053,
79
+ "eval_steps_per_second": 18.872,
80
+ "step": 135
 
 
 
 
 
 
 
 
 
 
 
 
81
  },
82
  {
83
  "epoch": 10.0,
84
+ "eval_loss": 2.3897533416748047,
85
+ "eval_runtime": 0.3189,
86
+ "eval_samples_per_second": 457.83,
87
+ "eval_steps_per_second": 15.679,
88
+ "step": 150
 
 
 
 
 
 
89
  },
90
  {
91
  "epoch": 11.0,
92
+ "eval_loss": 2.473085641860962,
93
+ "eval_runtime": 0.3415,
94
+ "eval_samples_per_second": 427.488,
95
+ "eval_steps_per_second": 14.64,
96
+ "step": 165
97
  },
98
  {
99
  "epoch": 12.0,
100
+ "eval_loss": 2.448983907699585,
101
+ "eval_runtime": 0.2684,
102
+ "eval_samples_per_second": 543.977,
103
+ "eval_steps_per_second": 18.629,
104
+ "step": 180
 
 
 
 
 
 
105
  },
106
  {
107
  "epoch": 13.0,
108
+ "eval_loss": 2.3829126358032227,
109
+ "eval_runtime": 0.2785,
110
+ "eval_samples_per_second": 524.252,
111
+ "eval_steps_per_second": 17.954,
112
+ "step": 195
 
 
 
 
 
 
 
 
 
 
 
 
113
  },
114
  {
115
  "epoch": 14.0,
116
+ "eval_loss": 2.501011610031128,
117
+ "eval_runtime": 0.2671,
118
+ "eval_samples_per_second": 546.582,
119
+ "eval_steps_per_second": 18.719,
120
+ "step": 210
121
  },
122
  {
123
  "epoch": 15.0,
124
+ "eval_loss": 2.427177906036377,
125
+ "eval_runtime": 0.2718,
126
+ "eval_samples_per_second": 537.134,
127
+ "eval_steps_per_second": 18.395,
128
+ "step": 225
 
 
 
 
 
 
 
 
 
 
 
 
129
  },
130
  {
131
  "epoch": 16.0,
132
+ "eval_loss": 2.421048879623413,
133
+ "eval_runtime": 0.2742,
134
+ "eval_samples_per_second": 532.533,
135
+ "eval_steps_per_second": 18.237,
136
+ "step": 240
 
 
 
 
 
 
137
  },
138
  {
139
  "epoch": 17.0,
140
+ "eval_loss": 2.342533826828003,
141
+ "eval_runtime": 0.2891,
142
+ "eval_samples_per_second": 505.042,
143
+ "eval_steps_per_second": 17.296,
144
+ "step": 255
145
  },
146
  {
147
  "epoch": 18.0,
148
+ "eval_loss": 2.398080348968506,
149
+ "eval_runtime": 0.2794,
150
+ "eval_samples_per_second": 522.517,
151
+ "eval_steps_per_second": 17.894,
152
+ "step": 270
 
 
 
 
 
 
 
 
 
 
 
 
153
  },
154
  {
155
  "epoch": 19.0,
156
+ "eval_loss": 2.3011465072631836,
157
+ "eval_runtime": 0.2774,
158
+ "eval_samples_per_second": 526.407,
159
+ "eval_steps_per_second": 18.028,
160
+ "step": 285
 
 
 
 
 
 
161
  },
162
  {
163
  "epoch": 20.0,
164
+ "eval_loss": 2.5109691619873047,
165
+ "eval_runtime": 0.2617,
166
+ "eval_samples_per_second": 557.964,
167
+ "eval_steps_per_second": 19.108,
168
+ "step": 300
169
  },
170
  {
171
  "epoch": 21.0,
172
+ "eval_loss": 2.381415843963623,
173
+ "eval_runtime": 0.2801,
174
+ "eval_samples_per_second": 521.25,
175
+ "eval_steps_per_second": 17.851,
176
+ "step": 315
 
 
 
 
 
 
 
 
 
 
 
 
177
  },
178
  {
179
  "epoch": 22.0,
180
+ "eval_loss": 2.3828046321868896,
181
+ "eval_runtime": 0.2743,
182
+ "eval_samples_per_second": 532.299,
183
+ "eval_steps_per_second": 18.229,
184
+ "step": 330
 
 
 
 
 
 
185
  },
186
  {
187
  "epoch": 23.0,
188
+ "eval_loss": 2.353680372238159,
189
+ "eval_runtime": 0.2779,
190
+ "eval_samples_per_second": 525.432,
191
+ "eval_steps_per_second": 17.994,
192
+ "step": 345
193
  },
194
  {
195
  "epoch": 24.0,
196
+ "eval_loss": 2.29482364654541,
197
+ "eval_runtime": 0.2832,
198
+ "eval_samples_per_second": 515.574,
199
+ "eval_steps_per_second": 17.657,
200
+ "step": 360
 
 
 
 
 
 
201
  },
202
  {
203
  "epoch": 25.0,
204
+ "eval_loss": 2.3079590797424316,
205
+ "eval_runtime": 0.3243,
206
+ "eval_samples_per_second": 450.187,
207
+ "eval_steps_per_second": 15.417,
208
+ "step": 375
 
 
 
 
 
 
 
 
 
 
 
 
209
  },
210
  {
211
  "epoch": 26.0,
212
+ "eval_loss": 2.379464864730835,
213
+ "eval_runtime": 0.2934,
214
+ "eval_samples_per_second": 497.692,
215
+ "eval_steps_per_second": 17.044,
216
+ "step": 390
217
  },
218
  {
219
  "epoch": 27.0,
220
+ "eval_loss": 2.37467885017395,
221
+ "eval_runtime": 0.2869,
222
+ "eval_samples_per_second": 508.855,
223
+ "eval_steps_per_second": 17.427,
224
+ "step": 405
 
 
 
 
 
 
 
 
 
 
 
 
225
  },
226
  {
227
  "epoch": 28.0,
228
+ "eval_loss": 2.3926637172698975,
229
+ "eval_runtime": 0.2792,
230
+ "eval_samples_per_second": 523.011,
231
+ "eval_steps_per_second": 17.911,
232
+ "step": 420
 
 
 
 
 
 
233
  },
234
  {
235
  "epoch": 29.0,
236
+ "eval_loss": 2.2542331218719482,
237
+ "eval_runtime": 0.2705,
238
+ "eval_samples_per_second": 539.84,
239
+ "eval_steps_per_second": 18.488,
240
+ "step": 435
241
  },
242
  {
243
  "epoch": 30.0,
244
+ "eval_loss": 2.312037944793701,
245
+ "eval_runtime": 0.2823,
246
+ "eval_samples_per_second": 517.14,
247
+ "eval_steps_per_second": 17.71,
248
+ "step": 450
 
 
 
 
 
 
 
 
 
 
 
 
249
  },
250
  {
251
  "epoch": 31.0,
252
+ "eval_loss": 2.2595930099487305,
253
+ "eval_runtime": 0.2709,
254
+ "eval_samples_per_second": 538.869,
255
+ "eval_steps_per_second": 18.454,
256
+ "step": 465
 
 
 
 
 
 
257
  },
258
  {
259
  "epoch": 32.0,
260
+ "eval_loss": 2.3319013118743896,
261
+ "eval_runtime": 0.2718,
262
+ "eval_samples_per_second": 537.135,
263
+ "eval_steps_per_second": 18.395,
264
+ "step": 480
265
  },
266
  {
267
  "epoch": 33.0,
268
+ "eval_loss": 2.321133852005005,
269
+ "eval_runtime": 0.3056,
270
+ "eval_samples_per_second": 477.789,
271
+ "eval_steps_per_second": 16.363,
272
+ "step": 495
 
 
 
 
 
 
 
 
 
 
 
 
273
  },
274
  {
275
  "epoch": 34.0,
276
+ "eval_loss": 2.3662209510803223,
277
+ "eval_runtime": 0.2727,
278
+ "eval_samples_per_second": 535.325,
279
+ "eval_steps_per_second": 18.333,
280
+ "step": 510
 
 
 
 
 
 
281
  },
282
  {
283
  "epoch": 35.0,
284
+ "eval_loss": 2.3607561588287354,
285
+ "eval_runtime": 0.2769,
286
+ "eval_samples_per_second": 527.269,
287
+ "eval_steps_per_second": 18.057,
288
+ "step": 525
289
  },
290
  {
291
  "epoch": 36.0,
292
+ "eval_loss": 2.2733652591705322,
293
+ "eval_runtime": 0.3144,
294
+ "eval_samples_per_second": 464.449,
295
+ "eval_steps_per_second": 15.906,
296
+ "step": 540
 
 
 
 
 
 
 
 
 
 
 
 
297
  },
298
  {
299
  "epoch": 37.0,
300
+ "eval_loss": 2.332275390625,
301
+ "eval_runtime": 0.2823,
302
+ "eval_samples_per_second": 517.096,
303
+ "eval_steps_per_second": 17.709,
304
+ "step": 555
 
 
 
 
 
 
305
  },
306
  {
307
  "epoch": 38.0,
308
+ "eval_loss": 2.3226001262664795,
309
+ "eval_runtime": 0.2722,
310
+ "eval_samples_per_second": 536.297,
311
+ "eval_steps_per_second": 18.366,
312
+ "step": 570
313
  },
314
  {
315
  "epoch": 39.0,
316
+ "eval_loss": 2.2499899864196777,
317
+ "eval_runtime": 0.2823,
318
+ "eval_samples_per_second": 517.269,
319
+ "eval_steps_per_second": 17.715,
320
+ "step": 585
 
 
 
 
 
 
 
 
 
 
 
 
321
  },
322
  {
323
  "epoch": 40.0,
324
+ "eval_loss": 2.3148353099823,
325
+ "eval_runtime": 0.2735,
326
+ "eval_samples_per_second": 533.755,
327
+ "eval_steps_per_second": 18.279,
328
+ "step": 600
329
  },
330
  {
331
+ "epoch": 40.73,
332
+ "learning_rate": 6.444444444444445e-06,
333
+ "loss": 2.4323,
334
+ "step": 611
335
  },
336
  {
337
  "epoch": 41.0,
338
+ "eval_loss": 2.190300464630127,
339
+ "eval_runtime": 0.279,
340
+ "eval_samples_per_second": 523.306,
341
+ "eval_steps_per_second": 17.921,
342
+ "step": 615
343
  },
344
  {
345
  "epoch": 42.0,
346
+ "eval_loss": 2.2688183784484863,
347
+ "eval_runtime": 0.2786,
348
+ "eval_samples_per_second": 524.054,
349
+ "eval_steps_per_second": 17.947,
350
+ "step": 630
 
 
 
 
 
 
 
 
 
 
 
 
351
  },
352
  {
353
  "epoch": 43.0,
354
+ "eval_loss": 2.3206570148468018,
355
+ "eval_runtime": 0.3166,
356
+ "eval_samples_per_second": 461.08,
357
+ "eval_steps_per_second": 15.79,
358
+ "step": 645
359
  },
360
  {
361
  "epoch": 44.0,
362
+ "eval_loss": 2.398860454559326,
363
+ "eval_runtime": 0.2802,
364
+ "eval_samples_per_second": 521.028,
365
+ "eval_steps_per_second": 17.843,
366
+ "step": 660
 
 
 
 
 
 
367
  },
368
  {
369
  "epoch": 45.0,
370
+ "eval_loss": 2.329181432723999,
371
+ "eval_runtime": 0.2845,
372
+ "eval_samples_per_second": 513.222,
373
+ "eval_steps_per_second": 17.576,
374
+ "step": 675
 
 
 
 
 
 
 
 
 
 
 
 
375
  },
376
  {
377
  "epoch": 46.0,
378
+ "eval_loss": 2.301910877227783,
379
+ "eval_runtime": 0.2825,
380
+ "eval_samples_per_second": 516.788,
381
+ "eval_steps_per_second": 17.698,
382
+ "step": 690
383
  },
384
  {
385
  "epoch": 47.0,
386
+ "eval_loss": 2.286062002182007,
387
+ "eval_runtime": 0.2774,
388
+ "eval_samples_per_second": 526.332,
389
+ "eval_steps_per_second": 18.025,
390
+ "step": 705
 
 
 
 
 
 
391
  },
392
  {
393
  "epoch": 48.0,
394
+ "eval_loss": 2.2627930641174316,
395
+ "eval_runtime": 0.3491,
396
+ "eval_samples_per_second": 418.174,
397
+ "eval_steps_per_second": 14.321,
398
+ "step": 720
 
 
 
 
 
 
 
 
 
 
 
 
399
  },
400
  {
401
  "epoch": 49.0,
402
+ "eval_loss": 2.3683576583862305,
403
+ "eval_runtime": 0.2828,
404
+ "eval_samples_per_second": 516.226,
405
+ "eval_steps_per_second": 17.679,
406
+ "step": 735
407
+ },
408
+ {
409
+ "epoch": 50.0,
410
+ "eval_loss": 2.3841121196746826,
411
+ "eval_runtime": 0.2738,
412
+ "eval_samples_per_second": 533.204,
413
+ "eval_steps_per_second": 18.26,
414
+ "step": 750
415
+ },
416
+ {
417
+ "epoch": 51.0,
418
+ "eval_loss": 2.3427212238311768,
419
+ "eval_runtime": 0.2841,
420
+ "eval_samples_per_second": 513.874,
421
+ "eval_steps_per_second": 17.598,
422
+ "step": 765
423
+ },
424
+ {
425
+ "epoch": 52.0,
426
+ "eval_loss": 2.3786392211914062,
427
+ "eval_runtime": 0.2882,
428
+ "eval_samples_per_second": 506.555,
429
+ "eval_steps_per_second": 17.348,
430
+ "step": 780
431
+ },
432
+ {
433
+ "epoch": 53.0,
434
+ "eval_loss": 2.3314857482910156,
435
+ "eval_runtime": 0.3054,
436
+ "eval_samples_per_second": 478.014,
437
+ "eval_steps_per_second": 16.37,
438
+ "step": 795
439
+ },
440
+ {
441
+ "epoch": 54.0,
442
+ "eval_loss": 2.4228127002716064,
443
+ "eval_runtime": 0.2752,
444
+ "eval_samples_per_second": 530.586,
445
+ "eval_steps_per_second": 18.171,
446
+ "step": 810
447
+ },
448
+ {
449
+ "epoch": 55.0,
450
+ "eval_loss": 2.2979846000671387,
451
+ "eval_runtime": 0.2804,
452
+ "eval_samples_per_second": 520.739,
453
+ "eval_steps_per_second": 17.834,
454
+ "step": 825
455
+ },
456
+ {
457
+ "epoch": 56.0,
458
+ "eval_loss": 2.288037061691284,
459
+ "eval_runtime": 0.2706,
460
+ "eval_samples_per_second": 539.536,
461
+ "eval_steps_per_second": 18.477,
462
+ "step": 840
463
+ },
464
+ {
465
+ "epoch": 57.0,
466
+ "eval_loss": 2.375304698944092,
467
+ "eval_runtime": 0.2754,
468
+ "eval_samples_per_second": 530.218,
469
+ "eval_steps_per_second": 18.158,
470
+ "step": 855
471
+ },
472
+ {
473
+ "epoch": 58.0,
474
+ "eval_loss": 2.302351474761963,
475
+ "eval_runtime": 0.2976,
476
+ "eval_samples_per_second": 490.644,
477
+ "eval_steps_per_second": 16.803,
478
+ "step": 870
479
+ },
480
+ {
481
+ "epoch": 59.0,
482
+ "eval_loss": 2.2706165313720703,
483
+ "eval_runtime": 0.3674,
484
+ "eval_samples_per_second": 397.413,
485
+ "eval_steps_per_second": 13.61,
486
+ "step": 885
487
  }
488
  ],
489
+ "max_steps": 900,
490
+ "num_train_epochs": 60,
491
+ "total_flos": 196520659812864.0,
492
  "trial_name": null,
493
  "trial_params": null
494
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89ca1ca3557d4150cb0d838160870f5330797f6b3f02ab04d02a6520f1b7cfed
3
  size 3643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c01f1cfb649a682bee068d5207deba93cefefbe24e4ab9cc31a8c66c0b3e205d
3
  size 3643