AlekseyKorshuk commited on
Commit
28aecb0
1 Parent(s): d800da3

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/ofa47ov0/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/11x8tlht) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/11x8tlht/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
 
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/2et9dwlh/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/22e6ddab) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/22e6ddab/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 0.31595703959465027, "eval_runtime": 694.9722, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.112, "epoch": 3.0}
 
1
+ {"eval_loss": 0.3614741861820221, "eval_runtime": 29.4359, "eval_samples_per_second": 21.878, "eval_steps_per_second": 2.752, "epoch": 3.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:051a2aa5612a00aefd0743508426504ceff0614f418bc3ffd2fae00bc7e2e01a
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d821cc38972b02052486350836768147963aaeb46cf69d3f41e17bfd52a8d71d
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:593f24adf357165ae7c067bfdec9e2e06abf005b482f2df7913d65b234eb4c23
3
- size 995599857
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35bd2c569b3675430dd87749d2e9e88b6c22f4d1277083f1eb323e5a78ccb1b7
3
+ size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4f3731cebd57313c0396ec7ec7278cbf2f4370924b703a6aa6ddad82d193a89
3
- size 510401385
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58d89bc75fcb0d32ac9fa6dcb15dddc93183abf0fb0e94b1c0e81fa3acad906e
3
+ size 510403817
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a26923031d40478a72fa6bcd5409ab432e85028c059b7b08da05ecaa1c93850
3
- size 13547
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049b9bcbcaee102a87fbbdf3ad0f08b27c66519b58ffa7a05152155bdbc8fb60
3
+ size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2587493ffef7a83ac744fa257014e43dd335fe886fa10856cabfbe630d057f3b
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02117a90c598eeedf1944bd81ff7344fb6927e0e3a13871b9e0ceb3095f3a8d3
3
  size 623
trainer_state.json CHANGED
@@ -1,1138 +1,1124 @@
1
  {
2
- "best_metric": 0.31595703959465027,
3
- "best_model_checkpoint": "output/eminem/checkpoint-916",
4
  "epoch": 2.0,
5
- "global_step": 916,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
- "learning_rate": 9.668898730225426e-06,
13
- "loss": 0.6451,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.02,
18
- "learning_rate": 8.494471426523322e-06,
19
- "loss": 0.6405,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.03,
24
- "learning_rate": 7.391359230778123e-06,
25
- "loss": 0.6683,
26
  "step": 15
27
  },
28
  {
29
  "epoch": 0.04,
30
- "learning_rate": 6.360870983745512e-06,
31
- "loss": 0.6156,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 0.05,
36
- "learning_rate": 5.404229357984623e-06,
37
- "loss": 0.7032,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 0.07,
42
- "learning_rate": 4.522569407168996e-06,
43
- "loss": 0.6814,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.08,
48
- "learning_rate": 3.7169372193369285e-06,
49
- "loss": 0.7428,
50
  "step": 35
51
  },
52
  {
53
  "epoch": 0.09,
54
- "learning_rate": 2.988288675715626e-06,
55
- "loss": 0.6695,
56
  "step": 40
57
  },
58
  {
59
  "epoch": 0.1,
60
- "learning_rate": 2.3374883165699825e-06,
61
- "loss": 0.6716,
62
  "step": 45
63
  },
64
  {
65
  "epoch": 0.11,
66
- "learning_rate": 1.7653083154249965e-06,
67
- "loss": 0.6442,
68
  "step": 50
69
  },
70
  {
71
  "epoch": 0.12,
72
- "learning_rate": 1.2724275628843243e-06,
73
- "loss": 0.7111,
74
  "step": 55
75
  },
76
  {
77
  "epoch": 0.13,
78
- "learning_rate": 8.594308611261225e-07,
79
- "loss": 0.717,
80
  "step": 60
81
  },
82
  {
83
  "epoch": 0.14,
84
- "learning_rate": 5.26808230038689e-07,
85
- "loss": 0.6728,
86
  "step": 65
87
  },
88
  {
89
  "epoch": 0.15,
90
- "learning_rate": 2.7495432581104424e-07,
91
- "loss": 0.5893,
92
  "step": 70
93
  },
94
  {
95
  "epoch": 0.16,
96
- "learning_rate": 1.0416797267454092e-07,
97
- "loss": 0.6088,
98
  "step": 75
99
  },
100
  {
101
  "epoch": 0.18,
102
- "learning_rate": 1.465180834793276e-08,
103
- "loss": 0.6029,
104
  "step": 80
105
  },
106
  {
107
  "epoch": 0.19,
108
- "learning_rate": 6.5120436089167734e-09,
109
- "loss": 0.6099,
110
  "step": 85
111
  },
112
  {
113
  "epoch": 0.2,
114
- "learning_rate": 7.975833627447873e-08,
115
- "loss": 0.5843,
116
  "step": 90
117
  },
118
  {
119
  "epoch": 0.21,
120
- "learning_rate": 2.3430377974243327e-07,
121
- "loss": 0.6587,
122
  "step": 95
123
  },
124
  {
125
  "epoch": 0.22,
126
- "learning_rate": 4.699650061053238e-07,
127
- "loss": 0.5792,
128
  "step": 100
129
  },
130
  {
131
  "epoch": 0.23,
132
- "learning_rate": 7.864624037171852e-07,
133
- "loss": 0.6884,
134
  "step": 105
135
  },
136
  {
137
  "epoch": 0.24,
138
- "learning_rate": 1.1834204489510753e-06,
139
- "loss": 0.6179,
140
  "step": 110
141
  },
142
  {
143
  "epoch": 0.25,
144
- "learning_rate": 1.6603681517579621e-06,
145
- "loss": 0.6588,
146
  "step": 115
147
  },
148
  {
149
  "epoch": 0.26,
150
- "learning_rate": 2.2167396144949356e-06,
151
- "loss": 0.5756,
152
  "step": 120
153
  },
154
  {
155
  "epoch": 0.27,
156
- "learning_rate": 2.851874703363704e-06,
157
- "loss": 0.7218,
158
  "step": 125
159
  },
160
  {
161
  "epoch": 0.29,
162
- "learning_rate": 3.565019831655522e-06,
163
- "loss": 0.6423,
164
  "step": 130
165
  },
166
  {
167
  "epoch": 0.3,
168
- "learning_rate": 4.3553288538822644e-06,
169
- "loss": 0.7181,
170
  "step": 135
171
  },
172
  {
173
  "epoch": 0.31,
174
- "learning_rate": 5.221864069725448e-06,
175
- "loss": 0.7147,
176
  "step": 140
177
  },
178
  {
179
  "epoch": 0.32,
180
- "learning_rate": 6.163597336619036e-06,
181
- "loss": 0.6411,
182
  "step": 145
183
  },
184
  {
185
  "epoch": 0.33,
186
- "learning_rate": 7.179411289634524e-06,
187
- "loss": 0.6499,
188
  "step": 150
189
  },
190
  {
191
  "epoch": 0.34,
192
- "learning_rate": 8.268100667234415e-06,
193
- "loss": 0.6962,
194
  "step": 155
195
  },
196
  {
197
  "epoch": 0.35,
198
- "learning_rate": 9.428373741315982e-06,
199
- "loss": 0.6569,
200
  "step": 160
201
  },
202
  {
203
  "epoch": 0.36,
204
- "learning_rate": 1.0658853849838009e-05,
205
- "loss": 0.6364,
206
  "step": 165
207
  },
208
  {
209
  "epoch": 0.37,
210
- "learning_rate": 1.1958081030236758e-05,
211
- "loss": 0.6697,
212
  "step": 170
213
  },
214
  {
215
  "epoch": 0.38,
216
- "learning_rate": 1.3324513751656602e-05,
217
- "loss": 0.6964,
218
  "step": 175
219
  },
220
  {
221
- "epoch": 0.39,
222
- "learning_rate": 1.47565307439888e-05,
223
- "loss": 0.6235,
224
  "step": 180
225
  },
226
  {
227
  "epoch": 0.41,
228
- "learning_rate": 1.6252432921492454e-05,
229
- "loss": 0.6185,
230
  "step": 185
231
  },
232
  {
233
  "epoch": 0.42,
234
- "learning_rate": 1.7810445398767395e-05,
235
- "loss": 0.7024,
236
  "step": 190
237
  },
238
  {
239
  "epoch": 0.43,
240
- "learning_rate": 1.9428719596644185e-05,
241
- "loss": 0.6126,
242
  "step": 195
243
  },
244
  {
245
  "epoch": 0.44,
246
- "learning_rate": 2.1105335435523642e-05,
247
- "loss": 0.6189,
248
  "step": 200
249
  },
250
  {
251
  "epoch": 0.45,
252
- "learning_rate": 2.283830361355242e-05,
253
- "loss": 0.5719,
254
  "step": 205
255
  },
256
  {
257
  "epoch": 0.46,
258
- "learning_rate": 2.4625567966918e-05,
259
- "loss": 0.7211,
260
  "step": 210
261
  },
262
  {
263
  "epoch": 0.47,
264
- "learning_rate": 2.6465007909489048e-05,
265
- "loss": 0.6735,
266
  "step": 215
267
  },
268
  {
269
  "epoch": 0.48,
270
- "learning_rate": 2.8354440948884326e-05,
271
- "loss": 0.5916,
272
  "step": 220
273
  },
274
  {
275
  "epoch": 0.49,
276
- "learning_rate": 3.0291625276002506e-05,
277
- "loss": 0.5753,
278
  "step": 225
279
  },
280
  {
281
- "epoch": 0.5,
282
- "learning_rate": 3.2274262424915514e-05,
283
- "loss": 0.6077,
284
  "step": 230
285
  },
286
  {
287
  "epoch": 0.52,
288
- "learning_rate": 3.430000000000064e-05,
289
- "loss": 0.7398,
290
  "step": 235
291
  },
292
  {
293
  "epoch": 0.53,
294
- "learning_rate": 3.636643446705075e-05,
295
- "loss": 0.7589,
296
  "step": 240
297
  },
298
  {
299
  "epoch": 0.54,
300
- "learning_rate": 3.847111400507036e-05,
301
- "loss": 0.7287,
302
  "step": 245
303
  },
304
  {
305
  "epoch": 0.55,
306
- "learning_rate": 4.0611541415345134e-05,
307
- "loss": 0.7015,
308
  "step": 250
309
  },
310
  {
311
  "epoch": 0.56,
312
- "learning_rate": 4.2785177084377745e-05,
313
- "loss": 0.6185,
314
  "step": 255
315
  },
316
  {
317
  "epoch": 0.57,
318
- "learning_rate": 4.49894419971228e-05,
319
- "loss": 0.6638,
320
  "step": 260
321
  },
322
  {
323
  "epoch": 0.58,
324
- "learning_rate": 4.7221720796988937e-05,
325
- "loss": 0.6769,
326
  "step": 265
327
  },
328
  {
329
  "epoch": 0.59,
330
- "learning_rate": 4.947936488896066e-05,
331
- "loss": 0.7237,
332
  "step": 270
333
  },
334
  {
335
  "epoch": 0.6,
336
- "learning_rate": 5.1759695582140806e-05,
337
- "loss": 0.7138,
338
  "step": 275
339
  },
340
  {
341
- "epoch": 0.61,
342
- "learning_rate": 5.4060007268020076e-05,
343
- "loss": 0.655,
344
  "step": 280
345
  },
346
  {
347
- "epoch": 0.62,
348
- "learning_rate": 5.637757063067322e-05,
349
- "loss": 0.7932,
350
  "step": 285
351
  },
352
  {
353
  "epoch": 0.64,
354
- "learning_rate": 5.870963588509545e-05,
355
- "loss": 0.6143,
356
  "step": 290
357
  },
358
  {
359
  "epoch": 0.65,
360
- "learning_rate": 6.105343603980619e-05,
361
- "loss": 0.6686,
362
  "step": 295
363
  },
364
  {
365
  "epoch": 0.66,
366
- "learning_rate": 6.34061901798874e-05,
367
- "loss": 0.7298,
368
  "step": 300
369
  },
370
  {
371
  "epoch": 0.67,
372
- "learning_rate": 6.576510676653068e-05,
373
- "loss": 0.6932,
374
  "step": 305
375
  },
376
  {
377
  "epoch": 0.68,
378
- "learning_rate": 6.812738694920095e-05,
379
- "loss": 0.7121,
380
  "step": 310
381
  },
382
  {
383
  "epoch": 0.69,
384
- "learning_rate": 7.049022788645635e-05,
385
- "loss": 0.668,
386
  "step": 315
387
  },
388
  {
389
  "epoch": 0.7,
390
- "learning_rate": 7.285082607152067e-05,
391
- "loss": 0.6626,
392
  "step": 320
393
  },
394
  {
395
  "epoch": 0.71,
396
- "learning_rate": 7.52063806586452e-05,
397
- "loss": 0.7163,
398
  "step": 325
399
  },
400
  {
401
- "epoch": 0.72,
402
- "learning_rate": 7.755409678629386e-05,
403
- "loss": 0.753,
404
  "step": 330
405
  },
406
  {
407
- "epoch": 0.73,
408
- "learning_rate": 7.989118889325919e-05,
409
- "loss": 0.6771,
410
  "step": 335
411
  },
412
  {
413
  "epoch": 0.75,
414
- "learning_rate": 8.221488402370275e-05,
415
- "loss": 0.6505,
416
  "step": 340
417
  },
418
  {
419
  "epoch": 0.76,
420
- "learning_rate": 8.452242511728957e-05,
421
- "loss": 0.6098,
422
  "step": 345
423
  },
424
  {
425
  "epoch": 0.77,
426
- "learning_rate": 8.681107428041253e-05,
427
- "loss": 0.6978,
428
  "step": 350
429
  },
430
  {
431
  "epoch": 0.78,
432
- "learning_rate": 8.907811603471558e-05,
433
- "loss": 0.6606,
434
  "step": 355
435
  },
436
  {
437
  "epoch": 0.79,
438
- "learning_rate": 9.132086053899106e-05,
439
- "loss": 0.788,
440
  "step": 360
441
  },
442
  {
443
  "epoch": 0.8,
444
- "learning_rate": 9.353664678067919e-05,
445
- "loss": 0.7524,
446
  "step": 365
447
  },
448
  {
449
  "epoch": 0.81,
450
- "learning_rate": 9.572284573315139e-05,
451
- "loss": 0.6861,
452
  "step": 370
453
  },
454
  {
455
  "epoch": 0.82,
456
- "learning_rate": 9.787686347505305e-05,
457
- "loss": 0.6812,
458
  "step": 375
459
  },
460
  {
461
- "epoch": 0.83,
462
- "learning_rate": 9.999614426797605e-05,
463
- "loss": 0.6453,
464
  "step": 380
465
  },
466
  {
467
- "epoch": 0.84,
468
- "learning_rate": 0.00010207817358884356,
469
- "loss": 0.7003,
470
  "step": 385
471
  },
472
  {
473
  "epoch": 0.86,
474
- "learning_rate": 0.00010412048111339246,
475
- "loss": 0.714,
476
  "step": 390
477
  },
478
  {
479
  "epoch": 0.87,
480
- "learning_rate": 0.00010612064364719859,
481
- "loss": 0.7008,
482
  "step": 395
483
  },
484
  {
485
  "epoch": 0.88,
486
- "learning_rate": 0.00010807628800079975,
487
- "loss": 0.6857,
488
  "step": 400
489
  },
490
  {
491
  "epoch": 0.89,
492
- "learning_rate": 0.00010998509380547761,
493
- "loss": 0.7609,
494
  "step": 405
495
  },
496
  {
497
  "epoch": 0.9,
498
- "learning_rate": 0.00011184479626637769,
499
- "loss": 0.7942,
500
  "step": 410
501
  },
502
  {
503
  "epoch": 0.91,
504
- "learning_rate": 0.00011365318884967352,
505
- "loss": 0.6518,
506
  "step": 415
507
  },
508
  {
509
  "epoch": 0.92,
510
- "learning_rate": 0.00011540812590062915,
511
- "loss": 0.7253,
512
  "step": 420
513
  },
514
  {
515
  "epoch": 0.93,
516
- "learning_rate": 0.000117107525189397,
517
- "loss": 0.7335,
518
  "step": 425
519
  },
520
  {
521
- "epoch": 0.94,
522
- "learning_rate": 0.00011874937038159925,
523
- "loss": 0.7592,
524
  "step": 430
525
  },
526
  {
527
- "epoch": 0.95,
528
- "learning_rate": 0.00012033171343069444,
529
- "loss": 0.7893,
530
  "step": 435
531
  },
532
  {
533
- "epoch": 0.96,
534
- "learning_rate": 0.00012185267688933072,
535
- "loss": 0.7095,
536
  "step": 440
537
  },
538
  {
539
  "epoch": 0.98,
540
- "learning_rate": 0.00012331045613693775,
541
- "loss": 0.7761,
542
  "step": 445
543
  },
544
  {
545
  "epoch": 0.99,
546
- "learning_rate": 0.0001247033215209036,
547
- "loss": 0.6766,
548
  "step": 450
549
  },
550
  {
551
  "epoch": 1.0,
552
- "learning_rate": 0.00012602962040880915,
553
- "loss": 0.7222,
554
  "step": 455
555
  },
556
  {
557
  "epoch": 1.0,
558
- "eval_loss": 0.41102197766304016,
559
- "eval_runtime": 15.2365,
560
- "eval_samples_per_second": 41.742,
561
- "eval_steps_per_second": 5.251,
562
- "step": 456
563
  },
564
  {
565
- "epoch": 1.02,
566
- "learning_rate": 0.00011710752518939759,
567
- "loss": 0.6694,
568
  "step": 460
569
  },
570
  {
571
- "epoch": 1.03,
572
- "learning_rate": 0.00011539283602861273,
573
- "loss": 0.6712,
574
  "step": 465
575
  },
576
  {
577
- "epoch": 1.04,
578
- "learning_rate": 0.00011362164035023001,
579
- "loss": 0.7083,
580
  "step": 470
581
  },
582
  {
583
- "epoch": 1.05,
584
- "learning_rate": 0.00011179607703080105,
585
- "loss": 0.6899,
586
  "step": 475
587
  },
588
  {
589
- "epoch": 1.06,
590
- "learning_rate": 0.000109918350600647,
591
- "loss": 0.7508,
592
  "step": 480
593
  },
594
  {
595
  "epoch": 1.07,
596
- "learning_rate": 0.0001079907285816916,
597
- "loss": 0.6351,
598
  "step": 485
599
  },
600
  {
601
  "epoch": 1.08,
602
- "learning_rate": 0.00010601553874922655,
603
- "loss": 0.7873,
604
  "step": 490
605
  },
606
  {
607
- "epoch": 1.1,
608
- "learning_rate": 0.0001039951663209155,
609
- "loss": 0.7405,
610
  "step": 495
611
  },
612
  {
613
- "epoch": 1.11,
614
- "learning_rate": 0.00010193205107643125,
615
- "loss": 0.8115,
616
  "step": 500
617
  },
618
  {
619
- "epoch": 1.12,
620
- "learning_rate": 9.982868441120446e-05,
621
- "loss": 0.8309,
622
  "step": 505
623
  },
624
  {
625
- "epoch": 1.13,
626
- "learning_rate": 9.768760632784171e-05,
627
- "loss": 0.7548,
628
  "step": 510
629
  },
630
  {
631
- "epoch": 1.14,
632
- "learning_rate": 9.551140236884618e-05,
633
- "loss": 0.7107,
634
  "step": 515
635
  },
636
  {
637
- "epoch": 1.15,
638
- "learning_rate": 9.330270049434476e-05,
639
- "loss": 0.6035,
640
  "step": 520
641
  },
642
  {
643
- "epoch": 1.16,
644
- "learning_rate": 9.1064167908592e-05,
645
- "loss": 0.724,
646
  "step": 525
647
  },
648
  {
649
- "epoch": 1.17,
650
- "learning_rate": 8.879850783908354e-05,
651
- "loss": 0.6642,
652
  "step": 530
653
  },
654
  {
655
  "epoch": 1.18,
656
- "learning_rate": 8.650845627216803e-05,
657
- "loss": 0.6811,
658
  "step": 535
659
  },
660
  {
661
  "epoch": 1.19,
662
- "learning_rate": 8.41967786490999e-05,
663
- "loss": 0.6505,
664
  "step": 540
665
  },
666
  {
667
- "epoch": 1.21,
668
- "learning_rate": 8.186626652652275e-05,
669
- "loss": 0.7759,
670
  "step": 545
671
  },
672
  {
673
- "epoch": 1.22,
674
- "learning_rate": 7.951973420541605e-05,
675
- "loss": 0.6233,
676
  "step": 550
677
  },
678
  {
679
- "epoch": 1.23,
680
- "learning_rate": 7.716001533257597e-05,
681
- "loss": 0.7224,
682
  "step": 555
683
  },
684
  {
685
- "epoch": 1.24,
686
- "learning_rate": 7.478995947873449e-05,
687
- "loss": 0.7252,
688
  "step": 560
689
  },
690
  {
691
- "epoch": 1.25,
692
- "learning_rate": 7.241242869744897e-05,
693
- "loss": 0.6465,
694
  "step": 565
695
  },
696
  {
697
- "epoch": 1.26,
698
- "learning_rate": 7.003029406891743e-05,
699
- "loss": 0.7505,
700
  "step": 570
701
  },
702
  {
703
- "epoch": 1.27,
704
- "learning_rate": 6.764643223289357e-05,
705
- "loss": 0.7117,
706
  "step": 575
707
  },
708
  {
709
- "epoch": 1.28,
710
- "learning_rate": 6.526372191488798e-05,
711
- "loss": 0.7289,
712
  "step": 580
713
  },
714
  {
715
  "epoch": 1.29,
716
- "learning_rate": 6.288504044985065e-05,
717
- "loss": 0.7069,
718
  "step": 585
719
  },
720
  {
721
- "epoch": 1.31,
722
- "learning_rate": 6.05132603075328e-05,
723
- "loss": 0.7925,
724
  "step": 590
725
  },
726
  {
727
- "epoch": 1.32,
728
- "learning_rate": 5.8151245623725575e-05,
729
- "loss": 0.7614,
730
  "step": 595
731
  },
732
  {
733
- "epoch": 1.33,
734
- "learning_rate": 5.5801848741551456e-05,
735
- "loss": 0.6241,
736
  "step": 600
737
  },
738
  {
739
- "epoch": 1.34,
740
- "learning_rate": 5.3467906767025034e-05,
741
- "loss": 0.6841,
742
  "step": 605
743
  },
744
  {
745
- "epoch": 1.35,
746
- "learning_rate": 5.115223814297748e-05,
747
- "loss": 0.7002,
748
  "step": 610
749
  },
750
  {
751
- "epoch": 1.36,
752
- "learning_rate": 4.885763924553591e-05,
753
- "loss": 0.7048,
754
  "step": 615
755
  },
756
  {
757
- "epoch": 1.37,
758
- "learning_rate": 4.65868810072671e-05,
759
- "loss": 0.6762,
760
  "step": 620
761
  },
762
  {
763
- "epoch": 1.38,
764
- "learning_rate": 4.434270557101253e-05,
765
- "loss": 0.6848,
766
  "step": 625
767
  },
768
  {
769
- "epoch": 1.39,
770
- "learning_rate": 4.2127822978506955e-05,
771
- "loss": 0.7911,
772
  "step": 630
773
  },
774
  {
775
  "epoch": 1.4,
776
- "learning_rate": 3.9944907897778e-05,
777
- "loss": 0.767,
778
  "step": 635
779
  },
780
  {
781
- "epoch": 1.42,
782
- "learning_rate": 3.779659639322971e-05,
783
- "loss": 0.6907,
784
  "step": 640
785
  },
786
  {
787
- "epoch": 1.43,
788
- "learning_rate": 3.568548274236045e-05,
789
- "loss": 0.7518,
790
  "step": 645
791
  },
792
  {
793
- "epoch": 1.44,
794
- "learning_rate": 3.361411630295728e-05,
795
- "loss": 0.6918,
796
  "step": 650
797
  },
798
  {
799
- "epoch": 1.45,
800
- "learning_rate": 3.158499843450359e-05,
801
- "loss": 0.7341,
802
  "step": 655
803
  },
804
  {
805
- "epoch": 1.46,
806
- "learning_rate": 2.9600579477565238e-05,
807
- "loss": 0.6637,
808
  "step": 660
809
  },
810
  {
811
- "epoch": 1.47,
812
- "learning_rate": 2.7663255794802226e-05,
813
- "loss": 0.7897,
814
  "step": 665
815
  },
816
  {
817
- "epoch": 1.48,
818
- "learning_rate": 2.5775366877124698e-05,
819
- "loss": 0.5947,
820
  "step": 670
821
  },
822
  {
823
- "epoch": 1.49,
824
- "learning_rate": 2.393919251856543e-05,
825
- "loss": 0.6711,
826
  "step": 675
827
  },
828
  {
829
- "epoch": 1.5,
830
- "learning_rate": 2.21569500632249e-05,
831
- "loss": 0.789,
832
  "step": 680
833
  },
834
  {
835
- "epoch": 1.52,
836
- "learning_rate": 2.043079172761756e-05,
837
- "loss": 0.6784,
838
  "step": 685
839
  },
840
  {
841
- "epoch": 1.53,
842
- "learning_rate": 1.876280200169214e-05,
843
- "loss": 0.648,
844
  "step": 690
845
  },
846
  {
847
- "epoch": 1.54,
848
- "learning_rate": 1.7154995131623676e-05,
849
- "loss": 0.8539,
850
  "step": 695
851
  },
852
  {
853
- "epoch": 1.55,
854
- "learning_rate": 1.5609312687419874e-05,
855
- "loss": 0.8157,
856
  "step": 700
857
  },
858
  {
859
- "epoch": 1.56,
860
- "learning_rate": 1.4127621218314657e-05,
861
- "loss": 0.6428,
862
  "step": 705
863
  },
864
  {
865
- "epoch": 1.57,
866
- "learning_rate": 1.2711709998742864e-05,
867
- "loss": 0.6836,
868
  "step": 710
869
  },
870
  {
871
- "epoch": 1.58,
872
- "learning_rate": 1.1363288867621515e-05,
873
- "loss": 0.6848,
874
  "step": 715
875
  },
876
  {
877
- "epoch": 1.59,
878
- "learning_rate": 1.0083986163577838e-05,
879
- "loss": 0.687,
880
  "step": 720
881
  },
882
  {
883
- "epoch": 1.6,
884
- "learning_rate": 8.875346758584498e-06,
885
- "loss": 0.6913,
886
  "step": 725
887
  },
888
  {
889
- "epoch": 1.62,
890
- "learning_rate": 7.738830192380085e-06,
891
- "loss": 0.7138,
892
  "step": 730
893
  },
894
  {
895
- "epoch": 1.63,
896
- "learning_rate": 6.675808909954194e-06,
897
- "loss": 0.7293,
898
  "step": 735
899
  },
900
  {
901
- "epoch": 1.64,
902
- "learning_rate": 5.687566604196989e-06,
903
- "loss": 0.7797,
904
  "step": 740
905
  },
906
  {
907
- "epoch": 1.65,
908
- "learning_rate": 4.7752966657182915e-06,
909
- "loss": 0.6821,
910
  "step": 745
911
  },
912
  {
913
- "epoch": 1.66,
914
- "learning_rate": 3.940100741730516e-06,
915
- "loss": 0.5752,
916
  "step": 750
917
  },
918
  {
919
- "epoch": 1.67,
920
- "learning_rate": 3.182987405707336e-06,
921
- "loss": 0.7456,
922
  "step": 755
923
  },
924
  {
925
- "epoch": 1.68,
926
- "learning_rate": 2.5048709394422254e-06,
927
- "loss": 0.694,
928
  "step": 760
929
  },
930
  {
931
- "epoch": 1.69,
932
- "learning_rate": 1.9065702289717339e-06,
933
- "loss": 0.6621,
934
  "step": 765
935
  },
936
  {
937
- "epoch": 1.7,
938
- "learning_rate": 1.3888077756976535e-06,
939
- "loss": 0.706,
940
  "step": 770
941
  },
942
  {
943
- "epoch": 1.71,
944
- "learning_rate": 9.522088239022507e-07,
945
- "loss": 0.7326,
946
  "step": 775
947
  },
948
  {
949
- "epoch": 1.73,
950
- "learning_rate": 5.973006057100785e-07,
951
- "loss": 0.6361,
952
  "step": 780
953
  },
954
  {
955
- "epoch": 1.74,
956
- "learning_rate": 3.245117044082019e-07,
957
- "loss": 0.647,
958
  "step": 785
959
  },
960
  {
961
- "epoch": 1.75,
962
- "learning_rate": 1.3417153689374065e-07,
963
- "loss": 0.7488,
964
  "step": 790
965
  },
966
  {
967
- "epoch": 1.76,
968
- "learning_rate": 2.6509955873614998e-08,
969
- "loss": 0.7149,
970
  "step": 795
971
  },
972
  {
973
- "epoch": 1.77,
974
- "learning_rate": 1.6569722969137058e-09,
975
- "loss": 0.6775,
976
  "step": 800
977
  },
978
  {
979
- "epoch": 1.78,
980
- "learning_rate": 5.964259835508488e-08,
981
- "loss": 0.7746,
982
  "step": 805
983
  },
984
  {
985
- "epoch": 1.79,
986
- "learning_rate": 2.003968112395687e-07,
987
- "loss": 0.6598,
988
  "step": 810
989
  },
990
  {
991
- "epoch": 1.8,
992
- "learning_rate": 4.2374963770057325e-07,
993
- "loss": 0.6891,
994
  "step": 815
995
  },
996
  {
997
- "epoch": 1.81,
998
- "learning_rate": 7.294313593051582e-07,
999
- "loss": 0.6988,
1000
  "step": 820
1001
  },
1002
  {
1003
- "epoch": 1.83,
1004
- "learning_rate": 1.1170728381449087e-06,
1005
- "loss": 0.6761,
1006
  "step": 825
1007
  },
1008
  {
1009
- "epoch": 1.84,
1010
- "learning_rate": 1.5862059626056298e-06,
1011
- "loss": 0.7306,
1012
  "step": 830
1013
  },
1014
  {
1015
- "epoch": 1.85,
1016
- "learning_rate": 2.1362642126509973e-06,
1017
- "loss": 0.6297,
1018
  "step": 835
1019
  },
1020
  {
1021
- "epoch": 1.86,
1022
- "learning_rate": 2.766583343947401e-06,
1023
- "loss": 0.7385,
1024
  "step": 840
1025
  },
1026
  {
1027
- "epoch": 1.87,
1028
- "learning_rate": 3.4764021899978206e-06,
1029
- "loss": 0.7045,
1030
  "step": 845
1031
  },
1032
  {
1033
- "epoch": 1.88,
1034
- "learning_rate": 4.264863581318223e-06,
1035
- "loss": 0.6952,
1036
  "step": 850
1037
  },
1038
  {
1039
- "epoch": 1.89,
1040
- "learning_rate": 5.131015380546021e-06,
1041
- "loss": 0.7021,
1042
  "step": 855
1043
  },
1044
  {
1045
- "epoch": 1.9,
1046
- "learning_rate": 6.073811632230782e-06,
1047
- "loss": 0.6392,
1048
  "step": 860
1049
  },
1050
  {
1051
- "epoch": 1.91,
1052
- "learning_rate": 7.09211382591858e-06,
1053
- "loss": 0.694,
1054
  "step": 865
1055
  },
1056
  {
1057
- "epoch": 1.92,
1058
- "learning_rate": 8.18469227100481e-06,
1059
- "loss": 0.6864,
1060
  "step": 870
1061
  },
1062
  {
1063
- "epoch": 1.94,
1064
- "learning_rate": 9.350227581695092e-06,
1065
- "loss": 0.7318,
1066
  "step": 875
1067
  },
1068
  {
1069
- "epoch": 1.95,
1070
- "learning_rate": 1.0587312270281189e-05,
1071
- "loss": 0.6326,
1072
  "step": 880
1073
  },
1074
  {
1075
- "epoch": 1.96,
1076
- "learning_rate": 1.1894452446807735e-05,
1077
- "loss": 0.7141,
1078
  "step": 885
1079
  },
1080
  {
1081
- "epoch": 1.97,
1082
- "learning_rate": 1.3270069623077462e-05,
1083
- "loss": 0.713,
1084
  "step": 890
1085
  },
1086
  {
1087
- "epoch": 1.98,
1088
- "learning_rate": 1.47125026188163e-05,
1089
- "loss": 0.7505,
1090
  "step": 895
1091
  },
1092
- {
1093
- "epoch": 1.99,
1094
- "learning_rate": 1.6220009567696506e-05,
1095
- "loss": 0.7118,
1096
- "step": 900
1097
- },
1098
- {
1099
- "epoch": 2.0,
1100
- "eval_loss": 0.3754318356513977,
1101
- "eval_runtime": 8.7593,
1102
- "eval_samples_per_second": 76.147,
1103
- "eval_steps_per_second": 9.59,
1104
- "step": 904
1105
- },
1106
  {
1107
  "epoch": 1.98,
1108
- "learning_rate": 4.956064306819408e-05,
1109
- "loss": 0.7479,
1110
- "step": 905
1111
  },
1112
  {
1113
  "epoch": 1.99,
1114
- "learning_rate": 4.7311949670299134e-05,
1115
- "loss": 0.7304,
1116
- "step": 910
1117
  },
1118
  {
1119
  "epoch": 2.0,
1120
- "learning_rate": 4.508829441538651e-05,
1121
- "loss": 0.666,
1122
- "step": 915
1123
  },
1124
  {
1125
  "epoch": 2.0,
1126
- "eval_loss": 0.31595703959465027,
1127
- "eval_runtime": 690.3622,
1128
- "eval_samples_per_second": 0.898,
1129
- "eval_steps_per_second": 0.113,
1130
- "step": 916
1131
  }
1132
  ],
1133
- "max_steps": 1374,
1134
  "num_train_epochs": 3,
1135
- "total_flos": 955022376960000.0,
1136
  "trial_name": null,
1137
  "trial_params": null
1138
  }
 
1
  {
2
+ "best_metric": 0.3614741861820221,
3
+ "best_model_checkpoint": "output/eminem/checkpoint-910",
4
  "epoch": 2.0,
5
+ "global_step": 910,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
+ "learning_rate": 1.0194653534426477e-06,
13
+ "loss": 0.6762,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.02,
18
+ "learning_rate": 6.530415424531046e-07,
19
+ "loss": 0.6579,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.03,
24
+ "learning_rate": 3.675914059099763e-07,
25
+ "loss": 0.7016,
26
  "step": 15
27
  },
28
  {
29
  "epoch": 0.04,
30
+ "learning_rate": 1.6345512013444254e-07,
31
+ "loss": 0.6176,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 0.05,
36
+ "learning_rate": 4.087595819659287e-08,
37
+ "loss": 0.563,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 0.07,
42
+ "learning_rate": 0.0,
43
+ "loss": 0.689,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.08,
48
+ "learning_rate": 4.0875958196577634e-08,
49
+ "loss": 0.6532,
50
  "step": 35
51
  },
52
  {
53
  "epoch": 0.09,
54
+ "learning_rate": 1.634551201344197e-07,
55
+ "loss": 0.5922,
56
  "step": 40
57
  },
58
  {
59
  "epoch": 0.1,
60
+ "learning_rate": 3.6759140590974026e-07,
61
+ "loss": 0.6565,
62
  "step": 45
63
  },
64
  {
65
  "epoch": 0.11,
66
+ "learning_rate": 6.530415424530588e-07,
67
+ "loss": 0.7043,
68
  "step": 50
69
  },
70
  {
71
  "epoch": 0.12,
72
+ "learning_rate": 1.0194653534425943e-06,
73
+ "loss": 0.597,
74
  "step": 55
75
  },
76
  {
77
  "epoch": 0.13,
78
+ "learning_rate": 1.4664261646975495e-06,
79
+ "loss": 0.7246,
80
  "step": 60
81
  },
82
  {
83
  "epoch": 0.14,
84
+ "learning_rate": 1.993391324572832e-06,
85
+ "loss": 0.639,
86
  "step": 65
87
  },
88
  {
89
  "epoch": 0.15,
90
+ "learning_rate": 2.5997328387288936e-06,
91
+ "loss": 0.6834,
92
  "step": 70
93
  },
94
  {
95
  "epoch": 0.16,
96
+ "learning_rate": 3.2847281185253694e-06,
97
+ "loss": 0.6745,
98
  "step": 75
99
  },
100
  {
101
  "epoch": 0.18,
102
+ "learning_rate": 4.0475608421405796e-06,
103
+ "loss": 0.5851,
104
  "step": 80
105
  },
106
  {
107
  "epoch": 0.19,
108
+ "learning_rate": 4.887321927404397e-06,
109
+ "loss": 0.6805,
110
  "step": 85
111
  },
112
  {
113
  "epoch": 0.2,
114
+ "learning_rate": 5.803010615159864e-06,
115
+ "loss": 0.6536,
116
  "step": 90
117
  },
118
  {
119
  "epoch": 0.21,
120
+ "learning_rate": 6.793535661893871e-06,
121
+ "loss": 0.6119,
122
  "step": 95
123
  },
124
  {
125
  "epoch": 0.22,
126
+ "learning_rate": 7.857716640189427e-06,
127
+ "loss": 0.6954,
128
  "step": 100
129
  },
130
  {
131
  "epoch": 0.23,
132
+ "learning_rate": 8.994285345464919e-06,
133
+ "loss": 0.7077,
134
  "step": 105
135
  },
136
  {
137
  "epoch": 0.24,
138
+ "learning_rate": 1.0201887307313696e-05,
139
+ "loss": 0.6852,
140
  "step": 110
141
  },
142
  {
143
  "epoch": 0.25,
144
+ "learning_rate": 1.147908340365762e-05,
145
+ "loss": 0.6743,
146
  "step": 115
147
  },
148
  {
149
  "epoch": 0.26,
150
+ "learning_rate": 1.2824351575772418e-05,
151
+ "loss": 0.7093,
152
  "step": 120
153
  },
154
  {
155
  "epoch": 0.27,
156
+ "learning_rate": 1.4236088642155879e-05,
157
+ "loss": 0.696,
158
  "step": 125
159
  },
160
  {
161
  "epoch": 0.29,
162
+ "learning_rate": 1.5712612209063624e-05,
163
+ "loss": 0.6607,
164
  "step": 130
165
  },
166
  {
167
  "epoch": 0.3,
168
+ "learning_rate": 1.7252162675462687e-05,
169
+ "loss": 0.5847,
170
  "step": 135
171
  },
172
  {
173
  "epoch": 0.31,
174
+ "learning_rate": 1.8852905329964338e-05,
175
+ "loss": 0.6638,
176
  "step": 140
177
  },
178
  {
179
  "epoch": 0.32,
180
+ "learning_rate": 2.051293253729783e-05,
181
+ "loss": 0.6221,
182
  "step": 145
183
  },
184
  {
185
  "epoch": 0.33,
186
+ "learning_rate": 2.2230266011669234e-05,
187
+ "loss": 0.5733,
188
  "step": 150
189
  },
190
  {
191
  "epoch": 0.34,
192
+ "learning_rate": 2.4002859174324688e-05,
193
+ "loss": 0.6478,
194
  "step": 155
195
  },
196
  {
197
  "epoch": 0.35,
198
+ "learning_rate": 2.5828599592491126e-05,
199
+ "loss": 0.5237,
200
  "step": 160
201
  },
202
  {
203
  "epoch": 0.36,
204
+ "learning_rate": 2.770531149681108e-05,
205
+ "loss": 0.6559,
206
  "step": 165
207
  },
208
  {
209
  "epoch": 0.37,
210
+ "learning_rate": 2.9630758374242324e-05,
211
+ "loss": 0.6755,
212
  "step": 170
213
  },
214
  {
215
  "epoch": 0.38,
216
+ "learning_rate": 3.1602645633354207e-05,
217
+ "loss": 0.6611,
218
  "step": 175
219
  },
220
  {
221
+ "epoch": 0.4,
222
+ "learning_rate": 3.3618623338835595e-05,
223
+ "loss": 0.651,
224
  "step": 180
225
  },
226
  {
227
  "epoch": 0.41,
228
+ "learning_rate": 3.5676289011958925e-05,
229
+ "loss": 0.7284,
230
  "step": 185
231
  },
232
  {
233
  "epoch": 0.42,
234
+ "learning_rate": 3.7773190493652644e-05,
235
+ "loss": 0.6301,
236
  "step": 190
237
  },
238
  {
239
  "epoch": 0.43,
240
+ "learning_rate": 3.990682886679578e-05,
241
+ "loss": 0.6188,
242
  "step": 195
243
  },
244
  {
245
  "epoch": 0.44,
246
+ "learning_rate": 4.2074661434217846e-05,
247
+ "loss": 0.586,
248
  "step": 200
249
  },
250
  {
251
  "epoch": 0.45,
252
+ "learning_rate": 4.427410474888269e-05,
253
+ "loss": 0.6809,
254
  "step": 205
255
  },
256
  {
257
  "epoch": 0.46,
258
+ "learning_rate": 4.650253769262196e-05,
259
+ "loss": 0.6343,
260
  "step": 210
261
  },
262
  {
263
  "epoch": 0.47,
264
+ "learning_rate": 4.875730459979135e-05,
265
+ "loss": 0.7333,
266
  "step": 215
267
  },
268
  {
269
  "epoch": 0.48,
270
+ "learning_rate": 5.103571842205178e-05,
271
+ "loss": 0.6158,
272
  "step": 220
273
  },
274
  {
275
  "epoch": 0.49,
276
+ "learning_rate": 5.3335063930595955e-05,
277
+ "loss": 0.6216,
278
  "step": 225
279
  },
280
  {
281
+ "epoch": 0.51,
282
+ "learning_rate": 5.565260095192864e-05,
283
+ "loss": 0.7031,
284
  "step": 230
285
  },
286
  {
287
  "epoch": 0.52,
288
+ "learning_rate": 5.7985567633386964e-05,
289
+ "loss": 0.7186,
290
  "step": 235
291
  },
292
  {
293
  "epoch": 0.53,
294
+ "learning_rate": 6.033118373448471e-05,
295
+ "loss": 0.6841,
296
  "step": 240
297
  },
298
  {
299
  "epoch": 0.54,
300
+ "learning_rate": 6.268665394018899e-05,
301
+ "loss": 0.6929,
302
  "step": 245
303
  },
304
  {
305
  "epoch": 0.55,
306
+ "learning_rate": 6.504917119214232e-05,
307
+ "loss": 0.8001,
308
  "step": 250
309
  },
310
  {
311
  "epoch": 0.56,
312
+ "learning_rate": 6.741592003389098e-05,
313
+ "loss": 0.6497,
314
  "step": 255
315
  },
316
  {
317
  "epoch": 0.57,
318
+ "learning_rate": 6.978407996610794e-05,
319
+ "loss": 0.6001,
320
  "step": 260
321
  },
322
  {
323
  "epoch": 0.58,
324
+ "learning_rate": 7.21508288078566e-05,
325
+ "loss": 0.7792,
326
  "step": 265
327
  },
328
  {
329
  "epoch": 0.59,
330
+ "learning_rate": 7.451334605980994e-05,
331
+ "loss": 0.6229,
332
  "step": 270
333
  },
334
  {
335
  "epoch": 0.6,
336
+ "learning_rate": 7.686881626551423e-05,
337
+ "loss": 0.6103,
338
  "step": 275
339
  },
340
  {
341
+ "epoch": 0.62,
342
+ "learning_rate": 7.921443236661197e-05,
343
+ "loss": 0.6509,
344
  "step": 280
345
  },
346
  {
347
+ "epoch": 0.63,
348
+ "learning_rate": 8.15473990480703e-05,
349
+ "loss": 0.7008,
350
  "step": 285
351
  },
352
  {
353
  "epoch": 0.64,
354
+ "learning_rate": 8.3864936069403e-05,
355
+ "loss": 0.6919,
356
  "step": 290
357
  },
358
  {
359
  "epoch": 0.65,
360
+ "learning_rate": 8.616428157794718e-05,
361
+ "loss": 0.7347,
362
  "step": 295
363
  },
364
  {
365
  "epoch": 0.66,
366
+ "learning_rate": 8.844269540020762e-05,
367
+ "loss": 0.7356,
368
  "step": 300
369
  },
370
  {
371
  "epoch": 0.67,
372
+ "learning_rate": 9.069746230737702e-05,
373
+ "loss": 0.7,
374
  "step": 305
375
  },
376
  {
377
  "epoch": 0.68,
378
+ "learning_rate": 9.29258952511163e-05,
379
+ "loss": 0.6359,
380
  "step": 310
381
  },
382
  {
383
  "epoch": 0.69,
384
+ "learning_rate": 9.512533856578116e-05,
385
+ "loss": 0.7165,
386
  "step": 315
387
  },
388
  {
389
  "epoch": 0.7,
390
+ "learning_rate": 9.729317113320324e-05,
391
+ "loss": 0.7637,
392
  "step": 320
393
  },
394
  {
395
  "epoch": 0.71,
396
+ "learning_rate": 9.942680950634639e-05,
397
+ "loss": 0.6223,
398
  "step": 325
399
  },
400
  {
401
+ "epoch": 0.73,
402
+ "learning_rate": 0.00010152371098804014,
403
+ "loss": 0.5555,
404
  "step": 330
405
  },
406
  {
407
+ "epoch": 0.74,
408
+ "learning_rate": 0.00010358137666116348,
409
+ "loss": 0.7219,
410
  "step": 335
411
  },
412
  {
413
  "epoch": 0.75,
414
+ "learning_rate": 0.00010559735436664489,
415
+ "loss": 0.7501,
416
  "step": 340
417
  },
418
  {
419
  "epoch": 0.76,
420
+ "learning_rate": 0.0001075692416257568,
421
+ "loss": 0.7454,
422
  "step": 345
423
  },
424
  {
425
  "epoch": 0.77,
426
+ "learning_rate": 0.00010949468850318805,
427
+ "loss": 0.6956,
428
  "step": 350
429
  },
430
  {
431
  "epoch": 0.78,
432
+ "learning_rate": 0.00011137140040750957,
433
+ "loss": 0.7294,
434
  "step": 355
435
  },
436
  {
437
  "epoch": 0.79,
438
+ "learning_rate": 0.00011319714082567451,
439
+ "loss": 0.7416,
440
  "step": 360
441
  },
442
  {
443
  "epoch": 0.8,
444
+ "learning_rate": 0.00011496973398832998,
445
+ "loss": 0.6926,
446
  "step": 365
447
  },
448
  {
449
  "epoch": 0.81,
450
+ "learning_rate": 0.00011668706746270142,
451
+ "loss": 0.7841,
452
  "step": 370
453
  },
454
  {
455
  "epoch": 0.82,
456
+ "learning_rate": 0.00011834709467003491,
457
+ "loss": 0.7603,
458
  "step": 375
459
  },
460
  {
461
+ "epoch": 0.84,
462
+ "learning_rate": 0.00011994783732453659,
463
+ "loss": 0.7249,
464
  "step": 380
465
  },
466
  {
467
+ "epoch": 0.85,
468
+ "learning_rate": 0.0001214873877909357,
469
+ "loss": 0.7231,
470
  "step": 385
471
  },
472
  {
473
  "epoch": 0.86,
474
+ "learning_rate": 0.00012296391135784465,
475
+ "loss": 0.7101,
476
  "step": 390
477
  },
478
  {
479
  "epoch": 0.87,
480
+ "learning_rate": 0.00012437564842422694,
481
+ "loss": 0.7521,
482
  "step": 395
483
  },
484
  {
485
  "epoch": 0.88,
486
+ "learning_rate": 0.00012572091659634178,
487
+ "loss": 0.6694,
488
  "step": 400
489
  },
490
  {
491
  "epoch": 0.89,
492
+ "learning_rate": 0.00012699811269268675,
493
+ "loss": 0.6763,
494
  "step": 405
495
  },
496
  {
497
  "epoch": 0.9,
498
+ "learning_rate": 0.00012820571465453455,
499
+ "loss": 0.7881,
500
  "step": 410
501
  },
502
  {
503
  "epoch": 0.91,
504
+ "learning_rate": 0.00012934228335981007,
505
+ "loss": 0.7906,
506
  "step": 415
507
  },
508
  {
509
  "epoch": 0.92,
510
+ "learning_rate": 0.00013040646433810568,
511
+ "loss": 0.7662,
512
  "step": 420
513
  },
514
  {
515
  "epoch": 0.93,
516
+ "learning_rate": 0.00013139698938483972,
517
+ "loss": 0.7165,
518
  "step": 425
519
  },
520
  {
521
+ "epoch": 0.95,
522
+ "learning_rate": 0.00013231267807259521,
523
+ "loss": 0.7159,
524
  "step": 430
525
  },
526
  {
527
+ "epoch": 0.96,
528
+ "learning_rate": 0.00013315243915785907,
529
+ "loss": 0.7796,
530
  "step": 435
531
  },
532
  {
533
+ "epoch": 0.97,
534
+ "learning_rate": 0.0001339152718814749,
535
+ "loss": 0.8222,
536
  "step": 440
537
  },
538
  {
539
  "epoch": 0.98,
540
+ "learning_rate": 0.0001346002671612708,
541
+ "loss": 0.7428,
542
  "step": 445
543
  },
544
  {
545
  "epoch": 0.99,
546
+ "learning_rate": 0.00013520660867542692,
547
+ "loss": 0.7597,
548
  "step": 450
549
  },
550
  {
551
  "epoch": 1.0,
552
+ "learning_rate": 0.00013573357383530262,
553
+ "loss": 0.8769,
554
  "step": 455
555
  },
556
  {
557
  "epoch": 1.0,
558
+ "eval_loss": 0.36263224482536316,
559
+ "eval_runtime": 29.2619,
560
+ "eval_samples_per_second": 22.008,
561
+ "eval_steps_per_second": 2.768,
562
+ "step": 455
563
  },
564
  {
565
+ "epoch": 1.01,
566
+ "learning_rate": 0.0001361805346465572,
567
+ "loss": 0.6009,
568
  "step": 460
569
  },
570
  {
571
+ "epoch": 1.02,
572
+ "learning_rate": 0.00013654695845754679,
573
+ "loss": 0.6886,
574
  "step": 465
575
  },
576
  {
577
+ "epoch": 1.03,
578
+ "learning_rate": 0.00013683240859409016,
579
+ "loss": 0.648,
580
  "step": 470
581
  },
582
  {
583
+ "epoch": 1.04,
584
+ "learning_rate": 0.0001370365448798655,
585
+ "loss": 0.6639,
586
  "step": 475
587
  },
588
  {
589
+ "epoch": 1.05,
590
+ "learning_rate": 0.00013715912404180336,
591
+ "loss": 0.6411,
592
  "step": 480
593
  },
594
  {
595
  "epoch": 1.07,
596
+ "learning_rate": 0.0001372,
597
+ "loss": 0.5669,
598
  "step": 485
599
  },
600
  {
601
  "epoch": 1.08,
602
+ "learning_rate": 0.00013715912404180345,
603
+ "loss": 0.7041,
604
  "step": 490
605
  },
606
  {
607
+ "epoch": 1.09,
608
+ "learning_rate": 0.00013703654487986564,
609
+ "loss": 0.7196,
610
  "step": 495
611
  },
612
  {
613
+ "epoch": 1.1,
614
+ "learning_rate": 0.00013683240859409013,
615
+ "loss": 0.6656,
616
  "step": 500
617
  },
618
  {
619
+ "epoch": 1.11,
620
+ "learning_rate": 0.00013654695845754679,
621
+ "loss": 0.7371,
622
  "step": 505
623
  },
624
  {
625
+ "epoch": 1.12,
626
+ "learning_rate": 0.00013618053464655754,
627
+ "loss": 0.6728,
628
  "step": 510
629
  },
630
  {
631
+ "epoch": 1.13,
632
+ "learning_rate": 0.0001357335738353026,
633
+ "loss": 0.663,
634
  "step": 515
635
  },
636
  {
637
+ "epoch": 1.14,
638
+ "learning_rate": 0.00013520660867542687,
639
+ "loss": 0.5962,
640
  "step": 520
641
  },
642
  {
643
+ "epoch": 1.15,
644
+ "learning_rate": 0.0001346002671612713,
645
+ "loss": 0.6391,
646
  "step": 525
647
  },
648
  {
649
+ "epoch": 1.16,
650
+ "learning_rate": 0.00013391527188147485,
651
+ "loss": 0.7195,
652
  "step": 530
653
  },
654
  {
655
  "epoch": 1.18,
656
+ "learning_rate": 0.00013315243915785902,
657
+ "loss": 0.7751,
658
  "step": 535
659
  },
660
  {
661
  "epoch": 1.19,
662
+ "learning_rate": 0.0001323126780725959,
663
+ "loss": 0.6413,
664
  "step": 540
665
  },
666
  {
667
+ "epoch": 1.2,
668
+ "learning_rate": 0.00013139698938484045,
669
+ "loss": 0.7913,
670
  "step": 545
671
  },
672
  {
673
+ "epoch": 1.21,
674
+ "learning_rate": 0.0001304064643381056,
675
+ "loss": 0.7761,
676
  "step": 550
677
  },
678
  {
679
+ "epoch": 1.22,
680
+ "learning_rate": 0.00012934228335981002,
681
+ "loss": 0.6472,
682
  "step": 555
683
  },
684
  {
685
+ "epoch": 1.23,
686
+ "learning_rate": 0.00012820571465453544,
687
+ "loss": 0.6222,
688
  "step": 560
689
  },
690
  {
691
+ "epoch": 1.24,
692
+ "learning_rate": 0.0001269981126926867,
693
+ "loss": 0.6736,
694
  "step": 565
695
  },
696
  {
697
+ "epoch": 1.25,
698
+ "learning_rate": 0.00012572091659634172,
699
+ "loss": 0.73,
700
  "step": 570
701
  },
702
  {
703
+ "epoch": 1.26,
704
+ "learning_rate": 0.000124375648424228,
705
+ "loss": 0.6426,
706
  "step": 575
707
  },
708
  {
709
+ "epoch": 1.27,
710
+ "learning_rate": 0.00012296391135784457,
711
+ "loss": 0.7043,
712
  "step": 580
713
  },
714
  {
715
  "epoch": 1.29,
716
+ "learning_rate": 0.00012148738779093562,
717
+ "loss": 0.7227,
718
  "step": 585
719
  },
720
  {
721
+ "epoch": 1.3,
722
+ "learning_rate": 0.00011994783732453781,
723
+ "loss": 0.7366,
724
  "step": 590
725
  },
726
  {
727
+ "epoch": 1.31,
728
+ "learning_rate": 0.00011834709467003617,
729
+ "loss": 0.7144,
730
  "step": 595
731
  },
732
  {
733
+ "epoch": 1.32,
734
+ "learning_rate": 0.00011668706746270132,
735
+ "loss": 0.6817,
736
  "step": 600
737
  },
738
  {
739
+ "epoch": 1.33,
740
+ "learning_rate": 0.00011496973398833133,
741
+ "loss": 0.6274,
742
  "step": 605
743
  },
744
  {
745
+ "epoch": 1.34,
746
+ "learning_rate": 0.00011319714082567588,
747
+ "loss": 0.6754,
748
  "step": 610
749
  },
750
  {
751
+ "epoch": 1.35,
752
+ "learning_rate": 0.00011137140040750945,
753
+ "loss": 0.6134,
754
  "step": 615
755
  },
756
  {
757
+ "epoch": 1.36,
758
+ "learning_rate": 0.00010949468850318951,
759
+ "loss": 0.698,
760
  "step": 620
761
  },
762
  {
763
+ "epoch": 1.37,
764
+ "learning_rate": 0.00010756924162575829,
765
+ "loss": 0.8638,
766
  "step": 625
767
  },
768
  {
769
+ "epoch": 1.38,
770
+ "learning_rate": 0.00010559735436664478,
771
+ "loss": 0.7123,
772
  "step": 630
773
  },
774
  {
775
  "epoch": 1.4,
776
+ "learning_rate": 0.00010358137666116336,
777
+ "loss": 0.666,
778
  "step": 635
779
  },
780
  {
781
+ "epoch": 1.41,
782
+ "learning_rate": 0.00010152371098804174,
783
+ "loss": 0.6977,
784
  "step": 640
785
  },
786
  {
787
+ "epoch": 1.42,
788
+ "learning_rate": 9.942680950634801e-05,
789
+ "loss": 0.5854,
790
  "step": 645
791
  },
792
  {
793
+ "epoch": 1.43,
794
+ "learning_rate": 9.729317113320311e-05,
795
+ "loss": 0.6888,
796
  "step": 650
797
  },
798
  {
799
+ "epoch": 1.44,
800
+ "learning_rate": 9.512533856578284e-05,
801
+ "loss": 0.6905,
802
  "step": 655
803
  },
804
  {
805
+ "epoch": 1.45,
806
+ "learning_rate": 9.2925895251118e-05,
807
+ "loss": 0.664,
808
  "step": 660
809
  },
810
  {
811
+ "epoch": 1.46,
812
+ "learning_rate": 9.069746230737689e-05,
813
+ "loss": 0.7013,
814
  "step": 665
815
  },
816
  {
817
+ "epoch": 1.47,
818
+ "learning_rate": 8.844269540020936e-05,
819
+ "loss": 0.6946,
820
  "step": 670
821
  },
822
  {
823
+ "epoch": 1.48,
824
+ "learning_rate": 8.616428157794893e-05,
825
+ "loss": 0.7819,
826
  "step": 675
827
  },
828
  {
829
+ "epoch": 1.49,
830
+ "learning_rate": 8.386493606940288e-05,
831
+ "loss": 0.6662,
832
  "step": 680
833
  },
834
  {
835
+ "epoch": 1.51,
836
+ "learning_rate": 8.154739904807017e-05,
837
+ "loss": 0.5889,
838
  "step": 685
839
  },
840
  {
841
+ "epoch": 1.52,
842
+ "learning_rate": 7.921443236661376e-05,
843
+ "loss": 0.7833,
844
  "step": 690
845
  },
846
  {
847
+ "epoch": 1.53,
848
+ "learning_rate": 7.686881626551602e-05,
849
+ "loss": 0.7548,
850
  "step": 695
851
  },
852
  {
853
+ "epoch": 1.54,
854
+ "learning_rate": 7.45133460598098e-05,
855
+ "loss": 0.6886,
856
  "step": 700
857
  },
858
  {
859
+ "epoch": 1.55,
860
+ "learning_rate": 7.215082880785842e-05,
861
+ "loss": 0.7179,
862
  "step": 705
863
  },
864
  {
865
+ "epoch": 1.56,
866
+ "learning_rate": 6.978407996610975e-05,
867
+ "loss": 0.745,
868
  "step": 710
869
  },
870
  {
871
+ "epoch": 1.57,
872
+ "learning_rate": 6.741592003389085e-05,
873
+ "loss": 0.5811,
874
  "step": 715
875
  },
876
  {
877
+ "epoch": 1.58,
878
+ "learning_rate": 6.504917119214413e-05,
879
+ "loss": 0.6962,
880
  "step": 720
881
  },
882
  {
883
+ "epoch": 1.59,
884
+ "learning_rate": 6.268665394019079e-05,
885
+ "loss": 0.6763,
886
  "step": 725
887
  },
888
  {
889
+ "epoch": 1.6,
890
+ "learning_rate": 6.033118373448457e-05,
891
+ "loss": 0.6247,
892
  "step": 730
893
  },
894
  {
895
+ "epoch": 1.62,
896
+ "learning_rate": 5.7985567633386836e-05,
897
+ "loss": 0.7763,
898
  "step": 735
899
  },
900
  {
901
+ "epoch": 1.63,
902
+ "learning_rate": 5.5652600951930425e-05,
903
+ "loss": 0.6239,
904
  "step": 740
905
  },
906
  {
907
+ "epoch": 1.64,
908
+ "learning_rate": 5.333506393059772e-05,
909
+ "loss": 0.7218,
910
  "step": 745
911
  },
912
  {
913
+ "epoch": 1.65,
914
+ "learning_rate": 5.103571842205165e-05,
915
+ "loss": 0.695,
916
  "step": 750
917
  },
918
  {
919
+ "epoch": 1.66,
920
+ "learning_rate": 4.8757304599793096e-05,
921
+ "loss": 0.7938,
922
  "step": 755
923
  },
924
  {
925
+ "epoch": 1.67,
926
+ "learning_rate": 4.650253769262368e-05,
927
+ "loss": 0.7333,
928
  "step": 760
929
  },
930
  {
931
+ "epoch": 1.68,
932
+ "learning_rate": 4.4274104748882565e-05,
933
+ "loss": 0.7092,
934
  "step": 765
935
  },
936
  {
937
+ "epoch": 1.69,
938
+ "learning_rate": 4.2074661434219527e-05,
939
+ "loss": 0.764,
940
  "step": 770
941
  },
942
  {
943
+ "epoch": 1.7,
944
+ "learning_rate": 3.9906828866797437e-05,
945
+ "loss": 0.7929,
946
  "step": 775
947
  },
948
  {
949
+ "epoch": 1.71,
950
+ "learning_rate": 3.777319049365253e-05,
951
+ "loss": 0.6747,
952
  "step": 780
953
  },
954
  {
955
+ "epoch": 1.73,
956
+ "learning_rate": 3.56762890119588e-05,
957
+ "loss": 0.6921,
958
  "step": 785
959
  },
960
  {
961
+ "epoch": 1.74,
962
+ "learning_rate": 3.361862333883716e-05,
963
+ "loss": 0.7049,
964
  "step": 790
965
  },
966
  {
967
+ "epoch": 1.75,
968
+ "learning_rate": 3.160264563335574e-05,
969
+ "loss": 0.778,
970
  "step": 795
971
  },
972
  {
973
+ "epoch": 1.76,
974
+ "learning_rate": 2.9630758374242215e-05,
975
+ "loss": 0.6684,
976
  "step": 800
977
  },
978
  {
979
+ "epoch": 1.77,
980
+ "learning_rate": 2.7705311496812532e-05,
981
+ "loss": 0.6652,
982
  "step": 805
983
  },
984
  {
985
+ "epoch": 1.78,
986
+ "learning_rate": 2.582859959249102e-05,
987
+ "loss": 0.7384,
988
  "step": 810
989
  },
990
  {
991
+ "epoch": 1.79,
992
+ "learning_rate": 2.400285917432458e-05,
993
+ "loss": 0.6104,
994
  "step": 815
995
  },
996
  {
997
+ "epoch": 1.8,
998
+ "learning_rate": 2.2230266011670566e-05,
999
+ "loss": 0.7144,
1000
  "step": 820
1001
  },
1002
  {
1003
+ "epoch": 1.81,
1004
+ "learning_rate": 2.0512932537299123e-05,
1005
+ "loss": 0.5937,
1006
  "step": 825
1007
  },
1008
  {
1009
+ "epoch": 1.82,
1010
+ "learning_rate": 1.885290532996424e-05,
1011
+ "loss": 0.7779,
1012
  "step": 830
1013
  },
1014
  {
1015
+ "epoch": 1.84,
1016
+ "learning_rate": 1.7252162675462595e-05,
1017
+ "loss": 0.7421,
1018
  "step": 835
1019
  },
1020
  {
1021
+ "epoch": 1.85,
1022
+ "learning_rate": 1.571261220906478e-05,
1023
+ "loss": 0.7221,
1024
  "step": 840
1025
  },
1026
  {
1027
+ "epoch": 1.86,
1028
+ "learning_rate": 1.4236088642155802e-05,
1029
+ "loss": 0.757,
1030
  "step": 845
1031
  },
1032
  {
1033
+ "epoch": 1.87,
1034
+ "learning_rate": 1.2824351575772341e-05,
1035
+ "loss": 0.752,
1036
  "step": 850
1037
  },
1038
  {
1039
+ "epoch": 1.88,
1040
+ "learning_rate": 1.1479083403658627e-05,
1041
+ "loss": 0.743,
1042
  "step": 855
1043
  },
1044
  {
1045
+ "epoch": 1.89,
1046
+ "learning_rate": 1.0201887307313627e-05,
1047
+ "loss": 0.5773,
1048
  "step": 860
1049
  },
1050
  {
1051
+ "epoch": 1.9,
1052
+ "learning_rate": 8.994285345464858e-06,
1053
+ "loss": 0.7125,
1054
  "step": 865
1055
  },
1056
  {
1057
+ "epoch": 1.91,
1058
+ "learning_rate": 7.857716640190273e-06,
1059
+ "loss": 0.6377,
1060
  "step": 870
1061
  },
1062
  {
1063
+ "epoch": 1.92,
1064
+ "learning_rate": 6.793535661894656e-06,
1065
+ "loss": 0.7089,
1066
  "step": 875
1067
  },
1068
  {
1069
+ "epoch": 1.93,
1070
+ "learning_rate": 5.803010615159811e-06,
1071
+ "loss": 0.61,
1072
  "step": 880
1073
  },
1074
  {
1075
+ "epoch": 1.95,
1076
+ "learning_rate": 4.887321927404351e-06,
1077
+ "loss": 0.696,
1078
  "step": 885
1079
  },
1080
  {
1081
+ "epoch": 1.96,
1082
+ "learning_rate": 4.047560842141189e-06,
1083
+ "loss": 0.6485,
1084
  "step": 890
1085
  },
1086
  {
1087
+ "epoch": 1.97,
1088
+ "learning_rate": 3.2847281185253237e-06,
1089
+ "loss": 0.6823,
1090
  "step": 895
1091
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
  {
1093
  "epoch": 1.98,
1094
+ "learning_rate": 2.599732838728863e-06,
1095
+ "loss": 0.7038,
1096
+ "step": 900
1097
  },
1098
  {
1099
  "epoch": 1.99,
1100
+ "learning_rate": 1.993391324573266e-06,
1101
+ "loss": 0.6816,
1102
+ "step": 905
1103
  },
1104
  {
1105
  "epoch": 2.0,
1106
+ "learning_rate": 1.4664261646975266e-06,
1107
+ "loss": 0.6658,
1108
+ "step": 910
1109
  },
1110
  {
1111
  "epoch": 2.0,
1112
+ "eval_loss": 0.3614741861820221,
1113
+ "eval_runtime": 29.3257,
1114
+ "eval_samples_per_second": 21.96,
1115
+ "eval_steps_per_second": 2.762,
1116
+ "step": 910
1117
  }
1118
  ],
1119
+ "max_steps": 1365,
1120
  "num_train_epochs": 3,
1121
+ "total_flos": 949535244288000.0,
1122
  "trial_name": null,
1123
  "trial_params": null
1124
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98ff7520005bd1b7cd3b14407c7fae7975d1a94564c7ae5c3bb82fb25346e68c
3
  size 3055
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20a154128815e7bbba517b0e4c2b881a6382b44d7ef3cd0aa609175592021c15
3
  size 3055