AlekseyKorshuk commited on
Commit
1559e0a
1 Parent(s): b2b44b1

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/1hflh7u6/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/4gvmxrna) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/4gvmxrna/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
 
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/2mol6j1a/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/35sq6jaq) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/35sq6jaq/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 0.17033791542053223, "eval_runtime": 13.3261, "eval_samples_per_second": 45.099, "eval_steps_per_second": 5.703, "epoch": 2.0}
 
1
+ {"eval_loss": 0.200975701212883, "eval_runtime": 14.714, "eval_samples_per_second": 43.224, "eval_steps_per_second": 5.437, "epoch": 2.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84c1e0b5ec78700742efecda3264cb6c05363dcd0cfd2603f9208a76f99a8e57
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b69833549ccfd36fe66e772afe6f4a3d07369b965ae5d6a0ecb8bd1a0536b2df
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dd1f75aa87ace65ff5eabf8bf35c462a5ea435576d30fa3ae79699f3a222ffd
3
  size 995604017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2d59312be6947afd2ba8011456d4e8908905235d6e1780c6feaddcce691dd0f
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05239515b946d463e5c208905abbc650450660236b44874d528bf2e8ff9404e4
3
  size 510396521
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e59175cb5178adcc41b3c715192961b8befb8944063b8edbaff0c3fe9b07e8e
3
  size 510396521
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a51ff024d36f5b16e406a7ff85689d3f4ab6243d4b5396f02ca94ed03ed314b
3
  size 14567
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e43063477db1b2b70df331c88c4d6e034ea15f0e3e30c9c64fc77e81e122fc
3
  size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68dcd37ec9f598b778dcfcdd0e6ff8990fdc855c792ae8971ac9bb4e2c9c2527
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa23e484404e2ec35aa04932ba167a546a0ca5eb25f20a84b410351d432dbc48
3
  size 623
trainer_state.json CHANGED
@@ -1,584 +1,570 @@
1
  {
2
- "best_metric": 0.17033791542053223,
3
- "best_model_checkpoint": "output/eminem/checkpoint-460",
4
  "epoch": 1.0,
5
- "global_step": 460,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
- "learning_rate": 0.00013711989076069754,
13
- "loss": 0.4887,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.02,
18
- "learning_rate": 0.00013719345930102362,
19
- "loss": 0.4564,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.03,
24
- "learning_rate": 0.00013718528371963108,
25
- "loss": 0.4132,
26
  "step": 15
27
  },
28
  {
29
  "epoch": 0.04,
30
- "learning_rate": 0.00013709537375951485,
31
- "loss": 0.4758,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 0.05,
36
- "learning_rate": 0.00013692383656807334,
37
- "loss": 0.4929,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 0.07,
42
- "learning_rate": 0.00013667087656941957,
43
- "loss": 0.4404,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.08,
48
- "learning_rate": 0.0001363367952207642,
49
- "loss": 0.4726,
50
  "step": 35
51
  },
52
  {
53
  "epoch": 0.09,
54
- "learning_rate": 0.00013592199065316386,
55
- "loss": 0.4448,
56
  "step": 40
57
  },
58
  {
59
  "epoch": 0.1,
60
- "learning_rate": 0.00013542695719706024,
61
- "loss": 0.4852,
62
  "step": 45
63
  },
64
  {
65
  "epoch": 0.11,
66
- "learning_rate": 0.00013485228479317936,
67
- "loss": 0.4865,
68
  "step": 50
69
  },
70
  {
71
  "epoch": 0.12,
72
- "learning_rate": 0.00013419865828948315,
73
- "loss": 0.5125,
74
  "step": 55
75
  },
76
  {
77
  "epoch": 0.13,
78
- "learning_rate": 0.00013346685662502967,
79
- "loss": 0.5314,
80
  "step": 60
81
  },
82
  {
83
  "epoch": 0.14,
84
- "learning_rate": 0.00013265775190169146,
85
- "loss": 0.4813,
86
  "step": 65
87
  },
88
  {
89
  "epoch": 0.15,
90
- "learning_rate": 0.00013177230834485767,
91
- "loss": 0.4348,
92
  "step": 70
93
  },
94
  {
95
  "epoch": 0.16,
96
- "learning_rate": 0.00013081158115434713,
97
- "loss": 0.4715,
98
  "step": 75
99
  },
100
  {
101
  "epoch": 0.18,
102
- "learning_rate": 0.00012977671524691005,
103
- "loss": 0.4864,
104
  "step": 80
105
  },
106
  {
107
  "epoch": 0.19,
108
- "learning_rate": 0.0001286689438918043,
109
- "loss": 0.4764,
110
  "step": 85
111
  },
112
  {
113
  "epoch": 0.2,
114
- "learning_rate": 0.0001274895872410883,
115
- "loss": 0.5284,
116
  "step": 90
117
  },
118
  {
119
  "epoch": 0.21,
120
- "learning_rate": 0.0001262400507563689,
121
- "loss": 0.5136,
122
  "step": 95
123
  },
124
  {
125
  "epoch": 0.22,
126
- "learning_rate": 0.00012492182353389172,
127
- "loss": 0.5015,
128
  "step": 100
129
  },
130
  {
131
  "epoch": 0.23,
132
- "learning_rate": 0.00012353647652994513,
133
- "loss": 0.5157,
134
  "step": 105
135
  },
136
  {
137
  "epoch": 0.24,
138
- "learning_rate": 0.00012208566068873714,
139
- "loss": 0.5087,
140
  "step": 110
141
  },
142
  {
143
  "epoch": 0.25,
144
- "learning_rate": 0.00012057110497492192,
145
- "loss": 0.5977,
146
  "step": 115
147
  },
148
  {
149
  "epoch": 0.26,
150
- "learning_rate": 0.00011899461431316551,
151
- "loss": 0.4527,
152
  "step": 120
153
  },
154
  {
155
  "epoch": 0.27,
156
- "learning_rate": 0.00011735806743718042,
157
- "loss": 0.4758,
158
  "step": 125
159
  },
160
  {
161
  "epoch": 0.29,
162
- "learning_rate": 0.00011566341465080868,
163
- "loss": 0.4566,
164
  "step": 130
165
  },
166
  {
167
  "epoch": 0.3,
168
- "learning_rate": 0.00011391267550379983,
169
- "loss": 0.4963,
170
  "step": 135
171
  },
172
  {
173
  "epoch": 0.31,
174
- "learning_rate": 0.00011210793638508056,
175
- "loss": 0.4994,
176
  "step": 140
177
  },
178
  {
179
  "epoch": 0.32,
180
- "learning_rate": 0.00011025134803636436,
181
- "loss": 0.5175,
182
  "step": 145
183
  },
184
  {
185
  "epoch": 0.33,
186
- "learning_rate": 0.00010834512298907191,
187
- "loss": 0.5139,
188
  "step": 150
189
  },
190
  {
191
  "epoch": 0.34,
192
- "learning_rate": 0.00010639153292761406,
193
- "loss": 0.515,
194
  "step": 155
195
  },
196
  {
197
  "epoch": 0.35,
198
- "learning_rate": 0.0001043929059821889,
199
- "loss": 0.4672,
200
  "step": 160
201
  },
202
  {
203
  "epoch": 0.36,
204
- "learning_rate": 0.00010235162395429551,
205
- "loss": 0.5449,
206
  "step": 165
207
  },
208
  {
209
  "epoch": 0.37,
210
- "learning_rate": 0.00010027011947830235,
211
- "loss": 0.5187,
212
  "step": 170
213
  },
214
  {
215
  "epoch": 0.38,
216
- "learning_rate": 9.81508731224277e-05,
217
- "loss": 0.5283,
218
  "step": 175
219
  },
220
  {
221
- "epoch": 0.4,
222
- "learning_rate": 9.599641043260833e-05,
223
- "loss": 0.4869,
224
  "step": 180
225
  },
226
  {
227
  "epoch": 0.41,
228
- "learning_rate": 9.380929892273937e-05,
229
- "loss": 0.4807,
230
  "step": 185
231
  },
232
  {
233
  "epoch": 0.42,
234
- "learning_rate": 9.15921450149442e-05,
235
- "loss": 0.5009,
236
  "step": 190
237
  },
238
  {
239
  "epoch": 0.43,
240
- "learning_rate": 8.93475909334324e-05,
241
- "loss": 0.5661,
242
  "step": 195
243
  },
244
  {
245
  "epoch": 0.44,
246
- "learning_rate": 8.70783115557202e-05,
247
- "loss": 0.5181,
248
  "step": 200
249
  },
250
  {
251
  "epoch": 0.45,
252
- "learning_rate": 8.478701122492464e-05,
253
- "loss": 0.5132,
254
  "step": 205
255
  },
256
  {
257
  "epoch": 0.46,
258
- "learning_rate": 8.247642052695435e-05,
259
- "loss": 0.4738,
260
  "step": 210
261
  },
262
  {
263
  "epoch": 0.47,
264
- "learning_rate": 8.014929303640746e-05,
265
- "loss": 0.4718,
266
  "step": 215
267
  },
268
  {
269
  "epoch": 0.48,
270
- "learning_rate": 7.780840203509217e-05,
271
- "loss": 0.5202,
272
  "step": 220
273
  },
274
  {
275
  "epoch": 0.49,
276
- "learning_rate": 7.545653720705064e-05,
277
- "loss": 0.4643,
278
  "step": 225
279
  },
280
  {
281
- "epoch": 0.51,
282
- "learning_rate": 7.309650131404948e-05,
283
- "loss": 0.4846,
284
  "step": 230
285
  },
286
  {
287
  "epoch": 0.52,
288
- "learning_rate": 7.073110685545424e-05,
289
- "loss": 0.4528,
290
  "step": 235
291
  },
292
  {
293
  "epoch": 0.53,
294
- "learning_rate": 6.836317271654793e-05,
295
- "loss": 0.4695,
296
  "step": 240
297
  },
298
  {
299
  "epoch": 0.54,
300
- "learning_rate": 6.599552080919084e-05,
301
- "loss": 0.4723,
302
  "step": 245
303
  },
304
  {
305
  "epoch": 0.55,
306
- "learning_rate": 6.363097270890452e-05,
307
- "loss": 0.4357,
308
  "step": 250
309
  },
310
  {
311
  "epoch": 0.56,
312
- "learning_rate": 6.127234629234303e-05,
313
- "loss": 0.5521,
314
  "step": 255
315
  },
316
  {
317
  "epoch": 0.57,
318
- "learning_rate": 5.892245237918387e-05,
319
- "loss": 0.484,
320
  "step": 260
321
  },
322
  {
323
  "epoch": 0.58,
324
- "learning_rate": 5.6584091382409376e-05,
325
- "loss": 0.5221,
326
  "step": 265
327
  },
328
  {
329
  "epoch": 0.59,
330
- "learning_rate": 5.426004997100939e-05,
331
- "loss": 0.4865,
332
  "step": 270
333
  },
334
  {
335
  "epoch": 0.6,
336
- "learning_rate": 5.195309774905142e-05,
337
- "loss": 0.4395,
338
  "step": 275
339
  },
340
  {
341
- "epoch": 0.62,
342
- "learning_rate": 4.966598395510049e-05,
343
- "loss": 0.4693,
344
  "step": 280
345
  },
346
  {
347
- "epoch": 0.63,
348
- "learning_rate": 4.7401434185878876e-05,
349
- "loss": 0.5105,
350
  "step": 285
351
  },
352
  {
353
  "epoch": 0.64,
354
- "learning_rate": 4.516214714814733e-05,
355
- "loss": 0.406,
356
  "step": 290
357
  },
358
  {
359
  "epoch": 0.65,
360
- "learning_rate": 4.2950791442584565e-05,
361
- "loss": 0.4573,
362
  "step": 295
363
  },
364
  {
365
  "epoch": 0.66,
366
- "learning_rate": 4.0770002383574794e-05,
367
- "loss": 0.4153,
368
  "step": 300
369
  },
370
  {
371
  "epoch": 0.67,
372
- "learning_rate": 3.862237885865012e-05,
373
- "loss": 0.4713,
374
  "step": 305
375
  },
376
  {
377
  "epoch": 0.68,
378
- "learning_rate": 3.651048023135423e-05,
379
- "loss": 0.4615,
380
  "step": 310
381
  },
382
  {
383
  "epoch": 0.69,
384
- "learning_rate": 3.4436823291189834e-05,
385
- "loss": 0.4731,
386
  "step": 315
387
  },
388
  {
389
  "epoch": 0.7,
390
- "learning_rate": 3.2403879254320225e-05,
391
- "loss": 0.4743,
392
  "step": 320
393
  },
394
  {
395
  "epoch": 0.71,
396
- "learning_rate": 3.0414070818570786e-05,
397
- "loss": 0.4617,
398
  "step": 325
399
  },
400
  {
401
- "epoch": 0.73,
402
- "learning_rate": 2.84697692762624e-05,
403
- "loss": 0.4779,
404
  "step": 330
405
  },
406
  {
407
- "epoch": 0.74,
408
- "learning_rate": 2.657329168827975e-05,
409
- "loss": 0.4796,
410
  "step": 335
411
  },
412
  {
413
  "epoch": 0.75,
414
- "learning_rate": 2.4726898122808552e-05,
415
- "loss": 0.451,
416
  "step": 340
417
  },
418
  {
419
  "epoch": 0.76,
420
- "learning_rate": 2.2932788961952088e-05,
421
- "loss": 0.5316,
422
  "step": 345
423
  },
424
  {
425
  "epoch": 0.77,
426
- "learning_rate": 2.1193102279502168e-05,
427
- "loss": 0.4846,
428
  "step": 350
429
  },
430
  {
431
  "epoch": 0.78,
432
- "learning_rate": 1.9509911292953254e-05,
433
- "loss": 0.4435,
434
  "step": 355
435
  },
436
  {
437
  "epoch": 0.79,
438
- "learning_rate": 1.7885221892815502e-05,
439
- "loss": 0.4859,
440
  "step": 360
441
  },
442
  {
443
  "epoch": 0.8,
444
- "learning_rate": 1.6320970252148786e-05,
445
- "loss": 0.5439,
446
  "step": 365
447
  },
448
  {
449
  "epoch": 0.81,
450
- "learning_rate": 1.48190205191943e-05,
451
- "loss": 0.4491,
452
  "step": 370
453
  },
454
  {
455
  "epoch": 0.82,
456
- "learning_rate": 1.3381162595831284e-05,
457
- "loss": 0.4599,
458
  "step": 375
459
  },
460
  {
461
- "epoch": 0.84,
462
- "learning_rate": 1.2009110004522916e-05,
463
- "loss": 0.4688,
464
  "step": 380
465
  },
466
  {
467
- "epoch": 0.85,
468
- "learning_rate": 1.0704497846266602e-05,
469
- "loss": 0.445,
470
  "step": 385
471
  },
472
  {
473
  "epoch": 0.86,
474
- "learning_rate": 9.468880852029007e-06,
475
- "loss": 0.4632,
476
  "step": 390
477
  },
478
  {
479
  "epoch": 0.87,
480
- "learning_rate": 8.303731529931584e-06,
481
- "loss": 0.4699,
482
  "step": 395
483
  },
484
  {
485
  "epoch": 0.88,
486
- "learning_rate": 7.210438410440295e-06,
487
- "loss": 0.4994,
488
  "step": 400
489
  },
490
  {
491
  "epoch": 0.89,
492
- "learning_rate": 6.190304391625742e-06,
493
- "loss": 0.4727,
494
  "step": 405
495
  },
496
  {
497
  "epoch": 0.9,
498
- "learning_rate": 5.24454518647817e-06,
499
- "loss": 0.4766,
500
  "step": 410
501
  },
502
  {
503
  "epoch": 0.91,
504
- "learning_rate": 4.374287874114302e-06,
505
- "loss": 0.5309,
506
  "step": 415
507
  },
508
  {
509
  "epoch": 0.92,
510
- "learning_rate": 3.5805695566192774e-06,
511
- "loss": 0.4346,
512
  "step": 420
513
  },
514
  {
515
  "epoch": 0.93,
516
- "learning_rate": 2.8643361231110826e-06,
517
- "loss": 0.4458,
518
  "step": 425
519
  },
520
  {
521
- "epoch": 0.95,
522
- "learning_rate": 2.2264411225093805e-06,
523
- "loss": 0.416,
524
  "step": 430
525
  },
526
  {
527
- "epoch": 0.96,
528
- "learning_rate": 1.6676447463391393e-06,
529
- "loss": 0.524,
530
  "step": 435
531
  },
532
  {
533
- "epoch": 0.97,
534
- "learning_rate": 1.1886129228033631e-06,
535
- "loss": 0.4421,
536
  "step": 440
537
  },
538
  {
539
  "epoch": 0.98,
540
- "learning_rate": 7.89916523178351e-07,
541
- "loss": 0.4325,
542
  "step": 445
543
  },
544
  {
545
  "epoch": 0.99,
546
- "learning_rate": 4.7203068149803844e-07,
547
- "loss": 0.476,
548
  "step": 450
549
  },
550
  {
551
  "epoch": 1.0,
552
- "learning_rate": 2.3533422832716514e-07,
553
- "loss": 0.4506,
554
  "step": 455
555
  },
556
  {
557
  "epoch": 1.0,
558
- "eval_loss": 0.22213919460773468,
559
- "eval_runtime": 29.39,
560
- "eval_samples_per_second": 21.946,
561
- "eval_steps_per_second": 2.756,
562
- "step": 455
563
- },
564
- {
565
- "epoch": 1.0,
566
- "learning_rate": 0.00010892206830726497,
567
- "loss": 0.4359,
568
- "step": 460
569
- },
570
- {
571
- "epoch": 1.0,
572
- "eval_loss": 0.17033791542053223,
573
- "eval_runtime": 12.2909,
574
- "eval_samples_per_second": 48.898,
575
- "eval_steps_per_second": 6.183,
576
- "step": 460
577
  }
578
  ],
579
- "max_steps": 920,
580
  "num_train_epochs": 2,
581
- "total_flos": 479470878720000.0,
582
  "trial_name": null,
583
  "trial_params": null
584
  }
 
1
  {
2
+ "best_metric": 0.200975701212883,
3
+ "best_model_checkpoint": "output/eminem/checkpoint-456",
4
  "epoch": 1.0,
5
+ "global_step": 456,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
+ "learning_rate": 0.0001295937875943477,
13
+ "loss": 0.4175,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.02,
18
+ "learning_rate": 0.0001306389012238537,
19
+ "loss": 0.4452,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.03,
24
+ "learning_rate": 0.00013161040580202325,
25
+ "loss": 0.4523,
26
  "step": 15
27
  },
28
  {
29
  "epoch": 0.04,
30
+ "learning_rate": 0.00013250714864031736,
31
+ "loss": 0.4272,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 0.05,
36
+ "learning_rate": 0.00013332806575487712,
37
+ "loss": 0.4701,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 0.07,
42
+ "learning_rate": 0.00013407218312893365,
43
+ "loss": 0.4872,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.08,
48
+ "learning_rate": 0.00013473861786848294,
49
+ "loss": 0.4231,
50
  "step": 35
51
  },
52
  {
53
  "epoch": 0.09,
54
+ "learning_rate": 0.00013532657924983333,
55
+ "loss": 0.4708,
56
  "step": 40
57
  },
58
  {
59
  "epoch": 0.1,
60
+ "learning_rate": 0.0001358353696578007,
61
+ "loss": 0.5047,
62
  "step": 45
63
  },
64
  {
65
  "epoch": 0.11,
66
+ "learning_rate": 0.00013626438541342652,
67
+ "loss": 0.4957,
68
  "step": 50
69
  },
70
  {
71
  "epoch": 0.12,
72
+ "learning_rate": 0.00013661311749024328,
73
+ "loss": 0.4333,
74
  "step": 55
75
  },
76
  {
77
  "epoch": 0.13,
78
+ "learning_rate": 0.0001368811521182315,
79
+ "loss": 0.4417,
80
  "step": 60
81
  },
82
  {
83
  "epoch": 0.14,
84
+ "learning_rate": 0.00013706817127475857,
85
+ "loss": 0.4644,
86
  "step": 65
87
  },
88
  {
89
  "epoch": 0.15,
90
+ "learning_rate": 0.00013717395306191163,
91
+ "loss": 0.5235,
92
  "step": 70
93
  },
94
  {
95
  "epoch": 0.16,
96
+ "learning_rate": 0.00013719837196977938,
97
+ "loss": 0.4143,
98
  "step": 75
99
  },
100
  {
101
  "epoch": 0.18,
102
+ "learning_rate": 0.00013714139902536895,
103
+ "loss": 0.4418,
104
  "step": 80
105
  },
106
  {
107
  "epoch": 0.19,
108
+ "learning_rate": 0.00013700310182698214,
109
+ "loss": 0.4862,
110
  "step": 85
111
  },
112
  {
113
  "epoch": 0.2,
114
+ "learning_rate": 0.0001367836444640114,
115
+ "loss": 0.5152,
116
  "step": 90
117
  },
118
  {
119
  "epoch": 0.21,
120
+ "learning_rate": 0.00013648328732224639,
121
+ "loss": 0.4401,
122
  "step": 95
123
  },
124
  {
125
  "epoch": 0.22,
126
+ "learning_rate": 0.00013610238677492728,
127
+ "loss": 0.4883,
128
  "step": 100
129
  },
130
  {
131
  "epoch": 0.23,
132
+ "learning_rate": 0.00013564139475990883,
133
+ "loss": 0.475,
134
  "step": 105
135
  },
136
  {
137
  "epoch": 0.24,
138
+ "learning_rate": 0.0001351008582434381,
139
+ "loss": 0.4708,
140
  "step": 110
141
  },
142
  {
143
  "epoch": 0.25,
144
+ "learning_rate": 0.00013448141857117668,
145
+ "loss": 0.5114,
146
  "step": 115
147
  },
148
  {
149
  "epoch": 0.26,
150
+ "learning_rate": 0.000133783810707247,
151
+ "loss": 0.4598,
152
  "step": 120
153
  },
154
  {
155
  "epoch": 0.27,
156
+ "learning_rate": 0.00013300886236219912,
157
+ "loss": 0.5016,
158
  "step": 125
159
  },
160
  {
161
  "epoch": 0.29,
162
+ "learning_rate": 0.00013215749301093531,
163
+ "loss": 0.5246,
164
  "step": 130
165
  },
166
  {
167
  "epoch": 0.3,
168
+ "learning_rate": 0.0001312307128017492,
169
+ "loss": 0.4599,
170
  "step": 135
171
  },
172
  {
173
  "epoch": 0.31,
174
+ "learning_rate": 0.00013022962135779,
175
+ "loss": 0.5193,
176
  "step": 140
177
  },
178
  {
179
  "epoch": 0.32,
180
+ "learning_rate": 0.0001291554064723639,
181
+ "loss": 0.4855,
182
  "step": 145
183
  },
184
  {
185
  "epoch": 0.33,
186
+ "learning_rate": 0.00012800934269961218,
187
+ "loss": 0.4923,
188
  "step": 150
189
  },
190
  {
191
  "epoch": 0.34,
192
+ "learning_rate": 0.00012679278984226595,
193
+ "loss": 0.5141,
194
  "step": 155
195
  },
196
  {
197
  "epoch": 0.35,
198
+ "learning_rate": 0.00012550719133822919,
199
+ "loss": 0.4847,
200
  "step": 160
201
  },
202
  {
203
  "epoch": 0.36,
204
+ "learning_rate": 0.0001241540725479539,
205
+ "loss": 0.4419,
206
  "step": 165
207
  },
208
  {
209
  "epoch": 0.37,
210
+ "learning_rate": 0.00012273503894459195,
211
+ "loss": 0.5324,
212
  "step": 170
213
  },
214
  {
215
  "epoch": 0.38,
216
+ "learning_rate": 0.00012125177420911749,
217
+ "loss": 0.4099,
218
  "step": 175
219
  },
220
  {
221
+ "epoch": 0.39,
222
+ "learning_rate": 0.00011970603823262598,
223
+ "loss": 0.4894,
224
  "step": 180
225
  },
226
  {
227
  "epoch": 0.41,
228
+ "learning_rate": 0.00011809966502824082,
229
+ "loss": 0.5617,
230
  "step": 185
231
  },
232
  {
233
  "epoch": 0.42,
234
+ "learning_rate": 0.00011643456055504982,
235
+ "loss": 0.5006,
236
  "step": 190
237
  },
238
  {
239
  "epoch": 0.43,
240
+ "learning_rate": 0.00011471270045669035,
241
+ "loss": 0.4947,
242
  "step": 195
243
  },
244
  {
245
  "epoch": 0.44,
246
+ "learning_rate": 0.00011293612771726151,
247
+ "loss": 0.5112,
248
  "step": 200
249
  },
250
  {
251
  "epoch": 0.45,
252
+ "learning_rate": 0.00011110695023730843,
253
+ "loss": 0.4745,
254
  "step": 205
255
  },
256
  {
257
  "epoch": 0.46,
258
+ "learning_rate": 0.00010922733833281926,
259
+ "loss": 0.4961,
260
  "step": 210
261
  },
262
  {
263
  "epoch": 0.47,
264
+ "learning_rate": 0.0001072995221601338,
265
+ "loss": 0.5159,
266
  "step": 215
267
  },
268
  {
269
  "epoch": 0.48,
270
+ "learning_rate": 0.00010532578906988555,
271
+ "loss": 0.4521,
272
  "step": 220
273
  },
274
  {
275
  "epoch": 0.49,
276
+ "learning_rate": 0.00010330848089304184,
277
+ "loss": 0.4683,
278
  "step": 225
279
  },
280
  {
281
+ "epoch": 0.5,
282
+ "learning_rate": 0.00010124999116234466,
283
+ "loss": 0.4694,
284
  "step": 230
285
  },
286
  {
287
  "epoch": 0.52,
288
+ "learning_rate": 9.915276227237154e-05,
289
+ "loss": 0.4838,
290
  "step": 235
291
  },
292
  {
293
  "epoch": 0.53,
294
+ "learning_rate": 9.701928258165896e-05,
295
+ "loss": 0.4934,
296
  "step": 240
297
  },
298
  {
299
  "epoch": 0.54,
300
+ "learning_rate": 9.485208346024501e-05,
301
+ "loss": 0.4964,
302
  "step": 245
303
  },
304
  {
305
  "epoch": 0.55,
306
+ "learning_rate": 9.265373628622407e-05,
307
+ "loss": 0.478,
308
  "step": 250
309
  },
310
  {
311
  "epoch": 0.56,
312
+ "learning_rate": 9.04268493947969e-05,
313
+ "loss": 0.4836,
314
  "step": 255
315
  },
316
  {
317
  "epoch": 0.57,
318
+ "learning_rate": 8.817406498348864e-05,
319
+ "loss": 0.4783,
320
  "step": 260
321
  },
322
  {
323
  "epoch": 0.58,
324
+ "learning_rate": 8.589805597719735e-05,
325
+ "loss": 0.5033,
326
  "step": 265
327
  },
328
  {
329
  "epoch": 0.59,
330
+ "learning_rate": 8.360152285675815e-05,
331
+ "loss": 0.4933,
332
  "step": 270
333
  },
334
  {
335
  "epoch": 0.6,
336
+ "learning_rate": 8.128719045483102e-05,
337
+ "loss": 0.4802,
338
  "step": 275
339
  },
340
  {
341
+ "epoch": 0.61,
342
+ "learning_rate": 7.895780472289125e-05,
343
+ "loss": 0.4608,
344
  "step": 280
345
  },
346
  {
347
+ "epoch": 0.62,
348
+ "learning_rate": 7.661612947317637e-05,
349
+ "loss": 0.451,
350
  "step": 285
351
  },
352
  {
353
  "epoch": 0.64,
354
+ "learning_rate": 7.426494309940237e-05,
355
+ "loss": 0.452,
356
  "step": 290
357
  },
358
  {
359
  "epoch": 0.65,
360
+ "learning_rate": 7.190703528022759e-05,
361
+ "loss": 0.4496,
362
  "step": 295
363
  },
364
  {
365
  "epoch": 0.66,
366
+ "learning_rate": 6.95452036692842e-05,
367
+ "loss": 0.4758,
368
  "step": 300
369
  },
370
  {
371
  "epoch": 0.67,
372
+ "learning_rate": 6.718225057579034e-05,
373
+ "loss": 0.4928,
374
  "step": 305
375
  },
376
  {
377
  "epoch": 0.68,
378
+ "learning_rate": 6.48209796395876e-05,
379
+ "loss": 0.5023,
380
  "step": 310
381
  },
382
  {
383
  "epoch": 0.69,
384
+ "learning_rate": 6.246419250465058e-05,
385
+ "loss": 0.426,
386
  "step": 315
387
  },
388
  {
389
  "epoch": 0.7,
390
+ "learning_rate": 6.011468549492541e-05,
391
+ "loss": 0.4651,
392
  "step": 320
393
  },
394
  {
395
  "epoch": 0.71,
396
+ "learning_rate": 5.777524629650007e-05,
397
+ "loss": 0.5082,
398
  "step": 325
399
  },
400
  {
401
+ "epoch": 0.72,
402
+ "learning_rate": 5.544865065003111e-05,
403
+ "loss": 0.4546,
404
  "step": 330
405
  },
406
  {
407
+ "epoch": 0.73,
408
+ "learning_rate": 5.313765905731657e-05,
409
+ "loss": 0.4512,
410
  "step": 335
411
  },
412
  {
413
  "epoch": 0.75,
414
+ "learning_rate": 5.084501350596927e-05,
415
+ "loss": 0.4794,
416
  "step": 340
417
  },
418
  {
419
  "epoch": 0.76,
420
+ "learning_rate": 4.857343421605311e-05,
421
+ "loss": 0.4781,
422
  "step": 345
423
  },
424
  {
425
  "epoch": 0.77,
426
+ "learning_rate": 4.63256164125579e-05,
427
+ "loss": 0.5233,
428
  "step": 350
429
  },
430
  {
431
  "epoch": 0.78,
432
+ "learning_rate": 4.410422712750424e-05,
433
+ "loss": 0.4695,
434
  "step": 355
435
  },
436
  {
437
  "epoch": 0.79,
438
+ "learning_rate": 4.191190203551854e-05,
439
+ "loss": 0.4788,
440
  "step": 360
441
  },
442
  {
443
  "epoch": 0.8,
444
+ "learning_rate": 3.975124232661141e-05,
445
+ "loss": 0.4318,
446
  "step": 365
447
  },
448
  {
449
  "epoch": 0.81,
450
+ "learning_rate": 3.762481161987185e-05,
451
+ "loss": 0.4609,
452
  "step": 370
453
  },
454
  {
455
  "epoch": 0.82,
456
+ "learning_rate": 3.553513292174085e-05,
457
+ "loss": 0.4854,
458
  "step": 375
459
  },
460
  {
461
+ "epoch": 0.83,
462
+ "learning_rate": 3.348468563245461e-05,
463
+ "loss": 0.4337,
464
  "step": 380
465
  },
466
  {
467
+ "epoch": 0.84,
468
+ "learning_rate": 3.1475902604251e-05,
469
+ "loss": 0.4707,
470
  "step": 385
471
  },
472
  {
473
  "epoch": 0.86,
474
+ "learning_rate": 2.951116725479596e-05,
475
+ "loss": 0.4394,
476
  "step": 390
477
  },
478
  {
479
  "epoch": 0.87,
480
+ "learning_rate": 2.7592810739257415e-05,
481
+ "loss": 0.5088,
482
  "step": 395
483
  },
484
  {
485
  "epoch": 0.88,
486
+ "learning_rate": 2.572310918439686e-05,
487
+ "loss": 0.4753,
488
  "step": 400
489
  },
490
  {
491
  "epoch": 0.89,
492
+ "learning_rate": 2.3904280987944108e-05,
493
+ "loss": 0.4626,
494
  "step": 405
495
  },
496
  {
497
  "epoch": 0.9,
498
+ "learning_rate": 2.2138484186474054e-05,
499
+ "loss": 0.4473,
500
  "step": 410
501
  },
502
  {
503
  "epoch": 0.91,
504
+ "learning_rate": 2.0427813894908452e-05,
505
+ "loss": 0.4662,
506
  "step": 415
507
  },
508
  {
509
  "epoch": 0.92,
510
+ "learning_rate": 1.877429982065378e-05,
511
+ "loss": 0.4383,
512
  "step": 420
513
  },
514
  {
515
  "epoch": 0.93,
516
+ "learning_rate": 1.7179903855360063e-05,
517
+ "loss": 0.4584,
518
  "step": 425
519
  },
520
  {
521
+ "epoch": 0.94,
522
+ "learning_rate": 1.564651774714127e-05,
523
+ "loss": 0.4932,
524
  "step": 430
525
  },
526
  {
527
+ "epoch": 0.95,
528
+ "learning_rate": 1.4175960856020567e-05,
529
+ "loss": 0.4168,
530
  "step": 435
531
  },
532
  {
533
+ "epoch": 0.96,
534
+ "learning_rate": 1.2769977995264743e-05,
535
+ "loss": 0.5093,
536
  "step": 440
537
  },
538
  {
539
  "epoch": 0.98,
540
+ "learning_rate": 1.1430237361156786e-05,
541
+ "loss": 0.486,
542
  "step": 445
543
  },
544
  {
545
  "epoch": 0.99,
546
+ "learning_rate": 1.0158328553691274e-05,
547
+ "loss": 0.4456,
548
  "step": 450
549
  },
550
  {
551
  "epoch": 1.0,
552
+ "learning_rate": 8.95576069051646e-06,
553
+ "loss": 0.4546,
554
  "step": 455
555
  },
556
  {
557
  "epoch": 1.0,
558
+ "eval_loss": 0.200975701212883,
559
+ "eval_runtime": 14.7931,
560
+ "eval_samples_per_second": 42.993,
561
+ "eval_steps_per_second": 5.408,
562
+ "step": 456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  }
564
  ],
565
+ "max_steps": 912,
566
  "num_train_epochs": 2,
567
+ "total_flos": 475682144256000.0,
568
  "trial_name": null,
569
  "trial_params": null
570
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2478f6ac251ccbd0eb17112df1fdac929c3ed4c313a16aa904d226b8a9b10b5
3
  size 3247
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b43ec02b3eaa28a980d2e5f3e7ecf8dc4639ef71a3a66137055cf261993e8b5
3
  size 3247