hllj commited on
Commit
c04ad42
1 Parent(s): 58a4b9e

Model save

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
- - Loss: 0.5125
18
 
19
  ## Model description
20
 
@@ -48,7 +48,7 @@ The following hyperparameters were used during training:
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
- | 0.312 | 1.02 | 1000 | 0.5086 |
52
 
53
 
54
  ### Framework versions
 
14
 
15
  This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
+ - Loss: 0.4106
18
 
19
  ## Model description
20
 
 
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
+ | 0.2982 | 1.02 | 1000 | 0.4478 |
52
 
53
 
54
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c2113d68bd5d5b38d9522e594a4df207d0456286a567467bf198c8a09ee501e
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a660c4ff8e831008249c3261c350bab867924823e6cbb2490b8294518e13a60
3
  size 872450448
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 1.45,
3
- "eval_loss": 0.5124574303627014,
4
- "eval_runtime": 6.883,
5
  "eval_samples": 140,
6
- "eval_samples_per_second": 20.34,
7
- "eval_steps_per_second": 5.085,
8
- "train_loss": 0.32689529290324765,
9
- "train_runtime": 3915.3355,
10
- "train_samples": 8517,
11
- "train_samples_per_second": 4.351,
12
- "train_steps_per_second": 1.088
13
  }
 
1
  {
2
+ "epoch": 1.44,
3
+ "eval_loss": 0.41060009598731995,
4
+ "eval_runtime": 6.7722,
5
  "eval_samples": 140,
6
+ "eval_samples_per_second": 20.673,
7
+ "eval_steps_per_second": 5.168,
8
+ "train_loss": 0.3275021193364904,
9
+ "train_runtime": 3966.2406,
10
+ "train_samples": 8657,
11
+ "train_samples_per_second": 4.365,
12
+ "train_steps_per_second": 1.092
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.45,
3
- "eval_loss": 0.5124574303627014,
4
- "eval_runtime": 6.883,
5
  "eval_samples": 140,
6
- "eval_samples_per_second": 20.34,
7
- "eval_steps_per_second": 5.085
8
  }
 
1
  {
2
+ "epoch": 1.44,
3
+ "eval_loss": 0.41060009598731995,
4
+ "eval_runtime": 6.7722,
5
  "eval_samples": 140,
6
+ "eval_samples_per_second": 20.673,
7
+ "eval_steps_per_second": 5.168
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.45,
3
- "train_loss": 0.32689529290324765,
4
- "train_runtime": 3915.3355,
5
- "train_samples": 8517,
6
- "train_samples_per_second": 4.351,
7
- "train_steps_per_second": 1.088
8
  }
 
1
  {
2
+ "epoch": 1.44,
3
+ "train_loss": 0.3275021193364904,
4
+ "train_runtime": 3966.2406,
5
+ "train_samples": 8657,
6
+ "train_samples_per_second": 4.365,
7
+ "train_steps_per_second": 1.092
8
  }
trainer_state.json CHANGED
@@ -1,1182 +1,1194 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.4460093896713615,
5
  "eval_steps": 1000,
6
- "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.4084507042253522e-07,
14
- "loss": 0.6105,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
- "learning_rate": 1.4084507042253521e-06,
20
- "loss": 0.7205,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
- "learning_rate": 2.8169014084507042e-06,
26
- "loss": 0.694,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.01,
31
- "learning_rate": 4.225352112676057e-06,
32
- "loss": 0.6913,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.02,
37
- "learning_rate": 5.6338028169014084e-06,
38
- "loss": 0.5948,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.02,
43
- "learning_rate": 7.042253521126761e-06,
44
- "loss": 0.5666,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.03,
49
- "learning_rate": 8.450704225352114e-06,
50
- "loss": 0.4994,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.03,
55
- "learning_rate": 9.859154929577466e-06,
56
- "loss": 0.4599,
57
  "step": 70
58
  },
59
  {
60
  "epoch": 0.04,
61
- "learning_rate": 1.1267605633802817e-05,
62
- "loss": 0.4061,
63
  "step": 80
64
  },
65
  {
66
  "epoch": 0.04,
67
- "learning_rate": 1.267605633802817e-05,
68
- "loss": 0.4111,
69
  "step": 90
70
  },
71
  {
72
  "epoch": 0.05,
73
- "learning_rate": 1.4084507042253522e-05,
74
- "loss": 0.3984,
75
  "step": 100
76
  },
77
  {
78
  "epoch": 0.05,
79
- "learning_rate": 1.5492957746478876e-05,
80
- "loss": 0.3893,
81
  "step": 110
82
  },
83
  {
84
  "epoch": 0.06,
85
- "learning_rate": 1.6901408450704228e-05,
86
- "loss": 0.4009,
87
  "step": 120
88
  },
89
  {
90
  "epoch": 0.06,
91
- "learning_rate": 1.830985915492958e-05,
92
- "loss": 0.4021,
93
  "step": 130
94
  },
95
  {
96
- "epoch": 0.07,
97
- "learning_rate": 1.9718309859154933e-05,
98
- "loss": 0.4013,
99
  "step": 140
100
  },
101
  {
102
  "epoch": 0.07,
103
- "learning_rate": 2.112676056338028e-05,
104
- "loss": 0.3468,
105
  "step": 150
106
  },
107
  {
108
- "epoch": 0.08,
109
- "learning_rate": 2.2535211267605634e-05,
110
- "loss": 0.3506,
111
  "step": 160
112
  },
113
  {
114
  "epoch": 0.08,
115
- "learning_rate": 2.3943661971830986e-05,
116
- "loss": 0.3765,
117
  "step": 170
118
  },
119
  {
120
  "epoch": 0.08,
121
- "learning_rate": 2.535211267605634e-05,
122
- "loss": 0.3766,
123
  "step": 180
124
  },
125
  {
126
  "epoch": 0.09,
127
- "learning_rate": 2.676056338028169e-05,
128
- "loss": 0.3583,
129
  "step": 190
130
  },
131
  {
132
  "epoch": 0.09,
133
- "learning_rate": 2.8169014084507043e-05,
134
- "loss": 0.3613,
135
  "step": 200
136
  },
137
  {
138
  "epoch": 0.1,
139
- "learning_rate": 2.9577464788732395e-05,
140
- "loss": 0.3604,
141
  "step": 210
142
  },
143
  {
144
  "epoch": 0.1,
145
- "learning_rate": 2.9999778542898527e-05,
146
- "loss": 0.3396,
147
  "step": 220
148
  },
149
  {
150
  "epoch": 0.11,
151
- "learning_rate": 2.9998693870796316e-05,
152
- "loss": 0.3655,
153
  "step": 230
154
  },
155
  {
156
  "epoch": 0.11,
157
- "learning_rate": 2.9996705373180166e-05,
158
- "loss": 0.3373,
159
  "step": 240
160
  },
161
  {
162
  "epoch": 0.12,
163
- "learning_rate": 2.9993813169877495e-05,
164
- "loss": 0.3397,
165
  "step": 250
166
  },
167
  {
168
  "epoch": 0.12,
169
- "learning_rate": 2.9990017435173293e-05,
170
- "loss": 0.3497,
171
  "step": 260
172
  },
173
  {
174
- "epoch": 0.13,
175
- "learning_rate": 2.9985318397799606e-05,
176
- "loss": 0.3548,
177
  "step": 270
178
  },
179
  {
180
  "epoch": 0.13,
181
- "learning_rate": 2.9979716340921736e-05,
182
- "loss": 0.3584,
183
  "step": 280
184
  },
185
  {
186
- "epoch": 0.14,
187
- "learning_rate": 2.997321160212122e-05,
188
- "loss": 0.3477,
189
  "step": 290
190
  },
191
  {
192
  "epoch": 0.14,
193
- "learning_rate": 2.996580457337544e-05,
194
- "loss": 0.3731,
195
  "step": 300
196
  },
197
  {
198
- "epoch": 0.15,
199
- "learning_rate": 2.9957495701034037e-05,
200
- "loss": 0.3688,
201
  "step": 310
202
  },
203
  {
204
  "epoch": 0.15,
205
- "learning_rate": 2.9948285485792e-05,
206
- "loss": 0.3399,
207
  "step": 320
208
  },
209
  {
210
  "epoch": 0.15,
211
- "learning_rate": 2.993817448265948e-05,
212
- "loss": 0.3419,
213
  "step": 330
214
  },
215
  {
216
  "epoch": 0.16,
217
- "learning_rate": 2.992716330092839e-05,
218
- "loss": 0.3663,
219
  "step": 340
220
  },
221
  {
222
  "epoch": 0.16,
223
- "learning_rate": 2.9915252604135618e-05,
224
- "loss": 0.358,
225
  "step": 350
226
  },
227
  {
228
  "epoch": 0.17,
229
- "learning_rate": 2.9902443110023127e-05,
230
- "loss": 0.3332,
231
  "step": 360
232
  },
233
  {
234
  "epoch": 0.17,
235
- "learning_rate": 2.9888735590494616e-05,
236
- "loss": 0.3315,
237
  "step": 370
238
  },
239
  {
240
  "epoch": 0.18,
241
- "learning_rate": 2.9874130871569087e-05,
242
- "loss": 0.3519,
243
  "step": 380
244
  },
245
  {
246
  "epoch": 0.18,
247
- "learning_rate": 2.9858629833331002e-05,
248
- "loss": 0.3876,
249
  "step": 390
250
  },
251
  {
252
- "epoch": 0.19,
253
- "learning_rate": 2.9842233409877296e-05,
254
- "loss": 0.3219,
255
  "step": 400
256
  },
257
  {
258
  "epoch": 0.19,
259
- "learning_rate": 2.9824942589261053e-05,
260
- "loss": 0.3465,
261
  "step": 410
262
  },
263
  {
264
- "epoch": 0.2,
265
- "learning_rate": 2.9806758413431997e-05,
266
- "loss": 0.3564,
267
  "step": 420
268
  },
269
  {
270
  "epoch": 0.2,
271
- "learning_rate": 2.978768197817368e-05,
272
- "loss": 0.3719,
273
  "step": 430
274
  },
275
  {
276
- "epoch": 0.21,
277
- "learning_rate": 2.976771443303745e-05,
278
- "loss": 0.345,
279
  "step": 440
280
  },
281
  {
282
  "epoch": 0.21,
283
- "learning_rate": 2.974685698127321e-05,
284
- "loss": 0.325,
285
  "step": 450
286
  },
287
  {
288
- "epoch": 0.22,
289
- "learning_rate": 2.9725110879756868e-05,
290
- "loss": 0.3461,
291
  "step": 460
292
  },
293
  {
294
  "epoch": 0.22,
295
- "learning_rate": 2.9702477438914617e-05,
296
- "loss": 0.3338,
297
  "step": 470
298
  },
299
  {
300
- "epoch": 0.23,
301
- "learning_rate": 2.9678958022643983e-05,
302
- "loss": 0.3533,
303
  "step": 480
304
  },
305
  {
306
  "epoch": 0.23,
307
- "learning_rate": 2.9654554048231597e-05,
308
- "loss": 0.3274,
309
  "step": 490
310
  },
311
  {
312
  "epoch": 0.23,
313
- "learning_rate": 2.9629266986267835e-05,
314
- "loss": 0.3423,
315
  "step": 500
316
  },
317
  {
318
  "epoch": 0.24,
319
- "learning_rate": 2.9603098360558167e-05,
320
- "loss": 0.3256,
321
  "step": 510
322
  },
323
  {
324
  "epoch": 0.24,
325
- "learning_rate": 2.957604974803134e-05,
326
- "loss": 0.33,
327
  "step": 520
328
  },
329
  {
330
- "epoch": 0.25,
331
- "learning_rate": 2.9548122778644357e-05,
332
- "loss": 0.3108,
333
  "step": 530
334
  },
335
  {
336
  "epoch": 0.25,
337
- "learning_rate": 2.9519319135284252e-05,
338
- "loss": 0.3473,
339
  "step": 540
340
  },
341
  {
342
- "epoch": 0.26,
343
- "learning_rate": 2.9489640553666687e-05,
344
- "loss": 0.3242,
345
  "step": 550
346
  },
347
  {
348
  "epoch": 0.26,
349
- "learning_rate": 2.945908882223134e-05,
350
- "loss": 0.3455,
351
  "step": 560
352
  },
353
  {
354
- "epoch": 0.27,
355
- "learning_rate": 2.9427665782034143e-05,
356
- "loss": 0.3417,
357
  "step": 570
358
  },
359
  {
360
  "epoch": 0.27,
361
- "learning_rate": 2.9395373326636344e-05,
362
- "loss": 0.3262,
363
  "step": 580
364
  },
365
  {
366
- "epoch": 0.28,
367
- "learning_rate": 2.9362213401990395e-05,
368
- "loss": 0.3507,
369
  "step": 590
370
  },
371
  {
372
  "epoch": 0.28,
373
- "learning_rate": 2.9328188006322693e-05,
374
- "loss": 0.3396,
375
  "step": 600
376
  },
377
  {
378
- "epoch": 0.29,
379
- "learning_rate": 2.9293299190013143e-05,
380
- "loss": 0.3283,
381
  "step": 610
382
  },
383
  {
384
  "epoch": 0.29,
385
- "learning_rate": 2.9257549055471645e-05,
386
- "loss": 0.3119,
387
  "step": 620
388
  },
389
  {
390
- "epoch": 0.3,
391
- "learning_rate": 2.9220939757011366e-05,
392
- "loss": 0.338,
393
  "step": 630
394
  },
395
  {
396
  "epoch": 0.3,
397
- "learning_rate": 2.9183473500718938e-05,
398
- "loss": 0.3285,
399
  "step": 640
400
  },
401
  {
402
- "epoch": 0.31,
403
- "learning_rate": 2.9145152544321504e-05,
404
- "loss": 0.3293,
405
  "step": 650
406
  },
407
  {
408
- "epoch": 0.31,
409
- "learning_rate": 2.9105979197050683e-05,
410
- "loss": 0.3528,
411
  "step": 660
412
  },
413
  {
414
  "epoch": 0.31,
415
- "learning_rate": 2.906595581950341e-05,
416
- "loss": 0.3183,
417
  "step": 670
418
  },
419
  {
420
- "epoch": 0.32,
421
- "learning_rate": 2.902508482349968e-05,
422
- "loss": 0.3593,
423
  "step": 680
424
  },
425
  {
426
  "epoch": 0.32,
427
- "learning_rate": 2.898336867193721e-05,
428
- "loss": 0.3371,
429
  "step": 690
430
  },
431
  {
432
- "epoch": 0.33,
433
- "learning_rate": 2.8940809878643038e-05,
434
- "loss": 0.3313,
435
  "step": 700
436
  },
437
  {
438
  "epoch": 0.33,
439
- "learning_rate": 2.8897411008222026e-05,
440
- "loss": 0.3298,
441
  "step": 710
442
  },
443
  {
444
- "epoch": 0.34,
445
- "learning_rate": 2.8853174675902323e-05,
446
- "loss": 0.3366,
447
  "step": 720
448
  },
449
  {
450
  "epoch": 0.34,
451
- "learning_rate": 2.8808103547377754e-05,
452
- "loss": 0.3408,
453
  "step": 730
454
  },
455
  {
456
- "epoch": 0.35,
457
- "learning_rate": 2.8762200338647222e-05,
458
- "loss": 0.3328,
459
  "step": 740
460
  },
461
  {
462
  "epoch": 0.35,
463
- "learning_rate": 2.8715467815850994e-05,
464
- "loss": 0.3235,
465
  "step": 750
466
  },
467
  {
468
- "epoch": 0.36,
469
- "learning_rate": 2.8667908795104053e-05,
470
- "loss": 0.3455,
471
  "step": 760
472
  },
473
  {
474
  "epoch": 0.36,
475
- "learning_rate": 2.8619526142326367e-05,
476
- "loss": 0.322,
477
  "step": 770
478
  },
479
  {
480
- "epoch": 0.37,
481
- "learning_rate": 2.8570322773070217e-05,
482
- "loss": 0.3367,
483
  "step": 780
484
  },
485
  {
486
- "epoch": 0.37,
487
- "learning_rate": 2.8520301652344476e-05,
488
- "loss": 0.3576,
489
  "step": 790
490
  },
491
  {
492
- "epoch": 0.38,
493
- "learning_rate": 2.8469465794435965e-05,
494
- "loss": 0.332,
495
  "step": 800
496
  },
497
  {
498
- "epoch": 0.38,
499
- "learning_rate": 2.8417818262727784e-05,
500
- "loss": 0.3156,
501
  "step": 810
502
  },
503
  {
504
  "epoch": 0.38,
505
- "learning_rate": 2.8365362169514726e-05,
506
- "loss": 0.3305,
507
  "step": 820
508
  },
509
  {
510
- "epoch": 0.39,
511
- "learning_rate": 2.8312100675815736e-05,
512
- "loss": 0.3238,
513
  "step": 830
514
  },
515
  {
516
  "epoch": 0.39,
517
- "learning_rate": 2.8258036991183414e-05,
518
- "loss": 0.3092,
519
  "step": 840
520
  },
521
  {
522
- "epoch": 0.4,
523
- "learning_rate": 2.8203174373510617e-05,
524
- "loss": 0.3503,
525
  "step": 850
526
  },
527
  {
528
  "epoch": 0.4,
529
- "learning_rate": 2.8147516128834116e-05,
530
- "loss": 0.3112,
531
  "step": 860
532
  },
533
  {
534
- "epoch": 0.41,
535
- "learning_rate": 2.809106561113541e-05,
536
- "loss": 0.3307,
537
  "step": 870
538
  },
539
  {
540
  "epoch": 0.41,
541
- "learning_rate": 2.803382622213857e-05,
542
- "loss": 0.3317,
543
  "step": 880
544
  },
545
  {
546
- "epoch": 0.42,
547
- "learning_rate": 2.7975801411105307e-05,
548
- "loss": 0.3328,
549
  "step": 890
550
  },
551
  {
552
  "epoch": 0.42,
553
- "learning_rate": 2.7916994674627045e-05,
554
- "loss": 0.3301,
555
  "step": 900
556
  },
557
  {
558
- "epoch": 0.43,
559
- "learning_rate": 2.7857409556414283e-05,
560
- "loss": 0.3271,
561
  "step": 910
562
  },
563
  {
564
- "epoch": 0.43,
565
- "learning_rate": 2.7797049647083016e-05,
566
- "loss": 0.3154,
567
  "step": 920
568
  },
569
  {
570
- "epoch": 0.44,
571
- "learning_rate": 2.7735918583938363e-05,
572
- "loss": 0.328,
573
  "step": 930
574
  },
575
  {
576
- "epoch": 0.44,
577
- "learning_rate": 2.76740200507554e-05,
578
- "loss": 0.3193,
579
  "step": 940
580
  },
581
  {
582
- "epoch": 0.45,
583
- "learning_rate": 2.761135777755715e-05,
584
- "loss": 0.3222,
585
  "step": 950
586
  },
587
  {
588
- "epoch": 1.0,
589
- "learning_rate": 2.7547935540389843e-05,
590
- "loss": 0.3022,
591
  "step": 960
592
  },
593
  {
594
- "epoch": 1.01,
595
- "learning_rate": 2.748375716109533e-05,
596
- "loss": 0.3119,
597
  "step": 970
598
  },
599
  {
600
  "epoch": 1.01,
601
- "learning_rate": 2.7418826507080818e-05,
602
- "loss": 0.3019,
603
  "step": 980
604
  },
605
  {
606
- "epoch": 1.02,
607
- "learning_rate": 2.7353147491085785e-05,
608
- "loss": 0.3181,
609
  "step": 990
610
  },
611
  {
612
  "epoch": 1.02,
613
- "learning_rate": 2.728672407094622e-05,
614
- "loss": 0.312,
615
  "step": 1000
616
  },
617
  {
618
  "epoch": 1.02,
619
- "eval_loss": 0.5085553526878357,
620
- "eval_runtime": 6.8207,
621
- "eval_samples_per_second": 20.526,
622
- "eval_steps_per_second": 5.131,
623
  "step": 1000
624
  },
625
  {
626
- "epoch": 1.03,
627
- "learning_rate": 2.7219560249356125e-05,
628
- "loss": 0.2941,
629
  "step": 1010
630
  },
631
  {
632
  "epoch": 1.03,
633
- "learning_rate": 2.7151660073626283e-05,
634
- "loss": 0.2852,
635
  "step": 1020
636
  },
637
  {
638
- "epoch": 1.04,
639
- "learning_rate": 2.7083027635440392e-05,
640
- "loss": 0.3113,
641
  "step": 1030
642
  },
643
  {
644
  "epoch": 1.04,
645
- "learning_rate": 2.7013667070608502e-05,
646
- "loss": 0.2969,
647
  "step": 1040
648
  },
649
  {
650
- "epoch": 1.05,
651
- "learning_rate": 2.6943582558817764e-05,
652
- "loss": 0.2912,
653
  "step": 1050
654
  },
655
  {
656
  "epoch": 1.05,
657
- "learning_rate": 2.6872778323380585e-05,
658
- "loss": 0.2851,
659
  "step": 1060
660
  },
661
  {
662
- "epoch": 1.06,
663
- "learning_rate": 2.6801258630980117e-05,
664
- "loss": 0.3045,
665
  "step": 1070
666
  },
667
  {
668
- "epoch": 1.06,
669
- "learning_rate": 2.6729027791413154e-05,
670
- "loss": 0.3157,
671
  "step": 1080
672
  },
673
  {
674
- "epoch": 1.07,
675
- "learning_rate": 2.6656090157330424e-05,
676
- "loss": 0.2968,
677
  "step": 1090
678
  },
679
  {
680
- "epoch": 1.07,
681
- "learning_rate": 2.6582450123974278e-05,
682
- "loss": 0.2832,
683
  "step": 1100
684
  },
685
  {
686
- "epoch": 1.08,
687
- "learning_rate": 2.650811212891385e-05,
688
- "loss": 0.32,
689
  "step": 1110
690
  },
691
  {
692
- "epoch": 1.08,
693
- "learning_rate": 2.6433080651777655e-05,
694
- "loss": 0.2936,
695
  "step": 1120
696
  },
697
  {
698
  "epoch": 1.08,
699
- "learning_rate": 2.635736021398361e-05,
700
- "loss": 0.3094,
701
  "step": 1130
702
  },
703
  {
704
- "epoch": 1.09,
705
- "learning_rate": 2.628095537846661e-05,
706
- "loss": 0.3073,
707
  "step": 1140
708
  },
709
  {
710
  "epoch": 1.09,
711
- "learning_rate": 2.6203870749403553e-05,
712
- "loss": 0.3067,
713
  "step": 1150
714
  },
715
  {
716
- "epoch": 1.1,
717
- "learning_rate": 2.6126110971935878e-05,
718
- "loss": 0.309,
719
  "step": 1160
720
  },
721
  {
722
  "epoch": 1.1,
723
- "learning_rate": 2.604768073188966e-05,
724
- "loss": 0.2851,
725
  "step": 1170
726
  },
727
  {
728
- "epoch": 1.11,
729
- "learning_rate": 2.5968584755493233e-05,
730
- "loss": 0.3074,
731
  "step": 1180
732
  },
733
  {
734
  "epoch": 1.11,
735
- "learning_rate": 2.5888827809092406e-05,
736
- "loss": 0.3012,
737
  "step": 1190
738
  },
739
  {
740
- "epoch": 1.12,
741
- "learning_rate": 2.5808414698863205e-05,
742
- "loss": 0.3042,
743
  "step": 1200
744
  },
745
  {
746
  "epoch": 1.12,
747
- "learning_rate": 2.5727350270522293e-05,
748
- "loss": 0.3072,
749
  "step": 1210
750
  },
751
  {
752
- "epoch": 1.13,
753
- "learning_rate": 2.5645639409034935e-05,
754
- "loss": 0.2948,
755
  "step": 1220
756
  },
757
  {
758
- "epoch": 1.13,
759
- "learning_rate": 2.5563287038320635e-05,
760
- "loss": 0.3042,
761
  "step": 1230
762
  },
763
  {
764
- "epoch": 1.14,
765
- "learning_rate": 2.548029812095644e-05,
766
- "loss": 0.3112,
767
  "step": 1240
768
  },
769
  {
770
- "epoch": 1.14,
771
- "learning_rate": 2.539667765787786e-05,
772
- "loss": 0.3213,
773
  "step": 1250
774
  },
775
  {
776
- "epoch": 1.15,
777
- "learning_rate": 2.531243068807754e-05,
778
- "loss": 0.2931,
779
  "step": 1260
780
  },
781
  {
782
- "epoch": 1.15,
783
- "learning_rate": 2.522756228830158e-05,
784
- "loss": 0.2802,
785
  "step": 1270
786
  },
787
  {
788
  "epoch": 1.15,
789
- "learning_rate": 2.5142077572743643e-05,
790
- "loss": 0.3049,
791
  "step": 1280
792
  },
793
  {
794
- "epoch": 1.16,
795
- "learning_rate": 2.5055981692736758e-05,
796
- "loss": 0.3234,
797
  "step": 1290
798
  },
799
  {
800
  "epoch": 1.16,
801
- "learning_rate": 2.4969279836442868e-05,
802
- "loss": 0.286,
803
  "step": 1300
804
  },
805
  {
806
- "epoch": 1.17,
807
- "learning_rate": 2.4881977228540243e-05,
808
- "loss": 0.3099,
809
  "step": 1310
810
  },
811
  {
812
  "epoch": 1.17,
813
- "learning_rate": 2.4794079129908606e-05,
814
- "loss": 0.2811,
815
  "step": 1320
816
  },
817
  {
818
- "epoch": 1.18,
819
- "learning_rate": 2.470559083731212e-05,
820
- "loss": 0.3202,
821
  "step": 1330
822
  },
823
  {
824
  "epoch": 1.18,
825
- "learning_rate": 2.4616517683080197e-05,
826
- "loss": 0.3031,
827
  "step": 1340
828
  },
829
  {
830
- "epoch": 1.19,
831
- "learning_rate": 2.4526865034786184e-05,
832
- "loss": 0.2663,
833
  "step": 1350
834
  },
835
  {
836
- "epoch": 1.19,
837
- "learning_rate": 2.4436638294923902e-05,
838
- "loss": 0.2946,
839
  "step": 1360
840
  },
841
  {
842
- "epoch": 1.2,
843
- "learning_rate": 2.4345842900582084e-05,
844
- "loss": 0.2625,
845
  "step": 1370
846
  },
847
  {
848
- "epoch": 1.2,
849
- "learning_rate": 2.4254484323116746e-05,
850
- "loss": 0.2953,
851
  "step": 1380
852
  },
853
  {
854
- "epoch": 1.21,
855
- "learning_rate": 2.4162568067821478e-05,
856
- "loss": 0.3124,
857
  "step": 1390
858
  },
859
  {
860
- "epoch": 1.21,
861
- "learning_rate": 2.4070099673595696e-05,
862
- "loss": 0.3166,
863
  "step": 1400
864
  },
865
  {
866
- "epoch": 1.22,
867
- "learning_rate": 2.3977084712610862e-05,
868
- "loss": 0.3096,
869
  "step": 1410
870
  },
871
  {
872
- "epoch": 1.22,
873
- "learning_rate": 2.3883528789974703e-05,
874
- "loss": 0.3054,
875
  "step": 1420
876
  },
877
  {
878
- "epoch": 1.23,
879
- "learning_rate": 2.3789437543393446e-05,
880
- "loss": 0.3024,
881
  "step": 1430
882
  },
883
  {
884
- "epoch": 1.23,
885
- "learning_rate": 2.3694816642832087e-05,
886
- "loss": 0.2855,
887
  "step": 1440
888
  },
889
  {
890
  "epoch": 1.23,
891
- "learning_rate": 2.3599671790172738e-05,
892
- "loss": 0.2768,
893
  "step": 1450
894
  },
895
  {
896
- "epoch": 1.24,
897
- "learning_rate": 2.3504008718870983e-05,
898
- "loss": 0.289,
899
  "step": 1460
900
  },
901
  {
902
  "epoch": 1.24,
903
- "learning_rate": 2.3407833193610427e-05,
904
- "loss": 0.2805,
905
  "step": 1470
906
  },
907
  {
908
- "epoch": 1.25,
909
- "learning_rate": 2.3311151009955297e-05,
910
- "loss": 0.2729,
911
  "step": 1480
912
  },
913
  {
914
- "epoch": 1.25,
915
- "learning_rate": 2.3213967994001185e-05,
916
- "loss": 0.2649,
917
  "step": 1490
918
  },
919
  {
920
- "epoch": 1.26,
921
- "learning_rate": 2.3116290002023982e-05,
922
- "loss": 0.2858,
923
  "step": 1500
924
  },
925
  {
926
- "epoch": 1.26,
927
- "learning_rate": 2.301812292012698e-05,
928
- "loss": 0.2785,
929
  "step": 1510
930
  },
931
  {
932
- "epoch": 1.27,
933
- "learning_rate": 2.291947266388616e-05,
934
- "loss": 0.2951,
935
  "step": 1520
936
  },
937
  {
938
- "epoch": 1.27,
939
- "learning_rate": 2.2820345177993727e-05,
940
- "loss": 0.2612,
941
  "step": 1530
942
  },
943
  {
944
- "epoch": 1.28,
945
- "learning_rate": 2.272074643589988e-05,
946
- "loss": 0.2873,
947
  "step": 1540
948
  },
949
  {
950
- "epoch": 1.28,
951
- "learning_rate": 2.262068243945285e-05,
952
- "loss": 0.276,
953
  "step": 1550
954
  },
955
  {
956
- "epoch": 1.29,
957
- "learning_rate": 2.252015921853723e-05,
958
- "loss": 0.2888,
959
  "step": 1560
960
  },
961
  {
962
- "epoch": 1.29,
963
- "learning_rate": 2.2419182830710593e-05,
964
- "loss": 0.2721,
965
  "step": 1570
966
  },
967
  {
968
- "epoch": 1.3,
969
- "learning_rate": 2.23177593608385e-05,
970
- "loss": 0.2805,
971
  "step": 1580
972
  },
973
  {
974
- "epoch": 1.3,
975
- "learning_rate": 2.221589492072778e-05,
976
- "loss": 0.2719,
977
  "step": 1590
978
  },
979
  {
980
- "epoch": 1.31,
981
- "learning_rate": 2.2113595648758273e-05,
982
- "loss": 0.2703,
983
  "step": 1600
984
  },
985
  {
986
- "epoch": 1.31,
987
- "learning_rate": 2.2010867709512895e-05,
988
- "loss": 0.2661,
989
  "step": 1610
990
  },
991
  {
992
- "epoch": 1.31,
993
- "learning_rate": 2.1907717293406175e-05,
994
- "loss": 0.2665,
995
  "step": 1620
996
  },
997
  {
998
- "epoch": 1.32,
999
- "learning_rate": 2.1804150616311222e-05,
1000
- "loss": 0.2791,
1001
  "step": 1630
1002
  },
1003
  {
1004
- "epoch": 1.32,
1005
- "learning_rate": 2.1700173919185144e-05,
1006
- "loss": 0.2549,
1007
  "step": 1640
1008
  },
1009
  {
1010
- "epoch": 1.33,
1011
- "learning_rate": 2.1595793467692967e-05,
1012
- "loss": 0.2934,
1013
  "step": 1650
1014
  },
1015
  {
1016
- "epoch": 1.33,
1017
- "learning_rate": 2.149101555183009e-05,
1018
- "loss": 0.2666,
1019
  "step": 1660
1020
  },
1021
  {
1022
- "epoch": 1.34,
1023
- "learning_rate": 2.1385846485543202e-05,
1024
- "loss": 0.3041,
1025
  "step": 1670
1026
  },
1027
  {
1028
- "epoch": 1.34,
1029
- "learning_rate": 2.1280292606349838e-05,
1030
- "loss": 0.2651,
1031
  "step": 1680
1032
  },
1033
  {
1034
- "epoch": 1.35,
1035
- "learning_rate": 2.117436027495647e-05,
1036
- "loss": 0.2718,
1037
  "step": 1690
1038
  },
1039
  {
1040
- "epoch": 1.35,
1041
- "learning_rate": 2.106805587487519e-05,
1042
- "loss": 0.2625,
1043
  "step": 1700
1044
  },
1045
  {
1046
- "epoch": 1.36,
1047
- "learning_rate": 2.096138581203908e-05,
1048
- "loss": 0.284,
1049
  "step": 1710
1050
  },
1051
  {
1052
- "epoch": 1.36,
1053
- "learning_rate": 2.0854356514416144e-05,
1054
- "loss": 0.2865,
1055
  "step": 1720
1056
  },
1057
  {
1058
- "epoch": 1.37,
1059
- "learning_rate": 2.0746974431621968e-05,
1060
- "loss": 0.288,
1061
  "step": 1730
1062
  },
1063
  {
1064
- "epoch": 1.37,
1065
- "learning_rate": 2.06392460345311e-05,
1066
- "loss": 0.2704,
1067
  "step": 1740
1068
  },
1069
  {
1070
- "epoch": 1.38,
1071
- "learning_rate": 2.053117781488706e-05,
1072
- "loss": 0.2896,
1073
  "step": 1750
1074
  },
1075
  {
1076
- "epoch": 1.38,
1077
- "learning_rate": 2.0422776284911175e-05,
1078
- "loss": 0.2813,
1079
  "step": 1760
1080
  },
1081
  {
1082
- "epoch": 1.38,
1083
- "learning_rate": 2.031404797691016e-05,
1084
- "loss": 0.2857,
1085
  "step": 1770
1086
  },
1087
  {
1088
- "epoch": 1.39,
1089
- "learning_rate": 2.0204999442882447e-05,
1090
- "loss": 0.3063,
1091
  "step": 1780
1092
  },
1093
  {
1094
- "epoch": 1.39,
1095
- "learning_rate": 2.0095637254123392e-05,
1096
- "loss": 0.2837,
1097
  "step": 1790
1098
  },
1099
  {
1100
- "epoch": 1.4,
1101
- "learning_rate": 1.998596800082927e-05,
1102
- "loss": 0.2851,
1103
  "step": 1800
1104
  },
1105
  {
1106
- "epoch": 1.4,
1107
- "learning_rate": 1.9875998291700148e-05,
1108
- "loss": 0.2852,
1109
  "step": 1810
1110
  },
1111
  {
1112
- "epoch": 1.41,
1113
- "learning_rate": 1.976573475354165e-05,
1114
- "loss": 0.2651,
1115
  "step": 1820
1116
  },
1117
  {
1118
- "epoch": 1.41,
1119
- "learning_rate": 1.9655184030865617e-05,
1120
- "loss": 0.2655,
1121
  "step": 1830
1122
  },
1123
  {
1124
- "epoch": 1.42,
1125
- "learning_rate": 1.9544352785489706e-05,
1126
- "loss": 0.2758,
1127
  "step": 1840
1128
  },
1129
  {
1130
- "epoch": 1.42,
1131
- "learning_rate": 1.9433247696135967e-05,
1132
- "loss": 0.2698,
1133
  "step": 1850
1134
  },
1135
  {
1136
- "epoch": 1.43,
1137
- "learning_rate": 1.9321875458028347e-05,
1138
- "loss": 0.2988,
1139
  "step": 1860
1140
  },
1141
  {
1142
- "epoch": 1.43,
1143
- "learning_rate": 1.9210242782489266e-05,
1144
- "loss": 0.2723,
1145
  "step": 1870
1146
  },
1147
  {
1148
- "epoch": 1.44,
1149
- "learning_rate": 1.9098356396535167e-05,
1150
- "loss": 0.2726,
1151
  "step": 1880
1152
  },
1153
  {
1154
- "epoch": 1.44,
1155
- "learning_rate": 1.8986223042471144e-05,
1156
- "loss": 0.2541,
1157
  "step": 1890
1158
  },
1159
  {
1160
- "epoch": 1.45,
1161
- "learning_rate": 1.8873849477484696e-05,
1162
- "loss": 0.2822,
1163
  "step": 1900
1164
  },
1165
  {
1166
- "epoch": 1.45,
1167
- "step": 1900,
1168
- "total_flos": 3.4203213408659046e+17,
1169
- "train_loss": 0.32689529290324765,
1170
- "train_runtime": 3915.3355,
1171
- "train_samples_per_second": 4.351,
1172
- "train_steps_per_second": 1.088
 
 
 
 
 
 
 
 
 
 
 
 
1173
  }
1174
  ],
1175
  "logging_steps": 10,
1176
- "max_steps": 4260,
1177
  "num_train_epochs": 2,
1178
  "save_steps": 1000,
1179
- "total_flos": 3.4203213408659046e+17,
1180
  "trial_name": null,
1181
  "trial_params": null
1182
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.4438799076212472,
5
  "eval_steps": 1000,
6
+ "global_step": 1922,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1.3824884792626728e-07,
14
+ "loss": 0.7351,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
+ "learning_rate": 1.3824884792626729e-06,
20
+ "loss": 0.7455,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
+ "learning_rate": 2.7649769585253458e-06,
26
+ "loss": 0.7061,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.01,
31
+ "learning_rate": 4.147465437788019e-06,
32
+ "loss": 0.6593,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.02,
37
+ "learning_rate": 5.5299539170506915e-06,
38
+ "loss": 0.6177,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.02,
43
+ "learning_rate": 6.912442396313364e-06,
44
+ "loss": 0.5817,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.03,
49
+ "learning_rate": 8.294930875576038e-06,
50
+ "loss": 0.5095,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.03,
55
+ "learning_rate": 9.67741935483871e-06,
56
+ "loss": 0.4615,
57
  "step": 70
58
  },
59
  {
60
  "epoch": 0.04,
61
+ "learning_rate": 1.1059907834101383e-05,
62
+ "loss": 0.4126,
63
  "step": 80
64
  },
65
  {
66
  "epoch": 0.04,
67
+ "learning_rate": 1.2442396313364056e-05,
68
+ "loss": 0.4067,
69
  "step": 90
70
  },
71
  {
72
  "epoch": 0.05,
73
+ "learning_rate": 1.3824884792626728e-05,
74
+ "loss": 0.3974,
75
  "step": 100
76
  },
77
  {
78
  "epoch": 0.05,
79
+ "learning_rate": 1.5207373271889403e-05,
80
+ "loss": 0.3864,
81
  "step": 110
82
  },
83
  {
84
  "epoch": 0.06,
85
+ "learning_rate": 1.6589861751152075e-05,
86
+ "loss": 0.4016,
87
  "step": 120
88
  },
89
  {
90
  "epoch": 0.06,
91
+ "learning_rate": 1.7972350230414745e-05,
92
+ "loss": 0.3873,
93
  "step": 130
94
  },
95
  {
96
+ "epoch": 0.06,
97
+ "learning_rate": 1.935483870967742e-05,
98
+ "loss": 0.3837,
99
  "step": 140
100
  },
101
  {
102
  "epoch": 0.07,
103
+ "learning_rate": 2.0737327188940094e-05,
104
+ "loss": 0.3868,
105
  "step": 150
106
  },
107
  {
108
+ "epoch": 0.07,
109
+ "learning_rate": 2.2119815668202766e-05,
110
+ "loss": 0.3817,
111
  "step": 160
112
  },
113
  {
114
  "epoch": 0.08,
115
+ "learning_rate": 2.350230414746544e-05,
116
+ "loss": 0.3593,
117
  "step": 170
118
  },
119
  {
120
  "epoch": 0.08,
121
+ "learning_rate": 2.488479262672811e-05,
122
+ "loss": 0.3641,
123
  "step": 180
124
  },
125
  {
126
  "epoch": 0.09,
127
+ "learning_rate": 2.6267281105990784e-05,
128
+ "loss": 0.3497,
129
  "step": 190
130
  },
131
  {
132
  "epoch": 0.09,
133
+ "learning_rate": 2.7649769585253457e-05,
134
+ "loss": 0.3679,
135
  "step": 200
136
  },
137
  {
138
  "epoch": 0.1,
139
+ "learning_rate": 2.903225806451613e-05,
140
+ "loss": 0.3751,
141
  "step": 210
142
  },
143
  {
144
  "epoch": 0.1,
145
+ "learning_rate": 2.9999960619075335e-05,
146
+ "loss": 0.3828,
147
  "step": 220
148
  },
149
  {
150
  "epoch": 0.11,
151
+ "learning_rate": 2.9999260519500367e-05,
152
+ "loss": 0.3763,
153
  "step": 230
154
  },
155
  {
156
  "epoch": 0.11,
157
+ "learning_rate": 2.9997685335280646e-05,
158
+ "loss": 0.3553,
159
  "step": 240
160
  },
161
  {
162
  "epoch": 0.12,
163
+ "learning_rate": 2.9995235158315353e-05,
164
+ "loss": 0.3589,
165
  "step": 250
166
  },
167
  {
168
  "epoch": 0.12,
169
+ "learning_rate": 2.999191013155234e-05,
170
+ "loss": 0.3585,
171
  "step": 260
172
  },
173
  {
174
+ "epoch": 0.12,
175
+ "learning_rate": 2.998771044897983e-05,
176
+ "loss": 0.3529,
177
  "step": 270
178
  },
179
  {
180
  "epoch": 0.13,
181
+ "learning_rate": 2.9982636355615092e-05,
182
+ "loss": 0.3303,
183
  "step": 280
184
  },
185
  {
186
+ "epoch": 0.13,
187
+ "learning_rate": 2.997668814749012e-05,
188
+ "loss": 0.3696,
189
  "step": 290
190
  },
191
  {
192
  "epoch": 0.14,
193
+ "learning_rate": 2.99698661716344e-05,
194
+ "loss": 0.3353,
195
  "step": 300
196
  },
197
  {
198
+ "epoch": 0.14,
199
+ "learning_rate": 2.9962170826054645e-05,
200
+ "loss": 0.3562,
201
  "step": 310
202
  },
203
  {
204
  "epoch": 0.15,
205
+ "learning_rate": 2.995360255971157e-05,
206
+ "loss": 0.3652,
207
  "step": 320
208
  },
209
  {
210
  "epoch": 0.15,
211
+ "learning_rate": 2.994416187249371e-05,
212
+ "loss": 0.3522,
213
  "step": 330
214
  },
215
  {
216
  "epoch": 0.16,
217
+ "learning_rate": 2.9933849315188233e-05,
218
+ "loss": 0.3754,
219
  "step": 340
220
  },
221
  {
222
  "epoch": 0.16,
223
+ "learning_rate": 2.992266548944885e-05,
224
+ "loss": 0.3348,
225
  "step": 350
226
  },
227
  {
228
  "epoch": 0.17,
229
+ "learning_rate": 2.991061104776067e-05,
230
+ "loss": 0.3513,
231
  "step": 360
232
  },
233
  {
234
  "epoch": 0.17,
235
+ "learning_rate": 2.9897686693402138e-05,
236
+ "loss": 0.3214,
237
  "step": 370
238
  },
239
  {
240
  "epoch": 0.18,
241
+ "learning_rate": 2.9883893180404046e-05,
242
+ "loss": 0.3451,
243
  "step": 380
244
  },
245
  {
246
  "epoch": 0.18,
247
+ "learning_rate": 2.986923131350549e-05,
248
+ "loss": 0.3579,
249
  "step": 390
250
  },
251
  {
252
+ "epoch": 0.18,
253
+ "learning_rate": 2.9853701948106944e-05,
254
+ "loss": 0.3353,
255
  "step": 400
256
  },
257
  {
258
  "epoch": 0.19,
259
+ "learning_rate": 2.9837305990220357e-05,
260
+ "loss": 0.3374,
261
  "step": 410
262
  },
263
  {
264
+ "epoch": 0.19,
265
+ "learning_rate": 2.982004439641628e-05,
266
+ "loss": 0.3544,
267
  "step": 420
268
  },
269
  {
270
  "epoch": 0.2,
271
+ "learning_rate": 2.980191817376808e-05,
272
+ "loss": 0.3313,
273
  "step": 430
274
  },
275
  {
276
+ "epoch": 0.2,
277
+ "learning_rate": 2.9782928379793154e-05,
278
+ "loss": 0.3488,
279
  "step": 440
280
  },
281
  {
282
  "epoch": 0.21,
283
+ "learning_rate": 2.976307612239127e-05,
284
+ "loss": 0.3384,
285
  "step": 450
286
  },
287
  {
288
+ "epoch": 0.21,
289
+ "learning_rate": 2.97423625597799e-05,
290
+ "loss": 0.3359,
291
  "step": 460
292
  },
293
  {
294
  "epoch": 0.22,
295
+ "learning_rate": 2.9720788900426657e-05,
296
+ "loss": 0.3353,
297
  "step": 470
298
  },
299
  {
300
+ "epoch": 0.22,
301
+ "learning_rate": 2.969835640297879e-05,
302
+ "loss": 0.356,
303
  "step": 480
304
  },
305
  {
306
  "epoch": 0.23,
307
+ "learning_rate": 2.967506637618976e-05,
308
+ "loss": 0.3575,
309
  "step": 490
310
  },
311
  {
312
  "epoch": 0.23,
313
+ "learning_rate": 2.9650920178842874e-05,
314
+ "loss": 0.3163,
315
  "step": 500
316
  },
317
  {
318
  "epoch": 0.24,
319
+ "learning_rate": 2.9625919219672017e-05,
320
+ "loss": 0.3277,
321
  "step": 510
322
  },
323
  {
324
  "epoch": 0.24,
325
+ "learning_rate": 2.960006495727946e-05,
326
+ "loss": 0.3627,
327
  "step": 520
328
  },
329
  {
330
+ "epoch": 0.24,
331
+ "learning_rate": 2.9573358900050764e-05,
332
+ "loss": 0.3412,
333
  "step": 530
334
  },
335
  {
336
  "epoch": 0.25,
337
+ "learning_rate": 2.9545802606066778e-05,
338
+ "loss": 0.3464,
339
  "step": 540
340
  },
341
  {
342
+ "epoch": 0.25,
343
+ "learning_rate": 2.9517397683012747e-05,
344
+ "loss": 0.3417,
345
  "step": 550
346
  },
347
  {
348
  "epoch": 0.26,
349
+ "learning_rate": 2.9488145788084502e-05,
350
+ "loss": 0.349,
351
  "step": 560
352
  },
353
  {
354
+ "epoch": 0.26,
355
+ "learning_rate": 2.945804862789178e-05,
356
+ "loss": 0.3497,
357
  "step": 570
358
  },
359
  {
360
  "epoch": 0.27,
361
+ "learning_rate": 2.942710795835866e-05,
362
+ "loss": 0.3318,
363
  "step": 580
364
  },
365
  {
366
+ "epoch": 0.27,
367
+ "learning_rate": 2.9395325584621122e-05,
368
+ "loss": 0.3249,
369
  "step": 590
370
  },
371
  {
372
  "epoch": 0.28,
373
+ "learning_rate": 2.9362703360921722e-05,
374
+ "loss": 0.3346,
375
  "step": 600
376
  },
377
  {
378
+ "epoch": 0.28,
379
+ "learning_rate": 2.932924319050143e-05,
380
+ "loss": 0.3401,
381
  "step": 610
382
  },
383
  {
384
  "epoch": 0.29,
385
+ "learning_rate": 2.9294947025488568e-05,
386
+ "loss": 0.327,
387
  "step": 620
388
  },
389
  {
390
+ "epoch": 0.29,
391
+ "learning_rate": 2.925981686678494e-05,
392
+ "loss": 0.3185,
393
  "step": 630
394
  },
395
  {
396
  "epoch": 0.3,
397
+ "learning_rate": 2.9223854763949082e-05,
398
+ "loss": 0.3411,
399
  "step": 640
400
  },
401
  {
402
+ "epoch": 0.3,
403
+ "learning_rate": 2.9187062815076688e-05,
404
+ "loss": 0.3401,
405
  "step": 650
406
  },
407
  {
408
+ "epoch": 0.3,
409
+ "learning_rate": 2.914944316667822e-05,
410
+ "loss": 0.3443,
411
  "step": 660
412
  },
413
  {
414
  "epoch": 0.31,
415
+ "learning_rate": 2.9110998013553653e-05,
416
+ "loss": 0.3336,
417
  "step": 670
418
  },
419
  {
420
+ "epoch": 0.31,
421
+ "learning_rate": 2.9071729598664433e-05,
422
+ "loss": 0.3271,
423
  "step": 680
424
  },
425
  {
426
  "epoch": 0.32,
427
+ "learning_rate": 2.9031640213002638e-05,
428
+ "loss": 0.3233,
429
  "step": 690
430
  },
431
  {
432
+ "epoch": 0.32,
433
+ "learning_rate": 2.899073219545729e-05,
434
+ "loss": 0.3548,
435
  "step": 700
436
  },
437
  {
438
  "epoch": 0.33,
439
+ "learning_rate": 2.8949007932677915e-05,
440
+ "loss": 0.3243,
441
  "step": 710
442
  },
443
  {
444
+ "epoch": 0.33,
445
+ "learning_rate": 2.89064698589353e-05,
446
+ "loss": 0.3151,
447
  "step": 720
448
  },
449
  {
450
  "epoch": 0.34,
451
+ "learning_rate": 2.8863120455979458e-05,
452
+ "loss": 0.325,
453
  "step": 730
454
  },
455
  {
456
+ "epoch": 0.34,
457
+ "learning_rate": 2.8818962252894872e-05,
458
+ "loss": 0.3406,
459
  "step": 740
460
  },
461
  {
462
  "epoch": 0.35,
463
+ "learning_rate": 2.8773997825952914e-05,
464
+ "loss": 0.3059,
465
  "step": 750
466
  },
467
  {
468
+ "epoch": 0.35,
469
+ "learning_rate": 2.872822979846154e-05,
470
+ "loss": 0.3236,
471
  "step": 760
472
  },
473
  {
474
  "epoch": 0.36,
475
+ "learning_rate": 2.8681660840612262e-05,
476
+ "loss": 0.3465,
477
  "step": 770
478
  },
479
  {
480
+ "epoch": 0.36,
481
+ "learning_rate": 2.8634293669324353e-05,
482
+ "loss": 0.34,
483
  "step": 780
484
  },
485
  {
486
+ "epoch": 0.36,
487
+ "learning_rate": 2.8586131048086334e-05,
488
+ "loss": 0.3319,
489
  "step": 790
490
  },
491
  {
492
+ "epoch": 0.37,
493
+ "learning_rate": 2.853717578679474e-05,
494
+ "loss": 0.3322,
495
  "step": 800
496
  },
497
  {
498
+ "epoch": 0.37,
499
+ "learning_rate": 2.848743074159021e-05,
500
+ "loss": 0.3125,
501
  "step": 810
502
  },
503
  {
504
  "epoch": 0.38,
505
+ "learning_rate": 2.8436898814690837e-05,
506
+ "loss": 0.3411,
507
  "step": 820
508
  },
509
  {
510
+ "epoch": 0.38,
511
+ "learning_rate": 2.838558295422284e-05,
512
+ "loss": 0.3371,
513
  "step": 830
514
  },
515
  {
516
  "epoch": 0.39,
517
+ "learning_rate": 2.833348615404859e-05,
518
+ "loss": 0.3336,
519
  "step": 840
520
  },
521
  {
522
+ "epoch": 0.39,
523
+ "learning_rate": 2.8280611453591908e-05,
524
+ "loss": 0.3335,
525
  "step": 850
526
  },
527
  {
528
  "epoch": 0.4,
529
+ "learning_rate": 2.8226961937660773e-05,
530
+ "loss": 0.3131,
531
  "step": 860
532
  },
533
  {
534
+ "epoch": 0.4,
535
+ "learning_rate": 2.817254073626733e-05,
536
+ "loss": 0.3202,
537
  "step": 870
538
  },
539
  {
540
  "epoch": 0.41,
541
+ "learning_rate": 2.811735102444528e-05,
542
+ "loss": 0.3265,
543
  "step": 880
544
  },
545
  {
546
+ "epoch": 0.41,
547
+ "learning_rate": 2.8061396022064657e-05,
548
+ "loss": 0.3162,
549
  "step": 890
550
  },
551
  {
552
  "epoch": 0.42,
553
+ "learning_rate": 2.8004678993643952e-05,
554
+ "loss": 0.3124,
555
  "step": 900
556
  },
557
  {
558
+ "epoch": 0.42,
559
+ "learning_rate": 2.7947203248159665e-05,
560
+ "loss": 0.3289,
561
  "step": 910
562
  },
563
  {
564
+ "epoch": 0.42,
565
+ "learning_rate": 2.788897213885327e-05,
566
+ "loss": 0.3242,
567
  "step": 920
568
  },
569
  {
570
+ "epoch": 0.43,
571
+ "learning_rate": 2.782998906303555e-05,
572
+ "loss": 0.3038,
573
  "step": 930
574
  },
575
  {
576
+ "epoch": 0.43,
577
+ "learning_rate": 2.777025746188842e-05,
578
+ "loss": 0.3319,
579
  "step": 940
580
  },
581
  {
582
+ "epoch": 0.44,
583
+ "learning_rate": 2.7709780820264147e-05,
584
+ "loss": 0.3372,
585
  "step": 950
586
  },
587
  {
588
+ "epoch": 0.44,
589
+ "learning_rate": 2.764856266648202e-05,
590
+ "loss": 0.306,
591
  "step": 960
592
  },
593
  {
594
+ "epoch": 1.0,
595
+ "learning_rate": 2.758660657212255e-05,
596
+ "loss": 0.3001,
597
  "step": 970
598
  },
599
  {
600
  "epoch": 1.01,
601
+ "learning_rate": 2.7523916151819048e-05,
602
+ "loss": 0.3044,
603
  "step": 980
604
  },
605
  {
606
+ "epoch": 1.01,
607
+ "learning_rate": 2.746049506304678e-05,
608
+ "loss": 0.3114,
609
  "step": 990
610
  },
611
  {
612
  "epoch": 1.02,
613
+ "learning_rate": 2.7396347005909535e-05,
614
+ "loss": 0.2982,
615
  "step": 1000
616
  },
617
  {
618
  "epoch": 1.02,
619
+ "eval_loss": 0.44782933592796326,
620
+ "eval_runtime": 6.8987,
621
+ "eval_samples_per_second": 20.294,
622
+ "eval_steps_per_second": 5.073,
623
  "step": 1000
624
  },
625
  {
626
+ "epoch": 1.02,
627
+ "learning_rate": 2.733147572292381e-05,
628
+ "loss": 0.3091,
629
  "step": 1010
630
  },
631
  {
632
  "epoch": 1.03,
633
+ "learning_rate": 2.7265884998800434e-05,
634
+ "loss": 0.3011,
635
  "step": 1020
636
  },
637
  {
638
+ "epoch": 1.03,
639
+ "learning_rate": 2.7199578660223743e-05,
640
+ "loss": 0.2794,
641
  "step": 1030
642
  },
643
  {
644
  "epoch": 1.04,
645
+ "learning_rate": 2.7132560575628377e-05,
646
+ "loss": 0.2956,
647
  "step": 1040
648
  },
649
  {
650
+ "epoch": 1.04,
651
+ "learning_rate": 2.7064834654973534e-05,
652
+ "loss": 0.3098,
653
  "step": 1050
654
  },
655
  {
656
  "epoch": 1.05,
657
+ "learning_rate": 2.6996404849514885e-05,
658
+ "loss": 0.3315,
659
  "step": 1060
660
  },
661
  {
662
+ "epoch": 1.05,
663
+ "learning_rate": 2.6927275151574053e-05,
664
+ "loss": 0.305,
665
  "step": 1070
666
  },
667
  {
668
+ "epoch": 1.05,
669
+ "learning_rate": 2.6857449594305674e-05,
670
+ "loss": 0.2986,
671
  "step": 1080
672
  },
673
  {
674
+ "epoch": 1.06,
675
+ "learning_rate": 2.678693225146211e-05,
676
+ "loss": 0.269,
677
  "step": 1090
678
  },
679
  {
680
+ "epoch": 1.06,
681
+ "learning_rate": 2.6715727237155777e-05,
682
+ "loss": 0.3124,
683
  "step": 1100
684
  },
685
  {
686
+ "epoch": 1.07,
687
+ "learning_rate": 2.6643838705619117e-05,
688
+ "loss": 0.303,
689
  "step": 1110
690
  },
691
  {
692
+ "epoch": 1.07,
693
+ "learning_rate": 2.6571270850962234e-05,
694
+ "loss": 0.3084,
695
  "step": 1120
696
  },
697
  {
698
  "epoch": 1.08,
699
+ "learning_rate": 2.6498027906928195e-05,
700
+ "loss": 0.31,
701
  "step": 1130
702
  },
703
  {
704
+ "epoch": 1.08,
705
+ "learning_rate": 2.6424114146646043e-05,
706
+ "loss": 0.3203,
707
  "step": 1140
708
  },
709
  {
710
  "epoch": 1.09,
711
+ "learning_rate": 2.6349533882381475e-05,
712
+ "loss": 0.2963,
713
  "step": 1150
714
  },
715
  {
716
+ "epoch": 1.09,
717
+ "learning_rate": 2.6274291465285266e-05,
718
+ "loss": 0.3042,
719
  "step": 1160
720
  },
721
  {
722
  "epoch": 1.1,
723
+ "learning_rate": 2.6198391285139417e-05,
724
+ "loss": 0.314,
725
  "step": 1170
726
  },
727
  {
728
+ "epoch": 1.1,
729
+ "learning_rate": 2.612183777010104e-05,
730
+ "loss": 0.3144,
731
  "step": 1180
732
  },
733
  {
734
  "epoch": 1.11,
735
+ "learning_rate": 2.6044635386444024e-05,
736
+ "loss": 0.3157,
737
  "step": 1190
738
  },
739
  {
740
+ "epoch": 1.11,
741
+ "learning_rate": 2.5966788638298443e-05,
742
+ "loss": 0.2958,
743
  "step": 1200
744
  },
745
  {
746
  "epoch": 1.12,
747
+ "learning_rate": 2.5888302067387793e-05,
748
+ "loss": 0.2867,
749
  "step": 1210
750
  },
751
  {
752
+ "epoch": 1.12,
753
+ "learning_rate": 2.5809180252764022e-05,
754
+ "loss": 0.2999,
755
  "step": 1220
756
  },
757
  {
758
+ "epoch": 1.12,
759
+ "learning_rate": 2.572942781054036e-05,
760
+ "loss": 0.2888,
761
  "step": 1230
762
  },
763
  {
764
+ "epoch": 1.13,
765
+ "learning_rate": 2.564904939362204e-05,
766
+ "loss": 0.3036,
767
  "step": 1240
768
  },
769
  {
770
+ "epoch": 1.13,
771
+ "learning_rate": 2.5568049691434794e-05,
772
+ "loss": 0.3127,
773
  "step": 1250
774
  },
775
  {
776
+ "epoch": 1.14,
777
+ "learning_rate": 2.5486433429651304e-05,
778
+ "loss": 0.2939,
779
  "step": 1260
780
  },
781
  {
782
+ "epoch": 1.14,
783
+ "learning_rate": 2.5404205369915473e-05,
784
+ "loss": 0.3096,
785
  "step": 1270
786
  },
787
  {
788
  "epoch": 1.15,
789
+ "learning_rate": 2.532137030956464e-05,
790
+ "loss": 0.3039,
791
  "step": 1280
792
  },
793
  {
794
+ "epoch": 1.15,
795
+ "learning_rate": 2.523793308134967e-05,
796
+ "loss": 0.2948,
797
  "step": 1290
798
  },
799
  {
800
  "epoch": 1.16,
801
+ "learning_rate": 2.5153898553153024e-05,
802
+ "loss": 0.3053,
803
  "step": 1300
804
  },
805
  {
806
+ "epoch": 1.16,
807
+ "learning_rate": 2.506927162770475e-05,
808
+ "loss": 0.2962,
809
  "step": 1310
810
  },
811
  {
812
  "epoch": 1.17,
813
+ "learning_rate": 2.4984057242296464e-05,
814
+ "loss": 0.3027,
815
  "step": 1320
816
  },
817
  {
818
+ "epoch": 1.17,
819
+ "learning_rate": 2.489826036849325e-05,
820
+ "loss": 0.3051,
821
  "step": 1330
822
  },
823
  {
824
  "epoch": 1.18,
825
+ "learning_rate": 2.4811886011843673e-05,
826
+ "loss": 0.3214,
827
  "step": 1340
828
  },
829
  {
830
+ "epoch": 1.18,
831
+ "learning_rate": 2.4724939211587706e-05,
832
+ "loss": 0.3002,
833
  "step": 1350
834
  },
835
  {
836
+ "epoch": 1.18,
837
+ "learning_rate": 2.4637425040362744e-05,
838
+ "loss": 0.3177,
839
  "step": 1360
840
  },
841
  {
842
+ "epoch": 1.19,
843
+ "learning_rate": 2.4549348603907658e-05,
844
+ "loss": 0.3141,
845
  "step": 1370
846
  },
847
  {
848
+ "epoch": 1.19,
849
+ "learning_rate": 2.4460715040764916e-05,
850
+ "loss": 0.3191,
851
  "step": 1380
852
  },
853
  {
854
+ "epoch": 1.2,
855
+ "learning_rate": 2.4371529521980775e-05,
856
+ "loss": 0.2736,
857
  "step": 1390
858
  },
859
  {
860
+ "epoch": 1.2,
861
+ "learning_rate": 2.428179725080362e-05,
862
+ "loss": 0.3063,
863
  "step": 1400
864
  },
865
  {
866
+ "epoch": 1.21,
867
+ "learning_rate": 2.419152346238038e-05,
868
+ "loss": 0.2965,
869
  "step": 1410
870
  },
871
  {
872
+ "epoch": 1.21,
873
+ "learning_rate": 2.410071342345111e-05,
874
+ "loss": 0.2942,
875
  "step": 1420
876
  },
877
  {
878
+ "epoch": 1.22,
879
+ "learning_rate": 2.4009372432041702e-05,
880
+ "loss": 0.2895,
881
  "step": 1430
882
  },
883
  {
884
+ "epoch": 1.22,
885
+ "learning_rate": 2.3917505817154795e-05,
886
+ "loss": 0.3084,
887
  "step": 1440
888
  },
889
  {
890
  "epoch": 1.23,
891
+ "learning_rate": 2.3825118938458894e-05,
892
+ "loss": 0.2849,
893
  "step": 1450
894
  },
895
  {
896
+ "epoch": 1.23,
897
+ "learning_rate": 2.373221718597564e-05,
898
+ "loss": 0.2923,
899
  "step": 1460
900
  },
901
  {
902
  "epoch": 1.24,
903
+ "learning_rate": 2.3638805979765387e-05,
904
+ "loss": 0.3062,
905
  "step": 1470
906
  },
907
  {
908
+ "epoch": 1.24,
909
+ "learning_rate": 2.3544890769610936e-05,
910
+ "loss": 0.2832,
911
  "step": 1480
912
  },
913
  {
914
+ "epoch": 1.24,
915
+ "learning_rate": 2.3450477034699632e-05,
916
+ "loss": 0.2823,
917
  "step": 1490
918
  },
919
  {
920
+ "epoch": 1.25,
921
+ "learning_rate": 2.335557028330366e-05,
922
+ "loss": 0.2822,
923
  "step": 1500
924
  },
925
  {
926
+ "epoch": 1.25,
927
+ "learning_rate": 2.326017605245872e-05,
928
+ "loss": 0.2885,
929
  "step": 1510
930
  },
931
  {
932
+ "epoch": 1.26,
933
+ "learning_rate": 2.3164299907640955e-05,
934
+ "loss": 0.2852,
935
  "step": 1520
936
  },
937
  {
938
+ "epoch": 1.26,
939
+ "learning_rate": 2.3067947442442264e-05,
940
+ "loss": 0.3022,
941
  "step": 1530
942
  },
943
  {
944
+ "epoch": 1.27,
945
+ "learning_rate": 2.2971124278243957e-05,
946
+ "loss": 0.2666,
947
  "step": 1540
948
  },
949
  {
950
+ "epoch": 1.27,
951
+ "learning_rate": 2.28738360638888e-05,
952
+ "loss": 0.2892,
953
  "step": 1550
954
  },
955
  {
956
+ "epoch": 1.28,
957
+ "learning_rate": 2.2776088475351445e-05,
958
+ "loss": 0.2689,
959
  "step": 1560
960
  },
961
  {
962
+ "epoch": 1.28,
963
+ "learning_rate": 2.2677887215407278e-05,
964
+ "loss": 0.2864,
965
  "step": 1570
966
  },
967
  {
968
+ "epoch": 1.29,
969
+ "learning_rate": 2.257923801329973e-05,
970
+ "loss": 0.281,
971
  "step": 1580
972
  },
973
  {
974
+ "epoch": 1.29,
975
+ "learning_rate": 2.248014662440599e-05,
976
+ "loss": 0.3068,
977
  "step": 1590
978
  },
979
  {
980
+ "epoch": 1.3,
981
+ "learning_rate": 2.238061882990126e-05,
982
+ "loss": 0.2753,
983
  "step": 1600
984
  },
985
  {
986
+ "epoch": 1.3,
987
+ "learning_rate": 2.2280660436421443e-05,
988
+ "loss": 0.2701,
989
  "step": 1610
990
  },
991
  {
992
+ "epoch": 1.3,
993
+ "learning_rate": 2.2180277275724385e-05,
994
+ "loss": 0.2891,
995
  "step": 1620
996
  },
997
  {
998
+ "epoch": 1.31,
999
+ "learning_rate": 2.2079475204349645e-05,
1000
+ "loss": 0.2691,
1001
  "step": 1630
1002
  },
1003
  {
1004
+ "epoch": 1.31,
1005
+ "learning_rate": 2.1978260103276796e-05,
1006
+ "loss": 0.2969,
1007
  "step": 1640
1008
  },
1009
  {
1010
+ "epoch": 1.32,
1011
+ "learning_rate": 2.187663787758234e-05,
1012
+ "loss": 0.2629,
1013
  "step": 1650
1014
  },
1015
  {
1016
+ "epoch": 1.32,
1017
+ "learning_rate": 2.177461445609518e-05,
1018
+ "loss": 0.2746,
1019
  "step": 1660
1020
  },
1021
  {
1022
+ "epoch": 1.33,
1023
+ "learning_rate": 2.1672195791050712e-05,
1024
+ "loss": 0.2775,
1025
  "step": 1670
1026
  },
1027
  {
1028
+ "epoch": 1.33,
1029
+ "learning_rate": 2.1569387857743596e-05,
1030
+ "loss": 0.2654,
1031
  "step": 1680
1032
  },
1033
  {
1034
+ "epoch": 1.34,
1035
+ "learning_rate": 2.1466196654179107e-05,
1036
+ "loss": 0.2678,
1037
  "step": 1690
1038
  },
1039
  {
1040
+ "epoch": 1.34,
1041
+ "learning_rate": 2.1362628200723228e-05,
1042
+ "loss": 0.2606,
1043
  "step": 1700
1044
  },
1045
  {
1046
+ "epoch": 1.35,
1047
+ "learning_rate": 2.1258688539751387e-05,
1048
+ "loss": 0.2886,
1049
  "step": 1710
1050
  },
1051
  {
1052
+ "epoch": 1.35,
1053
+ "learning_rate": 2.115438373529596e-05,
1054
+ "loss": 0.2799,
1055
  "step": 1720
1056
  },
1057
  {
1058
+ "epoch": 1.36,
1059
+ "learning_rate": 2.104971987269245e-05,
1060
+ "loss": 0.2804,
1061
  "step": 1730
1062
  },
1063
  {
1064
+ "epoch": 1.36,
1065
+ "learning_rate": 2.0944703058224504e-05,
1066
+ "loss": 0.2583,
1067
  "step": 1740
1068
  },
1069
  {
1070
+ "epoch": 1.36,
1071
+ "learning_rate": 2.0839339418767616e-05,
1072
+ "loss": 0.2857,
1073
  "step": 1750
1074
  },
1075
  {
1076
+ "epoch": 1.37,
1077
+ "learning_rate": 2.0733635101431694e-05,
1078
+ "loss": 0.2692,
1079
  "step": 1760
1080
  },
1081
  {
1082
+ "epoch": 1.37,
1083
+ "learning_rate": 2.0627596273202435e-05,
1084
+ "loss": 0.2759,
1085
  "step": 1770
1086
  },
1087
  {
1088
+ "epoch": 1.38,
1089
+ "learning_rate": 2.05212291205815e-05,
1090
+ "loss": 0.2685,
1091
  "step": 1780
1092
  },
1093
  {
1094
+ "epoch": 1.38,
1095
+ "learning_rate": 2.0414539849225637e-05,
1096
+ "loss": 0.2511,
1097
  "step": 1790
1098
  },
1099
  {
1100
+ "epoch": 1.39,
1101
+ "learning_rate": 2.0307534683584565e-05,
1102
+ "loss": 0.2764,
1103
  "step": 1800
1104
  },
1105
  {
1106
+ "epoch": 1.39,
1107
+ "learning_rate": 2.0200219866537882e-05,
1108
+ "loss": 0.2763,
1109
  "step": 1810
1110
  },
1111
  {
1112
+ "epoch": 1.4,
1113
+ "learning_rate": 2.0092601659030807e-05,
1114
+ "loss": 0.2731,
1115
  "step": 1820
1116
  },
1117
  {
1118
+ "epoch": 1.4,
1119
+ "learning_rate": 1.9984686339708927e-05,
1120
+ "loss": 0.2803,
1121
  "step": 1830
1122
  },
1123
  {
1124
+ "epoch": 1.41,
1125
+ "learning_rate": 1.9876480204551894e-05,
1126
+ "loss": 0.281,
1127
  "step": 1840
1128
  },
1129
  {
1130
+ "epoch": 1.41,
1131
+ "learning_rate": 1.976798956650607e-05,
1132
+ "loss": 0.2802,
1133
  "step": 1850
1134
  },
1135
  {
1136
+ "epoch": 1.42,
1137
+ "learning_rate": 1.9659220755116277e-05,
1138
+ "loss": 0.2743,
1139
  "step": 1860
1140
  },
1141
  {
1142
+ "epoch": 1.42,
1143
+ "learning_rate": 1.9550180116156447e-05,
1144
+ "loss": 0.2579,
1145
  "step": 1870
1146
  },
1147
  {
1148
+ "epoch": 1.42,
1149
+ "learning_rate": 1.9440874011259458e-05,
1150
+ "loss": 0.2721,
1151
  "step": 1880
1152
  },
1153
  {
1154
+ "epoch": 1.43,
1155
+ "learning_rate": 1.9331308817545963e-05,
1156
+ "loss": 0.2713,
1157
  "step": 1890
1158
  },
1159
  {
1160
+ "epoch": 1.43,
1161
+ "learning_rate": 1.922149092725233e-05,
1162
+ "loss": 0.2874,
1163
  "step": 1900
1164
  },
1165
  {
1166
+ "epoch": 1.44,
1167
+ "learning_rate": 1.911142674735771e-05,
1168
+ "loss": 0.2766,
1169
+ "step": 1910
1170
+ },
1171
+ {
1172
+ "epoch": 1.44,
1173
+ "learning_rate": 1.900112269921026e-05,
1174
+ "loss": 0.2695,
1175
+ "step": 1920
1176
+ },
1177
+ {
1178
+ "epoch": 1.44,
1179
+ "step": 1922,
1180
+ "total_flos": 3.4599459174219776e+17,
1181
+ "train_loss": 0.3275021193364904,
1182
+ "train_runtime": 3966.2406,
1183
+ "train_samples_per_second": 4.365,
1184
+ "train_steps_per_second": 1.092
1185
  }
1186
  ],
1187
  "logging_steps": 10,
1188
+ "max_steps": 4330,
1189
  "num_train_epochs": 2,
1190
  "save_steps": 1000,
1191
+ "total_flos": 3.4599459174219776e+17,
1192
  "trial_name": null,
1193
  "trial_params": null
1194
  }