Safetensors
llama
AALF commited on
Commit
a53c77c
·
verified ·
1 Parent(s): 733930c

Delete trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -1589
trainer_state.json DELETED
@@ -1,1589 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "eval_steps": 500,
6
- "global_step": 2217,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.013531799729364006,
13
- "grad_norm": 3.034516595682754,
14
- "learning_rate": 2.2522522522522524e-07,
15
- "loss": 0.6411,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.02706359945872801,
20
- "grad_norm": 2.7841028225351487,
21
- "learning_rate": 4.504504504504505e-07,
22
- "loss": 0.6332,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.04059539918809202,
27
- "grad_norm": 1.8710512230715446,
28
- "learning_rate": 6.756756756756758e-07,
29
- "loss": 0.6135,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.05412719891745602,
34
- "grad_norm": 1.5248415557438793,
35
- "learning_rate": 9.00900900900901e-07,
36
- "loss": 0.6095,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.06765899864682003,
41
- "grad_norm": 1.330731637337357,
42
- "learning_rate": 1.1261261261261262e-06,
43
- "loss": 0.5672,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.08119079837618404,
48
- "grad_norm": 1.1702536060306041,
49
- "learning_rate": 1.3513513513513515e-06,
50
- "loss": 0.5592,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.09472259810554803,
55
- "grad_norm": 1.2129816143771526,
56
- "learning_rate": 1.5765765765765766e-06,
57
- "loss": 0.527,
58
- "step": 70
59
- },
60
- {
61
- "epoch": 0.10825439783491204,
62
- "grad_norm": 1.234538155408458,
63
- "learning_rate": 1.801801801801802e-06,
64
- "loss": 0.5421,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.12178619756427606,
69
- "grad_norm": 1.1711228120854185,
70
- "learning_rate": 2.0270270270270273e-06,
71
- "loss": 0.5292,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 0.13531799729364005,
76
- "grad_norm": 1.2936648719718322,
77
- "learning_rate": 2.2522522522522524e-06,
78
- "loss": 0.5479,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.14884979702300405,
83
- "grad_norm": 1.291284989397587,
84
- "learning_rate": 2.4774774774774775e-06,
85
- "loss": 0.5416,
86
- "step": 110
87
- },
88
- {
89
- "epoch": 0.16238159675236807,
90
- "grad_norm": 1.34797590127609,
91
- "learning_rate": 2.702702702702703e-06,
92
- "loss": 0.5152,
93
- "step": 120
94
- },
95
- {
96
- "epoch": 0.17591339648173207,
97
- "grad_norm": 1.360973654533329,
98
- "learning_rate": 2.927927927927928e-06,
99
- "loss": 0.5129,
100
- "step": 130
101
- },
102
- {
103
- "epoch": 0.18944519621109607,
104
- "grad_norm": 1.2536822051142262,
105
- "learning_rate": 3.1531531531531532e-06,
106
- "loss": 0.5281,
107
- "step": 140
108
- },
109
- {
110
- "epoch": 0.2029769959404601,
111
- "grad_norm": 1.3088135261215759,
112
- "learning_rate": 3.3783783783783788e-06,
113
- "loss": 0.5209,
114
- "step": 150
115
- },
116
- {
117
- "epoch": 0.2165087956698241,
118
- "grad_norm": 1.288765418316905,
119
- "learning_rate": 3.603603603603604e-06,
120
- "loss": 0.5174,
121
- "step": 160
122
- },
123
- {
124
- "epoch": 0.23004059539918809,
125
- "grad_norm": 1.393215959864774,
126
- "learning_rate": 3.828828828828829e-06,
127
- "loss": 0.5207,
128
- "step": 170
129
- },
130
- {
131
- "epoch": 0.2435723951285521,
132
- "grad_norm": 1.2564229102790085,
133
- "learning_rate": 4.0540540540540545e-06,
134
- "loss": 0.5273,
135
- "step": 180
136
- },
137
- {
138
- "epoch": 0.2571041948579161,
139
- "grad_norm": 1.2582309298347993,
140
- "learning_rate": 4.27927927927928e-06,
141
- "loss": 0.4934,
142
- "step": 190
143
- },
144
- {
145
- "epoch": 0.2706359945872801,
146
- "grad_norm": 1.227173613438421,
147
- "learning_rate": 4.504504504504505e-06,
148
- "loss": 0.5112,
149
- "step": 200
150
- },
151
- {
152
- "epoch": 0.28416779431664413,
153
- "grad_norm": 1.3112186763708722,
154
- "learning_rate": 4.72972972972973e-06,
155
- "loss": 0.4982,
156
- "step": 210
157
- },
158
- {
159
- "epoch": 0.2976995940460081,
160
- "grad_norm": 1.3637637039109458,
161
- "learning_rate": 4.954954954954955e-06,
162
- "loss": 0.5229,
163
- "step": 220
164
- },
165
- {
166
- "epoch": 0.3112313937753721,
167
- "grad_norm": 1.375489718620912,
168
- "learning_rate": 4.999801619861762e-06,
169
- "loss": 0.5042,
170
- "step": 230
171
- },
172
- {
173
- "epoch": 0.32476319350473615,
174
- "grad_norm": 1.2608972862462173,
175
- "learning_rate": 4.99899575450882e-06,
176
- "loss": 0.5127,
177
- "step": 240
178
- },
179
- {
180
- "epoch": 0.3382949932341001,
181
- "grad_norm": 1.2911890277318863,
182
- "learning_rate": 4.9975702048619155e-06,
183
- "loss": 0.5033,
184
- "step": 250
185
- },
186
- {
187
- "epoch": 0.35182679296346414,
188
- "grad_norm": 1.3078317427998958,
189
- "learning_rate": 4.995525324419338e-06,
190
- "loss": 0.5002,
191
- "step": 260
192
- },
193
- {
194
- "epoch": 0.36535859269282817,
195
- "grad_norm": 1.3588495282283148,
196
- "learning_rate": 4.992861620256898e-06,
197
- "loss": 0.5065,
198
- "step": 270
199
- },
200
- {
201
- "epoch": 0.37889039242219213,
202
- "grad_norm": 1.3626322380792617,
203
- "learning_rate": 4.98957975290218e-06,
204
- "loss": 0.511,
205
- "step": 280
206
- },
207
- {
208
- "epoch": 0.39242219215155616,
209
- "grad_norm": 1.3701159835791592,
210
- "learning_rate": 4.985680536170754e-06,
211
- "loss": 0.5137,
212
- "step": 290
213
- },
214
- {
215
- "epoch": 0.4059539918809202,
216
- "grad_norm": 1.2997955232042016,
217
- "learning_rate": 4.981164936964371e-06,
218
- "loss": 0.4965,
219
- "step": 300
220
- },
221
- {
222
- "epoch": 0.41948579161028415,
223
- "grad_norm": 1.3519542829203703,
224
- "learning_rate": 4.976034075031193e-06,
225
- "loss": 0.512,
226
- "step": 310
227
- },
228
- {
229
- "epoch": 0.4330175913396482,
230
- "grad_norm": 1.2630076374904904,
231
- "learning_rate": 4.970289222688129e-06,
232
- "loss": 0.5028,
233
- "step": 320
234
- },
235
- {
236
- "epoch": 0.4465493910690122,
237
- "grad_norm": 1.2707537109175924,
238
- "learning_rate": 4.963931804505335e-06,
239
- "loss": 0.5032,
240
- "step": 330
241
- },
242
- {
243
- "epoch": 0.46008119079837617,
244
- "grad_norm": 1.2589722075840526,
245
- "learning_rate": 4.956963396952954e-06,
246
- "loss": 0.4913,
247
- "step": 340
248
- },
249
- {
250
- "epoch": 0.4736129905277402,
251
- "grad_norm": 1.250431061653177,
252
- "learning_rate": 4.949385728010199e-06,
253
- "loss": 0.4805,
254
- "step": 350
255
- },
256
- {
257
- "epoch": 0.4871447902571042,
258
- "grad_norm": 1.3372965532968473,
259
- "learning_rate": 4.941200676736856e-06,
260
- "loss": 0.4991,
261
- "step": 360
262
- },
263
- {
264
- "epoch": 0.5006765899864682,
265
- "grad_norm": 1.342662150633839,
266
- "learning_rate": 4.932410272807328e-06,
267
- "loss": 0.5066,
268
- "step": 370
269
- },
270
- {
271
- "epoch": 0.5142083897158322,
272
- "grad_norm": 1.2238441786277905,
273
- "learning_rate": 4.9230166960073325e-06,
274
- "loss": 0.4863,
275
- "step": 380
276
- },
277
- {
278
- "epoch": 0.5277401894451962,
279
- "grad_norm": 1.359535295509501,
280
- "learning_rate": 4.913022275693372e-06,
281
- "loss": 0.4949,
282
- "step": 390
283
- },
284
- {
285
- "epoch": 0.5412719891745602,
286
- "grad_norm": 1.2168352667558966,
287
- "learning_rate": 4.902429490215112e-06,
288
- "loss": 0.4887,
289
- "step": 400
290
- },
291
- {
292
- "epoch": 0.5548037889039242,
293
- "grad_norm": 1.4624621853482647,
294
- "learning_rate": 4.891240966300822e-06,
295
- "loss": 0.4969,
296
- "step": 410
297
- },
298
- {
299
- "epoch": 0.5683355886332883,
300
- "grad_norm": 1.238588018136266,
301
- "learning_rate": 4.879459478406012e-06,
302
- "loss": 0.5067,
303
- "step": 420
304
- },
305
- {
306
- "epoch": 0.5818673883626523,
307
- "grad_norm": 1.3181900309459351,
308
- "learning_rate": 4.867087948025444e-06,
309
- "loss": 0.4791,
310
- "step": 430
311
- },
312
- {
313
- "epoch": 0.5953991880920162,
314
- "grad_norm": 1.2954274899523053,
315
- "learning_rate": 4.854129442968679e-06,
316
- "loss": 0.4927,
317
- "step": 440
318
- },
319
- {
320
- "epoch": 0.6089309878213802,
321
- "grad_norm": 1.4125967606025722,
322
- "learning_rate": 4.8405871765993435e-06,
323
- "loss": 0.4879,
324
- "step": 450
325
- },
326
- {
327
- "epoch": 0.6224627875507442,
328
- "grad_norm": 1.3930229082121182,
329
- "learning_rate": 4.8264645070382964e-06,
330
- "loss": 0.5073,
331
- "step": 460
332
- },
333
- {
334
- "epoch": 0.6359945872801083,
335
- "grad_norm": 1.0709300883023405,
336
- "learning_rate": 4.8117649363309105e-06,
337
- "loss": 0.4729,
338
- "step": 470
339
- },
340
- {
341
- "epoch": 0.6495263870094723,
342
- "grad_norm": 1.2926318844200286,
343
- "learning_rate": 4.796492109578655e-06,
344
- "loss": 0.4956,
345
- "step": 480
346
- },
347
- {
348
- "epoch": 0.6630581867388363,
349
- "grad_norm": 1.2313258024791178,
350
- "learning_rate": 4.780649814035205e-06,
351
- "loss": 0.5152,
352
- "step": 490
353
- },
354
- {
355
- "epoch": 0.6765899864682002,
356
- "grad_norm": 1.178544127732,
357
- "learning_rate": 4.764241978167314e-06,
358
- "loss": 0.4859,
359
- "step": 500
360
- },
361
- {
362
- "epoch": 0.6901217861975643,
363
- "grad_norm": 1.3295118207561514,
364
- "learning_rate": 4.747272670680646e-06,
365
- "loss": 0.4676,
366
- "step": 510
367
- },
368
- {
369
- "epoch": 0.7036535859269283,
370
- "grad_norm": 1.2304742068887151,
371
- "learning_rate": 4.729746099510853e-06,
372
- "loss": 0.4678,
373
- "step": 520
374
- },
375
- {
376
- "epoch": 0.7171853856562923,
377
- "grad_norm": 1.3372876193995562,
378
- "learning_rate": 4.711666610780115e-06,
379
- "loss": 0.4831,
380
- "step": 530
381
- },
382
- {
383
- "epoch": 0.7307171853856563,
384
- "grad_norm": 1.1886552687072083,
385
- "learning_rate": 4.693038687719424e-06,
386
- "loss": 0.4767,
387
- "step": 540
388
- },
389
- {
390
- "epoch": 0.7442489851150202,
391
- "grad_norm": 1.1599254136343242,
392
- "learning_rate": 4.673866949556854e-06,
393
- "loss": 0.4936,
394
- "step": 550
395
- },
396
- {
397
- "epoch": 0.7577807848443843,
398
- "grad_norm": 1.3619743128031154,
399
- "learning_rate": 4.654156150372123e-06,
400
- "loss": 0.4932,
401
- "step": 560
402
- },
403
- {
404
- "epoch": 0.7713125845737483,
405
- "grad_norm": 1.2317704190385679,
406
- "learning_rate": 4.633911177917701e-06,
407
- "loss": 0.4823,
408
- "step": 570
409
- },
410
- {
411
- "epoch": 0.7848443843031123,
412
- "grad_norm": 1.1895331578985897,
413
- "learning_rate": 4.613137052406783e-06,
414
- "loss": 0.4828,
415
- "step": 580
416
- },
417
- {
418
- "epoch": 0.7983761840324763,
419
- "grad_norm": 1.2879457390422215,
420
- "learning_rate": 4.5918389252684115e-06,
421
- "loss": 0.4779,
422
- "step": 590
423
- },
424
- {
425
- "epoch": 0.8119079837618404,
426
- "grad_norm": 1.2202436257043876,
427
- "learning_rate": 4.570022077870051e-06,
428
- "loss": 0.4875,
429
- "step": 600
430
- },
431
- {
432
- "epoch": 0.8254397834912043,
433
- "grad_norm": 1.191014968745712,
434
- "learning_rate": 4.547691920207958e-06,
435
- "loss": 0.4726,
436
- "step": 610
437
- },
438
- {
439
- "epoch": 0.8389715832205683,
440
- "grad_norm": 1.3456732382536576,
441
- "learning_rate": 4.524853989565644e-06,
442
- "loss": 0.4716,
443
- "step": 620
444
- },
445
- {
446
- "epoch": 0.8525033829499323,
447
- "grad_norm": 1.1703319949253694,
448
- "learning_rate": 4.501513949140776e-06,
449
- "loss": 0.4715,
450
- "step": 630
451
- },
452
- {
453
- "epoch": 0.8660351826792964,
454
- "grad_norm": 1.2604743747219656,
455
- "learning_rate": 4.477677586640854e-06,
456
- "loss": 0.483,
457
- "step": 640
458
- },
459
- {
460
- "epoch": 0.8795669824086604,
461
- "grad_norm": 1.2589734283703884,
462
- "learning_rate": 4.453350812848014e-06,
463
- "loss": 0.4788,
464
- "step": 650
465
- },
466
- {
467
- "epoch": 0.8930987821380244,
468
- "grad_norm": 1.3165298623822046,
469
- "learning_rate": 4.428539660153315e-06,
470
- "loss": 0.4781,
471
- "step": 660
472
- },
473
- {
474
- "epoch": 0.9066305818673883,
475
- "grad_norm": 1.2879141623482424,
476
- "learning_rate": 4.403250281060862e-06,
477
- "loss": 0.4663,
478
- "step": 670
479
- },
480
- {
481
- "epoch": 0.9201623815967523,
482
- "grad_norm": 1.2076038888483498,
483
- "learning_rate": 4.377488946662152e-06,
484
- "loss": 0.4808,
485
- "step": 680
486
- },
487
- {
488
- "epoch": 0.9336941813261164,
489
- "grad_norm": 1.1851588037695613,
490
- "learning_rate": 4.3512620450810115e-06,
491
- "loss": 0.4793,
492
- "step": 690
493
- },
494
- {
495
- "epoch": 0.9472259810554804,
496
- "grad_norm": 1.2950814486817164,
497
- "learning_rate": 4.324576079889508e-06,
498
- "loss": 0.4922,
499
- "step": 700
500
- },
501
- {
502
- "epoch": 0.9607577807848444,
503
- "grad_norm": 1.201177645302697,
504
- "learning_rate": 4.297437668495241e-06,
505
- "loss": 0.4789,
506
- "step": 710
507
- },
508
- {
509
- "epoch": 0.9742895805142084,
510
- "grad_norm": 1.217638325880889,
511
- "learning_rate": 4.269853540500404e-06,
512
- "loss": 0.4777,
513
- "step": 720
514
- },
515
- {
516
- "epoch": 0.9878213802435724,
517
- "grad_norm": 1.2410357155495917,
518
- "learning_rate": 4.2418305360330135e-06,
519
- "loss": 0.4677,
520
- "step": 730
521
- },
522
- {
523
- "epoch": 1.0013531799729365,
524
- "grad_norm": 1.4640185730269493,
525
- "learning_rate": 4.21337560405075e-06,
526
- "loss": 0.4706,
527
- "step": 740
528
- },
529
- {
530
- "epoch": 1.0148849797023005,
531
- "grad_norm": 1.38525301927104,
532
- "learning_rate": 4.184495800617795e-06,
533
- "loss": 0.3663,
534
- "step": 750
535
- },
536
- {
537
- "epoch": 1.0284167794316643,
538
- "grad_norm": 1.3150399206192376,
539
- "learning_rate": 4.1551982871551195e-06,
540
- "loss": 0.3446,
541
- "step": 760
542
- },
543
- {
544
- "epoch": 1.0419485791610283,
545
- "grad_norm": 1.2508231428791732,
546
- "learning_rate": 4.125490328664639e-06,
547
- "loss": 0.3433,
548
- "step": 770
549
- },
550
- {
551
- "epoch": 1.0554803788903924,
552
- "grad_norm": 1.1828710966111038,
553
- "learning_rate": 4.095379291927689e-06,
554
- "loss": 0.3372,
555
- "step": 780
556
- },
557
- {
558
- "epoch": 1.0690121786197564,
559
- "grad_norm": 1.3234842465450887,
560
- "learning_rate": 4.064872643678261e-06,
561
- "loss": 0.3492,
562
- "step": 790
563
- },
564
- {
565
- "epoch": 1.0825439783491204,
566
- "grad_norm": 1.3510823566344805,
567
- "learning_rate": 4.033977948751445e-06,
568
- "loss": 0.3538,
569
- "step": 800
570
- },
571
- {
572
- "epoch": 1.0960757780784844,
573
- "grad_norm": 1.162520339907919,
574
- "learning_rate": 4.002702868207563e-06,
575
- "loss": 0.3462,
576
- "step": 810
577
- },
578
- {
579
- "epoch": 1.1096075778078485,
580
- "grad_norm": 1.2976046378570087,
581
- "learning_rate": 3.971055157432421e-06,
582
- "loss": 0.3474,
583
- "step": 820
584
- },
585
- {
586
- "epoch": 1.1231393775372125,
587
- "grad_norm": 1.2857491708515112,
588
- "learning_rate": 3.939042664214185e-06,
589
- "loss": 0.3365,
590
- "step": 830
591
- },
592
- {
593
- "epoch": 1.1366711772665765,
594
- "grad_norm": 1.2956865515811835,
595
- "learning_rate": 3.9066733267973335e-06,
596
- "loss": 0.3483,
597
- "step": 840
598
- },
599
- {
600
- "epoch": 1.1502029769959405,
601
- "grad_norm": 1.2863826250955843,
602
- "learning_rate": 3.873955171914196e-06,
603
- "loss": 0.3515,
604
- "step": 850
605
- },
606
- {
607
- "epoch": 1.1637347767253043,
608
- "grad_norm": 1.48236681103959,
609
- "learning_rate": 3.840896312794523e-06,
610
- "loss": 0.3502,
611
- "step": 860
612
- },
613
- {
614
- "epoch": 1.1772665764546684,
615
- "grad_norm": 1.333493280950809,
616
- "learning_rate": 3.8075049471536317e-06,
617
- "loss": 0.3402,
618
- "step": 870
619
- },
620
- {
621
- "epoch": 1.1907983761840324,
622
- "grad_norm": 1.3845427764767562,
623
- "learning_rate": 3.773789355159587e-06,
624
- "loss": 0.3634,
625
- "step": 880
626
- },
627
- {
628
- "epoch": 1.2043301759133964,
629
- "grad_norm": 1.1768813996050986,
630
- "learning_rate": 3.7397578973799432e-06,
631
- "loss": 0.3426,
632
- "step": 890
633
- },
634
- {
635
- "epoch": 1.2178619756427604,
636
- "grad_norm": 1.2400576449491367,
637
- "learning_rate": 3.7054190127085414e-06,
638
- "loss": 0.3643,
639
- "step": 900
640
- },
641
- {
642
- "epoch": 1.2313937753721245,
643
- "grad_norm": 1.3771269453073722,
644
- "learning_rate": 3.6707812162728963e-06,
645
- "loss": 0.3618,
646
- "step": 910
647
- },
648
- {
649
- "epoch": 1.2449255751014885,
650
- "grad_norm": 1.342260391925942,
651
- "learning_rate": 3.6358530973226634e-06,
652
- "loss": 0.3619,
653
- "step": 920
654
- },
655
- {
656
- "epoch": 1.2584573748308525,
657
- "grad_norm": 1.305612405058302,
658
- "learning_rate": 3.600643317099742e-06,
659
- "loss": 0.343,
660
- "step": 930
661
- },
662
- {
663
- "epoch": 1.2719891745602165,
664
- "grad_norm": 1.2499532104940625,
665
- "learning_rate": 3.5651606066905125e-06,
666
- "loss": 0.3469,
667
- "step": 940
668
- },
669
- {
670
- "epoch": 1.2855209742895806,
671
- "grad_norm": 1.3139038237546603,
672
- "learning_rate": 3.529413764860763e-06,
673
- "loss": 0.3547,
674
- "step": 950
675
- },
676
- {
677
- "epoch": 1.2990527740189446,
678
- "grad_norm": 1.3891975036748492,
679
- "learning_rate": 3.493411655873826e-06,
680
- "loss": 0.3565,
681
- "step": 960
682
- },
683
- {
684
- "epoch": 1.3125845737483086,
685
- "grad_norm": 1.31663081113206,
686
- "learning_rate": 3.4571632072924853e-06,
687
- "loss": 0.3459,
688
- "step": 970
689
- },
690
- {
691
- "epoch": 1.3261163734776726,
692
- "grad_norm": 1.4316096903150302,
693
- "learning_rate": 3.4206774077651706e-06,
694
- "loss": 0.3435,
695
- "step": 980
696
- },
697
- {
698
- "epoch": 1.3396481732070367,
699
- "grad_norm": 1.2148334230866564,
700
- "learning_rate": 3.383963304797016e-06,
701
- "loss": 0.3505,
702
- "step": 990
703
- },
704
- {
705
- "epoch": 1.3531799729364005,
706
- "grad_norm": 1.371605209380619,
707
- "learning_rate": 3.347030002506321e-06,
708
- "loss": 0.3367,
709
- "step": 1000
710
- },
711
- {
712
- "epoch": 1.3667117726657645,
713
- "grad_norm": 1.2492881761277035,
714
- "learning_rate": 3.309886659366967e-06,
715
- "loss": 0.3409,
716
- "step": 1010
717
- },
718
- {
719
- "epoch": 1.3802435723951285,
720
- "grad_norm": 1.3599908217112529,
721
- "learning_rate": 3.272542485937369e-06,
722
- "loss": 0.3485,
723
- "step": 1020
724
- },
725
- {
726
- "epoch": 1.3937753721244925,
727
- "grad_norm": 1.3587500825976617,
728
- "learning_rate": 3.2350067425764932e-06,
729
- "loss": 0.3564,
730
- "step": 1030
731
- },
732
- {
733
- "epoch": 1.4073071718538566,
734
- "grad_norm": 1.286120602039562,
735
- "learning_rate": 3.1972887371475404e-06,
736
- "loss": 0.3445,
737
- "step": 1040
738
- },
739
- {
740
- "epoch": 1.4208389715832206,
741
- "grad_norm": 1.3034585004391024,
742
- "learning_rate": 3.1593978227098442e-06,
743
- "loss": 0.3499,
744
- "step": 1050
745
- },
746
- {
747
- "epoch": 1.4343707713125846,
748
- "grad_norm": 1.4093897542299632,
749
- "learning_rate": 3.1213433951995585e-06,
750
- "loss": 0.3476,
751
- "step": 1060
752
- },
753
- {
754
- "epoch": 1.4479025710419486,
755
- "grad_norm": 1.2311850270587112,
756
- "learning_rate": 3.0831348910997206e-06,
757
- "loss": 0.3364,
758
- "step": 1070
759
- },
760
- {
761
- "epoch": 1.4614343707713127,
762
- "grad_norm": 1.222347605205662,
763
- "learning_rate": 3.0447817851002493e-06,
764
- "loss": 0.3479,
765
- "step": 1080
766
- },
767
- {
768
- "epoch": 1.4749661705006765,
769
- "grad_norm": 1.296829055261883,
770
- "learning_rate": 3.0062935877484807e-06,
771
- "loss": 0.3483,
772
- "step": 1090
773
- },
774
- {
775
- "epoch": 1.4884979702300405,
776
- "grad_norm": 1.4143686463454304,
777
- "learning_rate": 2.9676798430908e-06,
778
- "loss": 0.3468,
779
- "step": 1100
780
- },
781
- {
782
- "epoch": 1.5020297699594045,
783
- "grad_norm": 1.3314404969152827,
784
- "learning_rate": 2.9289501263059796e-06,
785
- "loss": 0.3526,
786
- "step": 1110
787
- },
788
- {
789
- "epoch": 1.5155615696887685,
790
- "grad_norm": 1.2209137417793836,
791
- "learning_rate": 2.890114041330782e-06,
792
- "loss": 0.3527,
793
- "step": 1120
794
- },
795
- {
796
- "epoch": 1.5290933694181326,
797
- "grad_norm": 1.3653030646734905,
798
- "learning_rate": 2.8511812184784476e-06,
799
- "loss": 0.3486,
800
- "step": 1130
801
- },
802
- {
803
- "epoch": 1.5426251691474966,
804
- "grad_norm": 1.3012446208990969,
805
- "learning_rate": 2.8121613120506274e-06,
806
- "loss": 0.3541,
807
- "step": 1140
808
- },
809
- {
810
- "epoch": 1.5561569688768606,
811
- "grad_norm": 1.1960147547589701,
812
- "learning_rate": 2.7730639979433778e-06,
813
- "loss": 0.3473,
814
- "step": 1150
815
- },
816
- {
817
- "epoch": 1.5696887686062246,
818
- "grad_norm": 1.4960354983589137,
819
- "learning_rate": 2.733898971247795e-06,
820
- "loss": 0.3456,
821
- "step": 1160
822
- },
823
- {
824
- "epoch": 1.5832205683355887,
825
- "grad_norm": 1.2152217200297626,
826
- "learning_rate": 2.6946759438458898e-06,
827
- "loss": 0.3433,
828
- "step": 1170
829
- },
830
- {
831
- "epoch": 1.5967523680649527,
832
- "grad_norm": 1.3453974757862506,
833
- "learning_rate": 2.655404642002304e-06,
834
- "loss": 0.3409,
835
- "step": 1180
836
- },
837
- {
838
- "epoch": 1.6102841677943167,
839
- "grad_norm": 1.3428793347679469,
840
- "learning_rate": 2.6160948039524497e-06,
841
- "loss": 0.3419,
842
- "step": 1190
843
- },
844
- {
845
- "epoch": 1.6238159675236807,
846
- "grad_norm": 1.2285874250356115,
847
- "learning_rate": 2.576756177487694e-06,
848
- "loss": 0.3508,
849
- "step": 1200
850
- },
851
- {
852
- "epoch": 1.6373477672530448,
853
- "grad_norm": 1.3449344602808873,
854
- "learning_rate": 2.5373985175381595e-06,
855
- "loss": 0.3503,
856
- "step": 1210
857
- },
858
- {
859
- "epoch": 1.6508795669824088,
860
- "grad_norm": 1.296420958776628,
861
- "learning_rate": 2.4980315837537682e-06,
862
- "loss": 0.3562,
863
- "step": 1220
864
- },
865
- {
866
- "epoch": 1.6644113667117728,
867
- "grad_norm": 1.3437387415224382,
868
- "learning_rate": 2.458665138084104e-06,
869
- "loss": 0.3405,
870
- "step": 1230
871
- },
872
- {
873
- "epoch": 1.6779431664411368,
874
- "grad_norm": 1.388342667143761,
875
- "learning_rate": 2.4193089423577125e-06,
876
- "loss": 0.3609,
877
- "step": 1240
878
- },
879
- {
880
- "epoch": 1.6914749661705006,
881
- "grad_norm": 1.2780103853967297,
882
- "learning_rate": 2.379972755861427e-06,
883
- "loss": 0.3464,
884
- "step": 1250
885
- },
886
- {
887
- "epoch": 1.7050067658998647,
888
- "grad_norm": 1.3272035289197703,
889
- "learning_rate": 2.3406663329203235e-06,
890
- "loss": 0.3495,
891
- "step": 1260
892
- },
893
- {
894
- "epoch": 1.7185385656292287,
895
- "grad_norm": 1.329132748454747,
896
- "learning_rate": 2.3013994204789125e-06,
897
- "loss": 0.3445,
898
- "step": 1270
899
- },
900
- {
901
- "epoch": 1.7320703653585927,
902
- "grad_norm": 1.398385700455576,
903
- "learning_rate": 2.262181755684152e-06,
904
- "loss": 0.3525,
905
- "step": 1280
906
- },
907
- {
908
- "epoch": 1.7456021650879567,
909
- "grad_norm": 1.3017040678134042,
910
- "learning_rate": 2.2230230634709004e-06,
911
- "loss": 0.3437,
912
- "step": 1290
913
- },
914
- {
915
- "epoch": 1.7591339648173205,
916
- "grad_norm": 1.2733756916822523,
917
- "learning_rate": 2.1839330541503846e-06,
918
- "loss": 0.3507,
919
- "step": 1300
920
- },
921
- {
922
- "epoch": 1.7726657645466846,
923
- "grad_norm": 1.2687781856621299,
924
- "learning_rate": 2.14492142100231e-06,
925
- "loss": 0.343,
926
- "step": 1310
927
- },
928
- {
929
- "epoch": 1.7861975642760486,
930
- "grad_norm": 1.2837756754284302,
931
- "learning_rate": 2.105997837871179e-06,
932
- "loss": 0.3567,
933
- "step": 1320
934
- },
935
- {
936
- "epoch": 1.7997293640054126,
937
- "grad_norm": 1.2388730929923404,
938
- "learning_rate": 2.0671719567674396e-06,
939
- "loss": 0.344,
940
- "step": 1330
941
- },
942
- {
943
- "epoch": 1.8132611637347766,
944
- "grad_norm": 1.2901053029094909,
945
- "learning_rate": 2.028453405474043e-06,
946
- "loss": 0.3517,
947
- "step": 1340
948
- },
949
- {
950
- "epoch": 1.8267929634641407,
951
- "grad_norm": 1.2731165360362213,
952
- "learning_rate": 1.9898517851590085e-06,
953
- "loss": 0.3386,
954
- "step": 1350
955
- },
956
- {
957
- "epoch": 1.8403247631935047,
958
- "grad_norm": 1.3976657362615021,
959
- "learning_rate": 1.951376667994594e-06,
960
- "loss": 0.341,
961
- "step": 1360
962
- },
963
- {
964
- "epoch": 1.8538565629228687,
965
- "grad_norm": 1.3707046787252906,
966
- "learning_rate": 1.913037594783648e-06,
967
- "loss": 0.3537,
968
- "step": 1370
969
- },
970
- {
971
- "epoch": 1.8673883626522327,
972
- "grad_norm": 1.2631835198560484,
973
- "learning_rate": 1.8748440725937485e-06,
974
- "loss": 0.3565,
975
- "step": 1380
976
- },
977
- {
978
- "epoch": 1.8809201623815968,
979
- "grad_norm": 1.2721719565376515,
980
- "learning_rate": 1.8368055723997013e-06,
981
- "loss": 0.3522,
982
- "step": 1390
983
- },
984
- {
985
- "epoch": 1.8944519621109608,
986
- "grad_norm": 1.2577752707995227,
987
- "learning_rate": 1.7989315267349936e-06,
988
- "loss": 0.3454,
989
- "step": 1400
990
- },
991
- {
992
- "epoch": 1.9079837618403248,
993
- "grad_norm": 1.514027269766047,
994
- "learning_rate": 1.7612313273527731e-06,
995
- "loss": 0.3496,
996
- "step": 1410
997
- },
998
- {
999
- "epoch": 1.9215155615696888,
1000
- "grad_norm": 1.292691163793762,
1001
- "learning_rate": 1.7237143228969488e-06,
1002
- "loss": 0.348,
1003
- "step": 1420
1004
- },
1005
- {
1006
- "epoch": 1.9350473612990529,
1007
- "grad_norm": 1.3014087462969852,
1008
- "learning_rate": 1.686389816583973e-06,
1009
- "loss": 0.3577,
1010
- "step": 1430
1011
- },
1012
- {
1013
- "epoch": 1.9485791610284169,
1014
- "grad_norm": 1.214993986220213,
1015
- "learning_rate": 1.6492670638958924e-06,
1016
- "loss": 0.3362,
1017
- "step": 1440
1018
- },
1019
- {
1020
- "epoch": 1.962110960757781,
1021
- "grad_norm": 1.221639454284981,
1022
- "learning_rate": 1.6123552702852375e-06,
1023
- "loss": 0.3347,
1024
- "step": 1450
1025
- },
1026
- {
1027
- "epoch": 1.975642760487145,
1028
- "grad_norm": 1.36951304107227,
1029
- "learning_rate": 1.5756635888923122e-06,
1030
- "loss": 0.3408,
1031
- "step": 1460
1032
- },
1033
- {
1034
- "epoch": 1.989174560216509,
1035
- "grad_norm": 1.291050299529943,
1036
- "learning_rate": 1.539201118275463e-06,
1037
- "loss": 0.3516,
1038
- "step": 1470
1039
- },
1040
- {
1041
- "epoch": 2.002706359945873,
1042
- "grad_norm": 1.5299892777665929,
1043
- "learning_rate": 1.502976900154876e-06,
1044
- "loss": 0.3294,
1045
- "step": 1480
1046
- },
1047
- {
1048
- "epoch": 2.016238159675237,
1049
- "grad_norm": 1.4268912812659167,
1050
- "learning_rate": 1.4669999171704742e-06,
1051
- "loss": 0.2451,
1052
- "step": 1490
1053
- },
1054
- {
1055
- "epoch": 2.029769959404601,
1056
- "grad_norm": 1.3062098172273324,
1057
- "learning_rate": 1.43127909065446e-06,
1058
- "loss": 0.2438,
1059
- "step": 1500
1060
- },
1061
- {
1062
- "epoch": 2.0433017591339646,
1063
- "grad_norm": 1.4093327122429686,
1064
- "learning_rate": 1.395823278419065e-06,
1065
- "loss": 0.2458,
1066
- "step": 1510
1067
- },
1068
- {
1069
- "epoch": 2.0568335588633286,
1070
- "grad_norm": 1.3446778247332223,
1071
- "learning_rate": 1.3606412725600471e-06,
1072
- "loss": 0.2483,
1073
- "step": 1520
1074
- },
1075
- {
1076
- "epoch": 2.0703653585926927,
1077
- "grad_norm": 1.288956301952182,
1078
- "learning_rate": 1.3257417972764853e-06,
1079
- "loss": 0.242,
1080
- "step": 1530
1081
- },
1082
- {
1083
- "epoch": 2.0838971583220567,
1084
- "grad_norm": 1.2722385458217775,
1085
- "learning_rate": 1.2911335067074108e-06,
1086
- "loss": 0.2318,
1087
- "step": 1540
1088
- },
1089
- {
1090
- "epoch": 2.0974289580514207,
1091
- "grad_norm": 1.3902319886251955,
1092
- "learning_rate": 1.2568249827858153e-06,
1093
- "loss": 0.2331,
1094
- "step": 1550
1095
- },
1096
- {
1097
- "epoch": 2.1109607577807847,
1098
- "grad_norm": 1.343020265634266,
1099
- "learning_rate": 1.2228247331105541e-06,
1100
- "loss": 0.242,
1101
- "step": 1560
1102
- },
1103
- {
1104
- "epoch": 2.1244925575101488,
1105
- "grad_norm": 1.3796390105439542,
1106
- "learning_rate": 1.1891411888366933e-06,
1107
- "loss": 0.2494,
1108
- "step": 1570
1109
- },
1110
- {
1111
- "epoch": 2.138024357239513,
1112
- "grad_norm": 1.287377449079076,
1113
- "learning_rate": 1.1557827025848048e-06,
1114
- "loss": 0.2373,
1115
- "step": 1580
1116
- },
1117
- {
1118
- "epoch": 2.151556156968877,
1119
- "grad_norm": 1.2636968613031825,
1120
- "learning_rate": 1.122757546369744e-06,
1121
- "loss": 0.2276,
1122
- "step": 1590
1123
- },
1124
- {
1125
- "epoch": 2.165087956698241,
1126
- "grad_norm": 1.3768480512381478,
1127
- "learning_rate": 1.0900739095494053e-06,
1128
- "loss": 0.2392,
1129
- "step": 1600
1130
- },
1131
- {
1132
- "epoch": 2.178619756427605,
1133
- "grad_norm": 1.3082506088448747,
1134
- "learning_rate": 1.0577398967939824e-06,
1135
- "loss": 0.2361,
1136
- "step": 1610
1137
- },
1138
- {
1139
- "epoch": 2.192151556156969,
1140
- "grad_norm": 1.4034690066329039,
1141
- "learning_rate": 1.0257635260762281e-06,
1142
- "loss": 0.2319,
1143
- "step": 1620
1144
- },
1145
- {
1146
- "epoch": 2.205683355886333,
1147
- "grad_norm": 1.2317759668032917,
1148
- "learning_rate": 9.941527266832064e-07,
1149
- "loss": 0.2493,
1150
- "step": 1630
1151
- },
1152
- {
1153
- "epoch": 2.219215155615697,
1154
- "grad_norm": 1.2971411614293273,
1155
- "learning_rate": 9.62915337250044e-07,
1156
- "loss": 0.2277,
1157
- "step": 1640
1158
- },
1159
- {
1160
- "epoch": 2.232746955345061,
1161
- "grad_norm": 1.3135829424346515,
1162
- "learning_rate": 9.320591038161575e-07,
1163
- "loss": 0.2328,
1164
- "step": 1650
1165
- },
1166
- {
1167
- "epoch": 2.246278755074425,
1168
- "grad_norm": 1.38689896774966,
1169
- "learning_rate": 9.015916779044429e-07,
1170
- "loss": 0.2376,
1171
- "step": 1660
1172
- },
1173
- {
1174
- "epoch": 2.259810554803789,
1175
- "grad_norm": 1.3904871269861285,
1176
- "learning_rate": 8.715206146238989e-07,
1177
- "loss": 0.2365,
1178
- "step": 1670
1179
- },
1180
- {
1181
- "epoch": 2.273342354533153,
1182
- "grad_norm": 1.2344911252878241,
1183
- "learning_rate": 8.418533707961635e-07,
1184
- "loss": 0.2398,
1185
- "step": 1680
1186
- },
1187
- {
1188
- "epoch": 2.286874154262517,
1189
- "grad_norm": 1.404914010406795,
1190
- "learning_rate": 8.125973031064241e-07,
1191
- "loss": 0.2375,
1192
- "step": 1690
1193
- },
1194
- {
1195
- "epoch": 2.300405953991881,
1196
- "grad_norm": 1.4005244602892066,
1197
- "learning_rate": 7.837596662791492e-07,
1198
- "loss": 0.2418,
1199
- "step": 1700
1200
- },
1201
- {
1202
- "epoch": 2.313937753721245,
1203
- "grad_norm": 1.3420607852511892,
1204
- "learning_rate": 7.553476112791156e-07,
1205
- "loss": 0.2332,
1206
- "step": 1710
1207
- },
1208
- {
1209
- "epoch": 2.3274695534506087,
1210
- "grad_norm": 1.3547101946313251,
1211
- "learning_rate": 7.273681835381569e-07,
1212
- "loss": 0.229,
1213
- "step": 1720
1214
- },
1215
- {
1216
- "epoch": 2.3410013531799727,
1217
- "grad_norm": 1.4020705762318206,
1218
- "learning_rate": 6.998283212080873e-07,
1219
- "loss": 0.2328,
1220
- "step": 1730
1221
- },
1222
- {
1223
- "epoch": 2.3545331529093367,
1224
- "grad_norm": 1.3623596919516412,
1225
- "learning_rate": 6.727348534402217e-07,
1226
- "loss": 0.2525,
1227
- "step": 1740
1228
- },
1229
- {
1230
- "epoch": 2.3680649526387008,
1231
- "grad_norm": 1.3512700634608603,
1232
- "learning_rate": 6.460944986919296e-07,
1233
- "loss": 0.2408,
1234
- "step": 1750
1235
- },
1236
- {
1237
- "epoch": 2.381596752368065,
1238
- "grad_norm": 1.3266784606321098,
1239
- "learning_rate": 6.199138630606389e-07,
1240
- "loss": 0.234,
1241
- "step": 1760
1242
- },
1243
- {
1244
- "epoch": 2.395128552097429,
1245
- "grad_norm": 1.3645670227625384,
1246
- "learning_rate": 5.941994386456962e-07,
1247
- "loss": 0.2339,
1248
- "step": 1770
1249
- },
1250
- {
1251
- "epoch": 2.408660351826793,
1252
- "grad_norm": 1.4293775086324754,
1253
- "learning_rate": 5.689576019385015e-07,
1254
- "loss": 0.2386,
1255
- "step": 1780
1256
- },
1257
- {
1258
- "epoch": 2.422192151556157,
1259
- "grad_norm": 1.4511738533160017,
1260
- "learning_rate": 5.441946122413086e-07,
1261
- "loss": 0.2456,
1262
- "step": 1790
1263
- },
1264
- {
1265
- "epoch": 2.435723951285521,
1266
- "grad_norm": 1.4144969812794062,
1267
- "learning_rate": 5.199166101150854e-07,
1268
- "loss": 0.242,
1269
- "step": 1800
1270
- },
1271
- {
1272
- "epoch": 2.449255751014885,
1273
- "grad_norm": 1.3760683860213159,
1274
- "learning_rate": 4.96129615856816e-07,
1275
- "loss": 0.2319,
1276
- "step": 1810
1277
- },
1278
- {
1279
- "epoch": 2.462787550744249,
1280
- "grad_norm": 1.2962323322234228,
1281
- "learning_rate": 4.7283952800663086e-07,
1282
- "loss": 0.239,
1283
- "step": 1820
1284
- },
1285
- {
1286
- "epoch": 2.476319350473613,
1287
- "grad_norm": 1.3785919346369462,
1288
- "learning_rate": 4.500521218851234e-07,
1289
- "loss": 0.2291,
1290
- "step": 1830
1291
- },
1292
- {
1293
- "epoch": 2.489851150202977,
1294
- "grad_norm": 1.3198852846590903,
1295
- "learning_rate": 4.2777304816122744e-07,
1296
- "loss": 0.2294,
1297
- "step": 1840
1298
- },
1299
- {
1300
- "epoch": 2.503382949932341,
1301
- "grad_norm": 1.3791429787991882,
1302
- "learning_rate": 4.0600783145099935e-07,
1303
- "loss": 0.2398,
1304
- "step": 1850
1305
- },
1306
- {
1307
- "epoch": 2.516914749661705,
1308
- "grad_norm": 1.3758327911725443,
1309
- "learning_rate": 3.847618689476612e-07,
1310
- "loss": 0.2311,
1311
- "step": 1860
1312
- },
1313
- {
1314
- "epoch": 2.530446549391069,
1315
- "grad_norm": 1.329309433256462,
1316
- "learning_rate": 3.640404290832433e-07,
1317
- "loss": 0.2322,
1318
- "step": 1870
1319
- },
1320
- {
1321
- "epoch": 2.543978349120433,
1322
- "grad_norm": 1.3667475875891038,
1323
- "learning_rate": 3.438486502221494e-07,
1324
- "loss": 0.2471,
1325
- "step": 1880
1326
- },
1327
- {
1328
- "epoch": 2.557510148849797,
1329
- "grad_norm": 1.4518282018049613,
1330
- "learning_rate": 3.2419153938698295e-07,
1331
- "loss": 0.2346,
1332
- "step": 1890
1333
- },
1334
- {
1335
- "epoch": 2.571041948579161,
1336
- "grad_norm": 1.334115365965958,
1337
- "learning_rate": 3.0507397101693565e-07,
1338
- "loss": 0.2383,
1339
- "step": 1900
1340
- },
1341
- {
1342
- "epoch": 2.584573748308525,
1343
- "grad_norm": 1.368769415739159,
1344
- "learning_rate": 2.865006857590619e-07,
1345
- "loss": 0.2344,
1346
- "step": 1910
1347
- },
1348
- {
1349
- "epoch": 2.598105548037889,
1350
- "grad_norm": 1.2174336094447944,
1351
- "learning_rate": 2.684762892927184e-07,
1352
- "loss": 0.2354,
1353
- "step": 1920
1354
- },
1355
- {
1356
- "epoch": 2.611637347767253,
1357
- "grad_norm": 1.3104470556604222,
1358
- "learning_rate": 2.510052511874822e-07,
1359
- "loss": 0.2297,
1360
- "step": 1930
1361
- },
1362
- {
1363
- "epoch": 2.6251691474966172,
1364
- "grad_norm": 1.3766509507609574,
1365
- "learning_rate": 2.3409190379481723e-07,
1366
- "loss": 0.2294,
1367
- "step": 1940
1368
- },
1369
- {
1370
- "epoch": 2.6387009472259813,
1371
- "grad_norm": 1.2110460902512004,
1372
- "learning_rate": 2.1774044117376407e-07,
1373
- "loss": 0.2273,
1374
- "step": 1950
1375
- },
1376
- {
1377
- "epoch": 2.6522327469553453,
1378
- "grad_norm": 1.294674188348231,
1379
- "learning_rate": 2.019549180509267e-07,
1380
- "loss": 0.2304,
1381
- "step": 1960
1382
- },
1383
- {
1384
- "epoch": 2.6657645466847093,
1385
- "grad_norm": 1.4025491240100805,
1386
- "learning_rate": 1.8673924881500826e-07,
1387
- "loss": 0.2361,
1388
- "step": 1970
1389
- },
1390
- {
1391
- "epoch": 2.6792963464140733,
1392
- "grad_norm": 1.3312582202000482,
1393
- "learning_rate": 1.7209720654614793e-07,
1394
- "loss": 0.2417,
1395
- "step": 1980
1396
- },
1397
- {
1398
- "epoch": 2.6928281461434374,
1399
- "grad_norm": 1.279370227399727,
1400
- "learning_rate": 1.580324220802959e-07,
1401
- "loss": 0.2306,
1402
- "step": 1990
1403
- },
1404
- {
1405
- "epoch": 2.706359945872801,
1406
- "grad_norm": 1.32387777543638,
1407
- "learning_rate": 1.4454838310886427e-07,
1408
- "loss": 0.2293,
1409
- "step": 2000
1410
- },
1411
- {
1412
- "epoch": 2.719891745602165,
1413
- "grad_norm": 1.4245015480114729,
1414
- "learning_rate": 1.3164843331387123e-07,
1415
- "loss": 0.2311,
1416
- "step": 2010
1417
- },
1418
- {
1419
- "epoch": 2.733423545331529,
1420
- "grad_norm": 1.3831803662571336,
1421
- "learning_rate": 1.1933577153879993e-07,
1422
- "loss": 0.2387,
1423
- "step": 2020
1424
- },
1425
- {
1426
- "epoch": 2.746955345060893,
1427
- "grad_norm": 1.341165321378542,
1428
- "learning_rate": 1.0761345099536691e-07,
1429
- "loss": 0.2406,
1430
- "step": 2030
1431
- },
1432
- {
1433
- "epoch": 2.760487144790257,
1434
- "grad_norm": 1.4225949777687765,
1435
- "learning_rate": 9.648437850640929e-08,
1436
- "loss": 0.2334,
1437
- "step": 2040
1438
- },
1439
- {
1440
- "epoch": 2.774018944519621,
1441
- "grad_norm": 1.2903746437594332,
1442
- "learning_rate": 8.595131378507038e-08,
1443
- "loss": 0.2302,
1444
- "step": 2050
1445
- },
1446
- {
1447
- "epoch": 2.787550744248985,
1448
- "grad_norm": 1.4459041867407572,
1449
- "learning_rate": 7.601686875046338e-08,
1450
- "loss": 0.2357,
1451
- "step": 2060
1452
- },
1453
- {
1454
- "epoch": 2.801082543978349,
1455
- "grad_norm": 1.43510005352293,
1456
- "learning_rate": 6.668350687998565e-08,
1457
- "loss": 0.2354,
1458
- "step": 2070
1459
- },
1460
- {
1461
- "epoch": 2.814614343707713,
1462
- "grad_norm": 1.3495037373612448,
1463
- "learning_rate": 5.7953542598441215e-08,
1464
- "loss": 0.2299,
1465
- "step": 2080
1466
- },
1467
- {
1468
- "epoch": 2.828146143437077,
1469
- "grad_norm": 1.2822048194805191,
1470
- "learning_rate": 4.9829140704127644e-08,
1471
- "loss": 0.2304,
1472
- "step": 2090
1473
- },
1474
- {
1475
- "epoch": 2.841677943166441,
1476
- "grad_norm": 1.4152294147048579,
1477
- "learning_rate": 4.231231583201989e-08,
1478
- "loss": 0.2373,
1479
- "step": 2100
1480
- },
1481
- {
1482
- "epoch": 2.855209742895805,
1483
- "grad_norm": 1.4204373591003197,
1484
- "learning_rate": 3.5404931954197696e-08,
1485
- "loss": 0.2359,
1486
- "step": 2110
1487
- },
1488
- {
1489
- "epoch": 2.8687415426251692,
1490
- "grad_norm": 1.3793380734018237,
1491
- "learning_rate": 2.9108701917630043e-08,
1492
- "loss": 0.2447,
1493
- "step": 2120
1494
- },
1495
- {
1496
- "epoch": 2.8822733423545333,
1497
- "grad_norm": 1.2804260757138324,
1498
- "learning_rate": 2.3425187019432415e-08,
1499
- "loss": 0.2446,
1500
- "step": 2130
1501
- },
1502
- {
1503
- "epoch": 2.8958051420838973,
1504
- "grad_norm": 1.3777377768354557,
1505
- "learning_rate": 1.8355796619708988e-08,
1506
- "loss": 0.2367,
1507
- "step": 2140
1508
- },
1509
- {
1510
- "epoch": 2.9093369418132613,
1511
- "grad_norm": 1.3764369341392175,
1512
- "learning_rate": 1.39017877920683e-08,
1513
- "loss": 0.2357,
1514
- "step": 2150
1515
- },
1516
- {
1517
- "epoch": 2.9228687415426253,
1518
- "grad_norm": 1.4731353222848615,
1519
- "learning_rate": 1.006426501190233e-08,
1520
- "loss": 0.2355,
1521
- "step": 2160
1522
- },
1523
- {
1524
- "epoch": 2.936400541271989,
1525
- "grad_norm": 1.3695944357270635,
1526
- "learning_rate": 6.844179882506685e-09,
1527
- "loss": 0.2312,
1528
- "step": 2170
1529
- },
1530
- {
1531
- "epoch": 2.949932341001353,
1532
- "grad_norm": 1.3973969195223515,
1533
- "learning_rate": 4.242330899106861e-09,
1534
- "loss": 0.2403,
1535
- "step": 2180
1536
- },
1537
- {
1538
- "epoch": 2.963464140730717,
1539
- "grad_norm": 1.330953340046642,
1540
- "learning_rate": 2.259363250854685e-09,
1541
- "loss": 0.2264,
1542
- "step": 2190
1543
- },
1544
- {
1545
- "epoch": 2.976995940460081,
1546
- "grad_norm": 1.305324417569301,
1547
- "learning_rate": 8.957686608371263e-10,
1548
- "loss": 0.2328,
1549
- "step": 2200
1550
- },
1551
- {
1552
- "epoch": 2.990527740189445,
1553
- "grad_norm": 1.349805214463894,
1554
- "learning_rate": 1.5188526414244842e-10,
1555
- "loss": 0.2348,
1556
- "step": 2210
1557
- },
1558
- {
1559
- "epoch": 3.0,
1560
- "step": 2217,
1561
- "total_flos": 438250533027840.0,
1562
- "train_loss": 0.3635299830593717,
1563
- "train_runtime": 20239.4479,
1564
- "train_samples_per_second": 14.013,
1565
- "train_steps_per_second": 0.11
1566
- }
1567
- ],
1568
- "logging_steps": 10,
1569
- "max_steps": 2217,
1570
- "num_input_tokens_seen": 0,
1571
- "num_train_epochs": 3,
1572
- "save_steps": 10086,
1573
- "stateful_callbacks": {
1574
- "TrainerControl": {
1575
- "args": {
1576
- "should_epoch_stop": false,
1577
- "should_evaluate": false,
1578
- "should_log": false,
1579
- "should_save": false,
1580
- "should_training_stop": false
1581
- },
1582
- "attributes": {}
1583
- }
1584
- },
1585
- "total_flos": 438250533027840.0,
1586
- "train_batch_size": 8,
1587
- "trial_name": null,
1588
- "trial_params": null
1589
- }