demo_LID_ntu-spml_distilhubert / trainer_state.json
ecampbelldspPhD's picture
End of training
47eddce verified
{
"best_metric": 0.6554008152173914,
"best_model_checkpoint": "demo_LID_ntu-spml_distilhubert/checkpoint-6930",
"epoch": 9.99891891891892,
"eval_steps": 500,
"global_step": 6930,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014414414414414415,
"grad_norm": 2.169387102127075,
"learning_rate": 4.329004329004329e-06,
"loss": 15.2197,
"step": 10
},
{
"epoch": 0.02882882882882883,
"grad_norm": 2.4440665245056152,
"learning_rate": 8.658008658008657e-06,
"loss": 15.2046,
"step": 20
},
{
"epoch": 0.043243243243243246,
"grad_norm": 1.9768311977386475,
"learning_rate": 1.2987012987012986e-05,
"loss": 15.2027,
"step": 30
},
{
"epoch": 0.05765765765765766,
"grad_norm": 2.598134994506836,
"learning_rate": 1.7316017316017315e-05,
"loss": 15.1842,
"step": 40
},
{
"epoch": 0.07207207207207207,
"grad_norm": 2.2137622833251953,
"learning_rate": 2.164502164502164e-05,
"loss": 15.1876,
"step": 50
},
{
"epoch": 0.08648648648648649,
"grad_norm": 2.252912759780884,
"learning_rate": 2.5974025974025972e-05,
"loss": 15.172,
"step": 60
},
{
"epoch": 0.1009009009009009,
"grad_norm": 2.699625015258789,
"learning_rate": 3.03030303030303e-05,
"loss": 15.1004,
"step": 70
},
{
"epoch": 0.11531531531531532,
"grad_norm": 2.774757146835327,
"learning_rate": 3.463203463203463e-05,
"loss": 15.0877,
"step": 80
},
{
"epoch": 0.12972972972972974,
"grad_norm": 2.9454381465911865,
"learning_rate": 3.896103896103895e-05,
"loss": 15.0704,
"step": 90
},
{
"epoch": 0.14414414414414414,
"grad_norm": 3.3984997272491455,
"learning_rate": 4.329004329004328e-05,
"loss": 15.0211,
"step": 100
},
{
"epoch": 0.15855855855855855,
"grad_norm": 3.1876633167266846,
"learning_rate": 4.7619047619047614e-05,
"loss": 14.9973,
"step": 110
},
{
"epoch": 0.17297297297297298,
"grad_norm": 3.87903094291687,
"learning_rate": 5.1948051948051944e-05,
"loss": 14.9534,
"step": 120
},
{
"epoch": 0.1873873873873874,
"grad_norm": 4.1114983558654785,
"learning_rate": 5.627705627705627e-05,
"loss": 14.8464,
"step": 130
},
{
"epoch": 0.2018018018018018,
"grad_norm": 3.837207078933716,
"learning_rate": 6.06060606060606e-05,
"loss": 14.8935,
"step": 140
},
{
"epoch": 0.21621621621621623,
"grad_norm": 3.978295087814331,
"learning_rate": 6.493506493506494e-05,
"loss": 14.8274,
"step": 150
},
{
"epoch": 0.23063063063063063,
"grad_norm": 5.2168145179748535,
"learning_rate": 6.926406926406926e-05,
"loss": 14.7065,
"step": 160
},
{
"epoch": 0.24504504504504504,
"grad_norm": 5.752880096435547,
"learning_rate": 7.359307359307358e-05,
"loss": 14.6178,
"step": 170
},
{
"epoch": 0.2594594594594595,
"grad_norm": 6.018016338348389,
"learning_rate": 7.79220779220779e-05,
"loss": 14.4008,
"step": 180
},
{
"epoch": 0.27387387387387385,
"grad_norm": 5.537229537963867,
"learning_rate": 8.225108225108224e-05,
"loss": 14.3105,
"step": 190
},
{
"epoch": 0.2882882882882883,
"grad_norm": 6.358255863189697,
"learning_rate": 8.658008658008657e-05,
"loss": 14.1688,
"step": 200
},
{
"epoch": 0.3027027027027027,
"grad_norm": 6.9536356925964355,
"learning_rate": 9.09090909090909e-05,
"loss": 14.1205,
"step": 210
},
{
"epoch": 0.3171171171171171,
"grad_norm": 8.093494415283203,
"learning_rate": 9.523809523809523e-05,
"loss": 14.1292,
"step": 220
},
{
"epoch": 0.33153153153153153,
"grad_norm": 6.803300380706787,
"learning_rate": 9.956709956709956e-05,
"loss": 13.9276,
"step": 230
},
{
"epoch": 0.34594594594594597,
"grad_norm": 6.665808200836182,
"learning_rate": 0.00010389610389610389,
"loss": 13.9136,
"step": 240
},
{
"epoch": 0.36036036036036034,
"grad_norm": 10.191052436828613,
"learning_rate": 0.00010822510822510823,
"loss": 13.708,
"step": 250
},
{
"epoch": 0.3747747747747748,
"grad_norm": 7.783840656280518,
"learning_rate": 0.00011255411255411254,
"loss": 13.6658,
"step": 260
},
{
"epoch": 0.3891891891891892,
"grad_norm": 11.964157104492188,
"learning_rate": 0.00011688311688311687,
"loss": 13.6014,
"step": 270
},
{
"epoch": 0.4036036036036036,
"grad_norm": 7.828129291534424,
"learning_rate": 0.0001212121212121212,
"loss": 13.3956,
"step": 280
},
{
"epoch": 0.418018018018018,
"grad_norm": 8.642557144165039,
"learning_rate": 0.00012554112554112555,
"loss": 13.4701,
"step": 290
},
{
"epoch": 0.43243243243243246,
"grad_norm": 8.499011993408203,
"learning_rate": 0.00012987012987012987,
"loss": 13.2608,
"step": 300
},
{
"epoch": 0.44684684684684683,
"grad_norm": 9.103832244873047,
"learning_rate": 0.0001341991341991342,
"loss": 12.8141,
"step": 310
},
{
"epoch": 0.46126126126126127,
"grad_norm": 8.243462562561035,
"learning_rate": 0.00013852813852813852,
"loss": 12.8678,
"step": 320
},
{
"epoch": 0.4756756756756757,
"grad_norm": 12.445680618286133,
"learning_rate": 0.00014285714285714284,
"loss": 12.9204,
"step": 330
},
{
"epoch": 0.4900900900900901,
"grad_norm": 10.037951469421387,
"learning_rate": 0.00014718614718614716,
"loss": 12.9456,
"step": 340
},
{
"epoch": 0.5045045045045045,
"grad_norm": 14.364166259765625,
"learning_rate": 0.00015151515151515152,
"loss": 12.5126,
"step": 350
},
{
"epoch": 0.518918918918919,
"grad_norm": 10.338336944580078,
"learning_rate": 0.0001558441558441558,
"loss": 13.2546,
"step": 360
},
{
"epoch": 0.5333333333333333,
"grad_norm": 9.899740219116211,
"learning_rate": 0.00016017316017316016,
"loss": 12.3445,
"step": 370
},
{
"epoch": 0.5477477477477477,
"grad_norm": 11.309089660644531,
"learning_rate": 0.00016450216450216449,
"loss": 12.2799,
"step": 380
},
{
"epoch": 0.5621621621621622,
"grad_norm": 11.268434524536133,
"learning_rate": 0.00016883116883116884,
"loss": 12.1578,
"step": 390
},
{
"epoch": 0.5765765765765766,
"grad_norm": 9.793964385986328,
"learning_rate": 0.00017316017316017313,
"loss": 11.9812,
"step": 400
},
{
"epoch": 0.590990990990991,
"grad_norm": 11.267273902893066,
"learning_rate": 0.00017748917748917746,
"loss": 12.1401,
"step": 410
},
{
"epoch": 0.6054054054054054,
"grad_norm": 10.383160591125488,
"learning_rate": 0.0001818181818181818,
"loss": 12.0603,
"step": 420
},
{
"epoch": 0.6198198198198198,
"grad_norm": 14.343868255615234,
"learning_rate": 0.00018614718614718616,
"loss": 11.2182,
"step": 430
},
{
"epoch": 0.6342342342342342,
"grad_norm": 13.931622505187988,
"learning_rate": 0.00019047619047619045,
"loss": 11.6929,
"step": 440
},
{
"epoch": 0.6486486486486487,
"grad_norm": 12.756230354309082,
"learning_rate": 0.00019480519480519478,
"loss": 11.9651,
"step": 450
},
{
"epoch": 0.6630630630630631,
"grad_norm": 13.018777847290039,
"learning_rate": 0.00019913419913419913,
"loss": 11.6416,
"step": 460
},
{
"epoch": 0.6774774774774774,
"grad_norm": 13.232623100280762,
"learning_rate": 0.00020346320346320345,
"loss": 11.4997,
"step": 470
},
{
"epoch": 0.6918918918918919,
"grad_norm": 12.543861389160156,
"learning_rate": 0.00020779220779220778,
"loss": 11.5597,
"step": 480
},
{
"epoch": 0.7063063063063063,
"grad_norm": 12.517231941223145,
"learning_rate": 0.0002121212121212121,
"loss": 11.1162,
"step": 490
},
{
"epoch": 0.7207207207207207,
"grad_norm": 14.61859130859375,
"learning_rate": 0.00021645021645021645,
"loss": 11.2086,
"step": 500
},
{
"epoch": 0.7351351351351352,
"grad_norm": 14.246715545654297,
"learning_rate": 0.00022077922077922075,
"loss": 11.2519,
"step": 510
},
{
"epoch": 0.7495495495495496,
"grad_norm": 13.88980484008789,
"learning_rate": 0.00022510822510822507,
"loss": 10.9391,
"step": 520
},
{
"epoch": 0.7639639639639639,
"grad_norm": 14.310384750366211,
"learning_rate": 0.00022943722943722942,
"loss": 10.7129,
"step": 530
},
{
"epoch": 0.7783783783783784,
"grad_norm": 13.765666007995605,
"learning_rate": 0.00023376623376623374,
"loss": 11.218,
"step": 540
},
{
"epoch": 0.7927927927927928,
"grad_norm": 17.789613723754883,
"learning_rate": 0.00023809523809523807,
"loss": 10.2992,
"step": 550
},
{
"epoch": 0.8072072072072072,
"grad_norm": 17.212533950805664,
"learning_rate": 0.0002424242424242424,
"loss": 11.1959,
"step": 560
},
{
"epoch": 0.8216216216216217,
"grad_norm": 14.872720718383789,
"learning_rate": 0.00024675324675324674,
"loss": 9.933,
"step": 570
},
{
"epoch": 0.836036036036036,
"grad_norm": 14.751778602600098,
"learning_rate": 0.0002510822510822511,
"loss": 10.2721,
"step": 580
},
{
"epoch": 0.8504504504504504,
"grad_norm": 13.110413551330566,
"learning_rate": 0.0002554112554112554,
"loss": 10.0697,
"step": 590
},
{
"epoch": 0.8648648648648649,
"grad_norm": 14.484004020690918,
"learning_rate": 0.00025974025974025974,
"loss": 10.6599,
"step": 600
},
{
"epoch": 0.8792792792792793,
"grad_norm": 15.150849342346191,
"learning_rate": 0.00026406926406926404,
"loss": 10.3077,
"step": 610
},
{
"epoch": 0.8936936936936937,
"grad_norm": 19.270540237426758,
"learning_rate": 0.0002683982683982684,
"loss": 10.2954,
"step": 620
},
{
"epoch": 0.9081081081081082,
"grad_norm": 17.365564346313477,
"learning_rate": 0.0002727272727272727,
"loss": 10.2966,
"step": 630
},
{
"epoch": 0.9225225225225225,
"grad_norm": 23.610044479370117,
"learning_rate": 0.00027705627705627703,
"loss": 9.4401,
"step": 640
},
{
"epoch": 0.9369369369369369,
"grad_norm": 16.38220977783203,
"learning_rate": 0.0002813852813852814,
"loss": 9.8423,
"step": 650
},
{
"epoch": 0.9513513513513514,
"grad_norm": 18.670101165771484,
"learning_rate": 0.0002857142857142857,
"loss": 10.2396,
"step": 660
},
{
"epoch": 0.9657657657657658,
"grad_norm": 20.733997344970703,
"learning_rate": 0.00029004329004329003,
"loss": 9.3347,
"step": 670
},
{
"epoch": 0.9801801801801802,
"grad_norm": 18.066375732421875,
"learning_rate": 0.00029437229437229433,
"loss": 10.4626,
"step": 680
},
{
"epoch": 0.9945945945945946,
"grad_norm": 18.0963191986084,
"learning_rate": 0.0002987012987012987,
"loss": 9.6557,
"step": 690
},
{
"epoch": 0.9989189189189189,
"eval_accuracy": 0.26137907608695654,
"eval_loss": 2.65486216545105,
"eval_runtime": 541.7254,
"eval_samples_per_second": 10.869,
"eval_steps_per_second": 10.869,
"step": 693
},
{
"epoch": 1.01009009009009,
"grad_norm": 15.17456340789795,
"learning_rate": 0.00029966329966329963,
"loss": 10.1474,
"step": 700
},
{
"epoch": 1.0245045045045045,
"grad_norm": 19.106407165527344,
"learning_rate": 0.00029918229918229916,
"loss": 8.6672,
"step": 710
},
{
"epoch": 1.038918918918919,
"grad_norm": 16.296113967895508,
"learning_rate": 0.0002987012987012987,
"loss": 8.7251,
"step": 720
},
{
"epoch": 1.0533333333333332,
"grad_norm": 22.187761306762695,
"learning_rate": 0.00029826839826839827,
"loss": 9.2252,
"step": 730
},
{
"epoch": 1.0677477477477477,
"grad_norm": 17.774612426757812,
"learning_rate": 0.00029778739778739773,
"loss": 8.3988,
"step": 740
},
{
"epoch": 1.0821621621621622,
"grad_norm": 22.759864807128906,
"learning_rate": 0.0002973063973063973,
"loss": 8.4637,
"step": 750
},
{
"epoch": 1.0965765765765765,
"grad_norm": 22.068397521972656,
"learning_rate": 0.0002968253968253968,
"loss": 9.4532,
"step": 760
},
{
"epoch": 1.110990990990991,
"grad_norm": 22.11869239807129,
"learning_rate": 0.0002963443963443963,
"loss": 8.5823,
"step": 770
},
{
"epoch": 1.1254054054054055,
"grad_norm": 20.577394485473633,
"learning_rate": 0.0002958633958633958,
"loss": 8.8257,
"step": 780
},
{
"epoch": 1.1398198198198197,
"grad_norm": 19.24051856994629,
"learning_rate": 0.00029538239538239535,
"loss": 8.4165,
"step": 790
},
{
"epoch": 1.1542342342342342,
"grad_norm": 18.745025634765625,
"learning_rate": 0.00029490139490139487,
"loss": 8.4419,
"step": 800
},
{
"epoch": 1.1686486486486487,
"grad_norm": 16.836870193481445,
"learning_rate": 0.0002944203944203944,
"loss": 8.2076,
"step": 810
},
{
"epoch": 1.183063063063063,
"grad_norm": 23.824594497680664,
"learning_rate": 0.0002939393939393939,
"loss": 7.8032,
"step": 820
},
{
"epoch": 1.1974774774774775,
"grad_norm": 17.577869415283203,
"learning_rate": 0.00029345839345839344,
"loss": 8.3441,
"step": 830
},
{
"epoch": 1.211891891891892,
"grad_norm": 17.508779525756836,
"learning_rate": 0.00029297739297739296,
"loss": 8.1213,
"step": 840
},
{
"epoch": 1.2263063063063062,
"grad_norm": 16.90478515625,
"learning_rate": 0.0002924963924963925,
"loss": 7.6077,
"step": 850
},
{
"epoch": 1.2407207207207207,
"grad_norm": 20.760663986206055,
"learning_rate": 0.000292015392015392,
"loss": 7.8654,
"step": 860
},
{
"epoch": 1.2551351351351352,
"grad_norm": 20.966073989868164,
"learning_rate": 0.00029153439153439153,
"loss": 7.7627,
"step": 870
},
{
"epoch": 1.2695495495495495,
"grad_norm": 18.766395568847656,
"learning_rate": 0.000291053391053391,
"loss": 7.0404,
"step": 880
},
{
"epoch": 1.283963963963964,
"grad_norm": 20.34043312072754,
"learning_rate": 0.0002905723905723906,
"loss": 8.2117,
"step": 890
},
{
"epoch": 1.2983783783783784,
"grad_norm": 22.05991554260254,
"learning_rate": 0.00029009139009139004,
"loss": 7.5249,
"step": 900
},
{
"epoch": 1.3127927927927927,
"grad_norm": 18.58563232421875,
"learning_rate": 0.00028961038961038956,
"loss": 7.9662,
"step": 910
},
{
"epoch": 1.3272072072072072,
"grad_norm": 18.942352294921875,
"learning_rate": 0.0002891293891293891,
"loss": 7.7609,
"step": 920
},
{
"epoch": 1.3416216216216217,
"grad_norm": 23.675949096679688,
"learning_rate": 0.0002886483886483886,
"loss": 7.4968,
"step": 930
},
{
"epoch": 1.356036036036036,
"grad_norm": 22.53910255432129,
"learning_rate": 0.00028816738816738813,
"loss": 7.9113,
"step": 940
},
{
"epoch": 1.3704504504504504,
"grad_norm": 21.479690551757812,
"learning_rate": 0.00028768638768638766,
"loss": 6.8956,
"step": 950
},
{
"epoch": 1.384864864864865,
"grad_norm": 20.469209671020508,
"learning_rate": 0.0002872053872053872,
"loss": 7.2737,
"step": 960
},
{
"epoch": 1.3992792792792792,
"grad_norm": 17.538774490356445,
"learning_rate": 0.0002867243867243867,
"loss": 7.2458,
"step": 970
},
{
"epoch": 1.4136936936936937,
"grad_norm": 22.793577194213867,
"learning_rate": 0.0002862433862433862,
"loss": 7.2339,
"step": 980
},
{
"epoch": 1.4281081081081082,
"grad_norm": 18.235897064208984,
"learning_rate": 0.00028576238576238575,
"loss": 7.6416,
"step": 990
},
{
"epoch": 1.4425225225225224,
"grad_norm": 24.108549118041992,
"learning_rate": 0.00028528138528138527,
"loss": 7.5449,
"step": 1000
},
{
"epoch": 1.456936936936937,
"grad_norm": 23.248693466186523,
"learning_rate": 0.0002848003848003848,
"loss": 7.0878,
"step": 1010
},
{
"epoch": 1.4713513513513514,
"grad_norm": 20.034454345703125,
"learning_rate": 0.00028431938431938426,
"loss": 7.426,
"step": 1020
},
{
"epoch": 1.4857657657657657,
"grad_norm": 22.129047393798828,
"learning_rate": 0.00028383838383838384,
"loss": 6.9635,
"step": 1030
},
{
"epoch": 1.5001801801801802,
"grad_norm": 20.906335830688477,
"learning_rate": 0.0002833573833573833,
"loss": 7.1704,
"step": 1040
},
{
"epoch": 1.5145945945945947,
"grad_norm": 22.88907814025879,
"learning_rate": 0.0002828763828763829,
"loss": 7.1875,
"step": 1050
},
{
"epoch": 1.529009009009009,
"grad_norm": 23.162479400634766,
"learning_rate": 0.00028239538239538235,
"loss": 7.665,
"step": 1060
},
{
"epoch": 1.5434234234234234,
"grad_norm": 22.069990158081055,
"learning_rate": 0.00028191438191438187,
"loss": 7.0347,
"step": 1070
},
{
"epoch": 1.557837837837838,
"grad_norm": 21.646320343017578,
"learning_rate": 0.0002814333814333814,
"loss": 7.4735,
"step": 1080
},
{
"epoch": 1.5722522522522522,
"grad_norm": 22.21576499938965,
"learning_rate": 0.0002809523809523809,
"loss": 7.3836,
"step": 1090
},
{
"epoch": 1.5866666666666667,
"grad_norm": 17.76190757751465,
"learning_rate": 0.00028047138047138044,
"loss": 7.2981,
"step": 1100
},
{
"epoch": 1.6010810810810812,
"grad_norm": 15.208210945129395,
"learning_rate": 0.00027999037999037996,
"loss": 6.1374,
"step": 1110
},
{
"epoch": 1.6154954954954954,
"grad_norm": 24.096397399902344,
"learning_rate": 0.0002795093795093795,
"loss": 6.3449,
"step": 1120
},
{
"epoch": 1.62990990990991,
"grad_norm": 23.264659881591797,
"learning_rate": 0.000279028379028379,
"loss": 6.9955,
"step": 1130
},
{
"epoch": 1.6443243243243244,
"grad_norm": 23.365312576293945,
"learning_rate": 0.00027854737854737853,
"loss": 6.7135,
"step": 1140
},
{
"epoch": 1.6587387387387387,
"grad_norm": 18.671892166137695,
"learning_rate": 0.00027806637806637805,
"loss": 6.3113,
"step": 1150
},
{
"epoch": 1.6731531531531532,
"grad_norm": 22.89389991760254,
"learning_rate": 0.0002775853775853776,
"loss": 6.6979,
"step": 1160
},
{
"epoch": 1.6875675675675677,
"grad_norm": 22.493839263916016,
"learning_rate": 0.0002771043771043771,
"loss": 5.7641,
"step": 1170
},
{
"epoch": 1.701981981981982,
"grad_norm": 24.027435302734375,
"learning_rate": 0.00027662337662337657,
"loss": 7.2983,
"step": 1180
},
{
"epoch": 1.7163963963963964,
"grad_norm": 19.027225494384766,
"learning_rate": 0.00027614237614237614,
"loss": 6.2111,
"step": 1190
},
{
"epoch": 1.730810810810811,
"grad_norm": 27.56620979309082,
"learning_rate": 0.0002756613756613756,
"loss": 6.7366,
"step": 1200
},
{
"epoch": 1.7452252252252252,
"grad_norm": 16.027616500854492,
"learning_rate": 0.00027518037518037513,
"loss": 6.1943,
"step": 1210
},
{
"epoch": 1.7596396396396397,
"grad_norm": 20.16025161743164,
"learning_rate": 0.0002746993746993747,
"loss": 6.3816,
"step": 1220
},
{
"epoch": 1.7740540540540541,
"grad_norm": 13.574505805969238,
"learning_rate": 0.0002742183742183742,
"loss": 5.9191,
"step": 1230
},
{
"epoch": 1.7884684684684684,
"grad_norm": 19.855785369873047,
"learning_rate": 0.0002737373737373737,
"loss": 6.3663,
"step": 1240
},
{
"epoch": 1.802882882882883,
"grad_norm": 20.211448669433594,
"learning_rate": 0.0002732563732563732,
"loss": 6.4382,
"step": 1250
},
{
"epoch": 1.8172972972972974,
"grad_norm": 21.60570526123047,
"learning_rate": 0.00027277537277537275,
"loss": 6.7056,
"step": 1260
},
{
"epoch": 1.8317117117117117,
"grad_norm": 20.5418758392334,
"learning_rate": 0.00027229437229437227,
"loss": 5.5842,
"step": 1270
},
{
"epoch": 1.8461261261261261,
"grad_norm": 27.491355895996094,
"learning_rate": 0.0002718133718133718,
"loss": 5.9011,
"step": 1280
},
{
"epoch": 1.8605405405405406,
"grad_norm": 23.979827880859375,
"learning_rate": 0.0002713323713323713,
"loss": 6.084,
"step": 1290
},
{
"epoch": 1.874954954954955,
"grad_norm": 18.55582618713379,
"learning_rate": 0.00027085137085137084,
"loss": 6.0097,
"step": 1300
},
{
"epoch": 1.8893693693693694,
"grad_norm": 19.917762756347656,
"learning_rate": 0.00027037037037037036,
"loss": 5.7525,
"step": 1310
},
{
"epoch": 1.9037837837837839,
"grad_norm": 17.546810150146484,
"learning_rate": 0.00026988936988936983,
"loss": 6.3093,
"step": 1320
},
{
"epoch": 1.9181981981981981,
"grad_norm": 26.043676376342773,
"learning_rate": 0.0002694083694083694,
"loss": 5.9062,
"step": 1330
},
{
"epoch": 1.9326126126126126,
"grad_norm": 22.03000831604004,
"learning_rate": 0.00026892736892736893,
"loss": 6.4594,
"step": 1340
},
{
"epoch": 1.9470270270270271,
"grad_norm": 23.965402603149414,
"learning_rate": 0.0002684463684463684,
"loss": 6.3053,
"step": 1350
},
{
"epoch": 1.9614414414414414,
"grad_norm": 21.040790557861328,
"learning_rate": 0.000267965367965368,
"loss": 5.4142,
"step": 1360
},
{
"epoch": 1.9758558558558559,
"grad_norm": 22.65288543701172,
"learning_rate": 0.00026748436748436744,
"loss": 6.5429,
"step": 1370
},
{
"epoch": 1.9902702702702704,
"grad_norm": 19.748960494995117,
"learning_rate": 0.00026700336700336696,
"loss": 6.1707,
"step": 1380
},
{
"epoch": 1.998918918918919,
"eval_accuracy": 0.468070652173913,
"eval_loss": 1.8478443622589111,
"eval_runtime": 536.954,
"eval_samples_per_second": 10.966,
"eval_steps_per_second": 10.966,
"step": 1386
},
{
"epoch": 2.0057657657657657,
"grad_norm": 20.129833221435547,
"learning_rate": 0.0002665223665223665,
"loss": 5.5637,
"step": 1390
},
{
"epoch": 2.02018018018018,
"grad_norm": 18.542203903198242,
"learning_rate": 0.000266041366041366,
"loss": 4.8547,
"step": 1400
},
{
"epoch": 2.0345945945945947,
"grad_norm": 16.80269432067871,
"learning_rate": 0.00026556036556036553,
"loss": 4.9395,
"step": 1410
},
{
"epoch": 2.049009009009009,
"grad_norm": 24.43153953552246,
"learning_rate": 0.00026507936507936506,
"loss": 4.8408,
"step": 1420
},
{
"epoch": 2.063423423423423,
"grad_norm": 20.406522750854492,
"learning_rate": 0.0002645983645983646,
"loss": 4.3663,
"step": 1430
},
{
"epoch": 2.077837837837838,
"grad_norm": 17.540870666503906,
"learning_rate": 0.0002641173641173641,
"loss": 3.6172,
"step": 1440
},
{
"epoch": 2.092252252252252,
"grad_norm": 22.39369773864746,
"learning_rate": 0.0002636363636363636,
"loss": 4.5143,
"step": 1450
},
{
"epoch": 2.1066666666666665,
"grad_norm": 24.582853317260742,
"learning_rate": 0.00026315536315536315,
"loss": 4.4835,
"step": 1460
},
{
"epoch": 2.121081081081081,
"grad_norm": 22.656949996948242,
"learning_rate": 0.00026267436267436267,
"loss": 4.4713,
"step": 1470
},
{
"epoch": 2.1354954954954954,
"grad_norm": 22.375396728515625,
"learning_rate": 0.0002621933621933622,
"loss": 4.4695,
"step": 1480
},
{
"epoch": 2.1499099099099097,
"grad_norm": 17.02708625793457,
"learning_rate": 0.00026171236171236166,
"loss": 3.8927,
"step": 1490
},
{
"epoch": 2.1643243243243244,
"grad_norm": 19.711584091186523,
"learning_rate": 0.00026123136123136124,
"loss": 3.9472,
"step": 1500
},
{
"epoch": 2.1787387387387387,
"grad_norm": 18.87154197692871,
"learning_rate": 0.0002607503607503607,
"loss": 4.8518,
"step": 1510
},
{
"epoch": 2.193153153153153,
"grad_norm": 25.693981170654297,
"learning_rate": 0.0002602693602693603,
"loss": 4.6599,
"step": 1520
},
{
"epoch": 2.2075675675675677,
"grad_norm": 15.880191802978516,
"learning_rate": 0.00025978835978835975,
"loss": 4.1435,
"step": 1530
},
{
"epoch": 2.221981981981982,
"grad_norm": 20.515146255493164,
"learning_rate": 0.00025930735930735927,
"loss": 4.1378,
"step": 1540
},
{
"epoch": 2.236396396396396,
"grad_norm": 23.654556274414062,
"learning_rate": 0.0002588263588263588,
"loss": 4.4749,
"step": 1550
},
{
"epoch": 2.250810810810811,
"grad_norm": 25.85966682434082,
"learning_rate": 0.0002583453583453583,
"loss": 4.2029,
"step": 1560
},
{
"epoch": 2.265225225225225,
"grad_norm": 21.542530059814453,
"learning_rate": 0.00025786435786435784,
"loss": 4.6039,
"step": 1570
},
{
"epoch": 2.2796396396396394,
"grad_norm": 19.57372283935547,
"learning_rate": 0.00025738335738335736,
"loss": 4.0779,
"step": 1580
},
{
"epoch": 2.294054054054054,
"grad_norm": 20.794376373291016,
"learning_rate": 0.0002569023569023569,
"loss": 4.7794,
"step": 1590
},
{
"epoch": 2.3084684684684684,
"grad_norm": 23.753938674926758,
"learning_rate": 0.0002564213564213564,
"loss": 4.8506,
"step": 1600
},
{
"epoch": 2.3228828828828827,
"grad_norm": 19.38469123840332,
"learning_rate": 0.00025594035594035593,
"loss": 3.4325,
"step": 1610
},
{
"epoch": 2.3372972972972974,
"grad_norm": 21.55483627319336,
"learning_rate": 0.00025545935545935545,
"loss": 4.151,
"step": 1620
},
{
"epoch": 2.3517117117117117,
"grad_norm": 24.347623825073242,
"learning_rate": 0.000254978354978355,
"loss": 4.3691,
"step": 1630
},
{
"epoch": 2.366126126126126,
"grad_norm": 22.3781795501709,
"learning_rate": 0.0002544973544973545,
"loss": 4.5897,
"step": 1640
},
{
"epoch": 2.3805405405405407,
"grad_norm": 23.88686180114746,
"learning_rate": 0.00025401635401635397,
"loss": 4.1445,
"step": 1650
},
{
"epoch": 2.394954954954955,
"grad_norm": 22.73502540588379,
"learning_rate": 0.00025353535353535354,
"loss": 4.7023,
"step": 1660
},
{
"epoch": 2.409369369369369,
"grad_norm": 28.19312286376953,
"learning_rate": 0.000253054353054353,
"loss": 4.1145,
"step": 1670
},
{
"epoch": 2.423783783783784,
"grad_norm": 18.269119262695312,
"learning_rate": 0.00025257335257335253,
"loss": 4.2782,
"step": 1680
},
{
"epoch": 2.438198198198198,
"grad_norm": 23.031797409057617,
"learning_rate": 0.00025209235209235206,
"loss": 4.1351,
"step": 1690
},
{
"epoch": 2.4526126126126124,
"grad_norm": 29.572736740112305,
"learning_rate": 0.0002516113516113516,
"loss": 3.9022,
"step": 1700
},
{
"epoch": 2.467027027027027,
"grad_norm": 27.48060417175293,
"learning_rate": 0.0002511303511303511,
"loss": 4.2383,
"step": 1710
},
{
"epoch": 2.4814414414414414,
"grad_norm": 20.07984733581543,
"learning_rate": 0.0002506493506493506,
"loss": 4.8254,
"step": 1720
},
{
"epoch": 2.4958558558558557,
"grad_norm": 15.536605834960938,
"learning_rate": 0.00025016835016835015,
"loss": 4.4781,
"step": 1730
},
{
"epoch": 2.5102702702702704,
"grad_norm": 24.318782806396484,
"learning_rate": 0.00024968734968734967,
"loss": 3.9879,
"step": 1740
},
{
"epoch": 2.5246846846846847,
"grad_norm": 16.27837562561035,
"learning_rate": 0.0002492063492063492,
"loss": 3.9869,
"step": 1750
},
{
"epoch": 2.539099099099099,
"grad_norm": 17.794788360595703,
"learning_rate": 0.0002487253487253487,
"loss": 3.9309,
"step": 1760
},
{
"epoch": 2.5535135135135136,
"grad_norm": 21.39970588684082,
"learning_rate": 0.00024824434824434824,
"loss": 4.3936,
"step": 1770
},
{
"epoch": 2.567927927927928,
"grad_norm": 22.3472957611084,
"learning_rate": 0.00024776334776334776,
"loss": 4.5431,
"step": 1780
},
{
"epoch": 2.5823423423423426,
"grad_norm": 22.283802032470703,
"learning_rate": 0.00024728234728234723,
"loss": 3.7322,
"step": 1790
},
{
"epoch": 2.596756756756757,
"grad_norm": 20.59347152709961,
"learning_rate": 0.0002468013468013468,
"loss": 4.7168,
"step": 1800
},
{
"epoch": 2.611171171171171,
"grad_norm": 21.301950454711914,
"learning_rate": 0.0002463203463203463,
"loss": 4.2457,
"step": 1810
},
{
"epoch": 2.6255855855855854,
"grad_norm": 24.100994110107422,
"learning_rate": 0.0002458393458393458,
"loss": 4.0849,
"step": 1820
},
{
"epoch": 2.64,
"grad_norm": 20.029577255249023,
"learning_rate": 0.0002453583453583453,
"loss": 3.956,
"step": 1830
},
{
"epoch": 2.6544144144144144,
"grad_norm": 18.682430267333984,
"learning_rate": 0.00024487734487734484,
"loss": 4.0165,
"step": 1840
},
{
"epoch": 2.668828828828829,
"grad_norm": 24.04487419128418,
"learning_rate": 0.00024439634439634437,
"loss": 4.0105,
"step": 1850
},
{
"epoch": 2.6832432432432434,
"grad_norm": 21.22220802307129,
"learning_rate": 0.0002439153439153439,
"loss": 3.997,
"step": 1860
},
{
"epoch": 2.6976576576576576,
"grad_norm": 19.668106079101562,
"learning_rate": 0.0002434343434343434,
"loss": 4.0831,
"step": 1870
},
{
"epoch": 2.712072072072072,
"grad_norm": 30.692045211791992,
"learning_rate": 0.00024295334295334293,
"loss": 4.0591,
"step": 1880
},
{
"epoch": 2.7264864864864866,
"grad_norm": 22.906898498535156,
"learning_rate": 0.00024247234247234246,
"loss": 4.5457,
"step": 1890
},
{
"epoch": 2.740900900900901,
"grad_norm": 22.690523147583008,
"learning_rate": 0.00024199134199134195,
"loss": 3.8756,
"step": 1900
},
{
"epoch": 2.755315315315315,
"grad_norm": 21.029132843017578,
"learning_rate": 0.0002415103415103415,
"loss": 4.011,
"step": 1910
},
{
"epoch": 2.76972972972973,
"grad_norm": 21.587825775146484,
"learning_rate": 0.000241029341029341,
"loss": 3.7924,
"step": 1920
},
{
"epoch": 2.784144144144144,
"grad_norm": 22.353364944458008,
"learning_rate": 0.00024054834054834052,
"loss": 4.3143,
"step": 1930
},
{
"epoch": 2.7985585585585584,
"grad_norm": 21.176376342773438,
"learning_rate": 0.00024006734006734004,
"loss": 4.6675,
"step": 1940
},
{
"epoch": 2.812972972972973,
"grad_norm": 18.859739303588867,
"learning_rate": 0.00023958633958633956,
"loss": 4.0779,
"step": 1950
},
{
"epoch": 2.8273873873873874,
"grad_norm": 18.34664535522461,
"learning_rate": 0.0002391053391053391,
"loss": 4.2849,
"step": 1960
},
{
"epoch": 2.8418018018018016,
"grad_norm": 22.619640350341797,
"learning_rate": 0.0002386243386243386,
"loss": 3.9383,
"step": 1970
},
{
"epoch": 2.8562162162162164,
"grad_norm": 22.183664321899414,
"learning_rate": 0.0002381433381433381,
"loss": 3.7888,
"step": 1980
},
{
"epoch": 2.8706306306306306,
"grad_norm": 26.002941131591797,
"learning_rate": 0.00023766233766233765,
"loss": 3.912,
"step": 1990
},
{
"epoch": 2.885045045045045,
"grad_norm": 27.130271911621094,
"learning_rate": 0.00023718133718133715,
"loss": 3.9044,
"step": 2000
},
{
"epoch": 2.8994594594594596,
"grad_norm": 21.608003616333008,
"learning_rate": 0.00023670033670033667,
"loss": 4.2128,
"step": 2010
},
{
"epoch": 2.913873873873874,
"grad_norm": 19.621829986572266,
"learning_rate": 0.0002362193362193362,
"loss": 3.8509,
"step": 2020
},
{
"epoch": 2.928288288288288,
"grad_norm": 23.38471031188965,
"learning_rate": 0.00023573833573833572,
"loss": 4.067,
"step": 2030
},
{
"epoch": 2.942702702702703,
"grad_norm": 13.28516674041748,
"learning_rate": 0.0002352573352573352,
"loss": 4.186,
"step": 2040
},
{
"epoch": 2.957117117117117,
"grad_norm": 18.91407585144043,
"learning_rate": 0.00023477633477633476,
"loss": 3.7117,
"step": 2050
},
{
"epoch": 2.9715315315315314,
"grad_norm": 18.93157196044922,
"learning_rate": 0.00023429533429533426,
"loss": 3.8855,
"step": 2060
},
{
"epoch": 2.985945945945946,
"grad_norm": 20.980789184570312,
"learning_rate": 0.0002338143338143338,
"loss": 3.7871,
"step": 2070
},
{
"epoch": 2.998918918918919,
"eval_accuracy": 0.5473845108695652,
"eval_loss": 1.6941322088241577,
"eval_runtime": 536.9387,
"eval_samples_per_second": 10.966,
"eval_steps_per_second": 10.966,
"step": 2079
},
{
"epoch": 3.0014414414414414,
"grad_norm": 28.662826538085938,
"learning_rate": 0.0002333333333333333,
"loss": 4.0376,
"step": 2080
},
{
"epoch": 3.0158558558558557,
"grad_norm": 13.298629760742188,
"learning_rate": 0.00023285233285233283,
"loss": 2.4392,
"step": 2090
},
{
"epoch": 3.0302702702702704,
"grad_norm": 20.722625732421875,
"learning_rate": 0.00023237133237133238,
"loss": 2.5711,
"step": 2100
},
{
"epoch": 3.0446846846846847,
"grad_norm": 18.076677322387695,
"learning_rate": 0.00023189033189033187,
"loss": 2.4815,
"step": 2110
},
{
"epoch": 3.059099099099099,
"grad_norm": 23.47679328918457,
"learning_rate": 0.00023140933140933137,
"loss": 2.4175,
"step": 2120
},
{
"epoch": 3.0735135135135137,
"grad_norm": 25.233163833618164,
"learning_rate": 0.00023092833092833092,
"loss": 2.6018,
"step": 2130
},
{
"epoch": 3.087927927927928,
"grad_norm": 23.916234970092773,
"learning_rate": 0.0002304473304473304,
"loss": 2.9529,
"step": 2140
},
{
"epoch": 3.102342342342342,
"grad_norm": 20.37197494506836,
"learning_rate": 0.00022996632996632994,
"loss": 2.2146,
"step": 2150
},
{
"epoch": 3.116756756756757,
"grad_norm": 20.04782485961914,
"learning_rate": 0.00022948532948532948,
"loss": 2.1764,
"step": 2160
},
{
"epoch": 3.131171171171171,
"grad_norm": 24.065858840942383,
"learning_rate": 0.00022900432900432898,
"loss": 2.7395,
"step": 2170
},
{
"epoch": 3.1455855855855854,
"grad_norm": 20.15619468688965,
"learning_rate": 0.00022852332852332853,
"loss": 2.6955,
"step": 2180
},
{
"epoch": 3.16,
"grad_norm": 15.333986282348633,
"learning_rate": 0.00022804232804232803,
"loss": 2.378,
"step": 2190
},
{
"epoch": 3.1744144144144144,
"grad_norm": 17.780742645263672,
"learning_rate": 0.00022756132756132752,
"loss": 2.4017,
"step": 2200
},
{
"epoch": 3.1888288288288287,
"grad_norm": 22.119949340820312,
"learning_rate": 0.00022708032708032707,
"loss": 2.3123,
"step": 2210
},
{
"epoch": 3.2032432432432434,
"grad_norm": 22.979034423828125,
"learning_rate": 0.0002265993265993266,
"loss": 1.877,
"step": 2220
},
{
"epoch": 3.2176576576576577,
"grad_norm": 21.25425910949707,
"learning_rate": 0.0002261183261183261,
"loss": 2.3021,
"step": 2230
},
{
"epoch": 3.232072072072072,
"grad_norm": 20.077585220336914,
"learning_rate": 0.00022563732563732564,
"loss": 2.5026,
"step": 2240
},
{
"epoch": 3.2464864864864866,
"grad_norm": 21.955101013183594,
"learning_rate": 0.00022515632515632513,
"loss": 2.4518,
"step": 2250
},
{
"epoch": 3.260900900900901,
"grad_norm": 23.3514347076416,
"learning_rate": 0.00022467532467532463,
"loss": 2.4694,
"step": 2260
},
{
"epoch": 3.275315315315315,
"grad_norm": 11.233248710632324,
"learning_rate": 0.00022419432419432418,
"loss": 2.2057,
"step": 2270
},
{
"epoch": 3.28972972972973,
"grad_norm": 20.17824363708496,
"learning_rate": 0.0002237133237133237,
"loss": 2.3982,
"step": 2280
},
{
"epoch": 3.304144144144144,
"grad_norm": 20.694353103637695,
"learning_rate": 0.00022323232323232322,
"loss": 3.0053,
"step": 2290
},
{
"epoch": 3.3185585585585584,
"grad_norm": 24.36587142944336,
"learning_rate": 0.00022275132275132275,
"loss": 2.3132,
"step": 2300
},
{
"epoch": 3.332972972972973,
"grad_norm": 18.3751277923584,
"learning_rate": 0.00022227032227032224,
"loss": 2.2867,
"step": 2310
},
{
"epoch": 3.3473873873873874,
"grad_norm": 19.790868759155273,
"learning_rate": 0.0002217893217893218,
"loss": 2.7789,
"step": 2320
},
{
"epoch": 3.3618018018018017,
"grad_norm": 24.86772346496582,
"learning_rate": 0.0002213083213083213,
"loss": 3.0161,
"step": 2330
},
{
"epoch": 3.3762162162162164,
"grad_norm": 21.827804565429688,
"learning_rate": 0.0002208273208273208,
"loss": 2.546,
"step": 2340
},
{
"epoch": 3.3906306306306306,
"grad_norm": 19.654054641723633,
"learning_rate": 0.00022034632034632033,
"loss": 2.6371,
"step": 2350
},
{
"epoch": 3.405045045045045,
"grad_norm": 21.734804153442383,
"learning_rate": 0.00021986531986531986,
"loss": 2.4253,
"step": 2360
},
{
"epoch": 3.4194594594594596,
"grad_norm": 27.88010597229004,
"learning_rate": 0.00021938431938431935,
"loss": 2.2937,
"step": 2370
},
{
"epoch": 3.433873873873874,
"grad_norm": 22.679140090942383,
"learning_rate": 0.0002189033189033189,
"loss": 2.6596,
"step": 2380
},
{
"epoch": 3.448288288288288,
"grad_norm": 21.52387809753418,
"learning_rate": 0.0002184223184223184,
"loss": 2.0818,
"step": 2390
},
{
"epoch": 3.462702702702703,
"grad_norm": 20.006406784057617,
"learning_rate": 0.00021794131794131792,
"loss": 2.8108,
"step": 2400
},
{
"epoch": 3.477117117117117,
"grad_norm": 19.29098892211914,
"learning_rate": 0.00021746031746031744,
"loss": 2.3845,
"step": 2410
},
{
"epoch": 3.4915315315315314,
"grad_norm": 16.946989059448242,
"learning_rate": 0.00021697931697931696,
"loss": 2.5469,
"step": 2420
},
{
"epoch": 3.505945945945946,
"grad_norm": 25.288267135620117,
"learning_rate": 0.0002164983164983165,
"loss": 2.5397,
"step": 2430
},
{
"epoch": 3.5203603603603604,
"grad_norm": 25.8332462310791,
"learning_rate": 0.000216017316017316,
"loss": 2.1714,
"step": 2440
},
{
"epoch": 3.5347747747747746,
"grad_norm": 19.762386322021484,
"learning_rate": 0.0002155363155363155,
"loss": 3.3805,
"step": 2450
},
{
"epoch": 3.5491891891891894,
"grad_norm": 20.7349796295166,
"learning_rate": 0.00021505531505531505,
"loss": 2.7777,
"step": 2460
},
{
"epoch": 3.5636036036036036,
"grad_norm": 22.35674285888672,
"learning_rate": 0.00021457431457431455,
"loss": 2.1907,
"step": 2470
},
{
"epoch": 3.578018018018018,
"grad_norm": 21.76331901550293,
"learning_rate": 0.00021409331409331407,
"loss": 2.7713,
"step": 2480
},
{
"epoch": 3.5924324324324326,
"grad_norm": 20.995986938476562,
"learning_rate": 0.0002136123136123136,
"loss": 2.6262,
"step": 2490
},
{
"epoch": 3.606846846846847,
"grad_norm": 23.074106216430664,
"learning_rate": 0.00021313131313131312,
"loss": 2.0651,
"step": 2500
},
{
"epoch": 3.621261261261261,
"grad_norm": 23.654848098754883,
"learning_rate": 0.00021265031265031261,
"loss": 2.718,
"step": 2510
},
{
"epoch": 3.6356756756756754,
"grad_norm": 25.261152267456055,
"learning_rate": 0.00021216931216931216,
"loss": 2.6679,
"step": 2520
},
{
"epoch": 3.65009009009009,
"grad_norm": 21.01721954345703,
"learning_rate": 0.00021168831168831166,
"loss": 2.8435,
"step": 2530
},
{
"epoch": 3.6645045045045044,
"grad_norm": 22.361772537231445,
"learning_rate": 0.0002112073112073112,
"loss": 2.7907,
"step": 2540
},
{
"epoch": 3.678918918918919,
"grad_norm": 25.23889923095703,
"learning_rate": 0.0002107263107263107,
"loss": 2.8608,
"step": 2550
},
{
"epoch": 3.6933333333333334,
"grad_norm": 21.43499183654785,
"learning_rate": 0.00021024531024531023,
"loss": 2.3714,
"step": 2560
},
{
"epoch": 3.7077477477477476,
"grad_norm": 20.24538230895996,
"learning_rate": 0.00020976430976430975,
"loss": 2.4759,
"step": 2570
},
{
"epoch": 3.722162162162162,
"grad_norm": 22.164335250854492,
"learning_rate": 0.00020928330928330927,
"loss": 2.8105,
"step": 2580
},
{
"epoch": 3.7365765765765766,
"grad_norm": 25.067033767700195,
"learning_rate": 0.00020880230880230877,
"loss": 2.3837,
"step": 2590
},
{
"epoch": 3.750990990990991,
"grad_norm": 27.547651290893555,
"learning_rate": 0.00020832130832130832,
"loss": 2.4441,
"step": 2600
},
{
"epoch": 3.7654054054054056,
"grad_norm": 19.971914291381836,
"learning_rate": 0.0002078403078403078,
"loss": 2.4194,
"step": 2610
},
{
"epoch": 3.77981981981982,
"grad_norm": 17.411178588867188,
"learning_rate": 0.00020735930735930734,
"loss": 2.3971,
"step": 2620
},
{
"epoch": 3.794234234234234,
"grad_norm": 31.035659790039062,
"learning_rate": 0.00020687830687830686,
"loss": 2.6306,
"step": 2630
},
{
"epoch": 3.8086486486486484,
"grad_norm": 26.793031692504883,
"learning_rate": 0.00020639730639730638,
"loss": 3.0321,
"step": 2640
},
{
"epoch": 3.823063063063063,
"grad_norm": 27.277006149291992,
"learning_rate": 0.0002059163059163059,
"loss": 2.1434,
"step": 2650
},
{
"epoch": 3.8374774774774774,
"grad_norm": 29.178829193115234,
"learning_rate": 0.00020543530543530543,
"loss": 2.7848,
"step": 2660
},
{
"epoch": 3.851891891891892,
"grad_norm": 17.34369659423828,
"learning_rate": 0.00020495430495430492,
"loss": 2.5354,
"step": 2670
},
{
"epoch": 3.8663063063063063,
"grad_norm": 24.41458511352539,
"learning_rate": 0.00020447330447330447,
"loss": 2.4852,
"step": 2680
},
{
"epoch": 3.8807207207207206,
"grad_norm": 27.604721069335938,
"learning_rate": 0.00020399230399230397,
"loss": 2.6835,
"step": 2690
},
{
"epoch": 3.895135135135135,
"grad_norm": 19.998043060302734,
"learning_rate": 0.0002035113035113035,
"loss": 2.2523,
"step": 2700
},
{
"epoch": 3.9095495495495496,
"grad_norm": 26.73026466369629,
"learning_rate": 0.000203030303030303,
"loss": 3.4174,
"step": 2710
},
{
"epoch": 3.923963963963964,
"grad_norm": 27.696605682373047,
"learning_rate": 0.00020254930254930253,
"loss": 2.5488,
"step": 2720
},
{
"epoch": 3.9383783783783786,
"grad_norm": 25.43397331237793,
"learning_rate": 0.00020206830206830203,
"loss": 2.1643,
"step": 2730
},
{
"epoch": 3.952792792792793,
"grad_norm": 18.155502319335938,
"learning_rate": 0.00020158730158730158,
"loss": 2.2196,
"step": 2740
},
{
"epoch": 3.967207207207207,
"grad_norm": 27.430566787719727,
"learning_rate": 0.00020110630110630108,
"loss": 2.2681,
"step": 2750
},
{
"epoch": 3.9816216216216214,
"grad_norm": 17.62324333190918,
"learning_rate": 0.00020062530062530062,
"loss": 2.3872,
"step": 2760
},
{
"epoch": 3.996036036036036,
"grad_norm": 22.322702407836914,
"learning_rate": 0.00020014430014430012,
"loss": 2.7966,
"step": 2770
},
{
"epoch": 3.998918918918919,
"eval_accuracy": 0.5579144021739131,
"eval_loss": 1.8579920530319214,
"eval_runtime": 536.9866,
"eval_samples_per_second": 10.965,
"eval_steps_per_second": 10.965,
"step": 2772
},
{
"epoch": 4.011531531531531,
"grad_norm": 17.038963317871094,
"learning_rate": 0.00019971139971139968,
"loss": 1.7853,
"step": 2780
},
{
"epoch": 4.025945945945946,
"grad_norm": 21.912731170654297,
"learning_rate": 0.00019923039923039923,
"loss": 1.4446,
"step": 2790
},
{
"epoch": 4.04036036036036,
"grad_norm": 8.3090238571167,
"learning_rate": 0.00019874939874939873,
"loss": 1.1382,
"step": 2800
},
{
"epoch": 4.054774774774775,
"grad_norm": 10.985939979553223,
"learning_rate": 0.00019826839826839825,
"loss": 1.4296,
"step": 2810
},
{
"epoch": 4.069189189189189,
"grad_norm": 14.48794174194336,
"learning_rate": 0.00019778739778739777,
"loss": 1.3267,
"step": 2820
},
{
"epoch": 4.083603603603604,
"grad_norm": 7.6786789894104,
"learning_rate": 0.0001973063973063973,
"loss": 1.3823,
"step": 2830
},
{
"epoch": 4.098018018018018,
"grad_norm": 21.3938045501709,
"learning_rate": 0.0001968253968253968,
"loss": 1.636,
"step": 2840
},
{
"epoch": 4.112432432432432,
"grad_norm": 16.059181213378906,
"learning_rate": 0.00019634439634439634,
"loss": 1.4253,
"step": 2850
},
{
"epoch": 4.126846846846846,
"grad_norm": 31.663381576538086,
"learning_rate": 0.00019586339586339583,
"loss": 1.6679,
"step": 2860
},
{
"epoch": 4.141261261261262,
"grad_norm": 28.778202056884766,
"learning_rate": 0.00019538239538239536,
"loss": 1.7084,
"step": 2870
},
{
"epoch": 4.155675675675676,
"grad_norm": 24.17688751220703,
"learning_rate": 0.00019490139490139488,
"loss": 1.503,
"step": 2880
},
{
"epoch": 4.17009009009009,
"grad_norm": 18.74388313293457,
"learning_rate": 0.0001944203944203944,
"loss": 1.4459,
"step": 2890
},
{
"epoch": 4.184504504504504,
"grad_norm": 25.333425521850586,
"learning_rate": 0.00019393939393939395,
"loss": 1.5935,
"step": 2900
},
{
"epoch": 4.198918918918919,
"grad_norm": 19.402793884277344,
"learning_rate": 0.00019345839345839345,
"loss": 1.3032,
"step": 2910
},
{
"epoch": 4.213333333333333,
"grad_norm": 11.908445358276367,
"learning_rate": 0.00019297739297739294,
"loss": 1.4052,
"step": 2920
},
{
"epoch": 4.227747747747748,
"grad_norm": 10.511947631835938,
"learning_rate": 0.0001924963924963925,
"loss": 1.3532,
"step": 2930
},
{
"epoch": 4.242162162162162,
"grad_norm": 18.962549209594727,
"learning_rate": 0.000192015392015392,
"loss": 1.4759,
"step": 2940
},
{
"epoch": 4.256576576576577,
"grad_norm": 29.238679885864258,
"learning_rate": 0.0001915343915343915,
"loss": 1.6444,
"step": 2950
},
{
"epoch": 4.270990990990991,
"grad_norm": 13.944114685058594,
"learning_rate": 0.00019105339105339106,
"loss": 1.5509,
"step": 2960
},
{
"epoch": 4.285405405405405,
"grad_norm": 17.7829532623291,
"learning_rate": 0.00019057239057239056,
"loss": 1.4536,
"step": 2970
},
{
"epoch": 4.299819819819819,
"grad_norm": 13.711050033569336,
"learning_rate": 0.00019009139009139005,
"loss": 1.299,
"step": 2980
},
{
"epoch": 4.314234234234235,
"grad_norm": 24.686168670654297,
"learning_rate": 0.0001896103896103896,
"loss": 1.3826,
"step": 2990
},
{
"epoch": 4.328648648648649,
"grad_norm": 21.13921546936035,
"learning_rate": 0.0001891293891293891,
"loss": 1.7036,
"step": 3000
},
{
"epoch": 4.343063063063063,
"grad_norm": 14.596439361572266,
"learning_rate": 0.00018864838864838862,
"loss": 1.5839,
"step": 3010
},
{
"epoch": 4.357477477477477,
"grad_norm": 22.715736389160156,
"learning_rate": 0.00018816738816738817,
"loss": 1.5686,
"step": 3020
},
{
"epoch": 4.371891891891892,
"grad_norm": 17.39431381225586,
"learning_rate": 0.00018768638768638766,
"loss": 1.5422,
"step": 3030
},
{
"epoch": 4.386306306306306,
"grad_norm": 24.868406295776367,
"learning_rate": 0.0001872053872053872,
"loss": 1.7397,
"step": 3040
},
{
"epoch": 4.400720720720721,
"grad_norm": 26.22691535949707,
"learning_rate": 0.0001867243867243867,
"loss": 1.4283,
"step": 3050
},
{
"epoch": 4.415135135135135,
"grad_norm": 15.568745613098145,
"learning_rate": 0.0001862433862433862,
"loss": 1.2897,
"step": 3060
},
{
"epoch": 4.42954954954955,
"grad_norm": 19.749555587768555,
"learning_rate": 0.00018576238576238575,
"loss": 1.4769,
"step": 3070
},
{
"epoch": 4.443963963963964,
"grad_norm": 29.223718643188477,
"learning_rate": 0.00018528138528138528,
"loss": 1.3324,
"step": 3080
},
{
"epoch": 4.458378378378378,
"grad_norm": 19.438663482666016,
"learning_rate": 0.00018480038480038477,
"loss": 1.568,
"step": 3090
},
{
"epoch": 4.472792792792792,
"grad_norm": 10.73144245147705,
"learning_rate": 0.00018431938431938432,
"loss": 1.1532,
"step": 3100
},
{
"epoch": 4.487207207207208,
"grad_norm": 16.664306640625,
"learning_rate": 0.00018383838383838382,
"loss": 1.4775,
"step": 3110
},
{
"epoch": 4.501621621621622,
"grad_norm": 25.43704605102539,
"learning_rate": 0.0001833573833573833,
"loss": 1.3084,
"step": 3120
},
{
"epoch": 4.516036036036036,
"grad_norm": 22.560327529907227,
"learning_rate": 0.00018287638287638286,
"loss": 1.4541,
"step": 3130
},
{
"epoch": 4.53045045045045,
"grad_norm": 22.581119537353516,
"learning_rate": 0.00018239538239538239,
"loss": 1.4581,
"step": 3140
},
{
"epoch": 4.544864864864865,
"grad_norm": 19.075603485107422,
"learning_rate": 0.0001819143819143819,
"loss": 1.3255,
"step": 3150
},
{
"epoch": 4.559279279279279,
"grad_norm": 15.375678062438965,
"learning_rate": 0.00018143338143338143,
"loss": 1.035,
"step": 3160
},
{
"epoch": 4.573693693693694,
"grad_norm": 30.394746780395508,
"learning_rate": 0.00018095238095238093,
"loss": 1.7147,
"step": 3170
},
{
"epoch": 4.588108108108108,
"grad_norm": 29.191686630249023,
"learning_rate": 0.00018047138047138048,
"loss": 1.3125,
"step": 3180
},
{
"epoch": 4.602522522522523,
"grad_norm": 21.012161254882812,
"learning_rate": 0.00017999037999037997,
"loss": 1.5039,
"step": 3190
},
{
"epoch": 4.616936936936937,
"grad_norm": 17.093364715576172,
"learning_rate": 0.0001795093795093795,
"loss": 1.4667,
"step": 3200
},
{
"epoch": 4.631351351351351,
"grad_norm": 14.385228157043457,
"learning_rate": 0.00017902837902837902,
"loss": 1.2575,
"step": 3210
},
{
"epoch": 4.645765765765765,
"grad_norm": 16.330244064331055,
"learning_rate": 0.00017854737854737854,
"loss": 1.2436,
"step": 3220
},
{
"epoch": 4.6601801801801805,
"grad_norm": 17.112266540527344,
"learning_rate": 0.00017806637806637803,
"loss": 1.5148,
"step": 3230
},
{
"epoch": 4.674594594594595,
"grad_norm": 25.027666091918945,
"learning_rate": 0.00017758537758537758,
"loss": 1.6239,
"step": 3240
},
{
"epoch": 4.689009009009009,
"grad_norm": 11.63669490814209,
"learning_rate": 0.00017710437710437708,
"loss": 1.4982,
"step": 3250
},
{
"epoch": 4.703423423423423,
"grad_norm": 18.43046760559082,
"learning_rate": 0.00017662337662337663,
"loss": 1.4225,
"step": 3260
},
{
"epoch": 4.717837837837838,
"grad_norm": 17.656518936157227,
"learning_rate": 0.00017614237614237613,
"loss": 1.4843,
"step": 3270
},
{
"epoch": 4.732252252252252,
"grad_norm": 17.17339324951172,
"learning_rate": 0.00017566137566137565,
"loss": 1.5321,
"step": 3280
},
{
"epoch": 4.746666666666667,
"grad_norm": 18.681303024291992,
"learning_rate": 0.00017518037518037517,
"loss": 1.6286,
"step": 3290
},
{
"epoch": 4.761081081081081,
"grad_norm": 22.697771072387695,
"learning_rate": 0.0001746993746993747,
"loss": 1.4057,
"step": 3300
},
{
"epoch": 4.775495495495496,
"grad_norm": 16.85506248474121,
"learning_rate": 0.0001742183742183742,
"loss": 1.6464,
"step": 3310
},
{
"epoch": 4.78990990990991,
"grad_norm": 23.760793685913086,
"learning_rate": 0.00017373737373737374,
"loss": 1.4451,
"step": 3320
},
{
"epoch": 4.804324324324324,
"grad_norm": 19.93245506286621,
"learning_rate": 0.00017325637325637323,
"loss": 1.821,
"step": 3330
},
{
"epoch": 4.818738738738738,
"grad_norm": 15.235669136047363,
"learning_rate": 0.00017277537277537276,
"loss": 1.3603,
"step": 3340
},
{
"epoch": 4.8331531531531535,
"grad_norm": 18.125097274780273,
"learning_rate": 0.00017229437229437228,
"loss": 1.2805,
"step": 3350
},
{
"epoch": 4.847567567567568,
"grad_norm": 19.607587814331055,
"learning_rate": 0.0001718133718133718,
"loss": 1.7882,
"step": 3360
},
{
"epoch": 4.861981981981982,
"grad_norm": 30.157733917236328,
"learning_rate": 0.00017133237133237132,
"loss": 1.5676,
"step": 3370
},
{
"epoch": 4.876396396396396,
"grad_norm": 14.961874961853027,
"learning_rate": 0.00017085137085137085,
"loss": 1.2282,
"step": 3380
},
{
"epoch": 4.890810810810811,
"grad_norm": 29.467988967895508,
"learning_rate": 0.00017037037037037034,
"loss": 1.6735,
"step": 3390
},
{
"epoch": 4.905225225225225,
"grad_norm": 22.682449340820312,
"learning_rate": 0.0001698893698893699,
"loss": 1.4523,
"step": 3400
},
{
"epoch": 4.91963963963964,
"grad_norm": 17.40091323852539,
"learning_rate": 0.0001694083694083694,
"loss": 1.1466,
"step": 3410
},
{
"epoch": 4.934054054054054,
"grad_norm": 24.69778823852539,
"learning_rate": 0.0001689273689273689,
"loss": 1.2446,
"step": 3420
},
{
"epoch": 4.9484684684684686,
"grad_norm": 14.909017562866211,
"learning_rate": 0.00016844636844636843,
"loss": 1.5575,
"step": 3430
},
{
"epoch": 4.962882882882883,
"grad_norm": 13.104373931884766,
"learning_rate": 0.00016796536796536796,
"loss": 1.5514,
"step": 3440
},
{
"epoch": 4.977297297297297,
"grad_norm": 24.999370574951172,
"learning_rate": 0.00016748436748436745,
"loss": 1.4959,
"step": 3450
},
{
"epoch": 4.991711711711711,
"grad_norm": 29.072294235229492,
"learning_rate": 0.000167003367003367,
"loss": 1.5871,
"step": 3460
},
{
"epoch": 4.998918918918919,
"eval_accuracy": 0.6139605978260869,
"eval_loss": 1.6662975549697876,
"eval_runtime": 540.9629,
"eval_samples_per_second": 10.884,
"eval_steps_per_second": 10.884,
"step": 3465
},
{
"epoch": 5.007207207207207,
"grad_norm": 12.2052640914917,
"learning_rate": 0.0001665223665223665,
"loss": 0.9848,
"step": 3470
},
{
"epoch": 5.021621621621621,
"grad_norm": 11.040346145629883,
"learning_rate": 0.00016604136604136605,
"loss": 0.7229,
"step": 3480
},
{
"epoch": 5.036036036036036,
"grad_norm": 11.913896560668945,
"learning_rate": 0.00016556036556036554,
"loss": 0.5134,
"step": 3490
},
{
"epoch": 5.050450450450451,
"grad_norm": 16.600475311279297,
"learning_rate": 0.00016507936507936506,
"loss": 0.5581,
"step": 3500
},
{
"epoch": 5.064864864864865,
"grad_norm": 9.584583282470703,
"learning_rate": 0.0001645983645983646,
"loss": 0.7335,
"step": 3510
},
{
"epoch": 5.079279279279279,
"grad_norm": 15.97603702545166,
"learning_rate": 0.0001641173641173641,
"loss": 0.9761,
"step": 3520
},
{
"epoch": 5.093693693693694,
"grad_norm": 21.01009178161621,
"learning_rate": 0.0001636363636363636,
"loss": 0.6637,
"step": 3530
},
{
"epoch": 5.108108108108108,
"grad_norm": 18.944791793823242,
"learning_rate": 0.00016315536315536315,
"loss": 0.8514,
"step": 3540
},
{
"epoch": 5.122522522522522,
"grad_norm": 15.107224464416504,
"learning_rate": 0.00016267436267436265,
"loss": 0.7069,
"step": 3550
},
{
"epoch": 5.136936936936937,
"grad_norm": 20.789289474487305,
"learning_rate": 0.00016219336219336217,
"loss": 0.7369,
"step": 3560
},
{
"epoch": 5.151351351351352,
"grad_norm": 25.02975845336914,
"learning_rate": 0.0001617123617123617,
"loss": 0.85,
"step": 3570
},
{
"epoch": 5.165765765765766,
"grad_norm": 14.045705795288086,
"learning_rate": 0.00016123136123136122,
"loss": 1.0056,
"step": 3580
},
{
"epoch": 5.18018018018018,
"grad_norm": 19.27486801147461,
"learning_rate": 0.00016075036075036074,
"loss": 0.8829,
"step": 3590
},
{
"epoch": 5.194594594594594,
"grad_norm": 16.740869522094727,
"learning_rate": 0.00016026936026936026,
"loss": 0.7436,
"step": 3600
},
{
"epoch": 5.209009009009009,
"grad_norm": 22.02817153930664,
"learning_rate": 0.00015978835978835976,
"loss": 0.8404,
"step": 3610
},
{
"epoch": 5.223423423423424,
"grad_norm": 18.062744140625,
"learning_rate": 0.0001593073593073593,
"loss": 0.9403,
"step": 3620
},
{
"epoch": 5.237837837837838,
"grad_norm": 11.673712730407715,
"learning_rate": 0.0001588263588263588,
"loss": 0.8351,
"step": 3630
},
{
"epoch": 5.252252252252252,
"grad_norm": 13.337545394897461,
"learning_rate": 0.00015834535834535833,
"loss": 0.6274,
"step": 3640
},
{
"epoch": 5.266666666666667,
"grad_norm": 19.310646057128906,
"learning_rate": 0.00015786435786435785,
"loss": 0.969,
"step": 3650
},
{
"epoch": 5.281081081081081,
"grad_norm": 19.875566482543945,
"learning_rate": 0.00015738335738335737,
"loss": 0.6036,
"step": 3660
},
{
"epoch": 5.295495495495495,
"grad_norm": 15.952252388000488,
"learning_rate": 0.00015690235690235687,
"loss": 0.6879,
"step": 3670
},
{
"epoch": 5.30990990990991,
"grad_norm": 17.611326217651367,
"learning_rate": 0.00015642135642135642,
"loss": 0.5589,
"step": 3680
},
{
"epoch": 5.324324324324325,
"grad_norm": 19.946884155273438,
"learning_rate": 0.0001559403559403559,
"loss": 0.7953,
"step": 3690
},
{
"epoch": 5.338738738738739,
"grad_norm": 11.897385597229004,
"learning_rate": 0.00015545935545935546,
"loss": 0.5896,
"step": 3700
},
{
"epoch": 5.353153153153153,
"grad_norm": 15.592938423156738,
"learning_rate": 0.00015497835497835496,
"loss": 1.1955,
"step": 3710
},
{
"epoch": 5.367567567567567,
"grad_norm": 15.585307121276855,
"learning_rate": 0.00015449735449735448,
"loss": 1.0289,
"step": 3720
},
{
"epoch": 5.381981981981982,
"grad_norm": 14.25250244140625,
"learning_rate": 0.000154016354016354,
"loss": 0.5986,
"step": 3730
},
{
"epoch": 5.396396396396397,
"grad_norm": 23.96398162841797,
"learning_rate": 0.00015353535353535353,
"loss": 0.7085,
"step": 3740
},
{
"epoch": 5.410810810810811,
"grad_norm": 23.628772735595703,
"learning_rate": 0.00015305435305435302,
"loss": 0.826,
"step": 3750
},
{
"epoch": 5.425225225225225,
"grad_norm": 17.359643936157227,
"learning_rate": 0.00015257335257335257,
"loss": 0.7858,
"step": 3760
},
{
"epoch": 5.43963963963964,
"grad_norm": 22.010915756225586,
"learning_rate": 0.00015209235209235207,
"loss": 0.7688,
"step": 3770
},
{
"epoch": 5.454054054054054,
"grad_norm": 28.990123748779297,
"learning_rate": 0.0001516113516113516,
"loss": 0.7106,
"step": 3780
},
{
"epoch": 5.468468468468468,
"grad_norm": 11.545175552368164,
"learning_rate": 0.0001511303511303511,
"loss": 0.9866,
"step": 3790
},
{
"epoch": 5.482882882882883,
"grad_norm": 25.446990966796875,
"learning_rate": 0.00015064935064935063,
"loss": 0.9894,
"step": 3800
},
{
"epoch": 5.4972972972972975,
"grad_norm": 28.915557861328125,
"learning_rate": 0.00015016835016835018,
"loss": 0.8584,
"step": 3810
},
{
"epoch": 5.511711711711712,
"grad_norm": 19.692970275878906,
"learning_rate": 0.00014968734968734968,
"loss": 0.6045,
"step": 3820
},
{
"epoch": 5.526126126126126,
"grad_norm": 25.059045791625977,
"learning_rate": 0.00014920634920634917,
"loss": 1.1067,
"step": 3830
},
{
"epoch": 5.54054054054054,
"grad_norm": 13.645286560058594,
"learning_rate": 0.0001487253487253487,
"loss": 0.7451,
"step": 3840
},
{
"epoch": 5.554954954954955,
"grad_norm": 22.43482780456543,
"learning_rate": 0.00014824434824434822,
"loss": 0.8842,
"step": 3850
},
{
"epoch": 5.569369369369369,
"grad_norm": 11.246109008789062,
"learning_rate": 0.00014776334776334774,
"loss": 0.629,
"step": 3860
},
{
"epoch": 5.583783783783784,
"grad_norm": 21.903657913208008,
"learning_rate": 0.00014728234728234727,
"loss": 0.9014,
"step": 3870
},
{
"epoch": 5.598198198198198,
"grad_norm": 9.34262752532959,
"learning_rate": 0.0001468013468013468,
"loss": 0.8017,
"step": 3880
},
{
"epoch": 5.612612612612613,
"grad_norm": 28.314603805541992,
"learning_rate": 0.0001463203463203463,
"loss": 0.8316,
"step": 3890
},
{
"epoch": 5.627027027027027,
"grad_norm": 23.812631607055664,
"learning_rate": 0.00014583934583934583,
"loss": 1.1573,
"step": 3900
},
{
"epoch": 5.641441441441441,
"grad_norm": 19.350114822387695,
"learning_rate": 0.00014535834535834533,
"loss": 0.6841,
"step": 3910
},
{
"epoch": 5.655855855855856,
"grad_norm": 36.78022766113281,
"learning_rate": 0.00014487734487734485,
"loss": 0.8235,
"step": 3920
},
{
"epoch": 5.6702702702702705,
"grad_norm": 14.95051097869873,
"learning_rate": 0.0001443963443963444,
"loss": 0.6835,
"step": 3930
},
{
"epoch": 5.684684684684685,
"grad_norm": 11.998274803161621,
"learning_rate": 0.0001439153439153439,
"loss": 0.9942,
"step": 3940
},
{
"epoch": 5.699099099099099,
"grad_norm": 19.465404510498047,
"learning_rate": 0.00014343434343434342,
"loss": 0.9386,
"step": 3950
},
{
"epoch": 5.713513513513513,
"grad_norm": 15.735244750976562,
"learning_rate": 0.00014295334295334294,
"loss": 0.8174,
"step": 3960
},
{
"epoch": 5.727927927927928,
"grad_norm": 24.03779411315918,
"learning_rate": 0.00014247234247234246,
"loss": 1.0849,
"step": 3970
},
{
"epoch": 5.742342342342342,
"grad_norm": 12.98159408569336,
"learning_rate": 0.00014199134199134196,
"loss": 0.6748,
"step": 3980
},
{
"epoch": 5.756756756756757,
"grad_norm": 13.99123477935791,
"learning_rate": 0.0001415103415103415,
"loss": 0.6744,
"step": 3990
},
{
"epoch": 5.771171171171171,
"grad_norm": 24.469266891479492,
"learning_rate": 0.00014102934102934103,
"loss": 0.6449,
"step": 4000
},
{
"epoch": 5.7855855855855856,
"grad_norm": 28.23906898498535,
"learning_rate": 0.00014054834054834055,
"loss": 0.757,
"step": 4010
},
{
"epoch": 5.8,
"grad_norm": 18.971261978149414,
"learning_rate": 0.00014006734006734005,
"loss": 0.7486,
"step": 4020
},
{
"epoch": 5.814414414414414,
"grad_norm": 19.77442169189453,
"learning_rate": 0.00013958633958633957,
"loss": 0.8439,
"step": 4030
},
{
"epoch": 5.828828828828829,
"grad_norm": 19.546371459960938,
"learning_rate": 0.0001391053391053391,
"loss": 0.8859,
"step": 4040
},
{
"epoch": 5.8432432432432435,
"grad_norm": 12.447526931762695,
"learning_rate": 0.0001386243386243386,
"loss": 0.6841,
"step": 4050
},
{
"epoch": 5.857657657657658,
"grad_norm": 18.02086639404297,
"learning_rate": 0.00013814333814333814,
"loss": 0.8155,
"step": 4060
},
{
"epoch": 5.872072072072072,
"grad_norm": 23.19020652770996,
"learning_rate": 0.00013766233766233766,
"loss": 0.8727,
"step": 4070
},
{
"epoch": 5.886486486486486,
"grad_norm": 9.812922477722168,
"learning_rate": 0.00013718133718133719,
"loss": 0.8107,
"step": 4080
},
{
"epoch": 5.900900900900901,
"grad_norm": 18.993051528930664,
"learning_rate": 0.00013670033670033668,
"loss": 0.6686,
"step": 4090
},
{
"epoch": 5.915315315315315,
"grad_norm": 24.841590881347656,
"learning_rate": 0.0001362193362193362,
"loss": 0.8777,
"step": 4100
},
{
"epoch": 5.92972972972973,
"grad_norm": 12.165318489074707,
"learning_rate": 0.00013573833573833573,
"loss": 0.7149,
"step": 4110
},
{
"epoch": 5.944144144144144,
"grad_norm": 25.776872634887695,
"learning_rate": 0.00013525733525733525,
"loss": 0.9527,
"step": 4120
},
{
"epoch": 5.9585585585585585,
"grad_norm": 15.240096092224121,
"learning_rate": 0.00013477633477633477,
"loss": 0.7363,
"step": 4130
},
{
"epoch": 5.972972972972973,
"grad_norm": 18.949817657470703,
"learning_rate": 0.0001342953342953343,
"loss": 0.8795,
"step": 4140
},
{
"epoch": 5.987387387387387,
"grad_norm": 23.45053482055664,
"learning_rate": 0.00013381433381433382,
"loss": 0.7355,
"step": 4150
},
{
"epoch": 5.998918918918919,
"eval_accuracy": 0.6154891304347826,
"eval_loss": 1.9490801095962524,
"eval_runtime": 540.4624,
"eval_samples_per_second": 10.894,
"eval_steps_per_second": 10.894,
"step": 4158
},
{
"epoch": 6.002882882882883,
"grad_norm": 19.96414566040039,
"learning_rate": 0.0001333333333333333,
"loss": 0.7705,
"step": 4160
},
{
"epoch": 6.017297297297297,
"grad_norm": 12.935175895690918,
"learning_rate": 0.00013285233285233284,
"loss": 0.4507,
"step": 4170
},
{
"epoch": 6.031711711711711,
"grad_norm": 18.57610511779785,
"learning_rate": 0.00013237133237133236,
"loss": 0.4772,
"step": 4180
},
{
"epoch": 6.0461261261261265,
"grad_norm": 18.15093231201172,
"learning_rate": 0.00013189033189033188,
"loss": 0.4697,
"step": 4190
},
{
"epoch": 6.060540540540541,
"grad_norm": 9.7061128616333,
"learning_rate": 0.0001314093314093314,
"loss": 0.3953,
"step": 4200
},
{
"epoch": 6.074954954954955,
"grad_norm": 14.228235244750977,
"learning_rate": 0.00013092833092833093,
"loss": 0.4857,
"step": 4210
},
{
"epoch": 6.089369369369369,
"grad_norm": 12.73335075378418,
"learning_rate": 0.00013044733044733045,
"loss": 0.2774,
"step": 4220
},
{
"epoch": 6.103783783783784,
"grad_norm": 26.926279067993164,
"learning_rate": 0.00012996632996632997,
"loss": 0.4033,
"step": 4230
},
{
"epoch": 6.118198198198198,
"grad_norm": 5.05507755279541,
"learning_rate": 0.00012948532948532947,
"loss": 0.379,
"step": 4240
},
{
"epoch": 6.132612612612613,
"grad_norm": 13.0632905960083,
"learning_rate": 0.000129004329004329,
"loss": 0.5064,
"step": 4250
},
{
"epoch": 6.147027027027027,
"grad_norm": 9.610346794128418,
"learning_rate": 0.0001285233285233285,
"loss": 0.5576,
"step": 4260
},
{
"epoch": 6.161441441441442,
"grad_norm": 9.474533081054688,
"learning_rate": 0.00012804232804232803,
"loss": 0.4405,
"step": 4270
},
{
"epoch": 6.175855855855856,
"grad_norm": 6.424566745758057,
"learning_rate": 0.00012756132756132756,
"loss": 0.4283,
"step": 4280
},
{
"epoch": 6.19027027027027,
"grad_norm": 22.856693267822266,
"learning_rate": 0.00012708032708032708,
"loss": 0.5386,
"step": 4290
},
{
"epoch": 6.204684684684684,
"grad_norm": 14.695728302001953,
"learning_rate": 0.0001265993265993266,
"loss": 0.4684,
"step": 4300
},
{
"epoch": 6.2190990990990995,
"grad_norm": 12.434320449829102,
"learning_rate": 0.0001261183261183261,
"loss": 0.3499,
"step": 4310
},
{
"epoch": 6.233513513513514,
"grad_norm": 3.9371864795684814,
"learning_rate": 0.00012563732563732562,
"loss": 0.4161,
"step": 4320
},
{
"epoch": 6.247927927927928,
"grad_norm": 11.733071327209473,
"learning_rate": 0.00012515632515632514,
"loss": 0.4829,
"step": 4330
},
{
"epoch": 6.262342342342342,
"grad_norm": 5.837855815887451,
"learning_rate": 0.00012467532467532467,
"loss": 0.5473,
"step": 4340
},
{
"epoch": 6.276756756756757,
"grad_norm": 10.520476341247559,
"learning_rate": 0.0001241943241943242,
"loss": 0.432,
"step": 4350
},
{
"epoch": 6.291171171171171,
"grad_norm": 14.354527473449707,
"learning_rate": 0.0001237133237133237,
"loss": 0.3837,
"step": 4360
},
{
"epoch": 6.305585585585586,
"grad_norm": 24.440963745117188,
"learning_rate": 0.00012323232323232323,
"loss": 0.6812,
"step": 4370
},
{
"epoch": 6.32,
"grad_norm": 21.688756942749023,
"learning_rate": 0.00012275132275132273,
"loss": 0.6889,
"step": 4380
},
{
"epoch": 6.3344144144144146,
"grad_norm": 4.70493221282959,
"learning_rate": 0.00012227032227032225,
"loss": 0.4692,
"step": 4390
},
{
"epoch": 6.348828828828829,
"grad_norm": 10.504195213317871,
"learning_rate": 0.00012178932178932179,
"loss": 0.3945,
"step": 4400
},
{
"epoch": 6.363243243243243,
"grad_norm": 12.554998397827148,
"learning_rate": 0.00012130832130832131,
"loss": 0.4145,
"step": 4410
},
{
"epoch": 6.377657657657657,
"grad_norm": 5.851123809814453,
"learning_rate": 0.0001208273208273208,
"loss": 0.3595,
"step": 4420
},
{
"epoch": 6.392072072072072,
"grad_norm": 33.16427993774414,
"learning_rate": 0.00012034632034632034,
"loss": 0.5448,
"step": 4430
},
{
"epoch": 6.406486486486487,
"grad_norm": 17.474634170532227,
"learning_rate": 0.00011986531986531986,
"loss": 0.4775,
"step": 4440
},
{
"epoch": 6.420900900900901,
"grad_norm": 21.54201889038086,
"learning_rate": 0.00011938431938431936,
"loss": 0.4061,
"step": 4450
},
{
"epoch": 6.435315315315315,
"grad_norm": 27.28333854675293,
"learning_rate": 0.00011890331890331888,
"loss": 0.41,
"step": 4460
},
{
"epoch": 6.44972972972973,
"grad_norm": 31.519390106201172,
"learning_rate": 0.00011842231842231842,
"loss": 0.4323,
"step": 4470
},
{
"epoch": 6.464144144144144,
"grad_norm": 18.609390258789062,
"learning_rate": 0.00011794131794131794,
"loss": 0.323,
"step": 4480
},
{
"epoch": 6.478558558558559,
"grad_norm": 16.234210968017578,
"learning_rate": 0.00011746031746031744,
"loss": 0.3677,
"step": 4490
},
{
"epoch": 6.492972972972973,
"grad_norm": 18.266056060791016,
"learning_rate": 0.00011697931697931697,
"loss": 0.4261,
"step": 4500
},
{
"epoch": 6.5073873873873875,
"grad_norm": 13.765610694885254,
"learning_rate": 0.0001164983164983165,
"loss": 0.2749,
"step": 4510
},
{
"epoch": 6.521801801801802,
"grad_norm": 19.466411590576172,
"learning_rate": 0.00011601731601731602,
"loss": 0.5191,
"step": 4520
},
{
"epoch": 6.536216216216216,
"grad_norm": 5.606191635131836,
"learning_rate": 0.00011553631553631553,
"loss": 0.2674,
"step": 4530
},
{
"epoch": 6.55063063063063,
"grad_norm": 21.999649047851562,
"learning_rate": 0.00011505531505531505,
"loss": 0.3778,
"step": 4540
},
{
"epoch": 6.565045045045045,
"grad_norm": 5.735301494598389,
"learning_rate": 0.00011457431457431457,
"loss": 0.5567,
"step": 4550
},
{
"epoch": 6.57945945945946,
"grad_norm": 10.661727905273438,
"learning_rate": 0.00011409331409331408,
"loss": 0.319,
"step": 4560
},
{
"epoch": 6.593873873873874,
"grad_norm": 23.01692771911621,
"learning_rate": 0.0001136123136123136,
"loss": 0.4116,
"step": 4570
},
{
"epoch": 6.608288288288288,
"grad_norm": 11.15292739868164,
"learning_rate": 0.00011313131313131313,
"loss": 0.395,
"step": 4580
},
{
"epoch": 6.622702702702703,
"grad_norm": 15.197105407714844,
"learning_rate": 0.00011265031265031265,
"loss": 0.5435,
"step": 4590
},
{
"epoch": 6.637117117117117,
"grad_norm": 23.04345703125,
"learning_rate": 0.00011216931216931216,
"loss": 0.4702,
"step": 4600
},
{
"epoch": 6.651531531531532,
"grad_norm": 8.85188102722168,
"learning_rate": 0.00011168831168831168,
"loss": 0.3533,
"step": 4610
},
{
"epoch": 6.665945945945946,
"grad_norm": 9.123584747314453,
"learning_rate": 0.0001112073112073112,
"loss": 0.4277,
"step": 4620
},
{
"epoch": 6.6803603603603605,
"grad_norm": 8.331842422485352,
"learning_rate": 0.00011072631072631073,
"loss": 0.5292,
"step": 4630
},
{
"epoch": 6.694774774774775,
"grad_norm": 12.688973426818848,
"learning_rate": 0.00011024531024531024,
"loss": 0.3495,
"step": 4640
},
{
"epoch": 6.709189189189189,
"grad_norm": 22.717866897583008,
"learning_rate": 0.00010976430976430976,
"loss": 0.4317,
"step": 4650
},
{
"epoch": 6.723603603603603,
"grad_norm": 22.28693962097168,
"learning_rate": 0.00010928330928330928,
"loss": 0.5334,
"step": 4660
},
{
"epoch": 6.738018018018018,
"grad_norm": 18.496274948120117,
"learning_rate": 0.00010880230880230879,
"loss": 0.4481,
"step": 4670
},
{
"epoch": 6.752432432432433,
"grad_norm": 22.91065216064453,
"learning_rate": 0.00010832130832130831,
"loss": 0.3546,
"step": 4680
},
{
"epoch": 6.766846846846847,
"grad_norm": 24.638437271118164,
"learning_rate": 0.00010784030784030783,
"loss": 0.6028,
"step": 4690
},
{
"epoch": 6.781261261261261,
"grad_norm": 12.158951759338379,
"learning_rate": 0.00010735930735930736,
"loss": 0.3595,
"step": 4700
},
{
"epoch": 6.7956756756756755,
"grad_norm": 3.462782144546509,
"learning_rate": 0.00010687830687830687,
"loss": 0.3434,
"step": 4710
},
{
"epoch": 6.81009009009009,
"grad_norm": 14.709941864013672,
"learning_rate": 0.00010639730639730639,
"loss": 0.3708,
"step": 4720
},
{
"epoch": 6.824504504504505,
"grad_norm": 2.6258020401000977,
"learning_rate": 0.00010591630591630591,
"loss": 0.3561,
"step": 4730
},
{
"epoch": 6.838918918918919,
"grad_norm": 4.584090709686279,
"learning_rate": 0.00010543530543530543,
"loss": 0.4685,
"step": 4740
},
{
"epoch": 6.8533333333333335,
"grad_norm": 27.684444427490234,
"learning_rate": 0.00010495430495430494,
"loss": 0.2848,
"step": 4750
},
{
"epoch": 6.867747747747748,
"grad_norm": 5.796729564666748,
"learning_rate": 0.00010447330447330447,
"loss": 0.3553,
"step": 4760
},
{
"epoch": 6.882162162162162,
"grad_norm": 4.9681396484375,
"learning_rate": 0.00010399230399230399,
"loss": 0.3048,
"step": 4770
},
{
"epoch": 6.896576576576576,
"grad_norm": 22.89188575744629,
"learning_rate": 0.0001035113035113035,
"loss": 0.6352,
"step": 4780
},
{
"epoch": 6.910990990990991,
"grad_norm": 2.380059003829956,
"learning_rate": 0.00010303030303030302,
"loss": 0.4462,
"step": 4790
},
{
"epoch": 6.925405405405406,
"grad_norm": 13.61782455444336,
"learning_rate": 0.00010254930254930254,
"loss": 0.4329,
"step": 4800
},
{
"epoch": 6.93981981981982,
"grad_norm": 6.834221839904785,
"learning_rate": 0.00010206830206830207,
"loss": 0.2754,
"step": 4810
},
{
"epoch": 6.954234234234234,
"grad_norm": 1.0478729009628296,
"learning_rate": 0.00010158730158730157,
"loss": 0.221,
"step": 4820
},
{
"epoch": 6.9686486486486485,
"grad_norm": 8.622994422912598,
"learning_rate": 0.0001011063011063011,
"loss": 0.2593,
"step": 4830
},
{
"epoch": 6.983063063063063,
"grad_norm": 22.14352035522461,
"learning_rate": 0.00010062530062530062,
"loss": 0.3164,
"step": 4840
},
{
"epoch": 6.997477477477478,
"grad_norm": 8.023240089416504,
"learning_rate": 0.00010014430014430014,
"loss": 0.4492,
"step": 4850
},
{
"epoch": 6.998918918918919,
"eval_accuracy": 0.6379076086956522,
"eval_loss": 2.059363842010498,
"eval_runtime": 537.7178,
"eval_samples_per_second": 10.95,
"eval_steps_per_second": 10.95,
"step": 4851
},
{
"epoch": 7.012972972972973,
"grad_norm": 14.681108474731445,
"learning_rate": 9.966329966329965e-05,
"loss": 0.2425,
"step": 4860
},
{
"epoch": 7.027387387387387,
"grad_norm": 25.905927658081055,
"learning_rate": 9.918229918229917e-05,
"loss": 0.2949,
"step": 4870
},
{
"epoch": 7.041801801801801,
"grad_norm": 2.836951971054077,
"learning_rate": 9.87012987012987e-05,
"loss": 0.1989,
"step": 4880
},
{
"epoch": 7.0562162162162165,
"grad_norm": 1.04839026927948,
"learning_rate": 9.82202982202982e-05,
"loss": 0.1024,
"step": 4890
},
{
"epoch": 7.070630630630631,
"grad_norm": 10.27518367767334,
"learning_rate": 9.773929773929773e-05,
"loss": 0.1522,
"step": 4900
},
{
"epoch": 7.085045045045045,
"grad_norm": 15.933104515075684,
"learning_rate": 9.725829725829725e-05,
"loss": 0.145,
"step": 4910
},
{
"epoch": 7.099459459459459,
"grad_norm": 18.11174201965332,
"learning_rate": 9.677729677729677e-05,
"loss": 0.1838,
"step": 4920
},
{
"epoch": 7.113873873873874,
"grad_norm": 1.1443898677825928,
"learning_rate": 9.629629629629628e-05,
"loss": 0.1418,
"step": 4930
},
{
"epoch": 7.128288288288288,
"grad_norm": 15.602287292480469,
"learning_rate": 9.58152958152958e-05,
"loss": 0.3214,
"step": 4940
},
{
"epoch": 7.142702702702703,
"grad_norm": 16.450904846191406,
"learning_rate": 9.533429533429533e-05,
"loss": 0.1656,
"step": 4950
},
{
"epoch": 7.157117117117117,
"grad_norm": 14.295945167541504,
"learning_rate": 9.485329485329484e-05,
"loss": 0.3092,
"step": 4960
},
{
"epoch": 7.1715315315315316,
"grad_norm": 3.2762200832366943,
"learning_rate": 9.437229437229436e-05,
"loss": 0.0993,
"step": 4970
},
{
"epoch": 7.185945945945946,
"grad_norm": 1.229925274848938,
"learning_rate": 9.389129389129388e-05,
"loss": 0.1636,
"step": 4980
},
{
"epoch": 7.20036036036036,
"grad_norm": 8.866992950439453,
"learning_rate": 9.34102934102934e-05,
"loss": 0.1434,
"step": 4990
},
{
"epoch": 7.214774774774774,
"grad_norm": 6.15886116027832,
"learning_rate": 9.292929292929291e-05,
"loss": 0.1759,
"step": 5000
},
{
"epoch": 7.2291891891891895,
"grad_norm": 6.583317279815674,
"learning_rate": 9.244829244829244e-05,
"loss": 0.1752,
"step": 5010
},
{
"epoch": 7.243603603603604,
"grad_norm": 13.805874824523926,
"learning_rate": 9.196729196729196e-05,
"loss": 0.1778,
"step": 5020
},
{
"epoch": 7.258018018018018,
"grad_norm": 4.149932861328125,
"learning_rate": 9.148629148629148e-05,
"loss": 0.3115,
"step": 5030
},
{
"epoch": 7.272432432432432,
"grad_norm": 13.87183666229248,
"learning_rate": 9.100529100529099e-05,
"loss": 0.1509,
"step": 5040
},
{
"epoch": 7.286846846846847,
"grad_norm": 8.47652530670166,
"learning_rate": 9.052429052429051e-05,
"loss": 0.3549,
"step": 5050
},
{
"epoch": 7.301261261261261,
"grad_norm": 9.171941757202148,
"learning_rate": 9.004329004329004e-05,
"loss": 0.1054,
"step": 5060
},
{
"epoch": 7.315675675675676,
"grad_norm": 9.501484870910645,
"learning_rate": 8.956228956228955e-05,
"loss": 0.1728,
"step": 5070
},
{
"epoch": 7.33009009009009,
"grad_norm": 0.5740847587585449,
"learning_rate": 8.908128908128907e-05,
"loss": 0.116,
"step": 5080
},
{
"epoch": 7.3445045045045045,
"grad_norm": 2.0156924724578857,
"learning_rate": 8.860028860028859e-05,
"loss": 0.1889,
"step": 5090
},
{
"epoch": 7.358918918918919,
"grad_norm": 4.784016132354736,
"learning_rate": 8.811928811928811e-05,
"loss": 0.2124,
"step": 5100
},
{
"epoch": 7.373333333333333,
"grad_norm": 2.135333299636841,
"learning_rate": 8.763828763828762e-05,
"loss": 0.1885,
"step": 5110
},
{
"epoch": 7.387747747747747,
"grad_norm": 13.758618354797363,
"learning_rate": 8.715728715728714e-05,
"loss": 0.2869,
"step": 5120
},
{
"epoch": 7.4021621621621625,
"grad_norm": 10.508682250976562,
"learning_rate": 8.667628667628667e-05,
"loss": 0.09,
"step": 5130
},
{
"epoch": 7.416576576576577,
"grad_norm": 8.677715301513672,
"learning_rate": 8.619528619528619e-05,
"loss": 0.1022,
"step": 5140
},
{
"epoch": 7.430990990990991,
"grad_norm": 7.379012584686279,
"learning_rate": 8.57142857142857e-05,
"loss": 0.2095,
"step": 5150
},
{
"epoch": 7.445405405405405,
"grad_norm": 16.449451446533203,
"learning_rate": 8.523328523328522e-05,
"loss": 0.1052,
"step": 5160
},
{
"epoch": 7.45981981981982,
"grad_norm": 14.736000061035156,
"learning_rate": 8.475228475228474e-05,
"loss": 0.2009,
"step": 5170
},
{
"epoch": 7.474234234234234,
"grad_norm": 3.677145004272461,
"learning_rate": 8.427128427128425e-05,
"loss": 0.1472,
"step": 5180
},
{
"epoch": 7.488648648648649,
"grad_norm": 0.6532973051071167,
"learning_rate": 8.379028379028378e-05,
"loss": 0.1467,
"step": 5190
},
{
"epoch": 7.503063063063063,
"grad_norm": 14.072589874267578,
"learning_rate": 8.33092833092833e-05,
"loss": 0.1727,
"step": 5200
},
{
"epoch": 7.5174774774774775,
"grad_norm": 15.414175033569336,
"learning_rate": 8.282828282828282e-05,
"loss": 0.1885,
"step": 5210
},
{
"epoch": 7.531891891891892,
"grad_norm": 2.108407735824585,
"learning_rate": 8.234728234728233e-05,
"loss": 0.1228,
"step": 5220
},
{
"epoch": 7.546306306306306,
"grad_norm": 13.167756080627441,
"learning_rate": 8.186628186628185e-05,
"loss": 0.1511,
"step": 5230
},
{
"epoch": 7.56072072072072,
"grad_norm": 12.300124168395996,
"learning_rate": 8.138528138528138e-05,
"loss": 0.1712,
"step": 5240
},
{
"epoch": 7.5751351351351355,
"grad_norm": 4.797776222229004,
"learning_rate": 8.09042809042809e-05,
"loss": 0.1385,
"step": 5250
},
{
"epoch": 7.58954954954955,
"grad_norm": 9.989211082458496,
"learning_rate": 8.042328042328041e-05,
"loss": 0.2256,
"step": 5260
},
{
"epoch": 7.603963963963964,
"grad_norm": 21.55989646911621,
"learning_rate": 7.994227994227993e-05,
"loss": 0.2175,
"step": 5270
},
{
"epoch": 7.618378378378378,
"grad_norm": 12.825868606567383,
"learning_rate": 7.946127946127945e-05,
"loss": 0.1561,
"step": 5280
},
{
"epoch": 7.6327927927927925,
"grad_norm": 5.119826793670654,
"learning_rate": 7.902837902837901e-05,
"loss": 0.1237,
"step": 5290
},
{
"epoch": 7.647207207207208,
"grad_norm": 8.325628280639648,
"learning_rate": 7.854737854737855e-05,
"loss": 0.3462,
"step": 5300
},
{
"epoch": 7.661621621621622,
"grad_norm": 8.451800346374512,
"learning_rate": 7.806637806637807e-05,
"loss": 0.2437,
"step": 5310
},
{
"epoch": 7.676036036036036,
"grad_norm": 9.6069974899292,
"learning_rate": 7.758537758537757e-05,
"loss": 0.1846,
"step": 5320
},
{
"epoch": 7.6904504504504505,
"grad_norm": 14.663230895996094,
"learning_rate": 7.71043771043771e-05,
"loss": 0.2186,
"step": 5330
},
{
"epoch": 7.704864864864865,
"grad_norm": 16.57319450378418,
"learning_rate": 7.662337662337662e-05,
"loss": 0.1133,
"step": 5340
},
{
"epoch": 7.719279279279279,
"grad_norm": 10.028879165649414,
"learning_rate": 7.614237614237615e-05,
"loss": 0.1361,
"step": 5350
},
{
"epoch": 7.733693693693693,
"grad_norm": 17.944252014160156,
"learning_rate": 7.566137566137566e-05,
"loss": 0.2533,
"step": 5360
},
{
"epoch": 7.7481081081081085,
"grad_norm": 4.871366500854492,
"learning_rate": 7.518037518037518e-05,
"loss": 0.1396,
"step": 5370
},
{
"epoch": 7.762522522522523,
"grad_norm": 5.787502765655518,
"learning_rate": 7.469937469937469e-05,
"loss": 0.3421,
"step": 5380
},
{
"epoch": 7.776936936936937,
"grad_norm": 20.75065040588379,
"learning_rate": 7.421837421837421e-05,
"loss": 0.1679,
"step": 5390
},
{
"epoch": 7.791351351351351,
"grad_norm": 16.226171493530273,
"learning_rate": 7.373737373737373e-05,
"loss": 0.2005,
"step": 5400
},
{
"epoch": 7.8057657657657655,
"grad_norm": 1.3808518648147583,
"learning_rate": 7.325637325637326e-05,
"loss": 0.2236,
"step": 5410
},
{
"epoch": 7.82018018018018,
"grad_norm": 5.49656343460083,
"learning_rate": 7.277537277537277e-05,
"loss": 0.2159,
"step": 5420
},
{
"epoch": 7.834594594594595,
"grad_norm": 4.51519250869751,
"learning_rate": 7.229437229437229e-05,
"loss": 0.1601,
"step": 5430
},
{
"epoch": 7.849009009009009,
"grad_norm": 3.9731264114379883,
"learning_rate": 7.181337181337181e-05,
"loss": 0.2402,
"step": 5440
},
{
"epoch": 7.8634234234234235,
"grad_norm": 1.414002776145935,
"learning_rate": 7.133237133237133e-05,
"loss": 0.1709,
"step": 5450
},
{
"epoch": 7.877837837837838,
"grad_norm": 3.847299575805664,
"learning_rate": 7.085137085137084e-05,
"loss": 0.2866,
"step": 5460
},
{
"epoch": 7.892252252252252,
"grad_norm": 16.216571807861328,
"learning_rate": 7.037037037037036e-05,
"loss": 0.1026,
"step": 5470
},
{
"epoch": 7.906666666666666,
"grad_norm": 1.87873113155365,
"learning_rate": 6.988936988936989e-05,
"loss": 0.1027,
"step": 5480
},
{
"epoch": 7.921081081081081,
"grad_norm": 11.856677055358887,
"learning_rate": 6.94083694083694e-05,
"loss": 0.0807,
"step": 5490
},
{
"epoch": 7.935495495495496,
"grad_norm": 1.2753289937973022,
"learning_rate": 6.892736892736892e-05,
"loss": 0.1885,
"step": 5500
},
{
"epoch": 7.94990990990991,
"grad_norm": 5.382585048675537,
"learning_rate": 6.844636844636844e-05,
"loss": 0.1034,
"step": 5510
},
{
"epoch": 7.964324324324324,
"grad_norm": 4.376471996307373,
"learning_rate": 6.796536796536796e-05,
"loss": 0.1051,
"step": 5520
},
{
"epoch": 7.9787387387387385,
"grad_norm": 6.501208782196045,
"learning_rate": 6.748436748436747e-05,
"loss": 0.1589,
"step": 5530
},
{
"epoch": 7.993153153153153,
"grad_norm": 7.671748161315918,
"learning_rate": 6.7003367003367e-05,
"loss": 0.1528,
"step": 5540
},
{
"epoch": 7.998918918918919,
"eval_accuracy": 0.6402853260869565,
"eval_loss": 2.1739323139190674,
"eval_runtime": 537.1422,
"eval_samples_per_second": 10.962,
"eval_steps_per_second": 10.962,
"step": 5544
},
{
"epoch": 8.008648648648649,
"grad_norm": 0.7333820462226868,
"learning_rate": 6.652236652236652e-05,
"loss": 0.1737,
"step": 5550
},
{
"epoch": 8.023063063063063,
"grad_norm": 1.1993273496627808,
"learning_rate": 6.604136604136604e-05,
"loss": 0.0923,
"step": 5560
},
{
"epoch": 8.037477477477477,
"grad_norm": 18.680021286010742,
"learning_rate": 6.556036556036555e-05,
"loss": 0.1005,
"step": 5570
},
{
"epoch": 8.051891891891891,
"grad_norm": 19.182872772216797,
"learning_rate": 6.507936507936507e-05,
"loss": 0.1297,
"step": 5580
},
{
"epoch": 8.066306306306306,
"grad_norm": 2.575910806655884,
"learning_rate": 6.45983645983646e-05,
"loss": 0.049,
"step": 5590
},
{
"epoch": 8.08072072072072,
"grad_norm": 1.0843993425369263,
"learning_rate": 6.41173641173641e-05,
"loss": 0.0646,
"step": 5600
},
{
"epoch": 8.095135135135136,
"grad_norm": 0.35826346278190613,
"learning_rate": 6.363636363636363e-05,
"loss": 0.0356,
"step": 5610
},
{
"epoch": 8.10954954954955,
"grad_norm": 1.4210469722747803,
"learning_rate": 6.315536315536315e-05,
"loss": 0.0329,
"step": 5620
},
{
"epoch": 8.123963963963964,
"grad_norm": 8.666502952575684,
"learning_rate": 6.267436267436267e-05,
"loss": 0.0496,
"step": 5630
},
{
"epoch": 8.138378378378379,
"grad_norm": 0.4810231328010559,
"learning_rate": 6.219336219336218e-05,
"loss": 0.0276,
"step": 5640
},
{
"epoch": 8.152792792792793,
"grad_norm": 5.4928789138793945,
"learning_rate": 6.17123617123617e-05,
"loss": 0.0692,
"step": 5650
},
{
"epoch": 8.167207207207207,
"grad_norm": 5.067449569702148,
"learning_rate": 6.123136123136123e-05,
"loss": 0.058,
"step": 5660
},
{
"epoch": 8.181621621621622,
"grad_norm": 25.670732498168945,
"learning_rate": 6.075036075036074e-05,
"loss": 0.1061,
"step": 5670
},
{
"epoch": 8.196036036036036,
"grad_norm": 6.106614589691162,
"learning_rate": 6.0269360269360265e-05,
"loss": 0.0554,
"step": 5680
},
{
"epoch": 8.21045045045045,
"grad_norm": 7.492941379547119,
"learning_rate": 5.978835978835978e-05,
"loss": 0.0667,
"step": 5690
},
{
"epoch": 8.224864864864864,
"grad_norm": 1.3118231296539307,
"learning_rate": 5.9307359307359304e-05,
"loss": 0.0388,
"step": 5700
},
{
"epoch": 8.239279279279279,
"grad_norm": 4.273688316345215,
"learning_rate": 5.882635882635882e-05,
"loss": 0.047,
"step": 5710
},
{
"epoch": 8.253693693693693,
"grad_norm": 2.6258041858673096,
"learning_rate": 5.834535834535834e-05,
"loss": 0.0652,
"step": 5720
},
{
"epoch": 8.268108108108109,
"grad_norm": 5.456060886383057,
"learning_rate": 5.786435786435786e-05,
"loss": 0.1954,
"step": 5730
},
{
"epoch": 8.282522522522523,
"grad_norm": 3.158957004547119,
"learning_rate": 5.738335738335738e-05,
"loss": 0.0662,
"step": 5740
},
{
"epoch": 8.296936936936937,
"grad_norm": 3.201091766357422,
"learning_rate": 5.6902356902356896e-05,
"loss": 0.199,
"step": 5750
},
{
"epoch": 8.311351351351352,
"grad_norm": 1.514101505279541,
"learning_rate": 5.642135642135642e-05,
"loss": 0.1082,
"step": 5760
},
{
"epoch": 8.325765765765766,
"grad_norm": 0.24764111638069153,
"learning_rate": 5.5940355940355935e-05,
"loss": 0.0607,
"step": 5770
},
{
"epoch": 8.34018018018018,
"grad_norm": 1.5579568147659302,
"learning_rate": 5.545935545935545e-05,
"loss": 0.0205,
"step": 5780
},
{
"epoch": 8.354594594594595,
"grad_norm": 9.406379699707031,
"learning_rate": 5.497835497835497e-05,
"loss": 0.0614,
"step": 5790
},
{
"epoch": 8.369009009009009,
"grad_norm": 3.4456870555877686,
"learning_rate": 5.449735449735449e-05,
"loss": 0.0169,
"step": 5800
},
{
"epoch": 8.383423423423423,
"grad_norm": 0.3121024966239929,
"learning_rate": 5.401635401635401e-05,
"loss": 0.078,
"step": 5810
},
{
"epoch": 8.397837837837837,
"grad_norm": 7.2323832511901855,
"learning_rate": 5.353535353535353e-05,
"loss": 0.0794,
"step": 5820
},
{
"epoch": 8.412252252252252,
"grad_norm": 0.42312678694725037,
"learning_rate": 5.305435305435305e-05,
"loss": 0.0229,
"step": 5830
},
{
"epoch": 8.426666666666666,
"grad_norm": 1.5303746461868286,
"learning_rate": 5.2573352573352566e-05,
"loss": 0.0555,
"step": 5840
},
{
"epoch": 8.441081081081082,
"grad_norm": 0.5218743681907654,
"learning_rate": 5.209235209235209e-05,
"loss": 0.097,
"step": 5850
},
{
"epoch": 8.455495495495496,
"grad_norm": 3.4224956035614014,
"learning_rate": 5.1611351611351604e-05,
"loss": 0.0415,
"step": 5860
},
{
"epoch": 8.46990990990991,
"grad_norm": 0.56160569190979,
"learning_rate": 5.113035113035113e-05,
"loss": 0.0476,
"step": 5870
},
{
"epoch": 8.484324324324325,
"grad_norm": 2.77597975730896,
"learning_rate": 5.064935064935064e-05,
"loss": 0.0231,
"step": 5880
},
{
"epoch": 8.498738738738739,
"grad_norm": 2.240520477294922,
"learning_rate": 5.016835016835016e-05,
"loss": 0.051,
"step": 5890
},
{
"epoch": 8.513153153153153,
"grad_norm": 1.585841178894043,
"learning_rate": 4.968734968734968e-05,
"loss": 0.0575,
"step": 5900
},
{
"epoch": 8.527567567567568,
"grad_norm": 12.269892692565918,
"learning_rate": 4.92063492063492e-05,
"loss": 0.0419,
"step": 5910
},
{
"epoch": 8.541981981981982,
"grad_norm": 4.764209747314453,
"learning_rate": 4.872534872534872e-05,
"loss": 0.1574,
"step": 5920
},
{
"epoch": 8.556396396396396,
"grad_norm": 6.484140396118164,
"learning_rate": 4.8244348244348236e-05,
"loss": 0.0667,
"step": 5930
},
{
"epoch": 8.57081081081081,
"grad_norm": 8.274352073669434,
"learning_rate": 4.7763347763347765e-05,
"loss": 0.1035,
"step": 5940
},
{
"epoch": 8.585225225225225,
"grad_norm": 18.833515167236328,
"learning_rate": 4.7282347282347274e-05,
"loss": 0.0372,
"step": 5950
},
{
"epoch": 8.599639639639639,
"grad_norm": 4.068152904510498,
"learning_rate": 4.68013468013468e-05,
"loss": 0.0689,
"step": 5960
},
{
"epoch": 8.614054054054055,
"grad_norm": 4.497600078582764,
"learning_rate": 4.632034632034632e-05,
"loss": 0.0501,
"step": 5970
},
{
"epoch": 8.62846846846847,
"grad_norm": 1.556960940361023,
"learning_rate": 4.583934583934583e-05,
"loss": 0.0988,
"step": 5980
},
{
"epoch": 8.642882882882883,
"grad_norm": 14.646133422851562,
"learning_rate": 4.535834535834536e-05,
"loss": 0.055,
"step": 5990
},
{
"epoch": 8.657297297297298,
"grad_norm": 0.7149348258972168,
"learning_rate": 4.4877344877344874e-05,
"loss": 0.0471,
"step": 6000
},
{
"epoch": 8.671711711711712,
"grad_norm": 0.4112788438796997,
"learning_rate": 4.4396344396344396e-05,
"loss": 0.0755,
"step": 6010
},
{
"epoch": 8.686126126126126,
"grad_norm": 0.7935078740119934,
"learning_rate": 4.391534391534391e-05,
"loss": 0.0194,
"step": 6020
},
{
"epoch": 8.70054054054054,
"grad_norm": 2.739198684692383,
"learning_rate": 4.3434343434343435e-05,
"loss": 0.0313,
"step": 6030
},
{
"epoch": 8.714954954954955,
"grad_norm": 1.197202444076538,
"learning_rate": 4.295334295334295e-05,
"loss": 0.0473,
"step": 6040
},
{
"epoch": 8.729369369369369,
"grad_norm": 2.7497189044952393,
"learning_rate": 4.247234247234247e-05,
"loss": 0.0168,
"step": 6050
},
{
"epoch": 8.743783783783783,
"grad_norm": 22.05868911743164,
"learning_rate": 4.199134199134199e-05,
"loss": 0.0741,
"step": 6060
},
{
"epoch": 8.758198198198198,
"grad_norm": 2.2377078533172607,
"learning_rate": 4.151034151034151e-05,
"loss": 0.0413,
"step": 6070
},
{
"epoch": 8.772612612612612,
"grad_norm": 1.0943878889083862,
"learning_rate": 4.102934102934103e-05,
"loss": 0.0475,
"step": 6080
},
{
"epoch": 8.787027027027026,
"grad_norm": 1.7506133317947388,
"learning_rate": 4.054834054834054e-05,
"loss": 0.0188,
"step": 6090
},
{
"epoch": 8.801441441441442,
"grad_norm": 2.1582717895507812,
"learning_rate": 4.0067340067340066e-05,
"loss": 0.0407,
"step": 6100
},
{
"epoch": 8.815855855855856,
"grad_norm": 13.355046272277832,
"learning_rate": 3.958633958633958e-05,
"loss": 0.1049,
"step": 6110
},
{
"epoch": 8.83027027027027,
"grad_norm": 3.4152133464813232,
"learning_rate": 3.9105339105339104e-05,
"loss": 0.0346,
"step": 6120
},
{
"epoch": 8.844684684684685,
"grad_norm": 0.4933088421821594,
"learning_rate": 3.862433862433862e-05,
"loss": 0.1112,
"step": 6130
},
{
"epoch": 8.8590990990991,
"grad_norm": 12.00542163848877,
"learning_rate": 3.814333814333814e-05,
"loss": 0.0318,
"step": 6140
},
{
"epoch": 8.873513513513513,
"grad_norm": 9.061931610107422,
"learning_rate": 3.766233766233766e-05,
"loss": 0.0962,
"step": 6150
},
{
"epoch": 8.887927927927928,
"grad_norm": 0.15183605253696442,
"learning_rate": 3.7181337181337174e-05,
"loss": 0.093,
"step": 6160
},
{
"epoch": 8.902342342342342,
"grad_norm": 5.919425010681152,
"learning_rate": 3.67003367003367e-05,
"loss": 0.0287,
"step": 6170
},
{
"epoch": 8.916756756756756,
"grad_norm": 6.494754791259766,
"learning_rate": 3.621933621933621e-05,
"loss": 0.0287,
"step": 6180
},
{
"epoch": 8.93117117117117,
"grad_norm": 3.5904083251953125,
"learning_rate": 3.5738335738335735e-05,
"loss": 0.0247,
"step": 6190
},
{
"epoch": 8.945585585585585,
"grad_norm": 5.52282190322876,
"learning_rate": 3.525733525733526e-05,
"loss": 0.0644,
"step": 6200
},
{
"epoch": 8.96,
"grad_norm": 3.505472183227539,
"learning_rate": 3.4776334776334774e-05,
"loss": 0.0133,
"step": 6210
},
{
"epoch": 8.974414414414415,
"grad_norm": 0.13238631188869476,
"learning_rate": 3.4295334295334296e-05,
"loss": 0.0294,
"step": 6220
},
{
"epoch": 8.98882882882883,
"grad_norm": 1.1236836910247803,
"learning_rate": 3.381433381433381e-05,
"loss": 0.0468,
"step": 6230
},
{
"epoch": 8.99891891891892,
"eval_accuracy": 0.6504755434782609,
"eval_loss": 2.3125061988830566,
"eval_runtime": 539.1351,
"eval_samples_per_second": 10.921,
"eval_steps_per_second": 10.921,
"step": 6237
},
{
"epoch": 9.004324324324324,
"grad_norm": 1.5750885009765625,
"learning_rate": 3.333333333333333e-05,
"loss": 0.0234,
"step": 6240
},
{
"epoch": 9.018738738738739,
"grad_norm": 0.3882788121700287,
"learning_rate": 3.285233285233285e-05,
"loss": 0.0151,
"step": 6250
},
{
"epoch": 9.033153153153153,
"grad_norm": 0.2824605107307434,
"learning_rate": 3.2371332371332367e-05,
"loss": 0.0045,
"step": 6260
},
{
"epoch": 9.047567567567567,
"grad_norm": 0.8951876759529114,
"learning_rate": 3.189033189033189e-05,
"loss": 0.0058,
"step": 6270
},
{
"epoch": 9.061981981981981,
"grad_norm": 0.6100791096687317,
"learning_rate": 3.1409331409331405e-05,
"loss": 0.0148,
"step": 6280
},
{
"epoch": 9.076396396396396,
"grad_norm": 8.918787002563477,
"learning_rate": 3.092833092833093e-05,
"loss": 0.0175,
"step": 6290
},
{
"epoch": 9.090810810810812,
"grad_norm": 0.46548986434936523,
"learning_rate": 3.0447330447330447e-05,
"loss": 0.006,
"step": 6300
},
{
"epoch": 9.105225225225226,
"grad_norm": 2.6482155323028564,
"learning_rate": 2.9966329966329966e-05,
"loss": 0.0089,
"step": 6310
},
{
"epoch": 9.11963963963964,
"grad_norm": 0.44524553418159485,
"learning_rate": 2.9485329485329485e-05,
"loss": 0.0063,
"step": 6320
},
{
"epoch": 9.134054054054054,
"grad_norm": 1.2146574258804321,
"learning_rate": 2.9004329004329005e-05,
"loss": 0.0065,
"step": 6330
},
{
"epoch": 9.148468468468469,
"grad_norm": 5.5731201171875,
"learning_rate": 2.852332852332852e-05,
"loss": 0.017,
"step": 6340
},
{
"epoch": 9.162882882882883,
"grad_norm": 1.0001026391983032,
"learning_rate": 2.804232804232804e-05,
"loss": 0.0095,
"step": 6350
},
{
"epoch": 9.177297297297297,
"grad_norm": 0.22491152584552765,
"learning_rate": 2.756132756132756e-05,
"loss": 0.0301,
"step": 6360
},
{
"epoch": 9.191711711711712,
"grad_norm": 0.5325976610183716,
"learning_rate": 2.7080327080327078e-05,
"loss": 0.0296,
"step": 6370
},
{
"epoch": 9.206126126126126,
"grad_norm": 0.44546425342559814,
"learning_rate": 2.6599326599326597e-05,
"loss": 0.0056,
"step": 6380
},
{
"epoch": 9.22054054054054,
"grad_norm": 3.602013349533081,
"learning_rate": 2.6118326118326117e-05,
"loss": 0.014,
"step": 6390
},
{
"epoch": 9.234954954954954,
"grad_norm": 0.4638885259628296,
"learning_rate": 2.5637325637325636e-05,
"loss": 0.01,
"step": 6400
},
{
"epoch": 9.249369369369369,
"grad_norm": 0.21774759888648987,
"learning_rate": 2.5156325156325155e-05,
"loss": 0.0543,
"step": 6410
},
{
"epoch": 9.263783783783785,
"grad_norm": 0.2262602001428604,
"learning_rate": 2.4675324675324674e-05,
"loss": 0.0086,
"step": 6420
},
{
"epoch": 9.278198198198199,
"grad_norm": 1.7811743021011353,
"learning_rate": 2.4194324194324193e-05,
"loss": 0.0109,
"step": 6430
},
{
"epoch": 9.292612612612613,
"grad_norm": 1.6832902431488037,
"learning_rate": 2.371332371332371e-05,
"loss": 0.0076,
"step": 6440
},
{
"epoch": 9.307027027027027,
"grad_norm": 0.11599577963352203,
"learning_rate": 2.323232323232323e-05,
"loss": 0.0065,
"step": 6450
},
{
"epoch": 9.321441441441442,
"grad_norm": 0.049297433346509933,
"learning_rate": 2.2751322751322748e-05,
"loss": 0.0094,
"step": 6460
},
{
"epoch": 9.335855855855856,
"grad_norm": 0.6120862364768982,
"learning_rate": 2.2270322270322267e-05,
"loss": 0.0065,
"step": 6470
},
{
"epoch": 9.35027027027027,
"grad_norm": 0.24179236590862274,
"learning_rate": 2.1789321789321786e-05,
"loss": 0.0156,
"step": 6480
},
{
"epoch": 9.364684684684685,
"grad_norm": 1.3065845966339111,
"learning_rate": 2.1308321308321305e-05,
"loss": 0.0114,
"step": 6490
},
{
"epoch": 9.379099099099099,
"grad_norm": 1.4051166772842407,
"learning_rate": 2.0827320827320825e-05,
"loss": 0.005,
"step": 6500
},
{
"epoch": 9.393513513513513,
"grad_norm": 1.3191016912460327,
"learning_rate": 2.0346320346320344e-05,
"loss": 0.0079,
"step": 6510
},
{
"epoch": 9.407927927927927,
"grad_norm": 0.15781471133232117,
"learning_rate": 1.9865319865319863e-05,
"loss": 0.0144,
"step": 6520
},
{
"epoch": 9.422342342342342,
"grad_norm": 0.2565706968307495,
"learning_rate": 1.9384319384319386e-05,
"loss": 0.0338,
"step": 6530
},
{
"epoch": 9.436756756756758,
"grad_norm": 0.3341190814971924,
"learning_rate": 1.8903318903318905e-05,
"loss": 0.0105,
"step": 6540
},
{
"epoch": 9.451171171171172,
"grad_norm": 0.5033118724822998,
"learning_rate": 1.842231842231842e-05,
"loss": 0.0568,
"step": 6550
},
{
"epoch": 9.465585585585586,
"grad_norm": 1.653732419013977,
"learning_rate": 1.794131794131794e-05,
"loss": 0.0084,
"step": 6560
},
{
"epoch": 9.48,
"grad_norm": 11.09926700592041,
"learning_rate": 1.746031746031746e-05,
"loss": 0.0144,
"step": 6570
},
{
"epoch": 9.494414414414415,
"grad_norm": 0.14694152772426605,
"learning_rate": 1.697931697931698e-05,
"loss": 0.0047,
"step": 6580
},
{
"epoch": 9.508828828828829,
"grad_norm": 0.05755695700645447,
"learning_rate": 1.6498316498316498e-05,
"loss": 0.0096,
"step": 6590
},
{
"epoch": 9.523243243243243,
"grad_norm": 0.30771782994270325,
"learning_rate": 1.6017316017316017e-05,
"loss": 0.0143,
"step": 6600
},
{
"epoch": 9.537657657657657,
"grad_norm": 0.2555331885814667,
"learning_rate": 1.5536315536315536e-05,
"loss": 0.0152,
"step": 6610
},
{
"epoch": 9.552072072072072,
"grad_norm": 0.45528095960617065,
"learning_rate": 1.5055315055315054e-05,
"loss": 0.0055,
"step": 6620
},
{
"epoch": 9.566486486486486,
"grad_norm": 1.118922472000122,
"learning_rate": 1.4574314574314573e-05,
"loss": 0.019,
"step": 6630
},
{
"epoch": 9.5809009009009,
"grad_norm": 0.5122382044792175,
"learning_rate": 1.4093314093314092e-05,
"loss": 0.0534,
"step": 6640
},
{
"epoch": 9.595315315315315,
"grad_norm": 0.18795226514339447,
"learning_rate": 1.3612313612313611e-05,
"loss": 0.0247,
"step": 6650
},
{
"epoch": 9.609729729729729,
"grad_norm": 1.0938136577606201,
"learning_rate": 1.313131313131313e-05,
"loss": 0.0062,
"step": 6660
},
{
"epoch": 9.624144144144145,
"grad_norm": 0.13021990656852722,
"learning_rate": 1.265031265031265e-05,
"loss": 0.0052,
"step": 6670
},
{
"epoch": 9.63855855855856,
"grad_norm": 1.0237598419189453,
"learning_rate": 1.2169312169312167e-05,
"loss": 0.0106,
"step": 6680
},
{
"epoch": 9.652972972972973,
"grad_norm": 0.8002647161483765,
"learning_rate": 1.1688311688311687e-05,
"loss": 0.0051,
"step": 6690
},
{
"epoch": 9.667387387387388,
"grad_norm": 0.5976181030273438,
"learning_rate": 1.1207311207311206e-05,
"loss": 0.0026,
"step": 6700
},
{
"epoch": 9.681801801801802,
"grad_norm": 0.4594089388847351,
"learning_rate": 1.0726310726310727e-05,
"loss": 0.0045,
"step": 6710
},
{
"epoch": 9.696216216216216,
"grad_norm": 0.6820192933082581,
"learning_rate": 1.0245310245310246e-05,
"loss": 0.005,
"step": 6720
},
{
"epoch": 9.71063063063063,
"grad_norm": 0.21790215373039246,
"learning_rate": 9.764309764309763e-06,
"loss": 0.0093,
"step": 6730
},
{
"epoch": 9.725045045045045,
"grad_norm": 3.2225234508514404,
"learning_rate": 9.283309283309283e-06,
"loss": 0.008,
"step": 6740
},
{
"epoch": 9.739459459459459,
"grad_norm": 2.9584898948669434,
"learning_rate": 8.802308802308802e-06,
"loss": 0.032,
"step": 6750
},
{
"epoch": 9.753873873873873,
"grad_norm": 0.250264972448349,
"learning_rate": 8.321308321308321e-06,
"loss": 0.0075,
"step": 6760
},
{
"epoch": 9.768288288288288,
"grad_norm": 14.774813652038574,
"learning_rate": 7.840307840307839e-06,
"loss": 0.0137,
"step": 6770
},
{
"epoch": 9.782702702702704,
"grad_norm": 16.798877716064453,
"learning_rate": 7.359307359307359e-06,
"loss": 0.0144,
"step": 6780
},
{
"epoch": 9.797117117117118,
"grad_norm": 0.39727962017059326,
"learning_rate": 6.878306878306877e-06,
"loss": 0.0068,
"step": 6790
},
{
"epoch": 9.811531531531532,
"grad_norm": 0.6047233939170837,
"learning_rate": 6.397306397306397e-06,
"loss": 0.0046,
"step": 6800
},
{
"epoch": 9.825945945945946,
"grad_norm": 0.6603574752807617,
"learning_rate": 5.916305916305916e-06,
"loss": 0.0058,
"step": 6810
},
{
"epoch": 9.84036036036036,
"grad_norm": 0.07351452112197876,
"learning_rate": 5.435305435305435e-06,
"loss": 0.007,
"step": 6820
},
{
"epoch": 9.854774774774775,
"grad_norm": 0.48447152972221375,
"learning_rate": 4.954304954304954e-06,
"loss": 0.0059,
"step": 6830
},
{
"epoch": 9.86918918918919,
"grad_norm": 0.12311412394046783,
"learning_rate": 4.473304473304473e-06,
"loss": 0.0151,
"step": 6840
},
{
"epoch": 9.883603603603603,
"grad_norm": 0.08983255177736282,
"learning_rate": 3.992303992303992e-06,
"loss": 0.0072,
"step": 6850
},
{
"epoch": 9.898018018018018,
"grad_norm": 0.78732830286026,
"learning_rate": 3.511303511303511e-06,
"loss": 0.0613,
"step": 6860
},
{
"epoch": 9.912432432432432,
"grad_norm": 0.09099213033914566,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.0064,
"step": 6870
},
{
"epoch": 9.926846846846846,
"grad_norm": 0.3043908476829529,
"learning_rate": 2.5493025493025493e-06,
"loss": 0.0062,
"step": 6880
},
{
"epoch": 9.94126126126126,
"grad_norm": 0.16236887872219086,
"learning_rate": 2.068302068302068e-06,
"loss": 0.007,
"step": 6890
},
{
"epoch": 9.955675675675675,
"grad_norm": 1.6547272205352783,
"learning_rate": 1.587301587301587e-06,
"loss": 0.0141,
"step": 6900
},
{
"epoch": 9.97009009009009,
"grad_norm": 14.645796775817871,
"learning_rate": 1.1063011063011063e-06,
"loss": 0.0272,
"step": 6910
},
{
"epoch": 9.984504504504505,
"grad_norm": 0.29277849197387695,
"learning_rate": 6.253006253006252e-07,
"loss": 0.0082,
"step": 6920
},
{
"epoch": 9.99891891891892,
"grad_norm": 0.7076464891433716,
"learning_rate": 1.4430014430014428e-07,
"loss": 0.0045,
"step": 6930
},
{
"epoch": 9.99891891891892,
"eval_accuracy": 0.6554008152173914,
"eval_loss": 2.2544686794281006,
"eval_runtime": 539.1275,
"eval_samples_per_second": 10.921,
"eval_steps_per_second": 10.921,
"step": 6930
},
{
"epoch": 9.99891891891892,
"step": 6930,
"total_flos": 3.884969846408101e+18,
"train_loss": 2.9567832476562925,
"train_runtime": 60026.293,
"train_samples_per_second": 3.697,
"train_steps_per_second": 0.115
}
],
"logging_steps": 10,
"max_steps": 6930,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.884969846408101e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}