terry69's picture
Model save
0c8a459 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2415,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00041407867494824016,
"grad_norm": 25.202419247147507,
"learning_rate": 4.132231404958678e-08,
"loss": 1.4005,
"step": 1
},
{
"epoch": 0.002070393374741201,
"grad_norm": 25.12179874511873,
"learning_rate": 2.066115702479339e-07,
"loss": 1.4165,
"step": 5
},
{
"epoch": 0.004140786749482402,
"grad_norm": 16.038038583422736,
"learning_rate": 4.132231404958678e-07,
"loss": 1.3797,
"step": 10
},
{
"epoch": 0.006211180124223602,
"grad_norm": 8.736440231985236,
"learning_rate": 6.198347107438018e-07,
"loss": 1.2598,
"step": 15
},
{
"epoch": 0.008281573498964804,
"grad_norm": 10.997222572810609,
"learning_rate": 8.264462809917356e-07,
"loss": 1.1367,
"step": 20
},
{
"epoch": 0.010351966873706004,
"grad_norm": 4.462899279113632,
"learning_rate": 1.0330578512396695e-06,
"loss": 1.0256,
"step": 25
},
{
"epoch": 0.012422360248447204,
"grad_norm": 3.3727716758436004,
"learning_rate": 1.2396694214876035e-06,
"loss": 0.9766,
"step": 30
},
{
"epoch": 0.014492753623188406,
"grad_norm": 3.296990128668156,
"learning_rate": 1.4462809917355372e-06,
"loss": 0.9495,
"step": 35
},
{
"epoch": 0.016563146997929608,
"grad_norm": 3.0384309271469228,
"learning_rate": 1.6528925619834712e-06,
"loss": 0.9208,
"step": 40
},
{
"epoch": 0.018633540372670808,
"grad_norm": 3.2109039176667187,
"learning_rate": 1.859504132231405e-06,
"loss": 0.9266,
"step": 45
},
{
"epoch": 0.020703933747412008,
"grad_norm": 3.0436406944879613,
"learning_rate": 2.066115702479339e-06,
"loss": 0.9027,
"step": 50
},
{
"epoch": 0.022774327122153208,
"grad_norm": 2.943290356971142,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.8907,
"step": 55
},
{
"epoch": 0.024844720496894408,
"grad_norm": 3.0723834984974894,
"learning_rate": 2.479338842975207e-06,
"loss": 0.8847,
"step": 60
},
{
"epoch": 0.026915113871635612,
"grad_norm": 3.1360516831874334,
"learning_rate": 2.6859504132231405e-06,
"loss": 0.8818,
"step": 65
},
{
"epoch": 0.028985507246376812,
"grad_norm": 3.0625435891926287,
"learning_rate": 2.8925619834710743e-06,
"loss": 0.884,
"step": 70
},
{
"epoch": 0.031055900621118012,
"grad_norm": 2.980216538475329,
"learning_rate": 3.0991735537190086e-06,
"loss": 0.8823,
"step": 75
},
{
"epoch": 0.033126293995859216,
"grad_norm": 3.2098789156337695,
"learning_rate": 3.3057851239669424e-06,
"loss": 0.8795,
"step": 80
},
{
"epoch": 0.035196687370600416,
"grad_norm": 2.9960247768409656,
"learning_rate": 3.5123966942148763e-06,
"loss": 0.882,
"step": 85
},
{
"epoch": 0.037267080745341616,
"grad_norm": 3.173106999088243,
"learning_rate": 3.71900826446281e-06,
"loss": 0.864,
"step": 90
},
{
"epoch": 0.039337474120082816,
"grad_norm": 3.5334328072524377,
"learning_rate": 3.925619834710744e-06,
"loss": 0.8625,
"step": 95
},
{
"epoch": 0.041407867494824016,
"grad_norm": 3.0099142606353935,
"learning_rate": 4.132231404958678e-06,
"loss": 0.8549,
"step": 100
},
{
"epoch": 0.043478260869565216,
"grad_norm": 2.9816578609914943,
"learning_rate": 4.338842975206612e-06,
"loss": 0.8581,
"step": 105
},
{
"epoch": 0.045548654244306416,
"grad_norm": 3.1482211842721495,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.8524,
"step": 110
},
{
"epoch": 0.047619047619047616,
"grad_norm": 3.224342894413885,
"learning_rate": 4.75206611570248e-06,
"loss": 0.8459,
"step": 115
},
{
"epoch": 0.049689440993788817,
"grad_norm": 3.118946291711065,
"learning_rate": 4.958677685950414e-06,
"loss": 0.8423,
"step": 120
},
{
"epoch": 0.051759834368530024,
"grad_norm": 3.2272023587099645,
"learning_rate": 5.165289256198347e-06,
"loss": 0.8388,
"step": 125
},
{
"epoch": 0.053830227743271224,
"grad_norm": 3.512425523515751,
"learning_rate": 5.371900826446281e-06,
"loss": 0.8309,
"step": 130
},
{
"epoch": 0.055900621118012424,
"grad_norm": 3.0536764521591517,
"learning_rate": 5.578512396694216e-06,
"loss": 0.8294,
"step": 135
},
{
"epoch": 0.057971014492753624,
"grad_norm": 3.290725564010408,
"learning_rate": 5.785123966942149e-06,
"loss": 0.8363,
"step": 140
},
{
"epoch": 0.060041407867494824,
"grad_norm": 3.0310839876550086,
"learning_rate": 5.991735537190083e-06,
"loss": 0.8411,
"step": 145
},
{
"epoch": 0.062111801242236024,
"grad_norm": 3.213171239307505,
"learning_rate": 6.198347107438017e-06,
"loss": 0.8357,
"step": 150
},
{
"epoch": 0.06418219461697723,
"grad_norm": 3.0179280975114384,
"learning_rate": 6.404958677685951e-06,
"loss": 0.826,
"step": 155
},
{
"epoch": 0.06625258799171843,
"grad_norm": 3.136701687048142,
"learning_rate": 6.611570247933885e-06,
"loss": 0.8189,
"step": 160
},
{
"epoch": 0.06832298136645963,
"grad_norm": 2.92910540464732,
"learning_rate": 6.818181818181818e-06,
"loss": 0.8303,
"step": 165
},
{
"epoch": 0.07039337474120083,
"grad_norm": 3.303919614430464,
"learning_rate": 7.0247933884297525e-06,
"loss": 0.8409,
"step": 170
},
{
"epoch": 0.07246376811594203,
"grad_norm": 3.2064955824393797,
"learning_rate": 7.231404958677687e-06,
"loss": 0.8175,
"step": 175
},
{
"epoch": 0.07453416149068323,
"grad_norm": 3.0083874019303964,
"learning_rate": 7.43801652892562e-06,
"loss": 0.8326,
"step": 180
},
{
"epoch": 0.07660455486542443,
"grad_norm": 3.032770779835326,
"learning_rate": 7.644628099173555e-06,
"loss": 0.8067,
"step": 185
},
{
"epoch": 0.07867494824016563,
"grad_norm": 3.101904967877232,
"learning_rate": 7.851239669421489e-06,
"loss": 0.8199,
"step": 190
},
{
"epoch": 0.08074534161490683,
"grad_norm": 3.0244431795719766,
"learning_rate": 8.057851239669421e-06,
"loss": 0.8241,
"step": 195
},
{
"epoch": 0.08281573498964803,
"grad_norm": 2.8649857073321745,
"learning_rate": 8.264462809917356e-06,
"loss": 0.7982,
"step": 200
},
{
"epoch": 0.08488612836438923,
"grad_norm": 3.318078893355966,
"learning_rate": 8.47107438016529e-06,
"loss": 0.8212,
"step": 205
},
{
"epoch": 0.08695652173913043,
"grad_norm": 2.9743518030337706,
"learning_rate": 8.677685950413224e-06,
"loss": 0.8253,
"step": 210
},
{
"epoch": 0.08902691511387163,
"grad_norm": 3.0014168160775525,
"learning_rate": 8.884297520661158e-06,
"loss": 0.8134,
"step": 215
},
{
"epoch": 0.09109730848861283,
"grad_norm": 3.149744314927577,
"learning_rate": 9.090909090909091e-06,
"loss": 0.8167,
"step": 220
},
{
"epoch": 0.09316770186335403,
"grad_norm": 3.3658629303836967,
"learning_rate": 9.297520661157025e-06,
"loss": 0.7841,
"step": 225
},
{
"epoch": 0.09523809523809523,
"grad_norm": 3.027318661460287,
"learning_rate": 9.50413223140496e-06,
"loss": 0.8107,
"step": 230
},
{
"epoch": 0.09730848861283643,
"grad_norm": 2.9331307507946445,
"learning_rate": 9.710743801652894e-06,
"loss": 0.8033,
"step": 235
},
{
"epoch": 0.09937888198757763,
"grad_norm": 3.3085978983438724,
"learning_rate": 9.917355371900828e-06,
"loss": 0.7929,
"step": 240
},
{
"epoch": 0.10144927536231885,
"grad_norm": 3.086961507534569,
"learning_rate": 9.999952971391373e-06,
"loss": 0.7974,
"step": 245
},
{
"epoch": 0.10351966873706005,
"grad_norm": 3.0432703518684767,
"learning_rate": 9.999665577542406e-06,
"loss": 0.7977,
"step": 250
},
{
"epoch": 0.10559006211180125,
"grad_norm": 2.913310908079154,
"learning_rate": 9.999116931848504e-06,
"loss": 0.7905,
"step": 255
},
{
"epoch": 0.10766045548654245,
"grad_norm": 2.9319459199084315,
"learning_rate": 9.99830706297852e-06,
"loss": 0.7832,
"step": 260
},
{
"epoch": 0.10973084886128365,
"grad_norm": 3.1199555789631535,
"learning_rate": 9.997236013251234e-06,
"loss": 0.7771,
"step": 265
},
{
"epoch": 0.11180124223602485,
"grad_norm": 3.124234923805504,
"learning_rate": 9.995903838633133e-06,
"loss": 0.7815,
"step": 270
},
{
"epoch": 0.11387163561076605,
"grad_norm": 3.0113713050092077,
"learning_rate": 9.994310608735492e-06,
"loss": 0.7981,
"step": 275
},
{
"epoch": 0.11594202898550725,
"grad_norm": 3.0720530525486063,
"learning_rate": 9.99245640681072e-06,
"loss": 0.7914,
"step": 280
},
{
"epoch": 0.11801242236024845,
"grad_norm": 2.9184696769176792,
"learning_rate": 9.99034132974804e-06,
"loss": 0.7669,
"step": 285
},
{
"epoch": 0.12008281573498965,
"grad_norm": 2.916371546489498,
"learning_rate": 9.987965488068398e-06,
"loss": 0.7936,
"step": 290
},
{
"epoch": 0.12215320910973085,
"grad_norm": 2.865854948598264,
"learning_rate": 9.985329005918702e-06,
"loss": 0.7697,
"step": 295
},
{
"epoch": 0.12422360248447205,
"grad_norm": 2.940161302447046,
"learning_rate": 9.982432021065334e-06,
"loss": 0.7724,
"step": 300
},
{
"epoch": 0.12629399585921325,
"grad_norm": 3.365022939953625,
"learning_rate": 9.979274684886943e-06,
"loss": 0.7763,
"step": 305
},
{
"epoch": 0.12836438923395446,
"grad_norm": 3.3299660498173185,
"learning_rate": 9.975857162366547e-06,
"loss": 0.7925,
"step": 310
},
{
"epoch": 0.13043478260869565,
"grad_norm": 2.801287366629653,
"learning_rate": 9.972179632082899e-06,
"loss": 0.7553,
"step": 315
},
{
"epoch": 0.13250517598343686,
"grad_norm": 2.76578270144393,
"learning_rate": 9.968242286201171e-06,
"loss": 0.7569,
"step": 320
},
{
"epoch": 0.13457556935817805,
"grad_norm": 2.688166103897115,
"learning_rate": 9.964045330462896e-06,
"loss": 0.7622,
"step": 325
},
{
"epoch": 0.13664596273291926,
"grad_norm": 2.884662314626577,
"learning_rate": 9.959588984175228e-06,
"loss": 0.7634,
"step": 330
},
{
"epoch": 0.13871635610766045,
"grad_norm": 2.8001939974195995,
"learning_rate": 9.954873480199483e-06,
"loss": 0.7702,
"step": 335
},
{
"epoch": 0.14078674948240166,
"grad_norm": 3.0189021570529007,
"learning_rate": 9.94989906493896e-06,
"loss": 0.7447,
"step": 340
},
{
"epoch": 0.14285714285714285,
"grad_norm": 3.103139217615635,
"learning_rate": 9.94466599832608e-06,
"loss": 0.7438,
"step": 345
},
{
"epoch": 0.14492753623188406,
"grad_norm": 3.0482579677527317,
"learning_rate": 9.939174553808793e-06,
"loss": 0.757,
"step": 350
},
{
"epoch": 0.14699792960662525,
"grad_norm": 2.922961568445881,
"learning_rate": 9.933425018336292e-06,
"loss": 0.7505,
"step": 355
},
{
"epoch": 0.14906832298136646,
"grad_norm": 2.8723334405858574,
"learning_rate": 9.927417692344025e-06,
"loss": 0.7329,
"step": 360
},
{
"epoch": 0.15113871635610765,
"grad_norm": 2.7928512597563544,
"learning_rate": 9.921152889737985e-06,
"loss": 0.7585,
"step": 365
},
{
"epoch": 0.15320910973084886,
"grad_norm": 2.8401744506025475,
"learning_rate": 9.914630937878315e-06,
"loss": 0.757,
"step": 370
},
{
"epoch": 0.15527950310559005,
"grad_norm": 3.076286706590547,
"learning_rate": 9.907852177562201e-06,
"loss": 0.7548,
"step": 375
},
{
"epoch": 0.15734989648033126,
"grad_norm": 2.8175618190354292,
"learning_rate": 9.900816963006063e-06,
"loss": 0.7421,
"step": 380
},
{
"epoch": 0.15942028985507245,
"grad_norm": 3.028785912971397,
"learning_rate": 9.893525661827043e-06,
"loss": 0.7286,
"step": 385
},
{
"epoch": 0.16149068322981366,
"grad_norm": 3.073226207863677,
"learning_rate": 9.885978655023805e-06,
"loss": 0.7527,
"step": 390
},
{
"epoch": 0.16356107660455488,
"grad_norm": 3.05785989623398,
"learning_rate": 9.87817633695661e-06,
"loss": 0.7362,
"step": 395
},
{
"epoch": 0.16563146997929606,
"grad_norm": 2.905076222884638,
"learning_rate": 9.87011911532673e-06,
"loss": 0.7189,
"step": 400
},
{
"epoch": 0.16770186335403728,
"grad_norm": 2.9328899308458394,
"learning_rate": 9.861807411155126e-06,
"loss": 0.739,
"step": 405
},
{
"epoch": 0.16977225672877846,
"grad_norm": 2.7874865697200697,
"learning_rate": 9.853241658760457e-06,
"loss": 0.7255,
"step": 410
},
{
"epoch": 0.17184265010351968,
"grad_norm": 2.770855522592001,
"learning_rate": 9.844422305736383e-06,
"loss": 0.712,
"step": 415
},
{
"epoch": 0.17391304347826086,
"grad_norm": 2.9867705060318173,
"learning_rate": 9.835349812928178e-06,
"loss": 0.7181,
"step": 420
},
{
"epoch": 0.17598343685300208,
"grad_norm": 2.8398584447403965,
"learning_rate": 9.826024654408645e-06,
"loss": 0.725,
"step": 425
},
{
"epoch": 0.17805383022774326,
"grad_norm": 2.6053173100787688,
"learning_rate": 9.816447317453353e-06,
"loss": 0.714,
"step": 430
},
{
"epoch": 0.18012422360248448,
"grad_norm": 2.731991279961853,
"learning_rate": 9.80661830251516e-06,
"loss": 0.74,
"step": 435
},
{
"epoch": 0.18219461697722567,
"grad_norm": 2.695140369889714,
"learning_rate": 9.796538123198077e-06,
"loss": 0.7212,
"step": 440
},
{
"epoch": 0.18426501035196688,
"grad_norm": 2.7926524393798635,
"learning_rate": 9.786207306230422e-06,
"loss": 0.71,
"step": 445
},
{
"epoch": 0.18633540372670807,
"grad_norm": 2.7953164745662167,
"learning_rate": 9.775626391437303e-06,
"loss": 0.695,
"step": 450
},
{
"epoch": 0.18840579710144928,
"grad_norm": 2.895428993193587,
"learning_rate": 9.764795931712396e-06,
"loss": 0.7152,
"step": 455
},
{
"epoch": 0.19047619047619047,
"grad_norm": 3.0004905465706764,
"learning_rate": 9.753716492989076e-06,
"loss": 0.7109,
"step": 460
},
{
"epoch": 0.19254658385093168,
"grad_norm": 2.939388920358479,
"learning_rate": 9.742388654210822e-06,
"loss": 0.6614,
"step": 465
},
{
"epoch": 0.19461697722567287,
"grad_norm": 2.7692614271421574,
"learning_rate": 9.730813007300984e-06,
"loss": 0.6991,
"step": 470
},
{
"epoch": 0.19668737060041408,
"grad_norm": 2.943721446818141,
"learning_rate": 9.718990157131841e-06,
"loss": 0.7037,
"step": 475
},
{
"epoch": 0.19875776397515527,
"grad_norm": 2.604947728674637,
"learning_rate": 9.706920721492995e-06,
"loss": 0.6952,
"step": 480
},
{
"epoch": 0.20082815734989648,
"grad_norm": 2.9279583555320574,
"learning_rate": 9.694605331059094e-06,
"loss": 0.6898,
"step": 485
},
{
"epoch": 0.2028985507246377,
"grad_norm": 2.8386929777247043,
"learning_rate": 9.682044629356874e-06,
"loss": 0.6799,
"step": 490
},
{
"epoch": 0.20496894409937888,
"grad_norm": 3.008756576964665,
"learning_rate": 9.669239272731538e-06,
"loss": 0.6853,
"step": 495
},
{
"epoch": 0.2070393374741201,
"grad_norm": 2.6602235879778577,
"learning_rate": 9.656189930312443e-06,
"loss": 0.6945,
"step": 500
},
{
"epoch": 0.20910973084886128,
"grad_norm": 3.013251956741497,
"learning_rate": 9.642897283978157e-06,
"loss": 0.671,
"step": 505
},
{
"epoch": 0.2111801242236025,
"grad_norm": 2.9053651262314943,
"learning_rate": 9.629362028320808e-06,
"loss": 0.6749,
"step": 510
},
{
"epoch": 0.21325051759834368,
"grad_norm": 2.7199311132614015,
"learning_rate": 9.615584870609809e-06,
"loss": 0.6814,
"step": 515
},
{
"epoch": 0.2153209109730849,
"grad_norm": 2.894465862241774,
"learning_rate": 9.601566530754882e-06,
"loss": 0.686,
"step": 520
},
{
"epoch": 0.21739130434782608,
"grad_norm": 2.83190203546856,
"learning_rate": 9.587307741268452e-06,
"loss": 0.6662,
"step": 525
},
{
"epoch": 0.2194616977225673,
"grad_norm": 2.80830381942455,
"learning_rate": 9.572809247227366e-06,
"loss": 0.6737,
"step": 530
},
{
"epoch": 0.22153209109730848,
"grad_norm": 2.8195667076208233,
"learning_rate": 9.558071806233955e-06,
"loss": 0.6911,
"step": 535
},
{
"epoch": 0.2236024844720497,
"grad_norm": 2.691767822549545,
"learning_rate": 9.54309618837646e-06,
"loss": 0.6611,
"step": 540
},
{
"epoch": 0.22567287784679088,
"grad_norm": 4.427493055249083,
"learning_rate": 9.52788317618878e-06,
"loss": 0.6794,
"step": 545
},
{
"epoch": 0.2277432712215321,
"grad_norm": 2.8623844423730156,
"learning_rate": 9.512433564609578e-06,
"loss": 0.6789,
"step": 550
},
{
"epoch": 0.22981366459627328,
"grad_norm": 2.5347982174860078,
"learning_rate": 9.496748160940762e-06,
"loss": 0.6758,
"step": 555
},
{
"epoch": 0.2318840579710145,
"grad_norm": 2.916405931910147,
"learning_rate": 9.480827784805278e-06,
"loss": 0.6427,
"step": 560
},
{
"epoch": 0.23395445134575568,
"grad_norm": 2.7277770304150564,
"learning_rate": 9.464673268104299e-06,
"loss": 0.6575,
"step": 565
},
{
"epoch": 0.2360248447204969,
"grad_norm": 2.87238035127745,
"learning_rate": 9.448285454973739e-06,
"loss": 0.6733,
"step": 570
},
{
"epoch": 0.23809523809523808,
"grad_norm": 2.841620041452773,
"learning_rate": 9.431665201740154e-06,
"loss": 0.6592,
"step": 575
},
{
"epoch": 0.2401656314699793,
"grad_norm": 2.6676467953789915,
"learning_rate": 9.414813376876003e-06,
"loss": 0.6461,
"step": 580
},
{
"epoch": 0.2422360248447205,
"grad_norm": 2.9708236577721334,
"learning_rate": 9.397730860954242e-06,
"loss": 0.6527,
"step": 585
},
{
"epoch": 0.2443064182194617,
"grad_norm": 5.35841345818986,
"learning_rate": 9.38041854660234e-06,
"loss": 0.6346,
"step": 590
},
{
"epoch": 0.2463768115942029,
"grad_norm": 2.543881001010828,
"learning_rate": 9.362877338455611e-06,
"loss": 0.6424,
"step": 595
},
{
"epoch": 0.2484472049689441,
"grad_norm": 2.8137763695246267,
"learning_rate": 9.345108153109963e-06,
"loss": 0.6445,
"step": 600
},
{
"epoch": 0.2505175983436853,
"grad_norm": 2.6758866112643505,
"learning_rate": 9.327111919073988e-06,
"loss": 0.6442,
"step": 605
},
{
"epoch": 0.2525879917184265,
"grad_norm": 2.877035571873532,
"learning_rate": 9.308889576720453e-06,
"loss": 0.6456,
"step": 610
},
{
"epoch": 0.2546583850931677,
"grad_norm": 2.8677372658265115,
"learning_rate": 9.290442078237154e-06,
"loss": 0.6247,
"step": 615
},
{
"epoch": 0.2567287784679089,
"grad_norm": 2.932727709740202,
"learning_rate": 9.271770387577168e-06,
"loss": 0.6512,
"step": 620
},
{
"epoch": 0.2587991718426501,
"grad_norm": 3.002742188503913,
"learning_rate": 9.252875480408479e-06,
"loss": 0.6335,
"step": 625
},
{
"epoch": 0.2608695652173913,
"grad_norm": 2.6495692612000235,
"learning_rate": 9.233758344062996e-06,
"loss": 0.6326,
"step": 630
},
{
"epoch": 0.2629399585921325,
"grad_norm": 2.715449722424682,
"learning_rate": 9.21441997748496e-06,
"loss": 0.6231,
"step": 635
},
{
"epoch": 0.2650103519668737,
"grad_norm": 2.5988933441618194,
"learning_rate": 9.194861391178749e-06,
"loss": 0.6441,
"step": 640
},
{
"epoch": 0.2670807453416149,
"grad_norm": 2.6160149596137243,
"learning_rate": 9.175083607156067e-06,
"loss": 0.6161,
"step": 645
},
{
"epoch": 0.2691511387163561,
"grad_norm": 2.729241846927493,
"learning_rate": 9.155087658882555e-06,
"loss": 0.6052,
"step": 650
},
{
"epoch": 0.2712215320910973,
"grad_norm": 2.8768347489745776,
"learning_rate": 9.134874591223773e-06,
"loss": 0.62,
"step": 655
},
{
"epoch": 0.2732919254658385,
"grad_norm": 2.9276810383708565,
"learning_rate": 9.114445460390605e-06,
"loss": 0.6221,
"step": 660
},
{
"epoch": 0.2753623188405797,
"grad_norm": 2.7620318299281212,
"learning_rate": 9.093801333884076e-06,
"loss": 0.6188,
"step": 665
},
{
"epoch": 0.2774327122153209,
"grad_norm": 3.1295950856893535,
"learning_rate": 9.072943290439566e-06,
"loss": 0.5977,
"step": 670
},
{
"epoch": 0.2795031055900621,
"grad_norm": 2.6257142794320996,
"learning_rate": 9.051872419970439e-06,
"loss": 0.603,
"step": 675
},
{
"epoch": 0.2815734989648033,
"grad_norm": 2.650013015058136,
"learning_rate": 9.03058982351109e-06,
"loss": 0.6127,
"step": 680
},
{
"epoch": 0.2836438923395445,
"grad_norm": 2.9013861667064007,
"learning_rate": 9.009096613159426e-06,
"loss": 0.6009,
"step": 685
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.7077769987668447,
"learning_rate": 8.987393912018732e-06,
"loss": 0.6198,
"step": 690
},
{
"epoch": 0.28778467908902694,
"grad_norm": 2.556201414876345,
"learning_rate": 8.965482854139003e-06,
"loss": 0.5889,
"step": 695
},
{
"epoch": 0.2898550724637681,
"grad_norm": 3.088081295064485,
"learning_rate": 8.943364584457675e-06,
"loss": 0.611,
"step": 700
},
{
"epoch": 0.2919254658385093,
"grad_norm": 2.8946957930783763,
"learning_rate": 8.921040258739804e-06,
"loss": 0.6026,
"step": 705
},
{
"epoch": 0.2939958592132505,
"grad_norm": 2.63148993012509,
"learning_rate": 8.898511043517668e-06,
"loss": 0.611,
"step": 710
},
{
"epoch": 0.29606625258799174,
"grad_norm": 2.775426413397115,
"learning_rate": 8.875778116029816e-06,
"loss": 0.5919,
"step": 715
},
{
"epoch": 0.2981366459627329,
"grad_norm": 2.699577283846763,
"learning_rate": 8.85284266415955e-06,
"loss": 0.6289,
"step": 720
},
{
"epoch": 0.3002070393374741,
"grad_norm": 2.8072229555121693,
"learning_rate": 8.829705886372845e-06,
"loss": 0.585,
"step": 725
},
{
"epoch": 0.3022774327122153,
"grad_norm": 2.696427988483572,
"learning_rate": 8.806368991655747e-06,
"loss": 0.6073,
"step": 730
},
{
"epoch": 0.30434782608695654,
"grad_norm": 2.686305019130463,
"learning_rate": 8.782833199451177e-06,
"loss": 0.5904,
"step": 735
},
{
"epoch": 0.3064182194616977,
"grad_norm": 2.7306138100936566,
"learning_rate": 8.759099739595215e-06,
"loss": 0.5997,
"step": 740
},
{
"epoch": 0.3084886128364389,
"grad_norm": 2.8046514575794985,
"learning_rate": 8.735169852252848e-06,
"loss": 0.5786,
"step": 745
},
{
"epoch": 0.3105590062111801,
"grad_norm": 2.6951846511705404,
"learning_rate": 8.71104478785315e-06,
"loss": 0.5727,
"step": 750
},
{
"epoch": 0.31262939958592134,
"grad_norm": 2.5419653836229874,
"learning_rate": 8.686725807023955e-06,
"loss": 0.5916,
"step": 755
},
{
"epoch": 0.3146997929606625,
"grad_norm": 2.765272923744354,
"learning_rate": 8.662214180525982e-06,
"loss": 0.5706,
"step": 760
},
{
"epoch": 0.3167701863354037,
"grad_norm": 2.5917394303727552,
"learning_rate": 8.637511189186425e-06,
"loss": 0.5845,
"step": 765
},
{
"epoch": 0.3188405797101449,
"grad_norm": 2.69021979941933,
"learning_rate": 8.612618123832033e-06,
"loss": 0.567,
"step": 770
},
{
"epoch": 0.32091097308488614,
"grad_norm": 2.536366805742445,
"learning_rate": 8.587536285221656e-06,
"loss": 0.5718,
"step": 775
},
{
"epoch": 0.32298136645962733,
"grad_norm": 2.6346511819266345,
"learning_rate": 8.562266983978278e-06,
"loss": 0.5827,
"step": 780
},
{
"epoch": 0.3250517598343685,
"grad_norm": 2.77729021265396,
"learning_rate": 8.536811540520529e-06,
"loss": 0.5833,
"step": 785
},
{
"epoch": 0.32712215320910976,
"grad_norm": 3.1173269180583056,
"learning_rate": 8.511171284993686e-06,
"loss": 0.5743,
"step": 790
},
{
"epoch": 0.32919254658385094,
"grad_norm": 2.6450450449366674,
"learning_rate": 8.485347557200177e-06,
"loss": 0.5817,
"step": 795
},
{
"epoch": 0.33126293995859213,
"grad_norm": 2.6826724529299786,
"learning_rate": 8.459341706529557e-06,
"loss": 0.5521,
"step": 800
},
{
"epoch": 0.3333333333333333,
"grad_norm": 2.688335715125397,
"learning_rate": 8.43315509188801e-06,
"loss": 0.574,
"step": 805
},
{
"epoch": 0.33540372670807456,
"grad_norm": 2.5367306505402767,
"learning_rate": 8.406789081627337e-06,
"loss": 0.5665,
"step": 810
},
{
"epoch": 0.33747412008281574,
"grad_norm": 2.6181238987040834,
"learning_rate": 8.380245053473452e-06,
"loss": 0.5786,
"step": 815
},
{
"epoch": 0.33954451345755693,
"grad_norm": 2.6695428259438483,
"learning_rate": 8.353524394454388e-06,
"loss": 0.575,
"step": 820
},
{
"epoch": 0.3416149068322981,
"grad_norm": 2.7917653195606955,
"learning_rate": 8.326628500827826e-06,
"loss": 0.5532,
"step": 825
},
{
"epoch": 0.34368530020703936,
"grad_norm": 2.6595925023210416,
"learning_rate": 8.299558778008137e-06,
"loss": 0.5685,
"step": 830
},
{
"epoch": 0.34575569358178054,
"grad_norm": 2.747451547037362,
"learning_rate": 8.272316640492932e-06,
"loss": 0.5355,
"step": 835
},
{
"epoch": 0.34782608695652173,
"grad_norm": 2.8512479215829347,
"learning_rate": 8.244903511789158e-06,
"loss": 0.5688,
"step": 840
},
{
"epoch": 0.3498964803312629,
"grad_norm": 2.7724639065987366,
"learning_rate": 8.217320824338713e-06,
"loss": 0.5618,
"step": 845
},
{
"epoch": 0.35196687370600416,
"grad_norm": 2.6274981045290047,
"learning_rate": 8.189570019443597e-06,
"loss": 0.5405,
"step": 850
},
{
"epoch": 0.35403726708074534,
"grad_norm": 2.7044220043668448,
"learning_rate": 8.161652547190593e-06,
"loss": 0.5591,
"step": 855
},
{
"epoch": 0.35610766045548653,
"grad_norm": 2.812969363607544,
"learning_rate": 8.133569866375497e-06,
"loss": 0.5551,
"step": 860
},
{
"epoch": 0.3581780538302277,
"grad_norm": 2.625493007061286,
"learning_rate": 8.105323444426891e-06,
"loss": 0.5557,
"step": 865
},
{
"epoch": 0.36024844720496896,
"grad_norm": 2.6956325421660163,
"learning_rate": 8.076914757329467e-06,
"loss": 0.5526,
"step": 870
},
{
"epoch": 0.36231884057971014,
"grad_norm": 2.546614178866351,
"learning_rate": 8.048345289546895e-06,
"loss": 0.5623,
"step": 875
},
{
"epoch": 0.36438923395445133,
"grad_norm": 2.581190054234493,
"learning_rate": 8.01961653394426e-06,
"loss": 0.5463,
"step": 880
},
{
"epoch": 0.36645962732919257,
"grad_norm": 2.6322600937825067,
"learning_rate": 7.990729991710046e-06,
"loss": 0.5543,
"step": 885
},
{
"epoch": 0.36853002070393376,
"grad_norm": 2.6627585369304048,
"learning_rate": 7.9616871722777e-06,
"loss": 0.5498,
"step": 890
},
{
"epoch": 0.37060041407867494,
"grad_norm": 2.6670580379384257,
"learning_rate": 7.932489593246764e-06,
"loss": 0.5339,
"step": 895
},
{
"epoch": 0.37267080745341613,
"grad_norm": 2.6963953532921052,
"learning_rate": 7.903138780303556e-06,
"loss": 0.5551,
"step": 900
},
{
"epoch": 0.3747412008281574,
"grad_norm": 2.5836765938746167,
"learning_rate": 7.873636267141463e-06,
"loss": 0.5242,
"step": 905
},
{
"epoch": 0.37681159420289856,
"grad_norm": 2.619510478717604,
"learning_rate": 7.843983595380793e-06,
"loss": 0.5402,
"step": 910
},
{
"epoch": 0.37888198757763975,
"grad_norm": 2.662476954345239,
"learning_rate": 7.814182314488225e-06,
"loss": 0.5448,
"step": 915
},
{
"epoch": 0.38095238095238093,
"grad_norm": 2.5257784050195555,
"learning_rate": 7.784233981695835e-06,
"loss": 0.5387,
"step": 920
},
{
"epoch": 0.3830227743271222,
"grad_norm": 2.690951049285571,
"learning_rate": 7.754140161919732e-06,
"loss": 0.5332,
"step": 925
},
{
"epoch": 0.38509316770186336,
"grad_norm": 2.605551601699826,
"learning_rate": 7.72390242767828e-06,
"loss": 0.5271,
"step": 930
},
{
"epoch": 0.38716356107660455,
"grad_norm": 2.5955021717573894,
"learning_rate": 7.693522359009931e-06,
"loss": 0.5346,
"step": 935
},
{
"epoch": 0.38923395445134573,
"grad_norm": 2.539066072604622,
"learning_rate": 7.663001543390657e-06,
"loss": 0.5293,
"step": 940
},
{
"epoch": 0.391304347826087,
"grad_norm": 2.605016200331387,
"learning_rate": 7.63234157565101e-06,
"loss": 0.5204,
"step": 945
},
{
"epoch": 0.39337474120082816,
"grad_norm": 2.5777260649391436,
"learning_rate": 7.601544057892769e-06,
"loss": 0.5396,
"step": 950
},
{
"epoch": 0.39544513457556935,
"grad_norm": 2.5855245152567594,
"learning_rate": 7.570610599405242e-06,
"loss": 0.5343,
"step": 955
},
{
"epoch": 0.39751552795031053,
"grad_norm": 2.713454954699433,
"learning_rate": 7.539542816581157e-06,
"loss": 0.5174,
"step": 960
},
{
"epoch": 0.3995859213250518,
"grad_norm": 2.812108009958658,
"learning_rate": 7.508342332832213e-06,
"loss": 0.5248,
"step": 965
},
{
"epoch": 0.40165631469979296,
"grad_norm": 2.646648464458073,
"learning_rate": 7.477010778504241e-06,
"loss": 0.5165,
"step": 970
},
{
"epoch": 0.40372670807453415,
"grad_norm": 2.42290825721576,
"learning_rate": 7.445549790792021e-06,
"loss": 0.5089,
"step": 975
},
{
"epoch": 0.4057971014492754,
"grad_norm": 2.5318991486724847,
"learning_rate": 7.413961013653725e-06,
"loss": 0.5065,
"step": 980
},
{
"epoch": 0.4078674948240166,
"grad_norm": 2.5686476576899575,
"learning_rate": 7.3822460977250145e-06,
"loss": 0.5192,
"step": 985
},
{
"epoch": 0.40993788819875776,
"grad_norm": 2.58280297716493,
"learning_rate": 7.350406700232794e-06,
"loss": 0.5189,
"step": 990
},
{
"epoch": 0.41200828157349895,
"grad_norm": 2.6917719839428713,
"learning_rate": 7.318444484908606e-06,
"loss": 0.5167,
"step": 995
},
{
"epoch": 0.4140786749482402,
"grad_norm": 2.4671867391993416,
"learning_rate": 7.286361121901706e-06,
"loss": 0.5159,
"step": 1000
},
{
"epoch": 0.4161490683229814,
"grad_norm": 2.541240364909601,
"learning_rate": 7.254158287691775e-06,
"loss": 0.5148,
"step": 1005
},
{
"epoch": 0.41821946169772256,
"grad_norm": 2.6249938123623515,
"learning_rate": 7.221837665001335e-06,
"loss": 0.523,
"step": 1010
},
{
"epoch": 0.42028985507246375,
"grad_norm": 2.5469842430294247,
"learning_rate": 7.189400942707804e-06,
"loss": 0.5269,
"step": 1015
},
{
"epoch": 0.422360248447205,
"grad_norm": 2.7022313837528213,
"learning_rate": 7.1568498157552576e-06,
"loss": 0.506,
"step": 1020
},
{
"epoch": 0.4244306418219462,
"grad_norm": 2.4876791467917965,
"learning_rate": 7.124185985065856e-06,
"loss": 0.5086,
"step": 1025
},
{
"epoch": 0.42650103519668736,
"grad_norm": 2.583617363607446,
"learning_rate": 7.091411157450965e-06,
"loss": 0.5103,
"step": 1030
},
{
"epoch": 0.42857142857142855,
"grad_norm": 2.5643698913481843,
"learning_rate": 7.0585270455219654e-06,
"loss": 0.5028,
"step": 1035
},
{
"epoch": 0.4306418219461698,
"grad_norm": 2.5895555730148647,
"learning_rate": 7.025535367600771e-06,
"loss": 0.5027,
"step": 1040
},
{
"epoch": 0.432712215320911,
"grad_norm": 2.6510836238182813,
"learning_rate": 6.992437847630031e-06,
"loss": 0.4949,
"step": 1045
},
{
"epoch": 0.43478260869565216,
"grad_norm": 2.7964019262025577,
"learning_rate": 6.95923621508305e-06,
"loss": 0.494,
"step": 1050
},
{
"epoch": 0.43685300207039335,
"grad_norm": 2.4236744218940838,
"learning_rate": 6.92593220487342e-06,
"loss": 0.4952,
"step": 1055
},
{
"epoch": 0.4389233954451346,
"grad_norm": 2.6086812577026115,
"learning_rate": 6.892527557264358e-06,
"loss": 0.4998,
"step": 1060
},
{
"epoch": 0.4409937888198758,
"grad_norm": 2.5312894768606355,
"learning_rate": 6.859024017777779e-06,
"loss": 0.5021,
"step": 1065
},
{
"epoch": 0.44306418219461696,
"grad_norm": 2.463884912344729,
"learning_rate": 6.825423337103074e-06,
"loss": 0.4866,
"step": 1070
},
{
"epoch": 0.4451345755693582,
"grad_norm": 2.4286686806906457,
"learning_rate": 6.791727271005642e-06,
"loss": 0.4917,
"step": 1075
},
{
"epoch": 0.4472049689440994,
"grad_norm": 2.494038982115163,
"learning_rate": 6.757937580235138e-06,
"loss": 0.4883,
"step": 1080
},
{
"epoch": 0.4492753623188406,
"grad_norm": 2.6966329171496204,
"learning_rate": 6.724056030433464e-06,
"loss": 0.4966,
"step": 1085
},
{
"epoch": 0.45134575569358176,
"grad_norm": 2.514570592807026,
"learning_rate": 6.690084392042514e-06,
"loss": 0.4955,
"step": 1090
},
{
"epoch": 0.453416149068323,
"grad_norm": 2.488358684976745,
"learning_rate": 6.656024440211662e-06,
"loss": 0.4829,
"step": 1095
},
{
"epoch": 0.4554865424430642,
"grad_norm": 2.646524798253244,
"learning_rate": 6.621877954704996e-06,
"loss": 0.4822,
"step": 1100
},
{
"epoch": 0.4575569358178054,
"grad_norm": 2.512182032141478,
"learning_rate": 6.5876467198083235e-06,
"loss": 0.4969,
"step": 1105
},
{
"epoch": 0.45962732919254656,
"grad_norm": 2.648983953220209,
"learning_rate": 6.553332524235937e-06,
"loss": 0.495,
"step": 1110
},
{
"epoch": 0.4616977225672878,
"grad_norm": 2.660589577634855,
"learning_rate": 6.518937161037144e-06,
"loss": 0.5015,
"step": 1115
},
{
"epoch": 0.463768115942029,
"grad_norm": 2.6722045497907794,
"learning_rate": 6.484462427502572e-06,
"loss": 0.4785,
"step": 1120
},
{
"epoch": 0.4658385093167702,
"grad_norm": 2.5809604638619446,
"learning_rate": 6.44991012507026e-06,
"loss": 0.4648,
"step": 1125
},
{
"epoch": 0.46790890269151136,
"grad_norm": 2.734171452675181,
"learning_rate": 6.415282059231518e-06,
"loss": 0.4774,
"step": 1130
},
{
"epoch": 0.4699792960662526,
"grad_norm": 2.5870397012349557,
"learning_rate": 6.380580039436586e-06,
"loss": 0.4838,
"step": 1135
},
{
"epoch": 0.4720496894409938,
"grad_norm": 2.7822429863534515,
"learning_rate": 6.345805879000087e-06,
"loss": 0.4632,
"step": 1140
},
{
"epoch": 0.474120082815735,
"grad_norm": 2.6076973524190157,
"learning_rate": 6.31096139500627e-06,
"loss": 0.4725,
"step": 1145
},
{
"epoch": 0.47619047619047616,
"grad_norm": 2.455526654081033,
"learning_rate": 6.2760484082140604e-06,
"loss": 0.4682,
"step": 1150
},
{
"epoch": 0.4782608695652174,
"grad_norm": 2.6946452852114193,
"learning_rate": 6.24106874296192e-06,
"loss": 0.4659,
"step": 1155
},
{
"epoch": 0.4803312629399586,
"grad_norm": 2.6209213733396317,
"learning_rate": 6.20602422707252e-06,
"loss": 0.477,
"step": 1160
},
{
"epoch": 0.4824016563146998,
"grad_norm": 2.398262365513629,
"learning_rate": 6.1709166917572264e-06,
"loss": 0.4682,
"step": 1165
},
{
"epoch": 0.484472049689441,
"grad_norm": 2.431819800187479,
"learning_rate": 6.135747971520412e-06,
"loss": 0.4683,
"step": 1170
},
{
"epoch": 0.4865424430641822,
"grad_norm": 2.8158037293443035,
"learning_rate": 6.100519904063597e-06,
"loss": 0.4746,
"step": 1175
},
{
"epoch": 0.4886128364389234,
"grad_norm": 2.837390624828845,
"learning_rate": 6.0652343301894345e-06,
"loss": 0.459,
"step": 1180
},
{
"epoch": 0.4906832298136646,
"grad_norm": 2.418135340222932,
"learning_rate": 6.029893093705492e-06,
"loss": 0.4635,
"step": 1185
},
{
"epoch": 0.4927536231884058,
"grad_norm": 2.6362494330529898,
"learning_rate": 5.99449804132794e-06,
"loss": 0.4646,
"step": 1190
},
{
"epoch": 0.494824016563147,
"grad_norm": 2.4849657771126545,
"learning_rate": 5.959051022585025e-06,
"loss": 0.4789,
"step": 1195
},
{
"epoch": 0.4968944099378882,
"grad_norm": 2.4244857566364044,
"learning_rate": 5.923553889720447e-06,
"loss": 0.4575,
"step": 1200
},
{
"epoch": 0.4989648033126294,
"grad_norm": 2.4201955665833808,
"learning_rate": 5.888008497596553e-06,
"loss": 0.463,
"step": 1205
},
{
"epoch": 0.5010351966873706,
"grad_norm": 2.3890836649573903,
"learning_rate": 5.852416703597431e-06,
"loss": 0.4514,
"step": 1210
},
{
"epoch": 0.5031055900621118,
"grad_norm": 2.6412054672478895,
"learning_rate": 5.816780367531841e-06,
"loss": 0.4605,
"step": 1215
},
{
"epoch": 0.505175983436853,
"grad_norm": 2.467673773766997,
"learning_rate": 5.781101351536041e-06,
"loss": 0.4516,
"step": 1220
},
{
"epoch": 0.5072463768115942,
"grad_norm": 2.513551157030368,
"learning_rate": 5.745381519976477e-06,
"loss": 0.4616,
"step": 1225
},
{
"epoch": 0.5093167701863354,
"grad_norm": 2.410367834138193,
"learning_rate": 5.7096227393523716e-06,
"loss": 0.4471,
"step": 1230
},
{
"epoch": 0.5113871635610766,
"grad_norm": 2.5435391349074563,
"learning_rate": 5.673826878198181e-06,
"loss": 0.4666,
"step": 1235
},
{
"epoch": 0.5134575569358178,
"grad_norm": 2.5608201315555044,
"learning_rate": 5.637995806985961e-06,
"loss": 0.4497,
"step": 1240
},
{
"epoch": 0.515527950310559,
"grad_norm": 2.4327777954667433,
"learning_rate": 5.602131398027637e-06,
"loss": 0.4491,
"step": 1245
},
{
"epoch": 0.5175983436853002,
"grad_norm": 2.578121255659851,
"learning_rate": 5.566235525377155e-06,
"loss": 0.4478,
"step": 1250
},
{
"epoch": 0.5196687370600414,
"grad_norm": 2.537852814252821,
"learning_rate": 5.530310064732559e-06,
"loss": 0.4561,
"step": 1255
},
{
"epoch": 0.5217391304347826,
"grad_norm": 2.498709949242266,
"learning_rate": 5.494356893337985e-06,
"loss": 0.4481,
"step": 1260
},
{
"epoch": 0.5238095238095238,
"grad_norm": 2.46805017260375,
"learning_rate": 5.4583778898855576e-06,
"loss": 0.4431,
"step": 1265
},
{
"epoch": 0.525879917184265,
"grad_norm": 2.5578853563846744,
"learning_rate": 5.422374934417228e-06,
"loss": 0.4528,
"step": 1270
},
{
"epoch": 0.5279503105590062,
"grad_norm": 2.4830168594688433,
"learning_rate": 5.386349908226538e-06,
"loss": 0.4429,
"step": 1275
},
{
"epoch": 0.5300207039337475,
"grad_norm": 2.4032070219407307,
"learning_rate": 5.350304693760301e-06,
"loss": 0.4432,
"step": 1280
},
{
"epoch": 0.5320910973084886,
"grad_norm": 2.4153621091186754,
"learning_rate": 5.314241174520251e-06,
"loss": 0.4461,
"step": 1285
},
{
"epoch": 0.5341614906832298,
"grad_norm": 2.6189310940711388,
"learning_rate": 5.2781612349646175e-06,
"loss": 0.442,
"step": 1290
},
{
"epoch": 0.5362318840579711,
"grad_norm": 2.495168142912517,
"learning_rate": 5.242066760409653e-06,
"loss": 0.4284,
"step": 1295
},
{
"epoch": 0.5383022774327122,
"grad_norm": 2.6470359872983504,
"learning_rate": 5.205959636931121e-06,
"loss": 0.4471,
"step": 1300
},
{
"epoch": 0.5403726708074534,
"grad_norm": 2.4505839626707417,
"learning_rate": 5.169841751265738e-06,
"loss": 0.4456,
"step": 1305
},
{
"epoch": 0.5424430641821946,
"grad_norm": 2.385356568459076,
"learning_rate": 5.13371499071259e-06,
"loss": 0.437,
"step": 1310
},
{
"epoch": 0.5445134575569358,
"grad_norm": 2.3732155252137397,
"learning_rate": 5.09758124303451e-06,
"loss": 0.4285,
"step": 1315
},
{
"epoch": 0.546583850931677,
"grad_norm": 2.5134121517653365,
"learning_rate": 5.0614423963594295e-06,
"loss": 0.4383,
"step": 1320
},
{
"epoch": 0.5486542443064182,
"grad_norm": 2.3111288720602516,
"learning_rate": 5.0253003390817264e-06,
"loss": 0.4419,
"step": 1325
},
{
"epoch": 0.5507246376811594,
"grad_norm": 2.575404095324474,
"learning_rate": 4.989156959763551e-06,
"loss": 0.4415,
"step": 1330
},
{
"epoch": 0.5527950310559007,
"grad_norm": 2.710324800040419,
"learning_rate": 4.953014147036121e-06,
"loss": 0.4355,
"step": 1335
},
{
"epoch": 0.5548654244306418,
"grad_norm": 2.554113409667505,
"learning_rate": 4.9168737895010615e-06,
"loss": 0.4439,
"step": 1340
},
{
"epoch": 0.556935817805383,
"grad_norm": 2.679574467996552,
"learning_rate": 4.880737775631698e-06,
"loss": 0.4309,
"step": 1345
},
{
"epoch": 0.5590062111801242,
"grad_norm": 2.584481421393527,
"learning_rate": 4.844607993674382e-06,
"loss": 0.4393,
"step": 1350
},
{
"epoch": 0.5610766045548654,
"grad_norm": 2.545870537459471,
"learning_rate": 4.808486331549824e-06,
"loss": 0.4328,
"step": 1355
},
{
"epoch": 0.5631469979296067,
"grad_norm": 2.469665751682372,
"learning_rate": 4.772374676754444e-06,
"loss": 0.4442,
"step": 1360
},
{
"epoch": 0.5652173913043478,
"grad_norm": 2.38998127762607,
"learning_rate": 4.736274916261741e-06,
"loss": 0.4254,
"step": 1365
},
{
"epoch": 0.567287784679089,
"grad_norm": 2.462443618159964,
"learning_rate": 4.700188936423683e-06,
"loss": 0.4366,
"step": 1370
},
{
"epoch": 0.5693581780538303,
"grad_norm": 2.474511596183826,
"learning_rate": 4.664118622872157e-06,
"loss": 0.4353,
"step": 1375
},
{
"epoch": 0.5714285714285714,
"grad_norm": 2.5413527583854303,
"learning_rate": 4.628065860420417e-06,
"loss": 0.4244,
"step": 1380
},
{
"epoch": 0.5734989648033126,
"grad_norm": 2.372400143060098,
"learning_rate": 4.592032532964611e-06,
"loss": 0.425,
"step": 1385
},
{
"epoch": 0.5755693581780539,
"grad_norm": 2.4090437338047757,
"learning_rate": 4.556020523385326e-06,
"loss": 0.4192,
"step": 1390
},
{
"epoch": 0.577639751552795,
"grad_norm": 2.5082813386736493,
"learning_rate": 4.520031713449215e-06,
"loss": 0.4156,
"step": 1395
},
{
"epoch": 0.5797101449275363,
"grad_norm": 2.234908678405377,
"learning_rate": 4.484067983710653e-06,
"loss": 0.4084,
"step": 1400
},
{
"epoch": 0.5817805383022774,
"grad_norm": 2.3822985177979783,
"learning_rate": 4.448131213413485e-06,
"loss": 0.4282,
"step": 1405
},
{
"epoch": 0.5838509316770186,
"grad_norm": 2.4766022736972664,
"learning_rate": 4.4122232803928145e-06,
"loss": 0.4211,
"step": 1410
},
{
"epoch": 0.5859213250517599,
"grad_norm": 2.3747826324444525,
"learning_rate": 4.376346060976888e-06,
"loss": 0.4404,
"step": 1415
},
{
"epoch": 0.587991718426501,
"grad_norm": 2.6757534541634906,
"learning_rate": 4.340501429889053e-06,
"loss": 0.4242,
"step": 1420
},
{
"epoch": 0.5900621118012422,
"grad_norm": 2.450942230518248,
"learning_rate": 4.30469126014978e-06,
"loss": 0.4139,
"step": 1425
},
{
"epoch": 0.5921325051759835,
"grad_norm": 2.4227115297891655,
"learning_rate": 4.268917422978811e-06,
"loss": 0.4256,
"step": 1430
},
{
"epoch": 0.5942028985507246,
"grad_norm": 2.315434684855971,
"learning_rate": 4.233181787697364e-06,
"loss": 0.4029,
"step": 1435
},
{
"epoch": 0.5962732919254659,
"grad_norm": 2.55017237684355,
"learning_rate": 4.197486221630467e-06,
"loss": 0.4125,
"step": 1440
},
{
"epoch": 0.598343685300207,
"grad_norm": 2.4045013173308862,
"learning_rate": 4.161832590009371e-06,
"loss": 0.4025,
"step": 1445
},
{
"epoch": 0.6004140786749482,
"grad_norm": 2.373661860105871,
"learning_rate": 4.1262227558740915e-06,
"loss": 0.4197,
"step": 1450
},
{
"epoch": 0.6024844720496895,
"grad_norm": 2.4304173483175173,
"learning_rate": 4.090658579976058e-06,
"loss": 0.41,
"step": 1455
},
{
"epoch": 0.6045548654244306,
"grad_norm": 2.3746165203972636,
"learning_rate": 4.055141920680883e-06,
"loss": 0.3976,
"step": 1460
},
{
"epoch": 0.6066252587991718,
"grad_norm": 2.416068109637582,
"learning_rate": 4.019674633871246e-06,
"loss": 0.3966,
"step": 1465
},
{
"epoch": 0.6086956521739131,
"grad_norm": 2.350164465130904,
"learning_rate": 3.984258572849926e-06,
"loss": 0.4078,
"step": 1470
},
{
"epoch": 0.6107660455486542,
"grad_norm": 2.347182227826508,
"learning_rate": 3.9488955882429605e-06,
"loss": 0.3985,
"step": 1475
},
{
"epoch": 0.6128364389233955,
"grad_norm": 2.4697044165198663,
"learning_rate": 3.9135875279029366e-06,
"loss": 0.4032,
"step": 1480
},
{
"epoch": 0.6149068322981367,
"grad_norm": 2.284984923730731,
"learning_rate": 3.878336236812432e-06,
"loss": 0.3964,
"step": 1485
},
{
"epoch": 0.6169772256728778,
"grad_norm": 2.6955137290314,
"learning_rate": 3.84314355698762e-06,
"loss": 0.3857,
"step": 1490
},
{
"epoch": 0.6190476190476191,
"grad_norm": 2.3690541308671142,
"learning_rate": 3.8080113273820024e-06,
"loss": 0.3953,
"step": 1495
},
{
"epoch": 0.6211180124223602,
"grad_norm": 2.3979378425381883,
"learning_rate": 3.7729413837903285e-06,
"loss": 0.3957,
"step": 1500
},
{
"epoch": 0.6231884057971014,
"grad_norm": 2.453015287246099,
"learning_rate": 3.7379355587526543e-06,
"loss": 0.3969,
"step": 1505
},
{
"epoch": 0.6252587991718427,
"grad_norm": 2.3067964899715356,
"learning_rate": 3.702995681458605e-06,
"loss": 0.4022,
"step": 1510
},
{
"epoch": 0.6273291925465838,
"grad_norm": 2.607715536541752,
"learning_rate": 3.668123577651773e-06,
"loss": 0.4025,
"step": 1515
},
{
"epoch": 0.629399585921325,
"grad_norm": 2.333378973090727,
"learning_rate": 3.633321069534325e-06,
"loss": 0.4005,
"step": 1520
},
{
"epoch": 0.6314699792960663,
"grad_norm": 2.356875690550255,
"learning_rate": 3.5985899756717872e-06,
"loss": 0.4047,
"step": 1525
},
{
"epoch": 0.6335403726708074,
"grad_norm": 2.3890409138847772,
"learning_rate": 3.563932110898015e-06,
"loss": 0.3932,
"step": 1530
},
{
"epoch": 0.6356107660455487,
"grad_norm": 2.31968662321495,
"learning_rate": 3.5293492862203595e-06,
"loss": 0.3895,
"step": 1535
},
{
"epoch": 0.6376811594202898,
"grad_norm": 2.5428163637284302,
"learning_rate": 3.494843308725032e-06,
"loss": 0.4065,
"step": 1540
},
{
"epoch": 0.639751552795031,
"grad_norm": 2.5185237710778074,
"learning_rate": 3.460415981482691e-06,
"loss": 0.3839,
"step": 1545
},
{
"epoch": 0.6418219461697723,
"grad_norm": 2.5748344153183926,
"learning_rate": 3.426069103454209e-06,
"loss": 0.3868,
"step": 1550
},
{
"epoch": 0.6438923395445134,
"grad_norm": 2.398514316563059,
"learning_rate": 3.3918044693966802e-06,
"loss": 0.4057,
"step": 1555
},
{
"epoch": 0.6459627329192547,
"grad_norm": 2.3796472538203353,
"learning_rate": 3.357623869769628e-06,
"loss": 0.3908,
"step": 1560
},
{
"epoch": 0.6480331262939959,
"grad_norm": 2.5314996966962813,
"learning_rate": 3.3235290906414575e-06,
"loss": 0.4031,
"step": 1565
},
{
"epoch": 0.650103519668737,
"grad_norm": 2.390195549848524,
"learning_rate": 3.289521913596117e-06,
"loss": 0.395,
"step": 1570
},
{
"epoch": 0.6521739130434783,
"grad_norm": 2.3776782356589634,
"learning_rate": 3.255604115640014e-06,
"loss": 0.3888,
"step": 1575
},
{
"epoch": 0.6542443064182195,
"grad_norm": 2.5080296077127335,
"learning_rate": 3.2217774691091437e-06,
"loss": 0.3888,
"step": 1580
},
{
"epoch": 0.6563146997929606,
"grad_norm": 2.3986665888378504,
"learning_rate": 3.1880437415764936e-06,
"loss": 0.4005,
"step": 1585
},
{
"epoch": 0.6583850931677019,
"grad_norm": 2.375653238705482,
"learning_rate": 3.1544046957596746e-06,
"loss": 0.3899,
"step": 1590
},
{
"epoch": 0.660455486542443,
"grad_norm": 2.4008362573502557,
"learning_rate": 3.1208620894288105e-06,
"loss": 0.3806,
"step": 1595
},
{
"epoch": 0.6625258799171843,
"grad_norm": 2.5109783189098183,
"learning_rate": 3.087417675314687e-06,
"loss": 0.3771,
"step": 1600
},
{
"epoch": 0.6645962732919255,
"grad_norm": 2.2873656945611245,
"learning_rate": 3.0540732010171704e-06,
"loss": 0.3874,
"step": 1605
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.3617937696185383,
"learning_rate": 3.0208304089138873e-06,
"loss": 0.3871,
"step": 1610
},
{
"epoch": 0.6687370600414079,
"grad_norm": 2.406944381034829,
"learning_rate": 2.9876910360691707e-06,
"loss": 0.3755,
"step": 1615
},
{
"epoch": 0.6708074534161491,
"grad_norm": 2.5027584513607466,
"learning_rate": 2.9546568141433007e-06,
"loss": 0.3823,
"step": 1620
},
{
"epoch": 0.6728778467908902,
"grad_norm": 2.293575964220515,
"learning_rate": 2.921729469302018e-06,
"loss": 0.3768,
"step": 1625
},
{
"epoch": 0.6749482401656315,
"grad_norm": 2.3522455618797347,
"learning_rate": 2.8889107221263203e-06,
"loss": 0.3853,
"step": 1630
},
{
"epoch": 0.6770186335403726,
"grad_norm": 2.4032465700336707,
"learning_rate": 2.856202287522556e-06,
"loss": 0.3738,
"step": 1635
},
{
"epoch": 0.6790890269151139,
"grad_norm": 2.4447099663458,
"learning_rate": 2.8236058746328203e-06,
"loss": 0.3663,
"step": 1640
},
{
"epoch": 0.6811594202898551,
"grad_norm": 2.4001036453392124,
"learning_rate": 2.7911231867456345e-06,
"loss": 0.3811,
"step": 1645
},
{
"epoch": 0.6832298136645962,
"grad_norm": 2.457794387595243,
"learning_rate": 2.7587559212069554e-06,
"loss": 0.3877,
"step": 1650
},
{
"epoch": 0.6853002070393375,
"grad_norm": 2.553103222028848,
"learning_rate": 2.72650576933147e-06,
"loss": 0.3647,
"step": 1655
},
{
"epoch": 0.6873706004140787,
"grad_norm": 2.389432256098554,
"learning_rate": 2.6943744163142292e-06,
"loss": 0.3785,
"step": 1660
},
{
"epoch": 0.6894409937888198,
"grad_norm": 2.4084063403303806,
"learning_rate": 2.662363541142585e-06,
"loss": 0.3775,
"step": 1665
},
{
"epoch": 0.6915113871635611,
"grad_norm": 2.519672396122606,
"learning_rate": 2.6304748165084503e-06,
"loss": 0.3682,
"step": 1670
},
{
"epoch": 0.6935817805383023,
"grad_norm": 2.508292367077065,
"learning_rate": 2.5987099087209035e-06,
"loss": 0.383,
"step": 1675
},
{
"epoch": 0.6956521739130435,
"grad_norm": 2.231700226019991,
"learning_rate": 2.5670704776191158e-06,
"loss": 0.3666,
"step": 1680
},
{
"epoch": 0.6977225672877847,
"grad_norm": 2.341236802443701,
"learning_rate": 2.5355581764856186e-06,
"loss": 0.3752,
"step": 1685
},
{
"epoch": 0.6997929606625258,
"grad_norm": 2.3316516163325844,
"learning_rate": 2.5041746519599036e-06,
"loss": 0.3667,
"step": 1690
},
{
"epoch": 0.7018633540372671,
"grad_norm": 2.404699267564602,
"learning_rate": 2.4729215439523945e-06,
"loss": 0.3554,
"step": 1695
},
{
"epoch": 0.7039337474120083,
"grad_norm": 2.3452352217274246,
"learning_rate": 2.441800485558748e-06,
"loss": 0.3602,
"step": 1700
},
{
"epoch": 0.7060041407867494,
"grad_norm": 2.2806454563645215,
"learning_rate": 2.410813102974512e-06,
"loss": 0.3565,
"step": 1705
},
{
"epoch": 0.7080745341614907,
"grad_norm": 2.307465580360019,
"learning_rate": 2.3799610154101603e-06,
"loss": 0.3606,
"step": 1710
},
{
"epoch": 0.7101449275362319,
"grad_norm": 2.452751128805671,
"learning_rate": 2.3492458350064805e-06,
"loss": 0.3556,
"step": 1715
},
{
"epoch": 0.7122153209109731,
"grad_norm": 2.2976764218550416,
"learning_rate": 2.3186691667503315e-06,
"loss": 0.3628,
"step": 1720
},
{
"epoch": 0.7142857142857143,
"grad_norm": 2.3304091828568834,
"learning_rate": 2.288232608390778e-06,
"loss": 0.3704,
"step": 1725
},
{
"epoch": 0.7163561076604554,
"grad_norm": 2.2865573025374255,
"learning_rate": 2.2579377503555967e-06,
"loss": 0.3596,
"step": 1730
},
{
"epoch": 0.7184265010351967,
"grad_norm": 2.352192149158963,
"learning_rate": 2.227786175668181e-06,
"loss": 0.3749,
"step": 1735
},
{
"epoch": 0.7204968944099379,
"grad_norm": 2.4289410342699123,
"learning_rate": 2.1977794598648106e-06,
"loss": 0.3618,
"step": 1740
},
{
"epoch": 0.722567287784679,
"grad_norm": 2.262023452323366,
"learning_rate": 2.167919170912333e-06,
"loss": 0.3651,
"step": 1745
},
{
"epoch": 0.7246376811594203,
"grad_norm": 2.326495097892394,
"learning_rate": 2.138206869126222e-06,
"loss": 0.3641,
"step": 1750
},
{
"epoch": 0.7267080745341615,
"grad_norm": 2.2660773938450736,
"learning_rate": 2.108644107089055e-06,
"loss": 0.3615,
"step": 1755
},
{
"epoch": 0.7287784679089027,
"grad_norm": 2.4931465842695126,
"learning_rate": 2.0792324295693782e-06,
"loss": 0.3459,
"step": 1760
},
{
"epoch": 0.7308488612836439,
"grad_norm": 2.4474318426483594,
"learning_rate": 2.0499733734409884e-06,
"loss": 0.3508,
"step": 1765
},
{
"epoch": 0.7329192546583851,
"grad_norm": 2.288988725791088,
"learning_rate": 2.020868467602623e-06,
"loss": 0.3491,
"step": 1770
},
{
"epoch": 0.7349896480331263,
"grad_norm": 2.3740385701739863,
"learning_rate": 1.9919192328980703e-06,
"loss": 0.3551,
"step": 1775
},
{
"epoch": 0.7370600414078675,
"grad_norm": 2.296340141809961,
"learning_rate": 1.9631271820367058e-06,
"loss": 0.3648,
"step": 1780
},
{
"epoch": 0.7391304347826086,
"grad_norm": 2.3859405743705238,
"learning_rate": 1.9344938195144353e-06,
"loss": 0.3458,
"step": 1785
},
{
"epoch": 0.7412008281573499,
"grad_norm": 2.3967009035292013,
"learning_rate": 1.9060206415350896e-06,
"loss": 0.3644,
"step": 1790
},
{
"epoch": 0.7432712215320911,
"grad_norm": 2.3650319320512256,
"learning_rate": 1.8777091359322386e-06,
"loss": 0.3586,
"step": 1795
},
{
"epoch": 0.7453416149068323,
"grad_norm": 2.3509250287781063,
"learning_rate": 1.8495607820914451e-06,
"loss": 0.3394,
"step": 1800
},
{
"epoch": 0.7474120082815735,
"grad_norm": 2.4462460721521566,
"learning_rate": 1.8215770508729602e-06,
"loss": 0.3533,
"step": 1805
},
{
"epoch": 0.7494824016563147,
"grad_norm": 2.4120928416913348,
"learning_rate": 1.7937594045348666e-06,
"loss": 0.3635,
"step": 1810
},
{
"epoch": 0.7515527950310559,
"grad_norm": 2.447171032014947,
"learning_rate": 1.7661092966566728e-06,
"loss": 0.3611,
"step": 1815
},
{
"epoch": 0.7536231884057971,
"grad_norm": 2.3997267555811703,
"learning_rate": 1.7386281720633569e-06,
"loss": 0.3573,
"step": 1820
},
{
"epoch": 0.7556935817805382,
"grad_norm": 2.526067948670027,
"learning_rate": 1.7113174667498612e-06,
"loss": 0.3465,
"step": 1825
},
{
"epoch": 0.7577639751552795,
"grad_norm": 2.4134155391984216,
"learning_rate": 1.6841786078060674e-06,
"loss": 0.3489,
"step": 1830
},
{
"epoch": 0.7598343685300207,
"grad_norm": 2.3620900320816407,
"learning_rate": 1.6572130133422204e-06,
"loss": 0.3606,
"step": 1835
},
{
"epoch": 0.7619047619047619,
"grad_norm": 2.3258830434552333,
"learning_rate": 1.630422092414823e-06,
"loss": 0.3462,
"step": 1840
},
{
"epoch": 0.7639751552795031,
"grad_norm": 2.507634486482911,
"learning_rate": 1.6038072449530118e-06,
"loss": 0.344,
"step": 1845
},
{
"epoch": 0.7660455486542443,
"grad_norm": 2.252574889611392,
"learning_rate": 1.5773698616854065e-06,
"loss": 0.3377,
"step": 1850
},
{
"epoch": 0.7681159420289855,
"grad_norm": 2.282939081832581,
"learning_rate": 1.5511113240674364e-06,
"loss": 0.3518,
"step": 1855
},
{
"epoch": 0.7701863354037267,
"grad_norm": 2.5201633790347926,
"learning_rate": 1.5250330042091543e-06,
"loss": 0.3419,
"step": 1860
},
{
"epoch": 0.772256728778468,
"grad_norm": 2.3378159307068187,
"learning_rate": 1.4991362648035368e-06,
"loss": 0.3519,
"step": 1865
},
{
"epoch": 0.7743271221532091,
"grad_norm": 2.2870256461374736,
"learning_rate": 1.4734224590552844e-06,
"loss": 0.3505,
"step": 1870
},
{
"epoch": 0.7763975155279503,
"grad_norm": 2.22216200101462,
"learning_rate": 1.4478929306101025e-06,
"loss": 0.3472,
"step": 1875
},
{
"epoch": 0.7784679089026915,
"grad_norm": 2.516344452921512,
"learning_rate": 1.4225490134844967e-06,
"loss": 0.3406,
"step": 1880
},
{
"epoch": 0.7805383022774327,
"grad_norm": 2.4407291133026496,
"learning_rate": 1.3973920319960654e-06,
"loss": 0.3507,
"step": 1885
},
{
"epoch": 0.782608695652174,
"grad_norm": 2.4350064276309467,
"learning_rate": 1.3724233006942972e-06,
"loss": 0.3603,
"step": 1890
},
{
"epoch": 0.7846790890269151,
"grad_norm": 2.4448018949580126,
"learning_rate": 1.347644124291883e-06,
"loss": 0.3469,
"step": 1895
},
{
"epoch": 0.7867494824016563,
"grad_norm": 2.442534113167265,
"learning_rate": 1.323055797596532e-06,
"loss": 0.3489,
"step": 1900
},
{
"epoch": 0.7888198757763976,
"grad_norm": 2.5804040167566464,
"learning_rate": 1.2986596054433255e-06,
"loss": 0.3525,
"step": 1905
},
{
"epoch": 0.7908902691511387,
"grad_norm": 2.2700326982249757,
"learning_rate": 1.2744568226275662e-06,
"loss": 0.3466,
"step": 1910
},
{
"epoch": 0.7929606625258799,
"grad_norm": 2.3799632085457563,
"learning_rate": 1.2504487138381782e-06,
"loss": 0.3308,
"step": 1915
},
{
"epoch": 0.7950310559006211,
"grad_norm": 2.440779468234445,
"learning_rate": 1.2266365335916086e-06,
"loss": 0.3388,
"step": 1920
},
{
"epoch": 0.7971014492753623,
"grad_norm": 2.6076552537353663,
"learning_rate": 1.203021526166287e-06,
"loss": 0.343,
"step": 1925
},
{
"epoch": 0.7991718426501035,
"grad_norm": 2.3498035887338213,
"learning_rate": 1.179604925537598e-06,
"loss": 0.3469,
"step": 1930
},
{
"epoch": 0.8012422360248447,
"grad_norm": 2.4386105466174897,
"learning_rate": 1.1563879553134072e-06,
"loss": 0.3438,
"step": 1935
},
{
"epoch": 0.8033126293995859,
"grad_norm": 2.222506157666813,
"learning_rate": 1.1333718286701162e-06,
"loss": 0.329,
"step": 1940
},
{
"epoch": 0.8053830227743272,
"grad_norm": 2.343211840912974,
"learning_rate": 1.110557748289275e-06,
"loss": 0.3419,
"step": 1945
},
{
"epoch": 0.8074534161490683,
"grad_norm": 2.449345558253982,
"learning_rate": 1.0879469062947369e-06,
"loss": 0.344,
"step": 1950
},
{
"epoch": 0.8095238095238095,
"grad_norm": 2.252437893667125,
"learning_rate": 1.0655404841903648e-06,
"loss": 0.3307,
"step": 1955
},
{
"epoch": 0.8115942028985508,
"grad_norm": 2.269087250324664,
"learning_rate": 1.0433396527982893e-06,
"loss": 0.33,
"step": 1960
},
{
"epoch": 0.8136645962732919,
"grad_norm": 2.3179756982928597,
"learning_rate": 1.0213455721977356e-06,
"loss": 0.3503,
"step": 1965
},
{
"epoch": 0.8157349896480331,
"grad_norm": 2.303567008366539,
"learning_rate": 9.995593916643981e-07,
"loss": 0.3314,
"step": 1970
},
{
"epoch": 0.8178053830227743,
"grad_norm": 2.4979873981211,
"learning_rate": 9.779822496103902e-07,
"loss": 0.3152,
"step": 1975
},
{
"epoch": 0.8198757763975155,
"grad_norm": 2.1769158789158918,
"learning_rate": 9.566152735247576e-07,
"loss": 0.3344,
"step": 1980
},
{
"epoch": 0.8219461697722568,
"grad_norm": 2.282167400217365,
"learning_rate": 9.354595799145627e-07,
"loss": 0.3329,
"step": 1985
},
{
"epoch": 0.8240165631469979,
"grad_norm": 2.308650436192575,
"learning_rate": 9.145162742465424e-07,
"loss": 0.3285,
"step": 1990
},
{
"epoch": 0.8260869565217391,
"grad_norm": 2.344011952358636,
"learning_rate": 8.937864508893396e-07,
"loss": 0.3315,
"step": 1995
},
{
"epoch": 0.8281573498964804,
"grad_norm": 2.428466280879843,
"learning_rate": 8.732711930563259e-07,
"loss": 0.3332,
"step": 2000
},
{
"epoch": 0.8302277432712215,
"grad_norm": 2.5043479750930784,
"learning_rate": 8.529715727489912e-07,
"loss": 0.3302,
"step": 2005
},
{
"epoch": 0.8322981366459627,
"grad_norm": 2.472524424303242,
"learning_rate": 8.32888650700936e-07,
"loss": 0.3339,
"step": 2010
},
{
"epoch": 0.8343685300207039,
"grad_norm": 2.215326262827526,
"learning_rate": 8.130234763224342e-07,
"loss": 0.3437,
"step": 2015
},
{
"epoch": 0.8364389233954451,
"grad_norm": 2.4522037152626943,
"learning_rate": 7.93377087645607e-07,
"loss": 0.334,
"step": 2020
},
{
"epoch": 0.8385093167701864,
"grad_norm": 2.2718308094595163,
"learning_rate": 7.739505112701751e-07,
"loss": 0.3388,
"step": 2025
},
{
"epoch": 0.8405797101449275,
"grad_norm": 2.2109463262055566,
"learning_rate": 7.547447623098191e-07,
"loss": 0.3293,
"step": 2030
},
{
"epoch": 0.8426501035196687,
"grad_norm": 2.422012212632736,
"learning_rate": 7.357608443391312e-07,
"loss": 0.3283,
"step": 2035
},
{
"epoch": 0.84472049689441,
"grad_norm": 2.3366029526880454,
"learning_rate": 7.169997493411774e-07,
"loss": 0.3394,
"step": 2040
},
{
"epoch": 0.8467908902691511,
"grad_norm": 2.280587622120492,
"learning_rate": 6.984624576556647e-07,
"loss": 0.3316,
"step": 2045
},
{
"epoch": 0.8488612836438924,
"grad_norm": 2.500805134048855,
"learning_rate": 6.801499379277115e-07,
"loss": 0.324,
"step": 2050
},
{
"epoch": 0.8509316770186336,
"grad_norm": 2.5427240212059488,
"learning_rate": 6.620631470572314e-07,
"loss": 0.3413,
"step": 2055
},
{
"epoch": 0.8530020703933747,
"grad_norm": 2.43268409428744,
"learning_rate": 6.442030301489338e-07,
"loss": 0.3284,
"step": 2060
},
{
"epoch": 0.855072463768116,
"grad_norm": 2.258241264471088,
"learning_rate": 6.265705204629402e-07,
"loss": 0.3184,
"step": 2065
},
{
"epoch": 0.8571428571428571,
"grad_norm": 2.242398674684009,
"learning_rate": 6.091665393660112e-07,
"loss": 0.3319,
"step": 2070
},
{
"epoch": 0.8592132505175983,
"grad_norm": 2.3482008204333398,
"learning_rate": 5.919919962834069e-07,
"loss": 0.3231,
"step": 2075
},
{
"epoch": 0.8612836438923396,
"grad_norm": 2.366760158355553,
"learning_rate": 5.750477886513656e-07,
"loss": 0.3312,
"step": 2080
},
{
"epoch": 0.8633540372670807,
"grad_norm": 2.3513742279943326,
"learning_rate": 5.58334801870209e-07,
"loss": 0.3367,
"step": 2085
},
{
"epoch": 0.865424430641822,
"grad_norm": 2.3613130008829075,
"learning_rate": 5.418539092580727e-07,
"loss": 0.3294,
"step": 2090
},
{
"epoch": 0.8674948240165632,
"grad_norm": 2.4347997686246248,
"learning_rate": 5.256059720052787e-07,
"loss": 0.3234,
"step": 2095
},
{
"epoch": 0.8695652173913043,
"grad_norm": 2.2092415116265345,
"learning_rate": 5.095918391293303e-07,
"loss": 0.3222,
"step": 2100
},
{
"epoch": 0.8716356107660456,
"grad_norm": 2.3458774987933326,
"learning_rate": 4.938123474305473e-07,
"loss": 0.3348,
"step": 2105
},
{
"epoch": 0.8737060041407867,
"grad_norm": 2.4897692670638545,
"learning_rate": 4.782683214483413e-07,
"loss": 0.3351,
"step": 2110
},
{
"epoch": 0.8757763975155279,
"grad_norm": 2.396327013145959,
"learning_rate": 4.629605734181331e-07,
"loss": 0.3201,
"step": 2115
},
{
"epoch": 0.8778467908902692,
"grad_norm": 2.360822136914451,
"learning_rate": 4.478899032289047e-07,
"loss": 0.314,
"step": 2120
},
{
"epoch": 0.8799171842650103,
"grad_norm": 2.3310983426231737,
"learning_rate": 4.3305709838140755e-07,
"loss": 0.327,
"step": 2125
},
{
"epoch": 0.8819875776397516,
"grad_norm": 2.3688781519915287,
"learning_rate": 4.1846293394700645e-07,
"loss": 0.333,
"step": 2130
},
{
"epoch": 0.8840579710144928,
"grad_norm": 2.376181781889082,
"learning_rate": 4.041081725271856e-07,
"loss": 0.3256,
"step": 2135
},
{
"epoch": 0.8861283643892339,
"grad_norm": 2.4221964696193012,
"learning_rate": 3.8999356421369426e-07,
"loss": 0.3159,
"step": 2140
},
{
"epoch": 0.8881987577639752,
"grad_norm": 2.451153892372484,
"learning_rate": 3.7611984654935707e-07,
"loss": 0.3326,
"step": 2145
},
{
"epoch": 0.8902691511387164,
"grad_norm": 2.4645445037651994,
"learning_rate": 3.6248774448952695e-07,
"loss": 0.3147,
"step": 2150
},
{
"epoch": 0.8923395445134575,
"grad_norm": 2.36063324025931,
"learning_rate": 3.490979703642117e-07,
"loss": 0.3214,
"step": 2155
},
{
"epoch": 0.8944099378881988,
"grad_norm": 2.367387659347362,
"learning_rate": 3.35951223840848e-07,
"loss": 0.3195,
"step": 2160
},
{
"epoch": 0.8964803312629399,
"grad_norm": 2.4776969660049786,
"learning_rate": 3.2304819188773815e-07,
"loss": 0.3157,
"step": 2165
},
{
"epoch": 0.8985507246376812,
"grad_norm": 2.2501313497169546,
"learning_rate": 3.1038954873816027e-07,
"loss": 0.3239,
"step": 2170
},
{
"epoch": 0.9006211180124224,
"grad_norm": 2.487032588980476,
"learning_rate": 2.9797595585512986e-07,
"loss": 0.3137,
"step": 2175
},
{
"epoch": 0.9026915113871635,
"grad_norm": 2.3994160126772686,
"learning_rate": 2.858080618968423e-07,
"loss": 0.3318,
"step": 2180
},
{
"epoch": 0.9047619047619048,
"grad_norm": 2.431716795327717,
"learning_rate": 2.738865026827714e-07,
"loss": 0.3189,
"step": 2185
},
{
"epoch": 0.906832298136646,
"grad_norm": 2.328576506089089,
"learning_rate": 2.6221190116045126e-07,
"loss": 0.326,
"step": 2190
},
{
"epoch": 0.9089026915113871,
"grad_norm": 2.664682914301927,
"learning_rate": 2.507848673729224e-07,
"loss": 0.3242,
"step": 2195
},
{
"epoch": 0.9109730848861284,
"grad_norm": 2.479613300400657,
"learning_rate": 2.3960599842685394e-07,
"loss": 0.3336,
"step": 2200
},
{
"epoch": 0.9130434782608695,
"grad_norm": 2.2791055625567185,
"learning_rate": 2.2867587846134188e-07,
"loss": 0.3241,
"step": 2205
},
{
"epoch": 0.9151138716356108,
"grad_norm": 2.412625004476594,
"learning_rate": 2.179950786173879e-07,
"loss": 0.3261,
"step": 2210
},
{
"epoch": 0.917184265010352,
"grad_norm": 2.34544574477024,
"learning_rate": 2.07564157008055e-07,
"loss": 0.3227,
"step": 2215
},
{
"epoch": 0.9192546583850931,
"grad_norm": 2.5441930579171372,
"learning_rate": 1.9738365868930188e-07,
"loss": 0.3204,
"step": 2220
},
{
"epoch": 0.9213250517598344,
"grad_norm": 2.437597190151229,
"learning_rate": 1.8745411563150128e-07,
"loss": 0.317,
"step": 2225
},
{
"epoch": 0.9233954451345756,
"grad_norm": 2.6092549862849777,
"learning_rate": 1.7777604669164727e-07,
"loss": 0.3179,
"step": 2230
},
{
"epoch": 0.9254658385093167,
"grad_norm": 2.347264025890746,
"learning_rate": 1.6834995758623685e-07,
"loss": 0.3169,
"step": 2235
},
{
"epoch": 0.927536231884058,
"grad_norm": 2.5310233377945597,
"learning_rate": 1.5917634086484778e-07,
"loss": 0.3246,
"step": 2240
},
{
"epoch": 0.9296066252587992,
"grad_norm": 2.3035152856926326,
"learning_rate": 1.5025567588439925e-07,
"loss": 0.3135,
"step": 2245
},
{
"epoch": 0.9316770186335404,
"grad_norm": 2.433671908497608,
"learning_rate": 1.415884287841074e-07,
"loss": 0.3284,
"step": 2250
},
{
"epoch": 0.9337474120082816,
"grad_norm": 2.4164031261091545,
"learning_rate": 1.331750524611225e-07,
"loss": 0.3171,
"step": 2255
},
{
"epoch": 0.9358178053830227,
"grad_norm": 2.272442688415743,
"learning_rate": 1.2501598654686532e-07,
"loss": 0.315,
"step": 2260
},
{
"epoch": 0.937888198757764,
"grad_norm": 2.529184131301037,
"learning_rate": 1.1711165738405749e-07,
"loss": 0.3152,
"step": 2265
},
{
"epoch": 0.9399585921325052,
"grad_norm": 2.331043749291153,
"learning_rate": 1.0946247800443899e-07,
"loss": 0.3151,
"step": 2270
},
{
"epoch": 0.9420289855072463,
"grad_norm": 2.5426329327421104,
"learning_rate": 1.0206884810718964e-07,
"loss": 0.3239,
"step": 2275
},
{
"epoch": 0.9440993788819876,
"grad_norm": 2.312668707858943,
"learning_rate": 9.493115403803876e-08,
"loss": 0.3151,
"step": 2280
},
{
"epoch": 0.9461697722567288,
"grad_norm": 2.4583990144456647,
"learning_rate": 8.804976876908178e-08,
"loss": 0.3189,
"step": 2285
},
{
"epoch": 0.94824016563147,
"grad_norm": 2.4715554221576554,
"learning_rate": 8.142505187928861e-08,
"loss": 0.3266,
"step": 2290
},
{
"epoch": 0.9503105590062112,
"grad_norm": 2.279313208381015,
"learning_rate": 7.505734953571486e-08,
"loss": 0.321,
"step": 2295
},
{
"epoch": 0.9523809523809523,
"grad_norm": 2.424936812987473,
"learning_rate": 6.894699447541231e-08,
"loss": 0.313,
"step": 2300
},
{
"epoch": 0.9544513457556936,
"grad_norm": 2.317154235770137,
"learning_rate": 6.309430598804234e-08,
"loss": 0.3078,
"step": 2305
},
{
"epoch": 0.9565217391304348,
"grad_norm": 2.317862973691703,
"learning_rate": 5.7499589899193134e-08,
"loss": 0.3233,
"step": 2310
},
{
"epoch": 0.9585921325051759,
"grad_norm": 2.4806788487037736,
"learning_rate": 5.216313855439858e-08,
"loss": 0.3203,
"step": 2315
},
{
"epoch": 0.9606625258799172,
"grad_norm": 2.444202329022983,
"learning_rate": 4.7085230803860513e-08,
"loss": 0.3231,
"step": 2320
},
{
"epoch": 0.9627329192546584,
"grad_norm": 2.5081514969218928,
"learning_rate": 4.2266131987880876e-08,
"loss": 0.3208,
"step": 2325
},
{
"epoch": 0.9648033126293996,
"grad_norm": 2.319139177260043,
"learning_rate": 3.770609392299285e-08,
"loss": 0.3171,
"step": 2330
},
{
"epoch": 0.9668737060041408,
"grad_norm": 2.5720410881498315,
"learning_rate": 3.340535488880525e-08,
"loss": 0.314,
"step": 2335
},
{
"epoch": 0.968944099378882,
"grad_norm": 2.356098570899947,
"learning_rate": 2.9364139615550846e-08,
"loss": 0.3251,
"step": 2340
},
{
"epoch": 0.9710144927536232,
"grad_norm": 2.2272834829974744,
"learning_rate": 2.5582659272341827e-08,
"loss": 0.3289,
"step": 2345
},
{
"epoch": 0.9730848861283644,
"grad_norm": 2.3546167612977658,
"learning_rate": 2.206111145613865e-08,
"loss": 0.3207,
"step": 2350
},
{
"epoch": 0.9751552795031055,
"grad_norm": 2.329623013425101,
"learning_rate": 1.8799680181421087e-08,
"loss": 0.3166,
"step": 2355
},
{
"epoch": 0.9772256728778468,
"grad_norm": 2.5617811367692394,
"learning_rate": 1.5798535870575893e-08,
"loss": 0.332,
"step": 2360
},
{
"epoch": 0.979296066252588,
"grad_norm": 2.353072654396703,
"learning_rate": 1.30578353449895e-08,
"loss": 0.3072,
"step": 2365
},
{
"epoch": 0.9813664596273292,
"grad_norm": 2.3494212436548008,
"learning_rate": 1.0577721816854015e-08,
"loss": 0.3221,
"step": 2370
},
{
"epoch": 0.9834368530020704,
"grad_norm": 2.4667761817578215,
"learning_rate": 8.35832488168431e-09,
"loss": 0.311,
"step": 2375
},
{
"epoch": 0.9855072463768116,
"grad_norm": 2.481882143231274,
"learning_rate": 6.399760511546227e-09,
"loss": 0.3177,
"step": 2380
},
{
"epoch": 0.9875776397515528,
"grad_norm": 2.4656898908254767,
"learning_rate": 4.7021310489953065e-09,
"loss": 0.3177,
"step": 2385
},
{
"epoch": 0.989648033126294,
"grad_norm": 2.5258649806925844,
"learning_rate": 3.265525201731623e-09,
"loss": 0.318,
"step": 2390
},
{
"epoch": 0.9917184265010351,
"grad_norm": 2.3678389174709706,
"learning_rate": 2.090018037960717e-09,
"loss": 0.3084,
"step": 2395
},
{
"epoch": 0.9937888198757764,
"grad_norm": 2.3049156249564833,
"learning_rate": 1.1756709824750633e-09,
"loss": 0.3158,
"step": 2400
},
{
"epoch": 0.9958592132505176,
"grad_norm": 2.4720909894954617,
"learning_rate": 5.225318134399749e-10,
"loss": 0.3234,
"step": 2405
},
{
"epoch": 0.9979296066252588,
"grad_norm": 2.4525006690815316,
"learning_rate": 1.306346599011521e-10,
"loss": 0.3127,
"step": 2410
},
{
"epoch": 1.0,
"grad_norm": 2.362376855252616,
"learning_rate": 0.0,
"loss": 0.3158,
"step": 2415
},
{
"epoch": 1.0,
"eval_runtime": 2.7767,
"eval_samples_per_second": 3.601,
"eval_steps_per_second": 1.08,
"step": 2415
},
{
"epoch": 1.0,
"step": 2415,
"total_flos": 252825934233600.0,
"train_loss": 0.5152654577239453,
"train_runtime": 25868.7646,
"train_samples_per_second": 1.493,
"train_steps_per_second": 0.093
}
],
"logging_steps": 5,
"max_steps": 2415,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 252825934233600.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}