OLMo-1B-SFT-hf / trainer_state.json
kykim0's picture
Model save
d972836 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.997726239199636,
"eval_steps": 500,
"global_step": 3708,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008084482845737962,
"grad_norm": 10.217898134986712,
"learning_rate": 1.7857142857142858e-07,
"loss": 1.7929,
"step": 1
},
{
"epoch": 0.008084482845737961,
"grad_norm": 8.447331849071613,
"learning_rate": 1.7857142857142859e-06,
"loss": 1.8062,
"step": 10
},
{
"epoch": 0.016168965691475922,
"grad_norm": 3.3131831113560053,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.6026,
"step": 20
},
{
"epoch": 0.024253448537213885,
"grad_norm": 1.9123288015392683,
"learning_rate": 5.357142857142857e-06,
"loss": 1.5044,
"step": 30
},
{
"epoch": 0.032337931382951844,
"grad_norm": 1.5061470360359468,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.46,
"step": 40
},
{
"epoch": 0.04042241422868981,
"grad_norm": 1.3730571252951111,
"learning_rate": 8.92857142857143e-06,
"loss": 1.4242,
"step": 50
},
{
"epoch": 0.04850689707442777,
"grad_norm": 1.3696793978053872,
"learning_rate": 1.0714285714285714e-05,
"loss": 1.3962,
"step": 60
},
{
"epoch": 0.05659137992016573,
"grad_norm": 1.3254100387294059,
"learning_rate": 1.25e-05,
"loss": 1.4019,
"step": 70
},
{
"epoch": 0.06467586276590369,
"grad_norm": 1.300617283886077,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.38,
"step": 80
},
{
"epoch": 0.07276034561164166,
"grad_norm": 1.2890020177179862,
"learning_rate": 1.6071428571428572e-05,
"loss": 1.3692,
"step": 90
},
{
"epoch": 0.08084482845737961,
"grad_norm": 1.3617464980600442,
"learning_rate": 1.785714285714286e-05,
"loss": 1.3717,
"step": 100
},
{
"epoch": 0.08892931130311758,
"grad_norm": 1.3364658238171299,
"learning_rate": 1.9642857142857145e-05,
"loss": 1.3445,
"step": 110
},
{
"epoch": 0.09701379414885554,
"grad_norm": 1.406654933905743,
"learning_rate": 1.999975576461237e-05,
"loss": 1.3724,
"step": 120
},
{
"epoch": 0.1050982769945935,
"grad_norm": 1.4611284363694028,
"learning_rate": 1.999876357879684e-05,
"loss": 1.3446,
"step": 130
},
{
"epoch": 0.11318275984033146,
"grad_norm": 1.3711888479566774,
"learning_rate": 1.9997008253510416e-05,
"loss": 1.3515,
"step": 140
},
{
"epoch": 0.12126724268606942,
"grad_norm": 1.409811295994504,
"learning_rate": 1.9994489922725454e-05,
"loss": 1.342,
"step": 150
},
{
"epoch": 0.12935172553180738,
"grad_norm": 1.3918093353079757,
"learning_rate": 1.9991208778649485e-05,
"loss": 1.3493,
"step": 160
},
{
"epoch": 0.13743620837754536,
"grad_norm": 1.3597096387911873,
"learning_rate": 1.998716507171053e-05,
"loss": 1.3186,
"step": 170
},
{
"epoch": 0.14552069122328332,
"grad_norm": 1.3532852407233782,
"learning_rate": 1.998235911053798e-05,
"loss": 1.3426,
"step": 180
},
{
"epoch": 0.15360517406902127,
"grad_norm": 1.395929355438092,
"learning_rate": 1.9976791261939064e-05,
"loss": 1.338,
"step": 190
},
{
"epoch": 0.16168965691475923,
"grad_norm": 1.3346897807354206,
"learning_rate": 1.997046195087082e-05,
"loss": 1.3209,
"step": 200
},
{
"epoch": 0.16977413976049718,
"grad_norm": 1.3501622187451188,
"learning_rate": 1.996337166040769e-05,
"loss": 1.3279,
"step": 210
},
{
"epoch": 0.17785862260623517,
"grad_norm": 1.262079121683393,
"learning_rate": 1.995552093170463e-05,
"loss": 1.3135,
"step": 220
},
{
"epoch": 0.18594310545197312,
"grad_norm": 1.324883144834464,
"learning_rate": 1.994691036395583e-05,
"loss": 1.306,
"step": 230
},
{
"epoch": 0.19402758829771108,
"grad_norm": 1.373867783740766,
"learning_rate": 1.9937540614348944e-05,
"loss": 1.3018,
"step": 240
},
{
"epoch": 0.20211207114344903,
"grad_norm": 1.4020861161362783,
"learning_rate": 1.992741239801498e-05,
"loss": 1.3203,
"step": 250
},
{
"epoch": 0.210196553989187,
"grad_norm": 1.3484650757297245,
"learning_rate": 1.9916526487973678e-05,
"loss": 1.2939,
"step": 260
},
{
"epoch": 0.21828103683492497,
"grad_norm": 1.3330965331306333,
"learning_rate": 1.9904883715074525e-05,
"loss": 1.2795,
"step": 270
},
{
"epoch": 0.22636551968066293,
"grad_norm": 1.3917589397233552,
"learning_rate": 1.989248496793335e-05,
"loss": 1.269,
"step": 280
},
{
"epoch": 0.23445000252640089,
"grad_norm": 1.3905412148367542,
"learning_rate": 1.9879331192864492e-05,
"loss": 1.286,
"step": 290
},
{
"epoch": 0.24253448537213884,
"grad_norm": 1.4569325197708967,
"learning_rate": 1.9865423393808573e-05,
"loss": 1.2944,
"step": 300
},
{
"epoch": 0.2506189682178768,
"grad_norm": 1.3399495909594208,
"learning_rate": 1.985076263225588e-05,
"loss": 1.3106,
"step": 310
},
{
"epoch": 0.25870345106361475,
"grad_norm": 1.478802579336813,
"learning_rate": 1.9835350027165342e-05,
"loss": 1.2994,
"step": 320
},
{
"epoch": 0.26678793390935274,
"grad_norm": 1.3105244819439577,
"learning_rate": 1.9819186754879137e-05,
"loss": 1.2871,
"step": 330
},
{
"epoch": 0.2748724167550907,
"grad_norm": 1.3667119896120177,
"learning_rate": 1.9802274049032898e-05,
"loss": 1.2893,
"step": 340
},
{
"epoch": 0.28295689960082865,
"grad_norm": 1.5054526064910085,
"learning_rate": 1.9784613200461568e-05,
"loss": 1.2912,
"step": 350
},
{
"epoch": 0.29104138244656663,
"grad_norm": 1.3163243486049039,
"learning_rate": 1.976620555710087e-05,
"loss": 1.2761,
"step": 360
},
{
"epoch": 0.29912586529230456,
"grad_norm": 1.322920539242633,
"learning_rate": 1.9747052523884435e-05,
"loss": 1.2572,
"step": 370
},
{
"epoch": 0.30721034813804254,
"grad_norm": 1.3954468357326724,
"learning_rate": 1.972715556263657e-05,
"loss": 1.2745,
"step": 380
},
{
"epoch": 0.3152948309837805,
"grad_norm": 1.3451929159695755,
"learning_rate": 1.9706516191960687e-05,
"loss": 1.2472,
"step": 390
},
{
"epoch": 0.32337931382951846,
"grad_norm": 1.2765565775996142,
"learning_rate": 1.9685135987123396e-05,
"loss": 1.255,
"step": 400
},
{
"epoch": 0.33146379667525644,
"grad_norm": 1.4632541877317655,
"learning_rate": 1.966301657993428e-05,
"loss": 1.2565,
"step": 410
},
{
"epoch": 0.33954827952099437,
"grad_norm": 1.3554436136314076,
"learning_rate": 1.9640159658621344e-05,
"loss": 1.2593,
"step": 420
},
{
"epoch": 0.34763276236673235,
"grad_norm": 1.3154961346767526,
"learning_rate": 1.9616566967702164e-05,
"loss": 1.2604,
"step": 430
},
{
"epoch": 0.35571724521247033,
"grad_norm": 1.3833700211512812,
"learning_rate": 1.9592240307850748e-05,
"loss": 1.2625,
"step": 440
},
{
"epoch": 0.36380172805820826,
"grad_norm": 1.2812641775550833,
"learning_rate": 1.95671815357601e-05,
"loss": 1.2661,
"step": 450
},
{
"epoch": 0.37188621090394625,
"grad_norm": 1.3509908047727408,
"learning_rate": 1.954139256400049e-05,
"loss": 1.2448,
"step": 460
},
{
"epoch": 0.3799706937496842,
"grad_norm": 1.356891388574271,
"learning_rate": 1.951487536087352e-05,
"loss": 1.2551,
"step": 470
},
{
"epoch": 0.38805517659542216,
"grad_norm": 1.2921423460134738,
"learning_rate": 1.948763195026186e-05,
"loss": 1.2503,
"step": 480
},
{
"epoch": 0.39613965944116014,
"grad_norm": 1.3494188641362486,
"learning_rate": 1.9459664411474793e-05,
"loss": 1.2509,
"step": 490
},
{
"epoch": 0.40422414228689807,
"grad_norm": 1.336605272931222,
"learning_rate": 1.9430974879089522e-05,
"loss": 1.251,
"step": 500
},
{
"epoch": 0.41230862513263605,
"grad_norm": 1.3167568815144604,
"learning_rate": 1.9401565542788238e-05,
"loss": 1.2341,
"step": 510
},
{
"epoch": 0.420393107978374,
"grad_norm": 1.3704112316871029,
"learning_rate": 1.9371438647191007e-05,
"loss": 1.2483,
"step": 520
},
{
"epoch": 0.42847759082411196,
"grad_norm": 1.2971253486447214,
"learning_rate": 1.9340596491684443e-05,
"loss": 1.2483,
"step": 530
},
{
"epoch": 0.43656207366984995,
"grad_norm": 1.278671915851734,
"learning_rate": 1.9309041430246228e-05,
"loss": 1.247,
"step": 540
},
{
"epoch": 0.4446465565155879,
"grad_norm": 1.7143062654632688,
"learning_rate": 1.927677587126542e-05,
"loss": 1.2582,
"step": 550
},
{
"epoch": 0.45273103936132586,
"grad_norm": 3.071400396021207,
"learning_rate": 1.924380227735867e-05,
"loss": 1.2369,
"step": 560
},
{
"epoch": 0.46081552220706384,
"grad_norm": 1.3043426791795303,
"learning_rate": 1.921012316518224e-05,
"loss": 1.2564,
"step": 570
},
{
"epoch": 0.46890000505280177,
"grad_norm": 1.4606599266501914,
"learning_rate": 1.917574110523994e-05,
"loss": 1.2455,
"step": 580
},
{
"epoch": 0.47698448789853976,
"grad_norm": 1.4084512281248918,
"learning_rate": 1.914065872168692e-05,
"loss": 1.237,
"step": 590
},
{
"epoch": 0.4850689707442777,
"grad_norm": 1.7442614101619536,
"learning_rate": 1.910487869212942e-05,
"loss": 1.2428,
"step": 600
},
{
"epoch": 0.49315345359001567,
"grad_norm": 3.217405569169141,
"learning_rate": 1.9068403747420365e-05,
"loss": 1.2406,
"step": 610
},
{
"epoch": 0.5012379364357537,
"grad_norm": 2.028319411000034,
"learning_rate": 1.9031236671450963e-05,
"loss": 1.2295,
"step": 620
},
{
"epoch": 0.5093224192814916,
"grad_norm": 1.3534016099705948,
"learning_rate": 1.899338030093822e-05,
"loss": 1.2287,
"step": 630
},
{
"epoch": 0.5174069021272295,
"grad_norm": 16.560363141349324,
"learning_rate": 1.8954837525208432e-05,
"loss": 1.2239,
"step": 640
},
{
"epoch": 0.5254913849729675,
"grad_norm": 1.6507428977016882,
"learning_rate": 1.8915611285976672e-05,
"loss": 1.2122,
"step": 650
},
{
"epoch": 0.5335758678187055,
"grad_norm": 1.4599892786481696,
"learning_rate": 1.887570457712225e-05,
"loss": 1.2448,
"step": 660
},
{
"epoch": 0.5416603506644434,
"grad_norm": 1.3576328654723246,
"learning_rate": 1.883512044446023e-05,
"loss": 1.235,
"step": 670
},
{
"epoch": 0.5497448335101814,
"grad_norm": 2.7275818350144383,
"learning_rate": 1.879386198550895e-05,
"loss": 1.2302,
"step": 680
},
{
"epoch": 0.5578293163559194,
"grad_norm": 1.4783688191374078,
"learning_rate": 1.8751932349253595e-05,
"loss": 1.2183,
"step": 690
},
{
"epoch": 0.5659137992016573,
"grad_norm": 1.3696848099680126,
"learning_rate": 1.8709334735905908e-05,
"loss": 1.2202,
"step": 700
},
{
"epoch": 0.5739982820473952,
"grad_norm": 1.3843064222445587,
"learning_rate": 1.866607239665988e-05,
"loss": 1.2292,
"step": 710
},
{
"epoch": 0.5820827648931333,
"grad_norm": 1.3013446345815274,
"learning_rate": 1.8622148633443626e-05,
"loss": 1.2404,
"step": 720
},
{
"epoch": 0.5901672477388712,
"grad_norm": 1.3389494076775972,
"learning_rate": 1.8577566798667397e-05,
"loss": 1.2,
"step": 730
},
{
"epoch": 0.5982517305846091,
"grad_norm": 1.2803553653933784,
"learning_rate": 1.8532330294967678e-05,
"loss": 1.2019,
"step": 740
},
{
"epoch": 0.6063362134303472,
"grad_norm": 1.3940783442430897,
"learning_rate": 1.848644257494751e-05,
"loss": 1.2111,
"step": 750
},
{
"epoch": 0.6144206962760851,
"grad_norm": 1.2967372912925752,
"learning_rate": 1.8439907140912962e-05,
"loss": 1.2044,
"step": 760
},
{
"epoch": 0.622505179121823,
"grad_norm": 1.307050777866234,
"learning_rate": 1.839272754460583e-05,
"loss": 1.211,
"step": 770
},
{
"epoch": 0.630589661967561,
"grad_norm": 1.7851865803650349,
"learning_rate": 1.8344907386932552e-05,
"loss": 1.2038,
"step": 780
},
{
"epoch": 0.638674144813299,
"grad_norm": 1.8614266164299924,
"learning_rate": 1.8296450317689377e-05,
"loss": 1.2054,
"step": 790
},
{
"epoch": 0.6467586276590369,
"grad_norm": 1.3262638540650757,
"learning_rate": 1.824736003528381e-05,
"loss": 1.209,
"step": 800
},
{
"epoch": 0.654843110504775,
"grad_norm": 1.290793353111858,
"learning_rate": 1.8197640286452312e-05,
"loss": 1.213,
"step": 810
},
{
"epoch": 0.6629275933505129,
"grad_norm": 1.2558226934999566,
"learning_rate": 1.814729486597436e-05,
"loss": 1.2266,
"step": 820
},
{
"epoch": 0.6710120761962508,
"grad_norm": 1.277465841944589,
"learning_rate": 1.8096327616382815e-05,
"loss": 1.2167,
"step": 830
},
{
"epoch": 0.6790965590419887,
"grad_norm": 1.298887855615747,
"learning_rate": 1.8044742427670627e-05,
"loss": 1.2226,
"step": 840
},
{
"epoch": 0.6871810418877268,
"grad_norm": 5.857168222574854,
"learning_rate": 1.7992543236993952e-05,
"loss": 1.2027,
"step": 850
},
{
"epoch": 0.6952655247334647,
"grad_norm": 1.3361306728189393,
"learning_rate": 1.7939734028371663e-05,
"loss": 1.207,
"step": 860
},
{
"epoch": 0.7033500075792026,
"grad_norm": 1.3969769044659528,
"learning_rate": 1.7886318832381264e-05,
"loss": 1.1799,
"step": 870
},
{
"epoch": 0.7114344904249407,
"grad_norm": 1.4266930108547686,
"learning_rate": 1.783230172585126e-05,
"loss": 1.2111,
"step": 880
},
{
"epoch": 0.7195189732706786,
"grad_norm": 1.3440902999919684,
"learning_rate": 1.7777686831550008e-05,
"loss": 1.1854,
"step": 890
},
{
"epoch": 0.7276034561164165,
"grad_norm": 1.251718689797153,
"learning_rate": 1.7722478317871053e-05,
"loss": 1.1803,
"step": 900
},
{
"epoch": 0.7356879389621546,
"grad_norm": 1.2756808323680056,
"learning_rate": 1.7666680398514978e-05,
"loss": 1.2148,
"step": 910
},
{
"epoch": 0.7437724218078925,
"grad_norm": 1.3774590120848857,
"learning_rate": 1.76102973321678e-05,
"loss": 1.189,
"step": 920
},
{
"epoch": 0.7518569046536304,
"grad_norm": 1.5207360711907143,
"learning_rate": 1.7553333422175933e-05,
"loss": 1.1819,
"step": 930
},
{
"epoch": 0.7599413874993683,
"grad_norm": 1.302009300658742,
"learning_rate": 1.7495793016217754e-05,
"loss": 1.191,
"step": 940
},
{
"epoch": 0.7680258703451064,
"grad_norm": 1.3859954985668783,
"learning_rate": 1.743768050597175e-05,
"loss": 1.1835,
"step": 950
},
{
"epoch": 0.7761103531908443,
"grad_norm": 1.3435502591474426,
"learning_rate": 1.7379000326781348e-05,
"loss": 1.2035,
"step": 960
},
{
"epoch": 0.7841948360365822,
"grad_norm": 1.38981939520544,
"learning_rate": 1.7319756957316392e-05,
"loss": 1.1887,
"step": 970
},
{
"epoch": 0.7922793188823203,
"grad_norm": 1.4015519572670776,
"learning_rate": 1.725995491923131e-05,
"loss": 1.1843,
"step": 980
},
{
"epoch": 0.8003638017280582,
"grad_norm": 1.4763071143801054,
"learning_rate": 1.7199598776820013e-05,
"loss": 1.1753,
"step": 990
},
{
"epoch": 0.8084482845737961,
"grad_norm": 1.3577477544239007,
"learning_rate": 1.713869313666753e-05,
"loss": 1.1966,
"step": 1000
},
{
"epoch": 0.8165327674195342,
"grad_norm": 1.3963231420568032,
"learning_rate": 1.7077242647298405e-05,
"loss": 1.1985,
"step": 1010
},
{
"epoch": 0.8246172502652721,
"grad_norm": 1.5498623314696613,
"learning_rate": 1.7015251998821938e-05,
"loss": 1.1785,
"step": 1020
},
{
"epoch": 0.83270173311101,
"grad_norm": 1.3586468512222978,
"learning_rate": 1.6952725922574188e-05,
"loss": 1.1648,
"step": 1030
},
{
"epoch": 0.840786215956748,
"grad_norm": 1.4300342736321576,
"learning_rate": 1.688966919075687e-05,
"loss": 1.1666,
"step": 1040
},
{
"epoch": 0.848870698802486,
"grad_norm": 1.5788283624417567,
"learning_rate": 1.682608661607313e-05,
"loss": 1.1821,
"step": 1050
},
{
"epoch": 0.8569551816482239,
"grad_norm": 1.359570582214726,
"learning_rate": 1.6761983051360232e-05,
"loss": 1.1958,
"step": 1060
},
{
"epoch": 0.8650396644939619,
"grad_norm": 1.3046392847858388,
"learning_rate": 1.6697363389219147e-05,
"loss": 1.1557,
"step": 1070
},
{
"epoch": 0.8731241473396999,
"grad_norm": 1.4677129965264875,
"learning_rate": 1.6632232561641158e-05,
"loss": 1.1593,
"step": 1080
},
{
"epoch": 0.8812086301854378,
"grad_norm": 1.4859252531152671,
"learning_rate": 1.6566595539631417e-05,
"loss": 1.1753,
"step": 1090
},
{
"epoch": 0.8892931130311758,
"grad_norm": 1.3209365154297203,
"learning_rate": 1.6500457332829553e-05,
"loss": 1.161,
"step": 1100
},
{
"epoch": 0.8973775958769138,
"grad_norm": 1.3862159117294945,
"learning_rate": 1.6433822989127314e-05,
"loss": 1.1592,
"step": 1110
},
{
"epoch": 0.9054620787226517,
"grad_norm": 1.4456179949854164,
"learning_rate": 1.636669759428329e-05,
"loss": 1.1484,
"step": 1120
},
{
"epoch": 0.9135465615683896,
"grad_norm": 1.288756152636894,
"learning_rate": 1.6299086271534764e-05,
"loss": 1.181,
"step": 1130
},
{
"epoch": 0.9216310444141277,
"grad_norm": 1.2599229391965052,
"learning_rate": 1.6230994181206674e-05,
"loss": 1.1718,
"step": 1140
},
{
"epoch": 0.9297155272598656,
"grad_norm": 1.4973902946133841,
"learning_rate": 1.6162426520317765e-05,
"loss": 1.1773,
"step": 1150
},
{
"epoch": 0.9378000101056035,
"grad_norm": 1.3698767908727083,
"learning_rate": 1.6093388522183948e-05,
"loss": 1.1666,
"step": 1160
},
{
"epoch": 0.9458844929513415,
"grad_norm": 1.386433062647111,
"learning_rate": 1.6023885456018852e-05,
"loss": 1.1859,
"step": 1170
},
{
"epoch": 0.9539689757970795,
"grad_norm": 1.284904254015402,
"learning_rate": 1.595392262653168e-05,
"loss": 1.1906,
"step": 1180
},
{
"epoch": 0.9620534586428174,
"grad_norm": 1.4402131637475677,
"learning_rate": 1.5883505373522317e-05,
"loss": 1.1593,
"step": 1190
},
{
"epoch": 0.9701379414885554,
"grad_norm": 1.6049356540049453,
"learning_rate": 1.5812639071473804e-05,
"loss": 1.1636,
"step": 1200
},
{
"epoch": 0.9782224243342934,
"grad_norm": 1.505036374645861,
"learning_rate": 1.574132912914211e-05,
"loss": 1.14,
"step": 1210
},
{
"epoch": 0.9863069071800313,
"grad_norm": 1.6280895974825729,
"learning_rate": 1.566958098914334e-05,
"loss": 1.1358,
"step": 1220
},
{
"epoch": 0.9943913900257693,
"grad_norm": 1.2574161457807662,
"learning_rate": 1.5597400127538324e-05,
"loss": 1.1754,
"step": 1230
},
{
"epoch": 0.9992420797332121,
"eval_loss": 1.0555766820907593,
"eval_runtime": 476.758,
"eval_samples_per_second": 25.514,
"eval_steps_per_second": 12.757,
"step": 1236
},
{
"epoch": 1.0024758728715073,
"grad_norm": 2.9356360899500897,
"learning_rate": 1.5524792053414676e-05,
"loss": 1.1182,
"step": 1240
},
{
"epoch": 1.0105603557172451,
"grad_norm": 1.4115997260524025,
"learning_rate": 1.5451762308466302e-05,
"loss": 1.0448,
"step": 1250
},
{
"epoch": 1.0186448385629832,
"grad_norm": 1.4408354404654395,
"learning_rate": 1.5378316466570466e-05,
"loss": 1.027,
"step": 1260
},
{
"epoch": 1.0267293214087212,
"grad_norm": 1.40209737150782,
"learning_rate": 1.530446013336235e-05,
"loss": 1.0253,
"step": 1270
},
{
"epoch": 1.034813804254459,
"grad_norm": 1.4050923085204698,
"learning_rate": 1.5230198945807226e-05,
"loss": 1.0596,
"step": 1280
},
{
"epoch": 1.042898287100197,
"grad_norm": 1.3850604464116953,
"learning_rate": 1.515553857177022e-05,
"loss": 1.0354,
"step": 1290
},
{
"epoch": 1.050982769945935,
"grad_norm": 1.6192982769908866,
"learning_rate": 1.5080484709583715e-05,
"loss": 1.0338,
"step": 1300
},
{
"epoch": 1.059067252791673,
"grad_norm": 1.5443333411983042,
"learning_rate": 1.5005043087612452e-05,
"loss": 1.0224,
"step": 1310
},
{
"epoch": 1.067151735637411,
"grad_norm": 1.4795375887873081,
"learning_rate": 1.4929219463816302e-05,
"loss": 1.0273,
"step": 1320
},
{
"epoch": 1.075236218483149,
"grad_norm": 1.3952469643942318,
"learning_rate": 1.4853019625310813e-05,
"loss": 1.0165,
"step": 1330
},
{
"epoch": 1.0833207013288868,
"grad_norm": 1.4102438583126526,
"learning_rate": 1.4776449387925507e-05,
"loss": 1.0323,
"step": 1340
},
{
"epoch": 1.0914051841746248,
"grad_norm": 1.4166513317270177,
"learning_rate": 1.4699514595760006e-05,
"loss": 1.0343,
"step": 1350
},
{
"epoch": 1.0994896670203629,
"grad_norm": 1.4572773218335806,
"learning_rate": 1.4622221120737985e-05,
"loss": 1.0449,
"step": 1360
},
{
"epoch": 1.1075741498661007,
"grad_norm": 1.4277575864922984,
"learning_rate": 1.4544574862159013e-05,
"loss": 1.0157,
"step": 1370
},
{
"epoch": 1.1156586327118387,
"grad_norm": 1.8246683293221693,
"learning_rate": 1.446658174624829e-05,
"loss": 1.037,
"step": 1380
},
{
"epoch": 1.1237431155575768,
"grad_norm": 1.4515508954548648,
"learning_rate": 1.4388247725704338e-05,
"loss": 1.0163,
"step": 1390
},
{
"epoch": 1.1318275984033146,
"grad_norm": 1.4472625641065484,
"learning_rate": 1.4309578779244678e-05,
"loss": 1.0339,
"step": 1400
},
{
"epoch": 1.1399120812490526,
"grad_norm": 1.441284439472294,
"learning_rate": 1.423058091114951e-05,
"loss": 1.0153,
"step": 1410
},
{
"epoch": 1.1479965640947905,
"grad_norm": 1.4505444065925723,
"learning_rate": 1.4151260150803445e-05,
"loss": 1.0413,
"step": 1420
},
{
"epoch": 1.1560810469405285,
"grad_norm": 1.5566575848024742,
"learning_rate": 1.4071622552235327e-05,
"loss": 1.014,
"step": 1430
},
{
"epoch": 1.1641655297862665,
"grad_norm": 1.476527456836737,
"learning_rate": 1.399167419365616e-05,
"loss": 1.0374,
"step": 1440
},
{
"epoch": 1.1722500126320043,
"grad_norm": 1.7587555981022083,
"learning_rate": 1.3911421176995206e-05,
"loss": 1.0145,
"step": 1450
},
{
"epoch": 1.1803344954777424,
"grad_norm": 1.5447530212974045,
"learning_rate": 1.3830869627434267e-05,
"loss": 1.0104,
"step": 1460
},
{
"epoch": 1.1884189783234804,
"grad_norm": 1.368002967716879,
"learning_rate": 1.3750025692940174e-05,
"loss": 1.0102,
"step": 1470
},
{
"epoch": 1.1965034611692182,
"grad_norm": 1.5132346329088506,
"learning_rate": 1.3668895543795581e-05,
"loss": 1.0241,
"step": 1480
},
{
"epoch": 1.2045879440149563,
"grad_norm": 1.4535090384504317,
"learning_rate": 1.3587485372128e-05,
"loss": 1.01,
"step": 1490
},
{
"epoch": 1.2126724268606943,
"grad_norm": 1.6349536867702466,
"learning_rate": 1.3505801391437215e-05,
"loss": 1.0538,
"step": 1500
},
{
"epoch": 1.2207569097064321,
"grad_norm": 1.608679365926187,
"learning_rate": 1.3423849836121043e-05,
"loss": 1.0256,
"step": 1510
},
{
"epoch": 1.2288413925521702,
"grad_norm": 1.4875509565909706,
"learning_rate": 1.33416369609995e-05,
"loss": 1.0365,
"step": 1520
},
{
"epoch": 1.2369258753979082,
"grad_norm": 1.4161399144655036,
"learning_rate": 1.325916904083741e-05,
"loss": 1.0285,
"step": 1530
},
{
"epoch": 1.245010358243646,
"grad_norm": 1.516547180031239,
"learning_rate": 1.3176452369865504e-05,
"loss": 0.9972,
"step": 1540
},
{
"epoch": 1.253094841089384,
"grad_norm": 1.4500310981963098,
"learning_rate": 1.3093493261300012e-05,
"loss": 1.0122,
"step": 1550
},
{
"epoch": 1.261179323935122,
"grad_norm": 1.3787551364346502,
"learning_rate": 1.3010298046860821e-05,
"loss": 1.0221,
"step": 1560
},
{
"epoch": 1.26926380678086,
"grad_norm": 1.3579456863416077,
"learning_rate": 1.2926873076288222e-05,
"loss": 1.0213,
"step": 1570
},
{
"epoch": 1.277348289626598,
"grad_norm": 1.4774509503134268,
"learning_rate": 1.2843224716858271e-05,
"loss": 1.012,
"step": 1580
},
{
"epoch": 1.285432772472336,
"grad_norm": 1.4805342986177266,
"learning_rate": 1.2759359352896809e-05,
"loss": 1.0193,
"step": 1590
},
{
"epoch": 1.2935172553180738,
"grad_norm": 1.4527468028008124,
"learning_rate": 1.2675283385292212e-05,
"loss": 1.0431,
"step": 1600
},
{
"epoch": 1.3016017381638119,
"grad_norm": 1.5688075844044822,
"learning_rate": 1.259100323100682e-05,
"loss": 1.0226,
"step": 1610
},
{
"epoch": 1.30968622100955,
"grad_norm": 1.493324687221304,
"learning_rate": 1.2506525322587207e-05,
"loss": 0.9966,
"step": 1620
},
{
"epoch": 1.3177707038552877,
"grad_norm": 1.563824009098089,
"learning_rate": 1.2421856107673205e-05,
"loss": 1.0317,
"step": 1630
},
{
"epoch": 1.3258551867010258,
"grad_norm": 1.4698666764020467,
"learning_rate": 1.233700204850581e-05,
"loss": 1.0013,
"step": 1640
},
{
"epoch": 1.3339396695467638,
"grad_norm": 1.625463847709757,
"learning_rate": 1.2251969621433947e-05,
"loss": 1.0233,
"step": 1650
},
{
"epoch": 1.3420241523925016,
"grad_norm": 1.560576858468798,
"learning_rate": 1.2166765316420195e-05,
"loss": 1.0137,
"step": 1660
},
{
"epoch": 1.3501086352382397,
"grad_norm": 1.6305115869655395,
"learning_rate": 1.2081395636545432e-05,
"loss": 1.0074,
"step": 1670
},
{
"epoch": 1.3581931180839777,
"grad_norm": 1.683367869903662,
"learning_rate": 1.1995867097512504e-05,
"loss": 1.0202,
"step": 1680
},
{
"epoch": 1.3662776009297155,
"grad_norm": 1.342629975477622,
"learning_rate": 1.191018622714893e-05,
"loss": 1.0039,
"step": 1690
},
{
"epoch": 1.3743620837754535,
"grad_norm": 1.4162506108365653,
"learning_rate": 1.1824359564908667e-05,
"loss": 1.0303,
"step": 1700
},
{
"epoch": 1.3824465666211916,
"grad_norm": 1.4322509952288762,
"learning_rate": 1.1738393661373004e-05,
"loss": 1.0223,
"step": 1710
},
{
"epoch": 1.3905310494669294,
"grad_norm": 1.4429525488762647,
"learning_rate": 1.1652295077750599e-05,
"loss": 1.0079,
"step": 1720
},
{
"epoch": 1.3986155323126674,
"grad_norm": 1.5044521870868257,
"learning_rate": 1.1566070385376705e-05,
"loss": 0.9903,
"step": 1730
},
{
"epoch": 1.4067000151584053,
"grad_norm": 1.4591518605463256,
"learning_rate": 1.1479726165211609e-05,
"loss": 1.0133,
"step": 1740
},
{
"epoch": 1.4147844980041433,
"grad_norm": 1.38699009818023,
"learning_rate": 1.1393269007338375e-05,
"loss": 1.0191,
"step": 1750
},
{
"epoch": 1.4228689808498813,
"grad_norm": 1.4248174199771946,
"learning_rate": 1.1306705510459852e-05,
"loss": 1.0048,
"step": 1760
},
{
"epoch": 1.4309534636956192,
"grad_norm": 1.5368128288739022,
"learning_rate": 1.1220042281395042e-05,
"loss": 1.0169,
"step": 1770
},
{
"epoch": 1.4390379465413572,
"grad_norm": 1.620365193180215,
"learning_rate": 1.1133285934574849e-05,
"loss": 0.9982,
"step": 1780
},
{
"epoch": 1.447122429387095,
"grad_norm": 1.4821421519804139,
"learning_rate": 1.1046443091537232e-05,
"loss": 1.0241,
"step": 1790
},
{
"epoch": 1.455206912232833,
"grad_norm": 1.5012997646705204,
"learning_rate": 1.0959520380421831e-05,
"loss": 1.0116,
"step": 1800
},
{
"epoch": 1.463291395078571,
"grad_norm": 1.4878335919543981,
"learning_rate": 1.0872524435464104e-05,
"loss": 0.9993,
"step": 1810
},
{
"epoch": 1.471375877924309,
"grad_norm": 1.3918759318142178,
"learning_rate": 1.0785461896488947e-05,
"loss": 1.0103,
"step": 1820
},
{
"epoch": 1.479460360770047,
"grad_norm": 1.7724767013914755,
"learning_rate": 1.0698339408403944e-05,
"loss": 0.9862,
"step": 1830
},
{
"epoch": 1.487544843615785,
"grad_norm": 2.0093844914876717,
"learning_rate": 1.06111636206922e-05,
"loss": 1.0039,
"step": 1840
},
{
"epoch": 1.4956293264615228,
"grad_norm": 1.4440349729006745,
"learning_rate": 1.0523941186904823e-05,
"loss": 1.0091,
"step": 1850
},
{
"epoch": 1.5037138093072608,
"grad_norm": 1.5530469064140777,
"learning_rate": 1.043667876415311e-05,
"loss": 0.9959,
"step": 1860
},
{
"epoch": 1.5117982921529989,
"grad_norm": 1.9710010624543786,
"learning_rate": 1.0349383012600448e-05,
"loss": 0.9902,
"step": 1870
},
{
"epoch": 1.5198827749987367,
"grad_norm": 1.4874119470603941,
"learning_rate": 1.0262060594954e-05,
"loss": 0.9889,
"step": 1880
},
{
"epoch": 1.5279672578444747,
"grad_norm": 1.5760932908781828,
"learning_rate": 1.0174718175956164e-05,
"loss": 0.997,
"step": 1890
},
{
"epoch": 1.5360517406902128,
"grad_norm": 1.5140336706570001,
"learning_rate": 1.0087362421875912e-05,
"loss": 1.0162,
"step": 1900
},
{
"epoch": 1.5441362235359506,
"grad_norm": 1.4275012742483075,
"learning_rate": 1e-05,
"loss": 1.0056,
"step": 1910
},
{
"epoch": 1.5522207063816886,
"grad_norm": 1.4479646715349155,
"learning_rate": 9.912637578124092e-06,
"loss": 0.9831,
"step": 1920
},
{
"epoch": 1.5603051892274267,
"grad_norm": 1.6529106306573094,
"learning_rate": 9.825281824043838e-06,
"loss": 1.0009,
"step": 1930
},
{
"epoch": 1.5683896720731645,
"grad_norm": 1.4537655155385498,
"learning_rate": 9.737939405046002e-06,
"loss": 1.0058,
"step": 1940
},
{
"epoch": 1.5764741549189025,
"grad_norm": 1.3881828231981752,
"learning_rate": 9.650616987399553e-06,
"loss": 0.9752,
"step": 1950
},
{
"epoch": 1.5845586377646406,
"grad_norm": 1.4410127433172688,
"learning_rate": 9.563321235846894e-06,
"loss": 1.0026,
"step": 1960
},
{
"epoch": 1.5926431206103784,
"grad_norm": 1.6585729752037028,
"learning_rate": 9.476058813095182e-06,
"loss": 0.9942,
"step": 1970
},
{
"epoch": 1.6007276034561164,
"grad_norm": 1.6572316797520206,
"learning_rate": 9.388836379307802e-06,
"loss": 0.9968,
"step": 1980
},
{
"epoch": 1.6088120863018545,
"grad_norm": 1.451151024162774,
"learning_rate": 9.301660591596059e-06,
"loss": 0.9921,
"step": 1990
},
{
"epoch": 1.6168965691475923,
"grad_norm": 1.5042478185497792,
"learning_rate": 9.214538103511053e-06,
"loss": 0.9959,
"step": 2000
},
{
"epoch": 1.6249810519933303,
"grad_norm": 1.4096442655309245,
"learning_rate": 9.127475564535898e-06,
"loss": 0.9944,
"step": 2010
},
{
"epoch": 1.6330655348390684,
"grad_norm": 1.3701103693221475,
"learning_rate": 9.04047961957817e-06,
"loss": 0.9806,
"step": 2020
},
{
"epoch": 1.6411500176848062,
"grad_norm": 1.6771886101217564,
"learning_rate": 8.953556908462773e-06,
"loss": 0.9986,
"step": 2030
},
{
"epoch": 1.6492345005305442,
"grad_norm": 1.4606744478213272,
"learning_rate": 8.866714065425154e-06,
"loss": 0.9894,
"step": 2040
},
{
"epoch": 1.6573189833762823,
"grad_norm": 1.5696191298486186,
"learning_rate": 8.779957718604956e-06,
"loss": 1.0055,
"step": 2050
},
{
"epoch": 1.66540346622202,
"grad_norm": 1.4621439613400917,
"learning_rate": 8.693294489540151e-06,
"loss": 1.0055,
"step": 2060
},
{
"epoch": 1.673487949067758,
"grad_norm": 1.4224764910826249,
"learning_rate": 8.60673099266163e-06,
"loss": 0.9687,
"step": 2070
},
{
"epoch": 1.6815724319134961,
"grad_norm": 1.6938323822086323,
"learning_rate": 8.520273834788395e-06,
"loss": 0.978,
"step": 2080
},
{
"epoch": 1.689656914759234,
"grad_norm": 1.5856717495753165,
"learning_rate": 8.4339296146233e-06,
"loss": 0.992,
"step": 2090
},
{
"epoch": 1.697741397604972,
"grad_norm": 1.4737528022353619,
"learning_rate": 8.3477049222494e-06,
"loss": 0.9882,
"step": 2100
},
{
"epoch": 1.70582588045071,
"grad_norm": 1.4413576604331515,
"learning_rate": 8.261606338626998e-06,
"loss": 0.9717,
"step": 2110
},
{
"epoch": 1.7139103632964479,
"grad_norm": 1.4533604100239785,
"learning_rate": 8.17564043509134e-06,
"loss": 0.9878,
"step": 2120
},
{
"epoch": 1.7219948461421857,
"grad_norm": 1.4996211527080612,
"learning_rate": 8.089813772851073e-06,
"loss": 0.9932,
"step": 2130
},
{
"epoch": 1.730079328987924,
"grad_norm": 1.4183735479797297,
"learning_rate": 8.004132902487499e-06,
"loss": 1.0021,
"step": 2140
},
{
"epoch": 1.7381638118336618,
"grad_norm": 1.4020103234354604,
"learning_rate": 7.91860436345457e-06,
"loss": 0.9717,
"step": 2150
},
{
"epoch": 1.7462482946793996,
"grad_norm": 1.4529101522297827,
"learning_rate": 7.833234683579806e-06,
"loss": 0.9844,
"step": 2160
},
{
"epoch": 1.7543327775251378,
"grad_norm": 1.4502465958251158,
"learning_rate": 7.748030378566056e-06,
"loss": 0.9782,
"step": 2170
},
{
"epoch": 1.7624172603708756,
"grad_norm": 1.4461707858445054,
"learning_rate": 7.662997951494193e-06,
"loss": 0.9836,
"step": 2180
},
{
"epoch": 1.7705017432166135,
"grad_norm": 1.3966480403360386,
"learning_rate": 7.578143892326797e-06,
"loss": 1.0089,
"step": 2190
},
{
"epoch": 1.7785862260623517,
"grad_norm": 1.5838575969719086,
"learning_rate": 7.493474677412795e-06,
"loss": 1.0017,
"step": 2200
},
{
"epoch": 1.7866707089080895,
"grad_norm": 1.6412461821364432,
"learning_rate": 7.408996768993184e-06,
"loss": 0.9889,
"step": 2210
},
{
"epoch": 1.7947551917538274,
"grad_norm": 1.8686882471940454,
"learning_rate": 7.324716614707794e-06,
"loss": 0.9814,
"step": 2220
},
{
"epoch": 1.8028396745995656,
"grad_norm": 1.4444454657231485,
"learning_rate": 7.240640647103192e-06,
"loss": 0.9934,
"step": 2230
},
{
"epoch": 1.8109241574453034,
"grad_norm": 1.5880994051473134,
"learning_rate": 7.156775283141733e-06,
"loss": 0.9972,
"step": 2240
},
{
"epoch": 1.8190086402910413,
"grad_norm": 1.6179768250952558,
"learning_rate": 7.0731269237117775e-06,
"loss": 0.9805,
"step": 2250
},
{
"epoch": 1.8270931231367793,
"grad_norm": 1.4161571668846493,
"learning_rate": 6.989701953139181e-06,
"loss": 0.9695,
"step": 2260
},
{
"epoch": 1.8351776059825173,
"grad_norm": 1.8752619329260358,
"learning_rate": 6.906506738699994e-06,
"loss": 0.9899,
"step": 2270
},
{
"epoch": 1.8432620888282552,
"grad_norm": 1.8476640791436918,
"learning_rate": 6.823547630134497e-06,
"loss": 0.9799,
"step": 2280
},
{
"epoch": 1.8513465716739932,
"grad_norm": 1.5003229948984453,
"learning_rate": 6.740830959162592e-06,
"loss": 0.9948,
"step": 2290
},
{
"epoch": 1.8594310545197312,
"grad_norm": 1.4363919724793655,
"learning_rate": 6.658363039000501e-06,
"loss": 0.9625,
"step": 2300
},
{
"epoch": 1.867515537365469,
"grad_norm": 1.45857815520064,
"learning_rate": 6.57615016387896e-06,
"loss": 0.976,
"step": 2310
},
{
"epoch": 1.875600020211207,
"grad_norm": 1.3637017381911254,
"learning_rate": 6.4941986085627895e-06,
"loss": 0.9608,
"step": 2320
},
{
"epoch": 1.8836845030569451,
"grad_norm": 1.586134857640991,
"learning_rate": 6.412514627872003e-06,
"loss": 0.9702,
"step": 2330
},
{
"epoch": 1.891768985902683,
"grad_norm": 1.6293874205755696,
"learning_rate": 6.331104456204423e-06,
"loss": 0.9672,
"step": 2340
},
{
"epoch": 1.899853468748421,
"grad_norm": 1.6185456719315228,
"learning_rate": 6.249974307059826e-06,
"loss": 0.9683,
"step": 2350
},
{
"epoch": 1.907937951594159,
"grad_norm": 1.5897776438113254,
"learning_rate": 6.169130372565737e-06,
"loss": 0.9942,
"step": 2360
},
{
"epoch": 1.9160224344398968,
"grad_norm": 1.4621464766459995,
"learning_rate": 6.088578823004796e-06,
"loss": 0.9552,
"step": 2370
},
{
"epoch": 1.9241069172856349,
"grad_norm": 1.57419066036152,
"learning_rate": 6.008325806343842e-06,
"loss": 0.9635,
"step": 2380
},
{
"epoch": 1.932191400131373,
"grad_norm": 1.4154240767952921,
"learning_rate": 5.9283774477646775e-06,
"loss": 0.9661,
"step": 2390
},
{
"epoch": 1.9402758829771107,
"grad_norm": 1.4089774352311322,
"learning_rate": 5.848739849196556e-06,
"loss": 0.9623,
"step": 2400
},
{
"epoch": 1.9483603658228488,
"grad_norm": 1.4330997113061938,
"learning_rate": 5.7694190888504964e-06,
"loss": 0.982,
"step": 2410
},
{
"epoch": 1.9564448486685868,
"grad_norm": 1.762833270995275,
"learning_rate": 5.690421220755329e-06,
"loss": 0.968,
"step": 2420
},
{
"epoch": 1.9645293315143246,
"grad_norm": 1.57370551896378,
"learning_rate": 5.611752274295665e-06,
"loss": 0.9639,
"step": 2430
},
{
"epoch": 1.9726138143600627,
"grad_norm": 1.4682932578058885,
"learning_rate": 5.533418253751714e-06,
"loss": 0.9786,
"step": 2440
},
{
"epoch": 1.9806982972058007,
"grad_norm": 1.7633821953728437,
"learning_rate": 5.455425137840987e-06,
"loss": 0.9618,
"step": 2450
},
{
"epoch": 1.9887827800515385,
"grad_norm": 1.5018261369656176,
"learning_rate": 5.377778879262017e-06,
"loss": 0.9454,
"step": 2460
},
{
"epoch": 1.9968672628972766,
"grad_norm": 1.5404280086355402,
"learning_rate": 5.300485404239999e-06,
"loss": 0.9628,
"step": 2470
},
{
"epoch": 1.999292607750998,
"eval_loss": 0.8751075863838196,
"eval_runtime": 481.67,
"eval_samples_per_second": 25.254,
"eval_steps_per_second": 12.627,
"step": 2473
},
{
"epoch": 2.0049517457430146,
"grad_norm": 1.8577507088673693,
"learning_rate": 5.223550612074497e-06,
"loss": 0.8752,
"step": 2480
},
{
"epoch": 2.0130362285887524,
"grad_norm": 1.5570324756102374,
"learning_rate": 5.146980374689192e-06,
"loss": 0.8398,
"step": 2490
},
{
"epoch": 2.0211207114344902,
"grad_norm": 1.645225536576169,
"learning_rate": 5.070780536183698e-06,
"loss": 0.856,
"step": 2500
},
{
"epoch": 2.0292051942802285,
"grad_norm": 1.6698633554870226,
"learning_rate": 4.99495691238755e-06,
"loss": 0.8365,
"step": 2510
},
{
"epoch": 2.0372896771259663,
"grad_norm": 2.010967933907663,
"learning_rate": 4.9195152904162865e-06,
"loss": 0.8308,
"step": 2520
},
{
"epoch": 2.045374159971704,
"grad_norm": 1.4592026658551123,
"learning_rate": 4.844461428229782e-06,
"loss": 0.8387,
"step": 2530
},
{
"epoch": 2.0534586428174424,
"grad_norm": 1.9716723547932462,
"learning_rate": 4.769801054192776e-06,
"loss": 0.8374,
"step": 2540
},
{
"epoch": 2.06154312566318,
"grad_norm": 1.6334367414667887,
"learning_rate": 4.695539866637653e-06,
"loss": 0.8587,
"step": 2550
},
{
"epoch": 2.069627608508918,
"grad_norm": 1.713926689166813,
"learning_rate": 4.6216835334295385e-06,
"loss": 0.8376,
"step": 2560
},
{
"epoch": 2.0777120913546563,
"grad_norm": 1.5714175555320091,
"learning_rate": 4.548237691533699e-06,
"loss": 0.8346,
"step": 2570
},
{
"epoch": 2.085796574200394,
"grad_norm": 1.4811489223457255,
"learning_rate": 4.475207946585328e-06,
"loss": 0.8473,
"step": 2580
},
{
"epoch": 2.093881057046132,
"grad_norm": 1.4400201402098334,
"learning_rate": 4.402599872461678e-06,
"loss": 0.8309,
"step": 2590
},
{
"epoch": 2.10196553989187,
"grad_norm": 1.5527150219002093,
"learning_rate": 4.330419010856661e-06,
"loss": 0.8312,
"step": 2600
},
{
"epoch": 2.110050022737608,
"grad_norm": 1.4540137626455856,
"learning_rate": 4.258670870857894e-06,
"loss": 0.8461,
"step": 2610
},
{
"epoch": 2.118134505583346,
"grad_norm": 1.5200526871374724,
"learning_rate": 4.187360928526198e-06,
"loss": 0.8353,
"step": 2620
},
{
"epoch": 2.126218988429084,
"grad_norm": 1.487656190760893,
"learning_rate": 4.116494626477684e-06,
"loss": 0.842,
"step": 2630
},
{
"epoch": 2.134303471274822,
"grad_norm": 1.4541876796717628,
"learning_rate": 4.046077373468325e-06,
"loss": 0.8285,
"step": 2640
},
{
"epoch": 2.1423879541205597,
"grad_norm": 1.515080712913025,
"learning_rate": 3.976114543981148e-06,
"loss": 0.8278,
"step": 2650
},
{
"epoch": 2.150472436966298,
"grad_norm": 1.5925627792233104,
"learning_rate": 3.906611477816054e-06,
"loss": 0.8382,
"step": 2660
},
{
"epoch": 2.158556919812036,
"grad_norm": 1.4749306746231339,
"learning_rate": 3.837573479682236e-06,
"loss": 0.8453,
"step": 2670
},
{
"epoch": 2.1666414026577736,
"grad_norm": 1.888042329530717,
"learning_rate": 3.769005818793329e-06,
"loss": 0.854,
"step": 2680
},
{
"epoch": 2.174725885503512,
"grad_norm": 1.598037794600047,
"learning_rate": 3.7009137284652386e-06,
"loss": 0.8519,
"step": 2690
},
{
"epoch": 2.1828103683492497,
"grad_norm": 1.5540837615094885,
"learning_rate": 3.633302405716712e-06,
"loss": 0.8397,
"step": 2700
},
{
"epoch": 2.1908948511949875,
"grad_norm": 1.430485289060877,
"learning_rate": 3.5661770108726914e-06,
"loss": 0.8271,
"step": 2710
},
{
"epoch": 2.1989793340407258,
"grad_norm": 2.401835949374892,
"learning_rate": 3.4995426671704493e-06,
"loss": 0.8335,
"step": 2720
},
{
"epoch": 2.2070638168864636,
"grad_norm": 1.506353292247366,
"learning_rate": 3.433404460368587e-06,
"loss": 0.828,
"step": 2730
},
{
"epoch": 2.2151482997322014,
"grad_norm": 1.4406717845115946,
"learning_rate": 3.3677674383588476e-06,
"loss": 0.8315,
"step": 2740
},
{
"epoch": 2.2232327825779397,
"grad_norm": 1.5393945850323205,
"learning_rate": 3.302636610780855e-06,
"loss": 0.8504,
"step": 2750
},
{
"epoch": 2.2313172654236775,
"grad_norm": 1.7257558230682333,
"learning_rate": 3.238016948639772e-06,
"loss": 0.8232,
"step": 2760
},
{
"epoch": 2.2394017482694153,
"grad_norm": 1.8326756661400847,
"learning_rate": 3.1739133839268698e-06,
"loss": 0.8154,
"step": 2770
},
{
"epoch": 2.2474862311151536,
"grad_norm": 1.5269518503128512,
"learning_rate": 3.110330809243134e-06,
"loss": 0.8317,
"step": 2780
},
{
"epoch": 2.2555707139608914,
"grad_norm": 1.504166909878008,
"learning_rate": 3.0472740774258157e-06,
"loss": 0.8368,
"step": 2790
},
{
"epoch": 2.263655196806629,
"grad_norm": 1.480047137104623,
"learning_rate": 2.9847480011780607e-06,
"loss": 0.8409,
"step": 2800
},
{
"epoch": 2.2717396796523674,
"grad_norm": 1.492023552078346,
"learning_rate": 2.922757352701595e-06,
"loss": 0.8243,
"step": 2810
},
{
"epoch": 2.2798241624981053,
"grad_norm": 1.467055149697424,
"learning_rate": 2.861306863332475e-06,
"loss": 0.8289,
"step": 2820
},
{
"epoch": 2.287908645343843,
"grad_norm": 1.504514345406056,
"learning_rate": 2.8004012231799905e-06,
"loss": 0.8375,
"step": 2830
},
{
"epoch": 2.295993128189581,
"grad_norm": 1.5091792435489357,
"learning_rate": 2.740045080768694e-06,
"loss": 0.8233,
"step": 2840
},
{
"epoch": 2.304077611035319,
"grad_norm": 1.4619080284602382,
"learning_rate": 2.6802430426836113e-06,
"loss": 0.8356,
"step": 2850
},
{
"epoch": 2.312162093881057,
"grad_norm": 1.4085751552174153,
"learning_rate": 2.620999673218656e-06,
"loss": 0.8156,
"step": 2860
},
{
"epoch": 2.3202465767267952,
"grad_norm": 1.4755258769825808,
"learning_rate": 2.5623194940282526e-06,
"loss": 0.8353,
"step": 2870
},
{
"epoch": 2.328331059572533,
"grad_norm": 1.5852343601430656,
"learning_rate": 2.504206983782248e-06,
"loss": 0.8133,
"step": 2880
},
{
"epoch": 2.336415542418271,
"grad_norm": 1.4903107631764194,
"learning_rate": 2.446666577824068e-06,
"loss": 0.8459,
"step": 2890
},
{
"epoch": 2.3445000252640087,
"grad_norm": 1.523719484539125,
"learning_rate": 2.389702667832202e-06,
"loss": 0.8285,
"step": 2900
},
{
"epoch": 2.352584508109747,
"grad_norm": 1.457321496284554,
"learning_rate": 2.3333196014850246e-06,
"loss": 0.8304,
"step": 2910
},
{
"epoch": 2.3606689909554848,
"grad_norm": 1.537434676857527,
"learning_rate": 2.277521682128947e-06,
"loss": 0.829,
"step": 2920
},
{
"epoch": 2.3687534738012226,
"grad_norm": 1.4707817420987006,
"learning_rate": 2.2223131684499932e-06,
"loss": 0.8372,
"step": 2930
},
{
"epoch": 2.376837956646961,
"grad_norm": 1.46749047915079,
"learning_rate": 2.1676982741487427e-06,
"loss": 0.8222,
"step": 2940
},
{
"epoch": 2.3849224394926987,
"grad_norm": 1.518122852634397,
"learning_rate": 2.113681167618736e-06,
"loss": 0.8401,
"step": 2950
},
{
"epoch": 2.3930069223384365,
"grad_norm": 1.8575848589445734,
"learning_rate": 2.060265971628338e-06,
"loss": 0.8339,
"step": 2960
},
{
"epoch": 2.4010914051841747,
"grad_norm": 1.5601145654381285,
"learning_rate": 2.0074567630060514e-06,
"loss": 0.8154,
"step": 2970
},
{
"epoch": 2.4091758880299126,
"grad_norm": 1.530898387002521,
"learning_rate": 1.955257572329379e-06,
"loss": 0.823,
"step": 2980
},
{
"epoch": 2.4172603708756504,
"grad_norm": 1.6224545445427798,
"learning_rate": 1.9036723836171899e-06,
"loss": 0.8145,
"step": 2990
},
{
"epoch": 2.4253448537213886,
"grad_norm": 1.4013679708594033,
"learning_rate": 1.8527051340256397e-06,
"loss": 0.8215,
"step": 3000
},
{
"epoch": 2.4334293365671265,
"grad_norm": 1.5692785609667004,
"learning_rate": 1.8023597135476923e-06,
"loss": 0.8241,
"step": 3010
},
{
"epoch": 2.4415138194128643,
"grad_norm": 1.5126974695662643,
"learning_rate": 1.752639964716193e-06,
"loss": 0.8421,
"step": 3020
},
{
"epoch": 2.4495983022586025,
"grad_norm": 1.6242742569822604,
"learning_rate": 1.7035496823106247e-06,
"loss": 0.8141,
"step": 3030
},
{
"epoch": 2.4576827851043404,
"grad_norm": 1.4628790110692993,
"learning_rate": 1.6550926130674527e-06,
"loss": 0.8184,
"step": 3040
},
{
"epoch": 2.465767267950078,
"grad_norm": 1.4807837431822446,
"learning_rate": 1.607272455394172e-06,
"loss": 0.8202,
"step": 3050
},
{
"epoch": 2.4738517507958164,
"grad_norm": 1.5539937903441552,
"learning_rate": 1.5600928590870402e-06,
"loss": 0.8391,
"step": 3060
},
{
"epoch": 2.4819362336415542,
"grad_norm": 1.6677495360703212,
"learning_rate": 1.5135574250524898e-06,
"loss": 0.8436,
"step": 3070
},
{
"epoch": 2.490020716487292,
"grad_norm": 1.53769857798961,
"learning_rate": 1.467669705032323e-06,
"loss": 0.8263,
"step": 3080
},
{
"epoch": 2.4981051993330303,
"grad_norm": 1.4732928239069325,
"learning_rate": 1.422433201332607e-06,
"loss": 0.8284,
"step": 3090
},
{
"epoch": 2.506189682178768,
"grad_norm": 1.5928757648188723,
"learning_rate": 1.3778513665563786e-06,
"loss": 0.8319,
"step": 3100
},
{
"epoch": 2.514274165024506,
"grad_norm": 1.4230928346180836,
"learning_rate": 1.3339276033401283e-06,
"loss": 0.8052,
"step": 3110
},
{
"epoch": 2.522358647870244,
"grad_norm": 1.4772661299744003,
"learning_rate": 1.290665264094093e-06,
"loss": 0.8241,
"step": 3120
},
{
"epoch": 2.530443130715982,
"grad_norm": 1.522091825661006,
"learning_rate": 1.2480676507464051e-06,
"loss": 0.8106,
"step": 3130
},
{
"epoch": 2.53852761356172,
"grad_norm": 1.525599170654266,
"learning_rate": 1.2061380144910572e-06,
"loss": 0.8166,
"step": 3140
},
{
"epoch": 2.5466120964074577,
"grad_norm": 1.4929327017491605,
"learning_rate": 1.1648795555397719e-06,
"loss": 0.8251,
"step": 3150
},
{
"epoch": 2.554696579253196,
"grad_norm": 1.5920001415947864,
"learning_rate": 1.1242954228777513e-06,
"loss": 0.8268,
"step": 3160
},
{
"epoch": 2.5627810620989337,
"grad_norm": 1.5252651359986042,
"learning_rate": 1.08438871402333e-06,
"loss": 0.831,
"step": 3170
},
{
"epoch": 2.570865544944672,
"grad_norm": 1.6461347768103347,
"learning_rate": 1.04516247479157e-06,
"loss": 0.8239,
"step": 3180
},
{
"epoch": 2.57895002779041,
"grad_norm": 1.490863354097273,
"learning_rate": 1.006619699061785e-06,
"loss": 0.823,
"step": 3190
},
{
"epoch": 2.5870345106361476,
"grad_norm": 1.5158841203253022,
"learning_rate": 9.687633285490395e-07,
"loss": 0.8333,
"step": 3200
},
{
"epoch": 2.5951189934818855,
"grad_norm": 1.4861408651974157,
"learning_rate": 9.315962525796374e-07,
"loss": 0.8178,
"step": 3210
},
{
"epoch": 2.6032034763276237,
"grad_norm": 1.4847726389856295,
"learning_rate": 8.951213078705811e-07,
"loss": 0.8244,
"step": 3220
},
{
"epoch": 2.6112879591733615,
"grad_norm": 1.4579228976188288,
"learning_rate": 8.593412783130805e-07,
"loss": 0.8116,
"step": 3230
},
{
"epoch": 2.6193724420191,
"grad_norm": 1.4309284818257009,
"learning_rate": 8.24258894760066e-07,
"loss": 0.8233,
"step": 3240
},
{
"epoch": 2.6274569248648376,
"grad_norm": 1.481662266621092,
"learning_rate": 7.898768348177643e-07,
"loss": 0.8393,
"step": 3250
},
{
"epoch": 2.6355414077105754,
"grad_norm": 1.42582017885812,
"learning_rate": 7.561977226413341e-07,
"loss": 0.8344,
"step": 3260
},
{
"epoch": 2.6436258905563133,
"grad_norm": 1.4203791210214531,
"learning_rate": 7.23224128734582e-07,
"loss": 0.821,
"step": 3270
},
{
"epoch": 2.6517103734020515,
"grad_norm": 1.4780417621137758,
"learning_rate": 6.909585697537758e-07,
"loss": 0.8353,
"step": 3280
},
{
"epoch": 2.6597948562477893,
"grad_norm": 1.4466612391449976,
"learning_rate": 6.594035083155581e-07,
"loss": 0.8268,
"step": 3290
},
{
"epoch": 2.6678793390935276,
"grad_norm": 1.4584592752103582,
"learning_rate": 6.285613528089962e-07,
"loss": 0.8164,
"step": 3300
},
{
"epoch": 2.6759638219392654,
"grad_norm": 1.487514724946772,
"learning_rate": 5.98434457211765e-07,
"loss": 0.8027,
"step": 3310
},
{
"epoch": 2.6840483047850032,
"grad_norm": 1.4294666752405771,
"learning_rate": 5.690251209104802e-07,
"loss": 0.8105,
"step": 3320
},
{
"epoch": 2.692132787630741,
"grad_norm": 1.4638925402226952,
"learning_rate": 5.403355885252104e-07,
"loss": 0.8135,
"step": 3330
},
{
"epoch": 2.7002172704764793,
"grad_norm": 1.4458763488108235,
"learning_rate": 5.123680497381444e-07,
"loss": 0.8102,
"step": 3340
},
{
"epoch": 2.708301753322217,
"grad_norm": 1.4903596037049076,
"learning_rate": 4.851246391264819e-07,
"loss": 0.8152,
"step": 3350
},
{
"epoch": 2.7163862361679554,
"grad_norm": 1.4429528216246368,
"learning_rate": 4.5860743599951186e-07,
"loss": 0.8121,
"step": 3360
},
{
"epoch": 2.724470719013693,
"grad_norm": 1.452035259914063,
"learning_rate": 4.328184642399036e-07,
"loss": 0.821,
"step": 3370
},
{
"epoch": 2.732555201859431,
"grad_norm": 1.5303877229228735,
"learning_rate": 4.077596921492533e-07,
"loss": 0.8145,
"step": 3380
},
{
"epoch": 2.740639684705169,
"grad_norm": 1.4449405328561624,
"learning_rate": 3.834330322978397e-07,
"loss": 0.8214,
"step": 3390
},
{
"epoch": 2.748724167550907,
"grad_norm": 1.4371584227135465,
"learning_rate": 3.598403413786611e-07,
"loss": 0.8131,
"step": 3400
},
{
"epoch": 2.756808650396645,
"grad_norm": 1.4632980675092546,
"learning_rate": 3.3698342006572294e-07,
"loss": 0.8244,
"step": 3410
},
{
"epoch": 2.764893133242383,
"grad_norm": 1.4500755832832954,
"learning_rate": 3.148640128766056e-07,
"loss": 0.823,
"step": 3420
},
{
"epoch": 2.772977616088121,
"grad_norm": 1.4751477866660623,
"learning_rate": 2.934838080393154e-07,
"loss": 0.8211,
"step": 3430
},
{
"epoch": 2.781062098933859,
"grad_norm": 1.4653755137740456,
"learning_rate": 2.7284443736343203e-07,
"loss": 0.8024,
"step": 3440
},
{
"epoch": 2.7891465817795966,
"grad_norm": 1.4089563044736344,
"learning_rate": 2.52947476115567e-07,
"loss": 0.8228,
"step": 3450
},
{
"epoch": 2.797231064625335,
"grad_norm": 1.460696621649454,
"learning_rate": 2.3379444289913344e-07,
"loss": 0.8184,
"step": 3460
},
{
"epoch": 2.8053155474710727,
"grad_norm": 1.4693334824298931,
"learning_rate": 2.153867995384351e-07,
"loss": 0.8224,
"step": 3470
},
{
"epoch": 2.8134000303168105,
"grad_norm": 1.4469954005038157,
"learning_rate": 1.9772595096710477e-07,
"loss": 0.8373,
"step": 3480
},
{
"epoch": 2.821484513162549,
"grad_norm": 1.4331150676229163,
"learning_rate": 1.8081324512086663e-07,
"loss": 0.8185,
"step": 3490
},
{
"epoch": 2.8295689960082866,
"grad_norm": 1.5335384382024873,
"learning_rate": 1.6464997283466067e-07,
"loss": 0.8124,
"step": 3500
},
{
"epoch": 2.8376534788540244,
"grad_norm": 1.4445147972537609,
"learning_rate": 1.492373677441228e-07,
"loss": 0.8145,
"step": 3510
},
{
"epoch": 2.8457379616997627,
"grad_norm": 1.4976188260457166,
"learning_rate": 1.3457660619142887e-07,
"loss": 0.8163,
"step": 3520
},
{
"epoch": 2.8538224445455005,
"grad_norm": 1.439452743377751,
"learning_rate": 1.2066880713550888e-07,
"loss": 0.829,
"step": 3530
},
{
"epoch": 2.8619069273912383,
"grad_norm": 1.524984754735583,
"learning_rate": 1.0751503206665071e-07,
"loss": 0.8236,
"step": 3540
},
{
"epoch": 2.8699914102369766,
"grad_norm": 1.448229914768272,
"learning_rate": 9.511628492547609e-08,
"loss": 0.8223,
"step": 3550
},
{
"epoch": 2.8780758930827144,
"grad_norm": 1.4915344957228824,
"learning_rate": 8.347351202632525e-08,
"loss": 0.843,
"step": 3560
},
{
"epoch": 2.886160375928452,
"grad_norm": 1.4891660841319714,
"learning_rate": 7.258760198502246e-08,
"loss": 0.8173,
"step": 3570
},
{
"epoch": 2.89424485877419,
"grad_norm": 1.4485487573496472,
"learning_rate": 6.245938565105803e-08,
"loss": 0.8299,
"step": 3580
},
{
"epoch": 2.9023293416199283,
"grad_norm": 1.452602418516034,
"learning_rate": 5.308963604417572e-08,
"loss": 0.8216,
"step": 3590
},
{
"epoch": 2.910413824465666,
"grad_norm": 1.4554407329371093,
"learning_rate": 4.447906829537219e-08,
"loss": 0.8284,
"step": 3600
},
{
"epoch": 2.9184983073114044,
"grad_norm": 1.4918607001029844,
"learning_rate": 3.6628339592313935e-08,
"loss": 0.8012,
"step": 3610
},
{
"epoch": 2.926582790157142,
"grad_norm": 1.4229324193215207,
"learning_rate": 2.95380491291819e-08,
"loss": 0.8401,
"step": 3620
},
{
"epoch": 2.93466727300288,
"grad_norm": 1.4288366788035922,
"learning_rate": 2.320873806093804e-08,
"loss": 0.8228,
"step": 3630
},
{
"epoch": 2.942751755848618,
"grad_norm": 1.4724134547959333,
"learning_rate": 1.764088946201947e-08,
"loss": 0.8064,
"step": 3640
},
{
"epoch": 2.950836238694356,
"grad_norm": 1.4984479737935563,
"learning_rate": 1.2834928289472415e-08,
"loss": 0.81,
"step": 3650
},
{
"epoch": 2.958920721540094,
"grad_norm": 1.4666816312445612,
"learning_rate": 8.79122135051591e-09,
"loss": 0.822,
"step": 3660
},
{
"epoch": 2.967005204385832,
"grad_norm": 1.445201429621803,
"learning_rate": 5.510077274547554e-09,
"loss": 0.8271,
"step": 3670
},
{
"epoch": 2.97508968723157,
"grad_norm": 1.4460059967392547,
"learning_rate": 2.9917464895856673e-09,
"loss": 0.8389,
"step": 3680
},
{
"epoch": 2.983174170077308,
"grad_norm": 1.435390156627942,
"learning_rate": 1.2364212031579226e-09,
"loss": 0.8294,
"step": 3690
},
{
"epoch": 2.9912586529230456,
"grad_norm": 1.5066669703721747,
"learning_rate": 2.442353876297432e-10,
"loss": 0.801,
"step": 3700
},
{
"epoch": 2.997726239199636,
"eval_loss": 0.8224219083786011,
"eval_runtime": 474.463,
"eval_samples_per_second": 25.637,
"eval_steps_per_second": 12.819,
"step": 3708
},
{
"epoch": 2.997726239199636,
"step": 3708,
"total_flos": 0.0,
"train_loss": 1.0273753281164324,
"train_runtime": 58675.1239,
"train_samples_per_second": 8.095,
"train_steps_per_second": 0.063
}
],
"logging_steps": 10,
"max_steps": 3708,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}