juyongjiang's picture
upload model checkpoint
80a6a02 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.966777408637874,
"eval_steps": 500,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006644518272425249,
"grad_norm": 324.0,
"learning_rate": 1.3333333333333334e-06,
"loss": 34.1539,
"step": 1
},
{
"epoch": 0.03322259136212625,
"grad_norm": 328.0,
"learning_rate": 6.666666666666667e-06,
"loss": 34.4732,
"step": 5
},
{
"epoch": 0.0664451827242525,
"grad_norm": 132.0,
"learning_rate": 1.3333333333333333e-05,
"loss": 30.9731,
"step": 10
},
{
"epoch": 0.09966777408637874,
"grad_norm": 57.75,
"learning_rate": 2e-05,
"loss": 24.1357,
"step": 15
},
{
"epoch": 0.132890365448505,
"grad_norm": 19.125,
"learning_rate": 2.6666666666666667e-05,
"loss": 19.6743,
"step": 20
},
{
"epoch": 0.16611295681063123,
"grad_norm": 14.1875,
"learning_rate": 3.3333333333333335e-05,
"loss": 17.9465,
"step": 25
},
{
"epoch": 0.19933554817275748,
"grad_norm": 7.25,
"learning_rate": 4e-05,
"loss": 15.9561,
"step": 30
},
{
"epoch": 0.23255813953488372,
"grad_norm": 4.125,
"learning_rate": 4.666666666666667e-05,
"loss": 14.7788,
"step": 35
},
{
"epoch": 0.26578073089701,
"grad_norm": 3.484375,
"learning_rate": 5.333333333333333e-05,
"loss": 14.139,
"step": 40
},
{
"epoch": 0.29900332225913623,
"grad_norm": 4.75,
"learning_rate": 6e-05,
"loss": 13.5886,
"step": 45
},
{
"epoch": 0.33222591362126247,
"grad_norm": 6.125,
"learning_rate": 6.666666666666667e-05,
"loss": 13.0275,
"step": 50
},
{
"epoch": 0.3654485049833887,
"grad_norm": 11.5625,
"learning_rate": 7.333333333333333e-05,
"loss": 11.9071,
"step": 55
},
{
"epoch": 0.39867109634551495,
"grad_norm": 18.375,
"learning_rate": 8e-05,
"loss": 9.4575,
"step": 60
},
{
"epoch": 0.4318936877076412,
"grad_norm": 21.25,
"learning_rate": 8.666666666666667e-05,
"loss": 5.8479,
"step": 65
},
{
"epoch": 0.46511627906976744,
"grad_norm": 5.09375,
"learning_rate": 9.333333333333334e-05,
"loss": 2.6937,
"step": 70
},
{
"epoch": 0.4983388704318937,
"grad_norm": 6.46875,
"learning_rate": 0.0001,
"loss": 2.0051,
"step": 75
},
{
"epoch": 0.53156146179402,
"grad_norm": 2.78125,
"learning_rate": 0.00010666666666666667,
"loss": 1.7309,
"step": 80
},
{
"epoch": 0.5647840531561462,
"grad_norm": 0.73828125,
"learning_rate": 0.00011333333333333334,
"loss": 1.5823,
"step": 85
},
{
"epoch": 0.5980066445182725,
"grad_norm": 1.359375,
"learning_rate": 0.00012,
"loss": 1.4702,
"step": 90
},
{
"epoch": 0.6312292358803987,
"grad_norm": 0.94140625,
"learning_rate": 0.00012666666666666666,
"loss": 1.3996,
"step": 95
},
{
"epoch": 0.6644518272425249,
"grad_norm": 0.859375,
"learning_rate": 0.00013333333333333334,
"loss": 1.3389,
"step": 100
},
{
"epoch": 0.6976744186046512,
"grad_norm": 1.0390625,
"learning_rate": 0.00014,
"loss": 1.293,
"step": 105
},
{
"epoch": 0.7308970099667774,
"grad_norm": 1.2265625,
"learning_rate": 0.00014666666666666666,
"loss": 1.2656,
"step": 110
},
{
"epoch": 0.7641196013289037,
"grad_norm": 0.5703125,
"learning_rate": 0.00015333333333333334,
"loss": 1.2254,
"step": 115
},
{
"epoch": 0.7973421926910299,
"grad_norm": 1.0546875,
"learning_rate": 0.00016,
"loss": 1.2072,
"step": 120
},
{
"epoch": 0.8305647840531561,
"grad_norm": 1.28125,
"learning_rate": 0.0001666666666666667,
"loss": 1.1856,
"step": 125
},
{
"epoch": 0.8637873754152824,
"grad_norm": 1.3046875,
"learning_rate": 0.00017333333333333334,
"loss": 1.169,
"step": 130
},
{
"epoch": 0.8970099667774086,
"grad_norm": 1.3125,
"learning_rate": 0.00018,
"loss": 1.1497,
"step": 135
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.9765625,
"learning_rate": 0.0001866666666666667,
"loss": 1.131,
"step": 140
},
{
"epoch": 0.9634551495016611,
"grad_norm": 1.609375,
"learning_rate": 0.00019333333333333333,
"loss": 1.1275,
"step": 145
},
{
"epoch": 0.9966777408637874,
"grad_norm": 6.65625,
"learning_rate": 0.0002,
"loss": 1.1216,
"step": 150
},
{
"epoch": 0.9966777408637874,
"eval_loss": 2.580465793609619,
"eval_runtime": 0.2799,
"eval_samples_per_second": 35.728,
"eval_steps_per_second": 3.573,
"step": 150
},
{
"epoch": 1.0299003322259137,
"grad_norm": 1.1171875,
"learning_rate": 0.00019999323080037624,
"loss": 1.1202,
"step": 155
},
{
"epoch": 1.06312292358804,
"grad_norm": 7.21875,
"learning_rate": 0.00019997292411794618,
"loss": 1.0982,
"step": 160
},
{
"epoch": 1.0963455149501662,
"grad_norm": 0.6875,
"learning_rate": 0.0001999390827019096,
"loss": 1.1059,
"step": 165
},
{
"epoch": 1.1295681063122924,
"grad_norm": 0.74609375,
"learning_rate": 0.0001998917111338525,
"loss": 1.079,
"step": 170
},
{
"epoch": 1.1627906976744187,
"grad_norm": 1.203125,
"learning_rate": 0.00019983081582712685,
"loss": 1.0626,
"step": 175
},
{
"epoch": 1.196013289036545,
"grad_norm": 3.75,
"learning_rate": 0.00019975640502598244,
"loss": 1.0644,
"step": 180
},
{
"epoch": 1.2292358803986712,
"grad_norm": 0.85546875,
"learning_rate": 0.00019966848880445062,
"loss": 1.064,
"step": 185
},
{
"epoch": 1.2624584717607974,
"grad_norm": 1.1796875,
"learning_rate": 0.00019956707906498044,
"loss": 1.0638,
"step": 190
},
{
"epoch": 1.2956810631229236,
"grad_norm": 1.75,
"learning_rate": 0.00019945218953682734,
"loss": 1.0598,
"step": 195
},
{
"epoch": 1.3289036544850499,
"grad_norm": 1.2734375,
"learning_rate": 0.00019932383577419432,
"loss": 1.0433,
"step": 200
},
{
"epoch": 1.3621262458471761,
"grad_norm": 1.1171875,
"learning_rate": 0.00019918203515412617,
"loss": 1.0375,
"step": 205
},
{
"epoch": 1.3953488372093024,
"grad_norm": 1.1171875,
"learning_rate": 0.00019902680687415705,
"loss": 1.0293,
"step": 210
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.1640625,
"learning_rate": 0.00019885817194971117,
"loss": 1.0196,
"step": 215
},
{
"epoch": 1.4617940199335548,
"grad_norm": 1.3828125,
"learning_rate": 0.00019867615321125795,
"loss": 1.0227,
"step": 220
},
{
"epoch": 1.495016611295681,
"grad_norm": 2.703125,
"learning_rate": 0.00019848077530122083,
"loss": 1.0192,
"step": 225
},
{
"epoch": 1.5282392026578073,
"grad_norm": 2.90625,
"learning_rate": 0.00019827206467064133,
"loss": 1.0254,
"step": 230
},
{
"epoch": 1.5614617940199336,
"grad_norm": 1.90625,
"learning_rate": 0.00019805004957559793,
"loss": 1.0076,
"step": 235
},
{
"epoch": 1.5946843853820598,
"grad_norm": 1.2578125,
"learning_rate": 0.00019781476007338058,
"loss": 0.9979,
"step": 240
},
{
"epoch": 1.627906976744186,
"grad_norm": 4.1875,
"learning_rate": 0.00019756622801842143,
"loss": 0.9963,
"step": 245
},
{
"epoch": 1.6611295681063123,
"grad_norm": 2.625,
"learning_rate": 0.00019730448705798239,
"loss": 1.0017,
"step": 250
},
{
"epoch": 1.6943521594684385,
"grad_norm": 2.9375,
"learning_rate": 0.00019702957262759965,
"loss": 1.0055,
"step": 255
},
{
"epoch": 1.7275747508305648,
"grad_norm": 2.40625,
"learning_rate": 0.00019674152194628638,
"loss": 0.993,
"step": 260
},
{
"epoch": 1.760797342192691,
"grad_norm": 1.3046875,
"learning_rate": 0.0001964403740114939,
"loss": 0.9875,
"step": 265
},
{
"epoch": 1.7940199335548173,
"grad_norm": 1.2734375,
"learning_rate": 0.0001961261695938319,
"loss": 1.0015,
"step": 270
},
{
"epoch": 1.8272425249169435,
"grad_norm": 1.0,
"learning_rate": 0.0001957989512315489,
"loss": 0.9879,
"step": 275
},
{
"epoch": 1.8604651162790697,
"grad_norm": 1.8828125,
"learning_rate": 0.0001954587632247732,
"loss": 0.9846,
"step": 280
},
{
"epoch": 1.893687707641196,
"grad_norm": 1.09375,
"learning_rate": 0.00019510565162951537,
"loss": 0.9816,
"step": 285
},
{
"epoch": 1.9269102990033222,
"grad_norm": 1.15625,
"learning_rate": 0.00019473966425143292,
"loss": 0.9832,
"step": 290
},
{
"epoch": 1.9601328903654485,
"grad_norm": 1.3359375,
"learning_rate": 0.00019436085063935835,
"loss": 0.9838,
"step": 295
},
{
"epoch": 1.9933554817275747,
"grad_norm": 0.76171875,
"learning_rate": 0.00019396926207859084,
"loss": 0.9828,
"step": 300
},
{
"epoch": 2.0,
"eval_loss": 2.516935110092163,
"eval_runtime": 0.2355,
"eval_samples_per_second": 42.456,
"eval_steps_per_second": 4.246,
"step": 301
},
{
"epoch": 2.026578073089701,
"grad_norm": 1.765625,
"learning_rate": 0.00019356495158395315,
"loss": 0.9602,
"step": 305
},
{
"epoch": 2.0598006644518274,
"grad_norm": 3.375,
"learning_rate": 0.00019314797389261424,
"loss": 0.9484,
"step": 310
},
{
"epoch": 2.0930232558139537,
"grad_norm": 0.54296875,
"learning_rate": 0.00019271838545667876,
"loss": 0.9496,
"step": 315
},
{
"epoch": 2.12624584717608,
"grad_norm": 0.80859375,
"learning_rate": 0.00019227624443554425,
"loss": 0.9405,
"step": 320
},
{
"epoch": 2.159468438538206,
"grad_norm": 1.4765625,
"learning_rate": 0.00019182161068802741,
"loss": 0.9509,
"step": 325
},
{
"epoch": 2.1926910299003324,
"grad_norm": 1.3515625,
"learning_rate": 0.0001913545457642601,
"loss": 0.9532,
"step": 330
},
{
"epoch": 2.2259136212624586,
"grad_norm": 1.0234375,
"learning_rate": 0.00019087511289735644,
"loss": 0.9421,
"step": 335
},
{
"epoch": 2.259136212624585,
"grad_norm": 3.453125,
"learning_rate": 0.00019038337699485208,
"loss": 0.9347,
"step": 340
},
{
"epoch": 2.292358803986711,
"grad_norm": 1.265625,
"learning_rate": 0.0001898794046299167,
"loss": 0.9451,
"step": 345
},
{
"epoch": 2.3255813953488373,
"grad_norm": 5.25,
"learning_rate": 0.00018936326403234125,
"loss": 0.9503,
"step": 350
},
{
"epoch": 2.3588039867109636,
"grad_norm": 1.2421875,
"learning_rate": 0.00018883502507930042,
"loss": 0.9515,
"step": 355
},
{
"epoch": 2.39202657807309,
"grad_norm": 1.4375,
"learning_rate": 0.00018829475928589271,
"loss": 0.9382,
"step": 360
},
{
"epoch": 2.425249169435216,
"grad_norm": 0.82421875,
"learning_rate": 0.0001877425397954582,
"loss": 0.9309,
"step": 365
},
{
"epoch": 2.4584717607973423,
"grad_norm": 1.578125,
"learning_rate": 0.00018717844136967624,
"loss": 0.9487,
"step": 370
},
{
"epoch": 2.4916943521594686,
"grad_norm": 1.3359375,
"learning_rate": 0.00018660254037844388,
"loss": 0.9414,
"step": 375
},
{
"epoch": 2.524916943521595,
"grad_norm": 1.3125,
"learning_rate": 0.00018601491478953657,
"loss": 0.9575,
"step": 380
},
{
"epoch": 2.558139534883721,
"grad_norm": 1.90625,
"learning_rate": 0.00018541564415805258,
"loss": 0.9469,
"step": 385
},
{
"epoch": 2.5913621262458473,
"grad_norm": 8.25,
"learning_rate": 0.0001848048096156426,
"loss": 0.9246,
"step": 390
},
{
"epoch": 2.6245847176079735,
"grad_norm": 0.921875,
"learning_rate": 0.00018418249385952575,
"loss": 0.9357,
"step": 395
},
{
"epoch": 2.6578073089700998,
"grad_norm": 1.59375,
"learning_rate": 0.00018354878114129367,
"loss": 0.9264,
"step": 400
},
{
"epoch": 2.691029900332226,
"grad_norm": 2.125,
"learning_rate": 0.00018290375725550417,
"loss": 0.934,
"step": 405
},
{
"epoch": 2.7242524916943522,
"grad_norm": 5.15625,
"learning_rate": 0.00018224750952806624,
"loss": 0.9378,
"step": 410
},
{
"epoch": 2.7574750830564785,
"grad_norm": 0.66796875,
"learning_rate": 0.00018158012680441723,
"loss": 0.9325,
"step": 415
},
{
"epoch": 2.7906976744186047,
"grad_norm": 1.109375,
"learning_rate": 0.00018090169943749476,
"loss": 0.9343,
"step": 420
},
{
"epoch": 2.823920265780731,
"grad_norm": 0.68359375,
"learning_rate": 0.0001802123192755044,
"loss": 0.9322,
"step": 425
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.25,
"learning_rate": 0.0001795120796494848,
"loss": 0.9203,
"step": 430
},
{
"epoch": 2.8903654485049834,
"grad_norm": 0.67578125,
"learning_rate": 0.00017880107536067218,
"loss": 0.9181,
"step": 435
},
{
"epoch": 2.9235880398671097,
"grad_norm": 0.66796875,
"learning_rate": 0.00017807940266766593,
"loss": 0.9152,
"step": 440
},
{
"epoch": 2.956810631229236,
"grad_norm": 0.57421875,
"learning_rate": 0.0001773471592733964,
"loss": 0.9193,
"step": 445
},
{
"epoch": 2.990033222591362,
"grad_norm": 0.69140625,
"learning_rate": 0.0001766044443118978,
"loss": 0.9157,
"step": 450
},
{
"epoch": 2.9966777408637872,
"eval_loss": 2.4835643768310547,
"eval_runtime": 0.2608,
"eval_samples_per_second": 38.338,
"eval_steps_per_second": 3.834,
"step": 451
},
{
"epoch": 3.0232558139534884,
"grad_norm": 1.390625,
"learning_rate": 0.00017585135833488692,
"loss": 0.9023,
"step": 455
},
{
"epoch": 3.0564784053156147,
"grad_norm": 1.5078125,
"learning_rate": 0.00017508800329814995,
"loss": 0.8957,
"step": 460
},
{
"epoch": 3.089700996677741,
"grad_norm": 1.75,
"learning_rate": 0.00017431448254773944,
"loss": 0.8963,
"step": 465
},
{
"epoch": 3.122923588039867,
"grad_norm": 1.4921875,
"learning_rate": 0.0001735309008059829,
"loss": 0.8938,
"step": 470
},
{
"epoch": 3.1561461794019934,
"grad_norm": 1.1484375,
"learning_rate": 0.00017273736415730488,
"loss": 0.8832,
"step": 475
},
{
"epoch": 3.1893687707641196,
"grad_norm": 0.734375,
"learning_rate": 0.0001719339800338651,
"loss": 0.8824,
"step": 480
},
{
"epoch": 3.222591362126246,
"grad_norm": 0.92578125,
"learning_rate": 0.00017112085720101373,
"loss": 0.8985,
"step": 485
},
{
"epoch": 3.255813953488372,
"grad_norm": 0.77734375,
"learning_rate": 0.0001702981057425662,
"loss": 0.8915,
"step": 490
},
{
"epoch": 3.2890365448504983,
"grad_norm": 1.0703125,
"learning_rate": 0.00016946583704589973,
"loss": 0.8959,
"step": 495
},
{
"epoch": 3.3222591362126246,
"grad_norm": 0.640625,
"learning_rate": 0.0001686241637868734,
"loss": 0.8932,
"step": 500
},
{
"epoch": 3.355481727574751,
"grad_norm": 0.875,
"learning_rate": 0.00016777319991457325,
"loss": 0.9034,
"step": 505
},
{
"epoch": 3.388704318936877,
"grad_norm": 1.03125,
"learning_rate": 0.00016691306063588583,
"loss": 0.8914,
"step": 510
},
{
"epoch": 3.4219269102990033,
"grad_norm": 1.0078125,
"learning_rate": 0.00016604386239990078,
"loss": 0.8968,
"step": 515
},
{
"epoch": 3.4551495016611296,
"grad_norm": 0.7109375,
"learning_rate": 0.00016516572288214552,
"loss": 0.8899,
"step": 520
},
{
"epoch": 3.488372093023256,
"grad_norm": 0.55078125,
"learning_rate": 0.00016427876096865394,
"loss": 0.888,
"step": 525
},
{
"epoch": 3.521594684385382,
"grad_norm": 1.5703125,
"learning_rate": 0.00016338309673987101,
"loss": 0.8966,
"step": 530
},
{
"epoch": 3.5548172757475083,
"grad_norm": 0.7890625,
"learning_rate": 0.000162478851454396,
"loss": 0.8802,
"step": 535
},
{
"epoch": 3.5880398671096345,
"grad_norm": 0.63671875,
"learning_rate": 0.0001615661475325658,
"loss": 0.8864,
"step": 540
},
{
"epoch": 3.6212624584717608,
"grad_norm": 1.3359375,
"learning_rate": 0.00016064510853988138,
"loss": 0.8816,
"step": 545
},
{
"epoch": 3.654485049833887,
"grad_norm": 1.484375,
"learning_rate": 0.00015971585917027862,
"loss": 0.8906,
"step": 550
},
{
"epoch": 3.6877076411960132,
"grad_norm": 1.09375,
"learning_rate": 0.00015877852522924732,
"loss": 0.8896,
"step": 555
},
{
"epoch": 3.7209302325581395,
"grad_norm": 0.73046875,
"learning_rate": 0.00015783323361679864,
"loss": 0.8806,
"step": 560
},
{
"epoch": 3.7541528239202657,
"grad_norm": 1.25,
"learning_rate": 0.00015688011231028518,
"loss": 0.8758,
"step": 565
},
{
"epoch": 3.787375415282392,
"grad_norm": 1.2109375,
"learning_rate": 0.0001559192903470747,
"loss": 0.871,
"step": 570
},
{
"epoch": 3.820598006644518,
"grad_norm": 0.7578125,
"learning_rate": 0.0001549508978070806,
"loss": 0.8882,
"step": 575
},
{
"epoch": 3.8538205980066444,
"grad_norm": 0.66015625,
"learning_rate": 0.0001539750657951513,
"loss": 0.8719,
"step": 580
},
{
"epoch": 3.8870431893687707,
"grad_norm": 0.58203125,
"learning_rate": 0.0001529919264233205,
"loss": 0.8794,
"step": 585
},
{
"epoch": 3.920265780730897,
"grad_norm": 0.82421875,
"learning_rate": 0.00015200161279292155,
"loss": 0.8787,
"step": 590
},
{
"epoch": 3.953488372093023,
"grad_norm": 0.8125,
"learning_rate": 0.00015100425897656753,
"loss": 0.873,
"step": 595
},
{
"epoch": 3.9867109634551494,
"grad_norm": 0.578125,
"learning_rate": 0.00015000000000000001,
"loss": 0.8753,
"step": 600
},
{
"epoch": 4.0,
"eval_loss": 2.5010673999786377,
"eval_runtime": 0.239,
"eval_samples_per_second": 41.842,
"eval_steps_per_second": 4.184,
"step": 602
},
{
"epoch": 4.019933554817276,
"grad_norm": 1.359375,
"learning_rate": 0.0001489889718238087,
"loss": 0.8697,
"step": 605
},
{
"epoch": 4.053156146179402,
"grad_norm": 0.9921875,
"learning_rate": 0.00014797131132502465,
"loss": 0.8496,
"step": 610
},
{
"epoch": 4.086378737541528,
"grad_norm": 1.765625,
"learning_rate": 0.00014694715627858908,
"loss": 0.8601,
"step": 615
},
{
"epoch": 4.119601328903655,
"grad_norm": 1.140625,
"learning_rate": 0.00014591664533870118,
"loss": 0.8647,
"step": 620
},
{
"epoch": 4.152823920265781,
"grad_norm": 1.140625,
"learning_rate": 0.00014487991802004623,
"loss": 0.8541,
"step": 625
},
{
"epoch": 4.186046511627907,
"grad_norm": 0.6015625,
"learning_rate": 0.00014383711467890774,
"loss": 0.8481,
"step": 630
},
{
"epoch": 4.219269102990033,
"grad_norm": 0.96875,
"learning_rate": 0.00014278837649416544,
"loss": 0.8514,
"step": 635
},
{
"epoch": 4.25249169435216,
"grad_norm": 0.79296875,
"learning_rate": 0.0001417338454481818,
"loss": 0.8498,
"step": 640
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.734375,
"learning_rate": 0.00014067366430758004,
"loss": 0.8368,
"step": 645
},
{
"epoch": 4.318936877076412,
"grad_norm": 0.875,
"learning_rate": 0.0001396079766039157,
"loss": 0.8439,
"step": 650
},
{
"epoch": 4.352159468438538,
"grad_norm": 0.67578125,
"learning_rate": 0.00013853692661424484,
"loss": 0.8565,
"step": 655
},
{
"epoch": 4.385382059800665,
"grad_norm": 0.79296875,
"learning_rate": 0.00013746065934159123,
"loss": 0.8426,
"step": 660
},
{
"epoch": 4.4186046511627906,
"grad_norm": 0.6953125,
"learning_rate": 0.00013637932049531516,
"loss": 0.8471,
"step": 665
},
{
"epoch": 4.451827242524917,
"grad_norm": 1.28125,
"learning_rate": 0.00013529305647138687,
"loss": 0.8417,
"step": 670
},
{
"epoch": 4.485049833887043,
"grad_norm": 0.94140625,
"learning_rate": 0.00013420201433256689,
"loss": 0.8493,
"step": 675
},
{
"epoch": 4.51827242524917,
"grad_norm": 1.59375,
"learning_rate": 0.0001331063417884958,
"loss": 0.8506,
"step": 680
},
{
"epoch": 4.5514950166112955,
"grad_norm": 0.97265625,
"learning_rate": 0.00013200618717569714,
"loss": 0.841,
"step": 685
},
{
"epoch": 4.584717607973422,
"grad_norm": 0.75,
"learning_rate": 0.00013090169943749476,
"loss": 0.8415,
"step": 690
},
{
"epoch": 4.617940199335548,
"grad_norm": 0.703125,
"learning_rate": 0.0001297930281038482,
"loss": 0.8506,
"step": 695
},
{
"epoch": 4.651162790697675,
"grad_norm": 0.8359375,
"learning_rate": 0.00012868032327110904,
"loss": 0.8425,
"step": 700
},
{
"epoch": 4.6843853820598005,
"grad_norm": 1.1953125,
"learning_rate": 0.0001275637355816999,
"loss": 0.8466,
"step": 705
},
{
"epoch": 4.717607973421927,
"grad_norm": 0.51171875,
"learning_rate": 0.00012644341620372023,
"loss": 0.841,
"step": 710
},
{
"epoch": 4.750830564784053,
"grad_norm": 1.7578125,
"learning_rate": 0.0001253195168104802,
"loss": 0.8396,
"step": 715
},
{
"epoch": 4.78405315614618,
"grad_norm": 1.609375,
"learning_rate": 0.00012419218955996676,
"loss": 0.8423,
"step": 720
},
{
"epoch": 4.8172757475083055,
"grad_norm": 0.86328125,
"learning_rate": 0.00012306158707424403,
"loss": 0.839,
"step": 725
},
{
"epoch": 4.850498338870432,
"grad_norm": 0.75,
"learning_rate": 0.00012192786241879033,
"loss": 0.8342,
"step": 730
},
{
"epoch": 4.883720930232558,
"grad_norm": 1.1328125,
"learning_rate": 0.00012079116908177593,
"loss": 0.8358,
"step": 735
},
{
"epoch": 4.916943521594685,
"grad_norm": 1.078125,
"learning_rate": 0.00011965166095328301,
"loss": 0.8432,
"step": 740
},
{
"epoch": 4.95016611295681,
"grad_norm": 0.68359375,
"learning_rate": 0.00011850949230447145,
"loss": 0.8368,
"step": 745
},
{
"epoch": 4.983388704318937,
"grad_norm": 1.0078125,
"learning_rate": 0.00011736481776669306,
"loss": 0.8334,
"step": 750
},
{
"epoch": 4.996677740863787,
"eval_loss": 2.4944658279418945,
"eval_runtime": 0.2592,
"eval_samples_per_second": 38.58,
"eval_steps_per_second": 3.858,
"step": 752
},
{
"epoch": 5.016611295681063,
"grad_norm": 0.7265625,
"learning_rate": 0.00011621779231055676,
"loss": 0.8264,
"step": 755
},
{
"epoch": 5.04983388704319,
"grad_norm": 1.6484375,
"learning_rate": 0.00011506857122494831,
"loss": 0.8175,
"step": 760
},
{
"epoch": 5.083056478405315,
"grad_norm": 0.8828125,
"learning_rate": 0.00011391731009600654,
"loss": 0.8207,
"step": 765
},
{
"epoch": 5.116279069767442,
"grad_norm": 0.8359375,
"learning_rate": 0.00011276416478605949,
"loss": 0.8134,
"step": 770
},
{
"epoch": 5.149501661129568,
"grad_norm": 0.8828125,
"learning_rate": 0.00011160929141252303,
"loss": 0.8146,
"step": 775
},
{
"epoch": 5.1827242524916945,
"grad_norm": 1.140625,
"learning_rate": 0.00011045284632676536,
"loss": 0.8118,
"step": 780
},
{
"epoch": 5.21594684385382,
"grad_norm": 3.4375,
"learning_rate": 0.00010929498609293924,
"loss": 0.8142,
"step": 785
},
{
"epoch": 5.249169435215947,
"grad_norm": 1.015625,
"learning_rate": 0.00010813586746678583,
"loss": 0.8156,
"step": 790
},
{
"epoch": 5.282392026578073,
"grad_norm": 2.828125,
"learning_rate": 0.00010697564737441252,
"loss": 0.8097,
"step": 795
},
{
"epoch": 5.3156146179401995,
"grad_norm": 0.8828125,
"learning_rate": 0.00010581448289104758,
"loss": 0.8213,
"step": 800
},
{
"epoch": 5.348837209302325,
"grad_norm": 1.3984375,
"learning_rate": 0.0001046525312197747,
"loss": 0.8087,
"step": 805
},
{
"epoch": 5.382059800664452,
"grad_norm": 1.0625,
"learning_rate": 0.00010348994967025012,
"loss": 0.8046,
"step": 810
},
{
"epoch": 5.415282392026578,
"grad_norm": 2.734375,
"learning_rate": 0.00010232689563740563,
"loss": 0.8086,
"step": 815
},
{
"epoch": 5.4485049833887045,
"grad_norm": 0.9921875,
"learning_rate": 0.00010116352658013973,
"loss": 0.809,
"step": 820
},
{
"epoch": 5.48172757475083,
"grad_norm": 1.0,
"learning_rate": 0.0001,
"loss": 0.8155,
"step": 825
},
{
"epoch": 5.514950166112957,
"grad_norm": 0.73046875,
"learning_rate": 9.883647341986032e-05,
"loss": 0.8016,
"step": 830
},
{
"epoch": 5.548172757475083,
"grad_norm": 0.6796875,
"learning_rate": 9.767310436259438e-05,
"loss": 0.8013,
"step": 835
},
{
"epoch": 5.5813953488372094,
"grad_norm": 0.81640625,
"learning_rate": 9.651005032974994e-05,
"loss": 0.8123,
"step": 840
},
{
"epoch": 5.614617940199335,
"grad_norm": 2.1875,
"learning_rate": 9.534746878022534e-05,
"loss": 0.8163,
"step": 845
},
{
"epoch": 5.647840531561462,
"grad_norm": 0.72265625,
"learning_rate": 9.418551710895243e-05,
"loss": 0.8164,
"step": 850
},
{
"epoch": 5.681063122923588,
"grad_norm": 1.6953125,
"learning_rate": 9.302435262558747e-05,
"loss": 0.7974,
"step": 855
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.76953125,
"learning_rate": 9.186413253321418e-05,
"loss": 0.8142,
"step": 860
},
{
"epoch": 5.74750830564784,
"grad_norm": 1.109375,
"learning_rate": 9.070501390706079e-05,
"loss": 0.8026,
"step": 865
},
{
"epoch": 5.780730897009967,
"grad_norm": 0.640625,
"learning_rate": 8.954715367323468e-05,
"loss": 0.8005,
"step": 870
},
{
"epoch": 5.813953488372093,
"grad_norm": 0.85546875,
"learning_rate": 8.839070858747697e-05,
"loss": 0.8015,
"step": 875
},
{
"epoch": 5.847176079734219,
"grad_norm": 0.52734375,
"learning_rate": 8.723583521394054e-05,
"loss": 0.7924,
"step": 880
},
{
"epoch": 5.880398671096345,
"grad_norm": 0.59765625,
"learning_rate": 8.608268990399349e-05,
"loss": 0.812,
"step": 885
},
{
"epoch": 5.913621262458472,
"grad_norm": 0.70703125,
"learning_rate": 8.49314287750517e-05,
"loss": 0.7969,
"step": 890
},
{
"epoch": 5.946843853820598,
"grad_norm": 0.74609375,
"learning_rate": 8.378220768944327e-05,
"loss": 0.7965,
"step": 895
},
{
"epoch": 5.980066445182724,
"grad_norm": 2.015625,
"learning_rate": 8.263518223330697e-05,
"loss": 0.796,
"step": 900
},
{
"epoch": 6.0,
"eval_loss": 2.531708240509033,
"eval_runtime": 0.239,
"eval_samples_per_second": 41.85,
"eval_steps_per_second": 4.185,
"step": 903
},
{
"epoch": 6.01328903654485,
"grad_norm": 0.482421875,
"learning_rate": 8.149050769552856e-05,
"loss": 0.7892,
"step": 905
},
{
"epoch": 6.046511627906977,
"grad_norm": 0.65234375,
"learning_rate": 8.034833904671698e-05,
"loss": 0.7792,
"step": 910
},
{
"epoch": 6.079734219269103,
"grad_norm": 0.7578125,
"learning_rate": 7.920883091822408e-05,
"loss": 0.7814,
"step": 915
},
{
"epoch": 6.112956810631229,
"grad_norm": 0.484375,
"learning_rate": 7.807213758120966e-05,
"loss": 0.7822,
"step": 920
},
{
"epoch": 6.146179401993355,
"grad_norm": 0.80859375,
"learning_rate": 7.693841292575598e-05,
"loss": 0.7749,
"step": 925
},
{
"epoch": 6.179401993355482,
"grad_norm": 0.81640625,
"learning_rate": 7.580781044003324e-05,
"loss": 0.7821,
"step": 930
},
{
"epoch": 6.212624584717608,
"grad_norm": 5.34375,
"learning_rate": 7.468048318951983e-05,
"loss": 0.7872,
"step": 935
},
{
"epoch": 6.245847176079734,
"grad_norm": 2.21875,
"learning_rate": 7.35565837962798e-05,
"loss": 0.7855,
"step": 940
},
{
"epoch": 6.27906976744186,
"grad_norm": 3.28125,
"learning_rate": 7.243626441830009e-05,
"loss": 0.7763,
"step": 945
},
{
"epoch": 6.312292358803987,
"grad_norm": 0.62890625,
"learning_rate": 7.131967672889101e-05,
"loss": 0.7901,
"step": 950
},
{
"epoch": 6.3455149501661126,
"grad_norm": 0.9765625,
"learning_rate": 7.02069718961518e-05,
"loss": 0.7814,
"step": 955
},
{
"epoch": 6.378737541528239,
"grad_norm": 0.8203125,
"learning_rate": 6.909830056250527e-05,
"loss": 0.7752,
"step": 960
},
{
"epoch": 6.411960132890365,
"grad_norm": 0.92578125,
"learning_rate": 6.799381282430284e-05,
"loss": 0.7782,
"step": 965
},
{
"epoch": 6.445182724252492,
"grad_norm": 0.91796875,
"learning_rate": 6.68936582115042e-05,
"loss": 0.7748,
"step": 970
},
{
"epoch": 6.4784053156146175,
"grad_norm": 1.1328125,
"learning_rate": 6.579798566743314e-05,
"loss": 0.7815,
"step": 975
},
{
"epoch": 6.511627906976744,
"grad_norm": 3.734375,
"learning_rate": 6.470694352861312e-05,
"loss": 0.7747,
"step": 980
},
{
"epoch": 6.544850498338871,
"grad_norm": 0.6015625,
"learning_rate": 6.362067950468489e-05,
"loss": 0.785,
"step": 985
},
{
"epoch": 6.578073089700997,
"grad_norm": 0.73828125,
"learning_rate": 6.25393406584088e-05,
"loss": 0.7716,
"step": 990
},
{
"epoch": 6.6112956810631225,
"grad_norm": 0.79296875,
"learning_rate": 6.146307338575519e-05,
"loss": 0.7723,
"step": 995
},
{
"epoch": 6.644518272425249,
"grad_norm": 0.69921875,
"learning_rate": 6.039202339608432e-05,
"loss": 0.7745,
"step": 1000
},
{
"epoch": 6.677740863787376,
"grad_norm": 1.96875,
"learning_rate": 5.9326335692419995e-05,
"loss": 0.7848,
"step": 1005
},
{
"epoch": 6.710963455149502,
"grad_norm": 0.734375,
"learning_rate": 5.8266154551818216e-05,
"loss": 0.7797,
"step": 1010
},
{
"epoch": 6.7441860465116275,
"grad_norm": 0.474609375,
"learning_rate": 5.72116235058346e-05,
"loss": 0.7714,
"step": 1015
},
{
"epoch": 6.777408637873754,
"grad_norm": 0.478515625,
"learning_rate": 5.616288532109225e-05,
"loss": 0.7716,
"step": 1020
},
{
"epoch": 6.810631229235881,
"grad_norm": 0.494140625,
"learning_rate": 5.5120081979953785e-05,
"loss": 0.7807,
"step": 1025
},
{
"epoch": 6.843853820598007,
"grad_norm": 0.65234375,
"learning_rate": 5.4083354661298814e-05,
"loss": 0.7647,
"step": 1030
},
{
"epoch": 6.877076411960132,
"grad_norm": 0.6328125,
"learning_rate": 5.305284372141095e-05,
"loss": 0.7755,
"step": 1035
},
{
"epoch": 6.910299003322259,
"grad_norm": 0.4765625,
"learning_rate": 5.2028688674975415e-05,
"loss": 0.7738,
"step": 1040
},
{
"epoch": 6.943521594684386,
"grad_norm": 0.5625,
"learning_rate": 5.101102817619131e-05,
"loss": 0.7765,
"step": 1045
},
{
"epoch": 6.976744186046512,
"grad_norm": 0.70703125,
"learning_rate": 5.000000000000002e-05,
"loss": 0.7745,
"step": 1050
},
{
"epoch": 6.996677740863787,
"eval_loss": 2.5435612201690674,
"eval_runtime": 0.2585,
"eval_samples_per_second": 38.679,
"eval_steps_per_second": 3.868,
"step": 1053
},
{
"epoch": 7.009966777408638,
"grad_norm": 0.53125,
"learning_rate": 4.899574102343247e-05,
"loss": 0.771,
"step": 1055
},
{
"epoch": 7.043189368770764,
"grad_norm": 0.640625,
"learning_rate": 4.799838720707846e-05,
"loss": 0.7653,
"step": 1060
},
{
"epoch": 7.076411960132891,
"grad_norm": 0.52734375,
"learning_rate": 4.700807357667952e-05,
"loss": 0.7644,
"step": 1065
},
{
"epoch": 7.1096345514950166,
"grad_norm": 0.490234375,
"learning_rate": 4.6024934204848745e-05,
"loss": 0.7632,
"step": 1070
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.55859375,
"learning_rate": 4.50491021929194e-05,
"loss": 0.7686,
"step": 1075
},
{
"epoch": 7.176079734219269,
"grad_norm": 0.46484375,
"learning_rate": 4.4080709652925336e-05,
"loss": 0.7549,
"step": 1080
},
{
"epoch": 7.209302325581396,
"grad_norm": 0.58203125,
"learning_rate": 4.3119887689714844e-05,
"loss": 0.7626,
"step": 1085
},
{
"epoch": 7.2425249169435215,
"grad_norm": 0.5546875,
"learning_rate": 4.216676638320135e-05,
"loss": 0.7588,
"step": 1090
},
{
"epoch": 7.275747508305648,
"grad_norm": 0.5,
"learning_rate": 4.12214747707527e-05,
"loss": 0.7583,
"step": 1095
},
{
"epoch": 7.308970099667774,
"grad_norm": 0.6015625,
"learning_rate": 4.028414082972141e-05,
"loss": 0.7529,
"step": 1100
},
{
"epoch": 7.342192691029901,
"grad_norm": 0.72265625,
"learning_rate": 3.935489146011869e-05,
"loss": 0.766,
"step": 1105
},
{
"epoch": 7.3754152823920265,
"grad_norm": 0.46875,
"learning_rate": 3.843385246743417e-05,
"loss": 0.7592,
"step": 1110
},
{
"epoch": 7.408637873754153,
"grad_norm": 0.431640625,
"learning_rate": 3.7521148545604e-05,
"loss": 0.7645,
"step": 1115
},
{
"epoch": 7.441860465116279,
"grad_norm": 0.455078125,
"learning_rate": 3.661690326012897e-05,
"loss": 0.7629,
"step": 1120
},
{
"epoch": 7.475083056478406,
"grad_norm": 0.4765625,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.7591,
"step": 1125
},
{
"epoch": 7.5083056478405314,
"grad_norm": 0.71484375,
"learning_rate": 3.483427711785449e-05,
"loss": 0.7558,
"step": 1130
},
{
"epoch": 7.541528239202658,
"grad_norm": 0.53515625,
"learning_rate": 3.395613760009925e-05,
"loss": 0.7611,
"step": 1135
},
{
"epoch": 7.574750830564784,
"grad_norm": 0.56640625,
"learning_rate": 3.308693936411421e-05,
"loss": 0.7619,
"step": 1140
},
{
"epoch": 7.607973421926911,
"grad_norm": 0.44921875,
"learning_rate": 3.222680008542678e-05,
"loss": 0.7585,
"step": 1145
},
{
"epoch": 7.641196013289036,
"grad_norm": 0.490234375,
"learning_rate": 3.137583621312665e-05,
"loss": 0.7551,
"step": 1150
},
{
"epoch": 7.674418604651163,
"grad_norm": 0.490234375,
"learning_rate": 3.053416295410026e-05,
"loss": 0.7626,
"step": 1155
},
{
"epoch": 7.707641196013289,
"grad_norm": 0.5,
"learning_rate": 2.9701894257433826e-05,
"loss": 0.764,
"step": 1160
},
{
"epoch": 7.740863787375416,
"grad_norm": 0.46875,
"learning_rate": 2.8879142798986292e-05,
"loss": 0.755,
"step": 1165
},
{
"epoch": 7.774086378737541,
"grad_norm": 0.46875,
"learning_rate": 2.8066019966134904e-05,
"loss": 0.7563,
"step": 1170
},
{
"epoch": 7.807308970099668,
"grad_norm": 0.451171875,
"learning_rate": 2.7262635842695127e-05,
"loss": 0.7688,
"step": 1175
},
{
"epoch": 7.840531561461794,
"grad_norm": 0.546875,
"learning_rate": 2.6469099194017143e-05,
"loss": 0.7665,
"step": 1180
},
{
"epoch": 7.8737541528239205,
"grad_norm": 0.4453125,
"learning_rate": 2.5685517452260567e-05,
"loss": 0.7664,
"step": 1185
},
{
"epoch": 7.906976744186046,
"grad_norm": 0.443359375,
"learning_rate": 2.491199670185008e-05,
"loss": 0.753,
"step": 1190
},
{
"epoch": 7.940199335548173,
"grad_norm": 0.44921875,
"learning_rate": 2.4148641665113113e-05,
"loss": 0.7614,
"step": 1195
},
{
"epoch": 7.973421926910299,
"grad_norm": 0.484375,
"learning_rate": 2.339555568810221e-05,
"loss": 0.7582,
"step": 1200
},
{
"epoch": 8.0,
"eval_loss": 2.5521774291992188,
"eval_runtime": 0.24,
"eval_samples_per_second": 41.669,
"eval_steps_per_second": 4.167,
"step": 1204
},
{
"epoch": 8.006644518272426,
"grad_norm": 0.423828125,
"learning_rate": 2.265284072660362e-05,
"loss": 0.7646,
"step": 1205
},
{
"epoch": 8.039867109634551,
"grad_norm": 0.44140625,
"learning_rate": 2.192059733233408e-05,
"loss": 0.758,
"step": 1210
},
{
"epoch": 8.073089700996677,
"grad_norm": 0.439453125,
"learning_rate": 2.119892463932781e-05,
"loss": 0.7566,
"step": 1215
},
{
"epoch": 8.106312292358805,
"grad_norm": 0.451171875,
"learning_rate": 2.0487920350515212e-05,
"loss": 0.7551,
"step": 1220
},
{
"epoch": 8.13953488372093,
"grad_norm": 0.4375,
"learning_rate": 1.9787680724495617e-05,
"loss": 0.7421,
"step": 1225
},
{
"epoch": 8.172757475083056,
"grad_norm": 0.44921875,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.7513,
"step": 1230
},
{
"epoch": 8.205980066445182,
"grad_norm": 0.44921875,
"learning_rate": 1.8419873195582814e-05,
"loss": 0.7578,
"step": 1235
},
{
"epoch": 8.23920265780731,
"grad_norm": 0.421875,
"learning_rate": 1.775249047193377e-05,
"loss": 0.7518,
"step": 1240
},
{
"epoch": 8.272425249169435,
"grad_norm": 0.498046875,
"learning_rate": 1.7096242744495837e-05,
"loss": 0.7519,
"step": 1245
},
{
"epoch": 8.305647840531561,
"grad_norm": 0.5390625,
"learning_rate": 1.6451218858706374e-05,
"loss": 0.7514,
"step": 1250
},
{
"epoch": 8.338870431893687,
"grad_norm": 0.43359375,
"learning_rate": 1.5817506140474247e-05,
"loss": 0.7553,
"step": 1255
},
{
"epoch": 8.372093023255815,
"grad_norm": 0.466796875,
"learning_rate": 1.5195190384357404e-05,
"loss": 0.7487,
"step": 1260
},
{
"epoch": 8.40531561461794,
"grad_norm": 0.43359375,
"learning_rate": 1.458435584194745e-05,
"loss": 0.7518,
"step": 1265
},
{
"epoch": 8.438538205980066,
"grad_norm": 0.4296875,
"learning_rate": 1.3985085210463477e-05,
"loss": 0.7487,
"step": 1270
},
{
"epoch": 8.471760797342192,
"grad_norm": 0.423828125,
"learning_rate": 1.339745962155613e-05,
"loss": 0.7467,
"step": 1275
},
{
"epoch": 8.50498338870432,
"grad_norm": 0.4296875,
"learning_rate": 1.2821558630323772e-05,
"loss": 0.7478,
"step": 1280
},
{
"epoch": 8.538205980066445,
"grad_norm": 0.46875,
"learning_rate": 1.2257460204541794e-05,
"loss": 0.7558,
"step": 1285
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.44921875,
"learning_rate": 1.1705240714107302e-05,
"loss": 0.7426,
"step": 1290
},
{
"epoch": 8.604651162790697,
"grad_norm": 0.46875,
"learning_rate": 1.116497492069961e-05,
"loss": 0.7411,
"step": 1295
},
{
"epoch": 8.637873754152825,
"grad_norm": 0.44140625,
"learning_rate": 1.0636735967658784e-05,
"loss": 0.7524,
"step": 1300
},
{
"epoch": 8.67109634551495,
"grad_norm": 0.453125,
"learning_rate": 1.0120595370083318e-05,
"loss": 0.7499,
"step": 1305
},
{
"epoch": 8.704318936877076,
"grad_norm": 0.435546875,
"learning_rate": 9.616623005147951e-06,
"loss": 0.7603,
"step": 1310
},
{
"epoch": 8.737541528239202,
"grad_norm": 0.44140625,
"learning_rate": 9.124887102643575e-06,
"loss": 0.7563,
"step": 1315
},
{
"epoch": 8.77076411960133,
"grad_norm": 0.4296875,
"learning_rate": 8.645454235739903e-06,
"loss": 0.7594,
"step": 1320
},
{
"epoch": 8.803986710963455,
"grad_norm": 0.431640625,
"learning_rate": 8.178389311972612e-06,
"loss": 0.7648,
"step": 1325
},
{
"epoch": 8.837209302325581,
"grad_norm": 0.443359375,
"learning_rate": 7.72375556445577e-06,
"loss": 0.7555,
"step": 1330
},
{
"epoch": 8.870431893687707,
"grad_norm": 0.44140625,
"learning_rate": 7.281614543321269e-06,
"loss": 0.7461,
"step": 1335
},
{
"epoch": 8.903654485049834,
"grad_norm": 0.470703125,
"learning_rate": 6.852026107385756e-06,
"loss": 0.7606,
"step": 1340
},
{
"epoch": 8.93687707641196,
"grad_norm": 0.435546875,
"learning_rate": 6.435048416046863e-06,
"loss": 0.7598,
"step": 1345
},
{
"epoch": 8.970099667774086,
"grad_norm": 0.439453125,
"learning_rate": 6.030737921409169e-06,
"loss": 0.754,
"step": 1350
},
{
"epoch": 8.996677740863788,
"eval_loss": 2.5503978729248047,
"eval_runtime": 0.2601,
"eval_samples_per_second": 38.445,
"eval_steps_per_second": 3.845,
"step": 1354
},
{
"epoch": 9.003322259136212,
"grad_norm": 0.4375,
"learning_rate": 5.639149360641649e-06,
"loss": 0.7546,
"step": 1355
},
{
"epoch": 9.03654485049834,
"grad_norm": 0.439453125,
"learning_rate": 5.26033574856708e-06,
"loss": 0.7562,
"step": 1360
},
{
"epoch": 9.069767441860465,
"grad_norm": 0.419921875,
"learning_rate": 4.8943483704846475e-06,
"loss": 0.7522,
"step": 1365
},
{
"epoch": 9.102990033222591,
"grad_norm": 0.427734375,
"learning_rate": 4.541236775226809e-06,
"loss": 0.7522,
"step": 1370
},
{
"epoch": 9.136212624584717,
"grad_norm": 0.427734375,
"learning_rate": 4.20104876845111e-06,
"loss": 0.7509,
"step": 1375
},
{
"epoch": 9.169435215946844,
"grad_norm": 0.41796875,
"learning_rate": 3.873830406168111e-06,
"loss": 0.7444,
"step": 1380
},
{
"epoch": 9.20265780730897,
"grad_norm": 0.453125,
"learning_rate": 3.5596259885061102e-06,
"loss": 0.7561,
"step": 1385
},
{
"epoch": 9.235880398671096,
"grad_norm": 0.443359375,
"learning_rate": 3.2584780537136207e-06,
"loss": 0.7502,
"step": 1390
},
{
"epoch": 9.269102990033222,
"grad_norm": 0.4921875,
"learning_rate": 2.970427372400353e-06,
"loss": 0.7546,
"step": 1395
},
{
"epoch": 9.30232558139535,
"grad_norm": 0.45703125,
"learning_rate": 2.6955129420176196e-06,
"loss": 0.7506,
"step": 1400
},
{
"epoch": 9.335548172757475,
"grad_norm": 0.4296875,
"learning_rate": 2.433771981578581e-06,
"loss": 0.7531,
"step": 1405
},
{
"epoch": 9.368770764119601,
"grad_norm": 0.427734375,
"learning_rate": 2.1852399266194314e-06,
"loss": 0.75,
"step": 1410
},
{
"epoch": 9.401993355481727,
"grad_norm": 0.4765625,
"learning_rate": 1.9499504244020693e-06,
"loss": 0.7449,
"step": 1415
},
{
"epoch": 9.435215946843854,
"grad_norm": 0.416015625,
"learning_rate": 1.7279353293586765e-06,
"loss": 0.765,
"step": 1420
},
{
"epoch": 9.46843853820598,
"grad_norm": 0.44921875,
"learning_rate": 1.5192246987791981e-06,
"loss": 0.7472,
"step": 1425
},
{
"epoch": 9.501661129568106,
"grad_norm": 0.431640625,
"learning_rate": 1.323846788742078e-06,
"loss": 0.7461,
"step": 1430
},
{
"epoch": 9.534883720930232,
"grad_norm": 0.443359375,
"learning_rate": 1.14182805028884e-06,
"loss": 0.7501,
"step": 1435
},
{
"epoch": 9.56810631229236,
"grad_norm": 0.43359375,
"learning_rate": 9.731931258429638e-07,
"loss": 0.7501,
"step": 1440
},
{
"epoch": 9.601328903654485,
"grad_norm": 0.41796875,
"learning_rate": 8.17964845873831e-07,
"loss": 0.7511,
"step": 1445
},
{
"epoch": 9.634551495016611,
"grad_norm": 0.427734375,
"learning_rate": 6.761642258056978e-07,
"loss": 0.7556,
"step": 1450
},
{
"epoch": 9.667774086378738,
"grad_norm": 0.42578125,
"learning_rate": 5.478104631726711e-07,
"loss": 0.751,
"step": 1455
},
{
"epoch": 9.700996677740864,
"grad_norm": 0.421875,
"learning_rate": 4.329209350195651e-07,
"loss": 0.7598,
"step": 1460
},
{
"epoch": 9.73421926910299,
"grad_norm": 0.4375,
"learning_rate": 3.315111955493944e-07,
"loss": 0.7572,
"step": 1465
},
{
"epoch": 9.767441860465116,
"grad_norm": 0.46484375,
"learning_rate": 2.4359497401758024e-07,
"loss": 0.7478,
"step": 1470
},
{
"epoch": 9.800664451827242,
"grad_norm": 0.419921875,
"learning_rate": 1.6918417287318245e-07,
"loss": 0.749,
"step": 1475
},
{
"epoch": 9.83388704318937,
"grad_norm": 0.44921875,
"learning_rate": 1.0828886614754341e-07,
"loss": 0.7488,
"step": 1480
},
{
"epoch": 9.867109634551495,
"grad_norm": 0.4609375,
"learning_rate": 6.09172980904238e-08,
"loss": 0.7407,
"step": 1485
},
{
"epoch": 9.90033222591362,
"grad_norm": 0.43359375,
"learning_rate": 2.7075882053828605e-08,
"loss": 0.7491,
"step": 1490
},
{
"epoch": 9.933554817275748,
"grad_norm": 0.447265625,
"learning_rate": 6.769199623779532e-09,
"loss": 0.7417,
"step": 1495
},
{
"epoch": 9.966777408637874,
"grad_norm": 0.435546875,
"learning_rate": 0.0,
"loss": 0.7572,
"step": 1500
},
{
"epoch": 9.966777408637874,
"eval_loss": 2.5546562671661377,
"eval_runtime": 0.2333,
"eval_samples_per_second": 42.867,
"eval_steps_per_second": 4.287,
"step": 1500
},
{
"epoch": 9.966777408637874,
"step": 1500,
"total_flos": 4.5794490708666614e+18,
"train_loss": 1.5882705609003702,
"train_runtime": 3659.0045,
"train_samples_per_second": 26.291,
"train_steps_per_second": 0.41
}
],
"logging_steps": 5,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 4.5794490708666614e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}