ben81828's picture
End of training
ad8abce verified
{
"best_metric": 0.6319106221199036,
"best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_classily_scale4_frozenVision/lora/sft/checkpoint-1600",
"epoch": 1.750965748132887,
"eval_steps": 50,
"global_step": 3400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025753283543651817,
"grad_norm": 21.336819681898895,
"learning_rate": 2.9411764705882355e-06,
"loss": 3.0444,
"num_input_tokens_seen": 58496,
"step": 5
},
{
"epoch": 0.0051506567087303634,
"grad_norm": 20.576623155848594,
"learning_rate": 5.882352941176471e-06,
"loss": 2.9824,
"num_input_tokens_seen": 116960,
"step": 10
},
{
"epoch": 0.007725985063095545,
"grad_norm": 22.989873871108518,
"learning_rate": 8.823529411764707e-06,
"loss": 2.8371,
"num_input_tokens_seen": 175448,
"step": 15
},
{
"epoch": 0.010301313417460727,
"grad_norm": 19.533434089690918,
"learning_rate": 1.1764705882352942e-05,
"loss": 2.5198,
"num_input_tokens_seen": 233944,
"step": 20
},
{
"epoch": 0.012876641771825908,
"grad_norm": 12.509494197145006,
"learning_rate": 1.4705882352941177e-05,
"loss": 1.772,
"num_input_tokens_seen": 292416,
"step": 25
},
{
"epoch": 0.01545197012619109,
"grad_norm": 3.6901887027066667,
"learning_rate": 1.7647058823529414e-05,
"loss": 1.2263,
"num_input_tokens_seen": 350904,
"step": 30
},
{
"epoch": 0.018027298480556272,
"grad_norm": 2.3996076770849744,
"learning_rate": 2.058823529411765e-05,
"loss": 1.0102,
"num_input_tokens_seen": 409384,
"step": 35
},
{
"epoch": 0.020602626834921454,
"grad_norm": 0.9253415848864577,
"learning_rate": 2.3529411764705884e-05,
"loss": 0.9378,
"num_input_tokens_seen": 467864,
"step": 40
},
{
"epoch": 0.023177955189286635,
"grad_norm": 1.1966244115097795,
"learning_rate": 2.647058823529412e-05,
"loss": 0.9265,
"num_input_tokens_seen": 526384,
"step": 45
},
{
"epoch": 0.025753283543651816,
"grad_norm": 1.853648349752417,
"learning_rate": 2.9411764705882354e-05,
"loss": 0.9157,
"num_input_tokens_seen": 584856,
"step": 50
},
{
"epoch": 0.025753283543651816,
"eval_loss": 0.9191630482673645,
"eval_runtime": 36.6123,
"eval_samples_per_second": 1.639,
"eval_steps_per_second": 0.41,
"num_input_tokens_seen": 584856,
"step": 50
},
{
"epoch": 0.028328611898016998,
"grad_norm": 0.8294990584587586,
"learning_rate": 3.235294117647059e-05,
"loss": 0.9009,
"num_input_tokens_seen": 643344,
"step": 55
},
{
"epoch": 0.03090394025238218,
"grad_norm": 0.8278765532866457,
"learning_rate": 3.529411764705883e-05,
"loss": 0.9063,
"num_input_tokens_seen": 701808,
"step": 60
},
{
"epoch": 0.03347926860674736,
"grad_norm": 0.7285901101792476,
"learning_rate": 3.8235294117647055e-05,
"loss": 0.9031,
"num_input_tokens_seen": 760304,
"step": 65
},
{
"epoch": 0.036054596961112545,
"grad_norm": 0.5341783688819233,
"learning_rate": 4.11764705882353e-05,
"loss": 0.8991,
"num_input_tokens_seen": 818760,
"step": 70
},
{
"epoch": 0.03862992531547772,
"grad_norm": 0.46059313680988906,
"learning_rate": 4.411764705882353e-05,
"loss": 0.9055,
"num_input_tokens_seen": 877256,
"step": 75
},
{
"epoch": 0.04120525366984291,
"grad_norm": 0.8194379237293679,
"learning_rate": 4.705882352941177e-05,
"loss": 0.9092,
"num_input_tokens_seen": 935752,
"step": 80
},
{
"epoch": 0.043780582024208085,
"grad_norm": 0.6745093544830881,
"learning_rate": 5e-05,
"loss": 0.9069,
"num_input_tokens_seen": 994216,
"step": 85
},
{
"epoch": 0.04635591037857327,
"grad_norm": 0.2894672897884604,
"learning_rate": 5.294117647058824e-05,
"loss": 0.8924,
"num_input_tokens_seen": 1052704,
"step": 90
},
{
"epoch": 0.04893123873293845,
"grad_norm": 0.5108489024576455,
"learning_rate": 5.588235294117647e-05,
"loss": 0.9059,
"num_input_tokens_seen": 1111176,
"step": 95
},
{
"epoch": 0.05150656708730363,
"grad_norm": 0.40317180386305224,
"learning_rate": 5.882352941176471e-05,
"loss": 0.901,
"num_input_tokens_seen": 1169664,
"step": 100
},
{
"epoch": 0.05150656708730363,
"eval_loss": 0.9077914953231812,
"eval_runtime": 16.8879,
"eval_samples_per_second": 3.553,
"eval_steps_per_second": 0.888,
"num_input_tokens_seen": 1169664,
"step": 100
},
{
"epoch": 0.05408189544166881,
"grad_norm": 0.412918917979438,
"learning_rate": 6.176470588235295e-05,
"loss": 0.9159,
"num_input_tokens_seen": 1228112,
"step": 105
},
{
"epoch": 0.056657223796033995,
"grad_norm": 0.34797408069968117,
"learning_rate": 6.470588235294118e-05,
"loss": 0.91,
"num_input_tokens_seen": 1286608,
"step": 110
},
{
"epoch": 0.05923255215039917,
"grad_norm": 0.27558494796967653,
"learning_rate": 6.764705882352942e-05,
"loss": 0.9047,
"num_input_tokens_seen": 1345072,
"step": 115
},
{
"epoch": 0.06180788050476436,
"grad_norm": 0.5422134023513459,
"learning_rate": 7.058823529411765e-05,
"loss": 0.9022,
"num_input_tokens_seen": 1403544,
"step": 120
},
{
"epoch": 0.06438320885912954,
"grad_norm": 0.4452796218739235,
"learning_rate": 7.352941176470589e-05,
"loss": 0.9081,
"num_input_tokens_seen": 1462024,
"step": 125
},
{
"epoch": 0.06695853721349472,
"grad_norm": 0.5632558160730559,
"learning_rate": 7.647058823529411e-05,
"loss": 0.8939,
"num_input_tokens_seen": 1520528,
"step": 130
},
{
"epoch": 0.0695338655678599,
"grad_norm": 0.3383115884436812,
"learning_rate": 7.941176470588235e-05,
"loss": 0.9029,
"num_input_tokens_seen": 1579024,
"step": 135
},
{
"epoch": 0.07210919392222509,
"grad_norm": 0.3506611095466577,
"learning_rate": 8.23529411764706e-05,
"loss": 0.9014,
"num_input_tokens_seen": 1637504,
"step": 140
},
{
"epoch": 0.07468452227659027,
"grad_norm": 0.6328034405712752,
"learning_rate": 8.529411764705883e-05,
"loss": 0.9053,
"num_input_tokens_seen": 1696024,
"step": 145
},
{
"epoch": 0.07725985063095545,
"grad_norm": 0.3511657661506363,
"learning_rate": 8.823529411764706e-05,
"loss": 0.9032,
"num_input_tokens_seen": 1754512,
"step": 150
},
{
"epoch": 0.07725985063095545,
"eval_loss": 0.8962129950523376,
"eval_runtime": 17.0673,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 0.879,
"num_input_tokens_seen": 1754512,
"step": 150
},
{
"epoch": 0.07983517898532062,
"grad_norm": 0.4047681172482029,
"learning_rate": 9.11764705882353e-05,
"loss": 0.8985,
"num_input_tokens_seen": 1812976,
"step": 155
},
{
"epoch": 0.08241050733968582,
"grad_norm": 0.37729033726569733,
"learning_rate": 9.411764705882353e-05,
"loss": 0.8949,
"num_input_tokens_seen": 1871464,
"step": 160
},
{
"epoch": 0.08498583569405099,
"grad_norm": 0.4655744785034158,
"learning_rate": 9.705882352941177e-05,
"loss": 0.9069,
"num_input_tokens_seen": 1929928,
"step": 165
},
{
"epoch": 0.08756116404841617,
"grad_norm": 0.30643056878817176,
"learning_rate": 0.0001,
"loss": 0.9049,
"num_input_tokens_seen": 1988432,
"step": 170
},
{
"epoch": 0.09013649240278135,
"grad_norm": 0.39944696269496754,
"learning_rate": 9.999940874631277e-05,
"loss": 0.9026,
"num_input_tokens_seen": 2046920,
"step": 175
},
{
"epoch": 0.09271182075714654,
"grad_norm": 0.31301259106593154,
"learning_rate": 9.999763499923432e-05,
"loss": 0.8984,
"num_input_tokens_seen": 2105392,
"step": 180
},
{
"epoch": 0.09528714911151172,
"grad_norm": 0.4309753054454554,
"learning_rate": 9.999467880071402e-05,
"loss": 0.9057,
"num_input_tokens_seen": 2163872,
"step": 185
},
{
"epoch": 0.0978624774658769,
"grad_norm": 0.262930252305763,
"learning_rate": 9.999054022066641e-05,
"loss": 0.9078,
"num_input_tokens_seen": 2222352,
"step": 190
},
{
"epoch": 0.10043780582024209,
"grad_norm": 0.22073598270887426,
"learning_rate": 9.998521935696953e-05,
"loss": 0.9028,
"num_input_tokens_seen": 2280800,
"step": 195
},
{
"epoch": 0.10301313417460727,
"grad_norm": 0.23764668792524696,
"learning_rate": 9.997871633546257e-05,
"loss": 0.9053,
"num_input_tokens_seen": 2339304,
"step": 200
},
{
"epoch": 0.10301313417460727,
"eval_loss": 0.8982028961181641,
"eval_runtime": 16.9118,
"eval_samples_per_second": 3.548,
"eval_steps_per_second": 0.887,
"num_input_tokens_seen": 2339304,
"step": 200
},
{
"epoch": 0.10558846252897244,
"grad_norm": 0.6222576114383499,
"learning_rate": 9.997103130994296e-05,
"loss": 0.9003,
"num_input_tokens_seen": 2397808,
"step": 205
},
{
"epoch": 0.10816379088333762,
"grad_norm": 0.2983149992592585,
"learning_rate": 9.996216446216267e-05,
"loss": 0.8969,
"num_input_tokens_seen": 2456288,
"step": 210
},
{
"epoch": 0.11073911923770281,
"grad_norm": 0.3505370510576513,
"learning_rate": 9.995211600182397e-05,
"loss": 0.9114,
"num_input_tokens_seen": 2514784,
"step": 215
},
{
"epoch": 0.11331444759206799,
"grad_norm": 0.3683806652106065,
"learning_rate": 9.994088616657444e-05,
"loss": 0.899,
"num_input_tokens_seen": 2573240,
"step": 220
},
{
"epoch": 0.11588977594643317,
"grad_norm": 0.21111769827155855,
"learning_rate": 9.992847522200133e-05,
"loss": 0.898,
"num_input_tokens_seen": 2631672,
"step": 225
},
{
"epoch": 0.11846510430079835,
"grad_norm": 0.3426987181783304,
"learning_rate": 9.99148834616253e-05,
"loss": 0.9006,
"num_input_tokens_seen": 2690112,
"step": 230
},
{
"epoch": 0.12104043265516354,
"grad_norm": 0.236983209071443,
"learning_rate": 9.990011120689351e-05,
"loss": 0.8973,
"num_input_tokens_seen": 2748608,
"step": 235
},
{
"epoch": 0.12361576100952872,
"grad_norm": 0.4575208248826409,
"learning_rate": 9.988415880717194e-05,
"loss": 0.8885,
"num_input_tokens_seen": 2807080,
"step": 240
},
{
"epoch": 0.1261910893638939,
"grad_norm": 0.5470317919414993,
"learning_rate": 9.986702663973722e-05,
"loss": 0.9066,
"num_input_tokens_seen": 2865520,
"step": 245
},
{
"epoch": 0.12876641771825909,
"grad_norm": 0.4992479706331095,
"learning_rate": 9.98487151097676e-05,
"loss": 0.9098,
"num_input_tokens_seen": 2924016,
"step": 250
},
{
"epoch": 0.12876641771825909,
"eval_loss": 0.8956434726715088,
"eval_runtime": 17.4804,
"eval_samples_per_second": 3.432,
"eval_steps_per_second": 0.858,
"num_input_tokens_seen": 2924016,
"step": 250
},
{
"epoch": 0.13134174607262425,
"grad_norm": 0.3762164361984238,
"learning_rate": 9.98292246503335e-05,
"loss": 0.8987,
"num_input_tokens_seen": 2982520,
"step": 255
},
{
"epoch": 0.13391707442698944,
"grad_norm": 0.6447043002410199,
"learning_rate": 9.980855572238714e-05,
"loss": 0.9036,
"num_input_tokens_seen": 3041008,
"step": 260
},
{
"epoch": 0.13649240278135463,
"grad_norm": 0.5308092769971742,
"learning_rate": 9.978670881475172e-05,
"loss": 0.8961,
"num_input_tokens_seen": 3099464,
"step": 265
},
{
"epoch": 0.1390677311357198,
"grad_norm": 0.508333330469703,
"learning_rate": 9.976368444410985e-05,
"loss": 0.9012,
"num_input_tokens_seen": 3157944,
"step": 270
},
{
"epoch": 0.141643059490085,
"grad_norm": 0.6801788563719119,
"learning_rate": 9.973948315499126e-05,
"loss": 0.8985,
"num_input_tokens_seen": 3216448,
"step": 275
},
{
"epoch": 0.14421838784445018,
"grad_norm": 0.6933074703933572,
"learning_rate": 9.971410551976002e-05,
"loss": 0.9114,
"num_input_tokens_seen": 3274928,
"step": 280
},
{
"epoch": 0.14679371619881534,
"grad_norm": 0.21208820897494882,
"learning_rate": 9.968755213860094e-05,
"loss": 0.8886,
"num_input_tokens_seen": 3333408,
"step": 285
},
{
"epoch": 0.14936904455318054,
"grad_norm": 0.5791422669000065,
"learning_rate": 9.96598236395054e-05,
"loss": 0.8929,
"num_input_tokens_seen": 3391896,
"step": 290
},
{
"epoch": 0.1519443729075457,
"grad_norm": 0.3460368893191152,
"learning_rate": 9.96309206782565e-05,
"loss": 0.9091,
"num_input_tokens_seen": 3450392,
"step": 295
},
{
"epoch": 0.1545197012619109,
"grad_norm": 0.22425222135997747,
"learning_rate": 9.960084393841355e-05,
"loss": 0.8893,
"num_input_tokens_seen": 3508888,
"step": 300
},
{
"epoch": 0.1545197012619109,
"eval_loss": 0.8908902406692505,
"eval_runtime": 16.9521,
"eval_samples_per_second": 3.539,
"eval_steps_per_second": 0.885,
"num_input_tokens_seen": 3508888,
"step": 300
},
{
"epoch": 0.15709502961627608,
"grad_norm": 0.23111596622064604,
"learning_rate": 9.956959413129585e-05,
"loss": 0.9056,
"num_input_tokens_seen": 3567368,
"step": 305
},
{
"epoch": 0.15967035797064125,
"grad_norm": 0.3918406894807393,
"learning_rate": 9.953717199596598e-05,
"loss": 0.8982,
"num_input_tokens_seen": 3625848,
"step": 310
},
{
"epoch": 0.16224568632500644,
"grad_norm": 0.22081666860189372,
"learning_rate": 9.95035782992122e-05,
"loss": 0.8968,
"num_input_tokens_seen": 3684336,
"step": 315
},
{
"epoch": 0.16482101467937163,
"grad_norm": 0.18024383676398176,
"learning_rate": 9.94688138355304e-05,
"loss": 0.8975,
"num_input_tokens_seen": 3742800,
"step": 320
},
{
"epoch": 0.1673963430337368,
"grad_norm": 0.3866897344302321,
"learning_rate": 9.943287942710527e-05,
"loss": 0.9061,
"num_input_tokens_seen": 3801280,
"step": 325
},
{
"epoch": 0.16997167138810199,
"grad_norm": 0.4804151381712559,
"learning_rate": 9.939577592379088e-05,
"loss": 0.8948,
"num_input_tokens_seen": 3859792,
"step": 330
},
{
"epoch": 0.17254699974246718,
"grad_norm": 0.35878231707669056,
"learning_rate": 9.935750420309055e-05,
"loss": 0.9063,
"num_input_tokens_seen": 3918272,
"step": 335
},
{
"epoch": 0.17512232809683234,
"grad_norm": 0.8713957774909928,
"learning_rate": 9.931806517013612e-05,
"loss": 0.8952,
"num_input_tokens_seen": 3976760,
"step": 340
},
{
"epoch": 0.17769765645119753,
"grad_norm": 0.6671526212854116,
"learning_rate": 9.927745975766654e-05,
"loss": 0.9136,
"num_input_tokens_seen": 4035240,
"step": 345
},
{
"epoch": 0.1802729848055627,
"grad_norm": 0.28702679234521244,
"learning_rate": 9.923568892600578e-05,
"loss": 0.9075,
"num_input_tokens_seen": 4093688,
"step": 350
},
{
"epoch": 0.1802729848055627,
"eval_loss": 0.89204341173172,
"eval_runtime": 16.5819,
"eval_samples_per_second": 3.618,
"eval_steps_per_second": 0.905,
"num_input_tokens_seen": 4093688,
"step": 350
},
{
"epoch": 0.1828483131599279,
"grad_norm": 0.32233149132200706,
"learning_rate": 9.91927536630402e-05,
"loss": 0.8812,
"num_input_tokens_seen": 4152160,
"step": 355
},
{
"epoch": 0.18542364151429308,
"grad_norm": 0.5071871697326992,
"learning_rate": 9.91486549841951e-05,
"loss": 0.9109,
"num_input_tokens_seen": 4210648,
"step": 360
},
{
"epoch": 0.18799896986865824,
"grad_norm": 0.4532792519849944,
"learning_rate": 9.91033939324107e-05,
"loss": 0.9176,
"num_input_tokens_seen": 4269136,
"step": 365
},
{
"epoch": 0.19057429822302344,
"grad_norm": 0.5409761562534501,
"learning_rate": 9.905697157811761e-05,
"loss": 0.9077,
"num_input_tokens_seen": 4327664,
"step": 370
},
{
"epoch": 0.19314962657738863,
"grad_norm": 0.3432361562809093,
"learning_rate": 9.900938901921131e-05,
"loss": 0.893,
"num_input_tokens_seen": 4386120,
"step": 375
},
{
"epoch": 0.1957249549317538,
"grad_norm": 0.4756530294720616,
"learning_rate": 9.896064738102635e-05,
"loss": 0.9094,
"num_input_tokens_seen": 4444560,
"step": 380
},
{
"epoch": 0.19830028328611898,
"grad_norm": 0.424836974193983,
"learning_rate": 9.891074781630966e-05,
"loss": 0.9091,
"num_input_tokens_seen": 4503016,
"step": 385
},
{
"epoch": 0.20087561164048418,
"grad_norm": 0.31316926977469683,
"learning_rate": 9.885969150519331e-05,
"loss": 0.9033,
"num_input_tokens_seen": 4561496,
"step": 390
},
{
"epoch": 0.20345093999484934,
"grad_norm": 0.6108378682480797,
"learning_rate": 9.88074796551666e-05,
"loss": 0.8851,
"num_input_tokens_seen": 4619944,
"step": 395
},
{
"epoch": 0.20602626834921453,
"grad_norm": 0.38294566619219206,
"learning_rate": 9.875411350104744e-05,
"loss": 0.9004,
"num_input_tokens_seen": 4678384,
"step": 400
},
{
"epoch": 0.20602626834921453,
"eval_loss": 0.9086406826972961,
"eval_runtime": 16.7827,
"eval_samples_per_second": 3.575,
"eval_steps_per_second": 0.894,
"num_input_tokens_seen": 4678384,
"step": 400
},
{
"epoch": 0.2086015967035797,
"grad_norm": 0.4283475401297436,
"learning_rate": 9.86995943049533e-05,
"loss": 0.8976,
"num_input_tokens_seen": 4736904,
"step": 405
},
{
"epoch": 0.2111769250579449,
"grad_norm": 0.40329738287583206,
"learning_rate": 9.864392335627117e-05,
"loss": 0.9134,
"num_input_tokens_seen": 4795376,
"step": 410
},
{
"epoch": 0.21375225341231008,
"grad_norm": 0.37890634863656475,
"learning_rate": 9.858710197162721e-05,
"loss": 0.8955,
"num_input_tokens_seen": 4853880,
"step": 415
},
{
"epoch": 0.21632758176667524,
"grad_norm": 0.32402245835420784,
"learning_rate": 9.852913149485556e-05,
"loss": 0.9014,
"num_input_tokens_seen": 4912360,
"step": 420
},
{
"epoch": 0.21890291012104043,
"grad_norm": 0.49572499508345125,
"learning_rate": 9.847001329696653e-05,
"loss": 0.9065,
"num_input_tokens_seen": 4970872,
"step": 425
},
{
"epoch": 0.22147823847540563,
"grad_norm": 0.11883567118448765,
"learning_rate": 9.840974877611422e-05,
"loss": 0.8952,
"num_input_tokens_seen": 5029304,
"step": 430
},
{
"epoch": 0.2240535668297708,
"grad_norm": 0.7105724703149633,
"learning_rate": 9.834833935756344e-05,
"loss": 0.9106,
"num_input_tokens_seen": 5087800,
"step": 435
},
{
"epoch": 0.22662889518413598,
"grad_norm": 0.708953365388227,
"learning_rate": 9.828578649365601e-05,
"loss": 0.8996,
"num_input_tokens_seen": 5146312,
"step": 440
},
{
"epoch": 0.22920422353850115,
"grad_norm": 0.4503080730364326,
"learning_rate": 9.822209166377635e-05,
"loss": 0.8999,
"num_input_tokens_seen": 5204800,
"step": 445
},
{
"epoch": 0.23177955189286634,
"grad_norm": 0.20754132336834788,
"learning_rate": 9.815725637431662e-05,
"loss": 0.9076,
"num_input_tokens_seen": 5263304,
"step": 450
},
{
"epoch": 0.23177955189286634,
"eval_loss": 0.8962157368659973,
"eval_runtime": 17.2029,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 0.872,
"num_input_tokens_seen": 5263304,
"step": 450
},
{
"epoch": 0.23435488024723153,
"grad_norm": 0.5906403377099594,
"learning_rate": 9.809128215864097e-05,
"loss": 0.8942,
"num_input_tokens_seen": 5321760,
"step": 455
},
{
"epoch": 0.2369302086015967,
"grad_norm": 0.5706805631290568,
"learning_rate": 9.802417057704931e-05,
"loss": 0.9099,
"num_input_tokens_seen": 5380224,
"step": 460
},
{
"epoch": 0.23950553695596188,
"grad_norm": 0.164631948732384,
"learning_rate": 9.795592321674045e-05,
"loss": 0.8981,
"num_input_tokens_seen": 5438704,
"step": 465
},
{
"epoch": 0.24208086531032708,
"grad_norm": 0.32986780285522194,
"learning_rate": 9.788654169177453e-05,
"loss": 0.8952,
"num_input_tokens_seen": 5497208,
"step": 470
},
{
"epoch": 0.24465619366469224,
"grad_norm": 0.40551569446674784,
"learning_rate": 9.781602764303487e-05,
"loss": 0.8959,
"num_input_tokens_seen": 5555704,
"step": 475
},
{
"epoch": 0.24723152201905743,
"grad_norm": 0.20928586231326682,
"learning_rate": 9.774438273818911e-05,
"loss": 0.901,
"num_input_tokens_seen": 5614160,
"step": 480
},
{
"epoch": 0.24980685037342262,
"grad_norm": 0.34365307116824517,
"learning_rate": 9.767160867164979e-05,
"loss": 0.9008,
"num_input_tokens_seen": 5672640,
"step": 485
},
{
"epoch": 0.2523821787277878,
"grad_norm": 0.4212274243028996,
"learning_rate": 9.759770716453436e-05,
"loss": 0.9016,
"num_input_tokens_seen": 5731072,
"step": 490
},
{
"epoch": 0.254957507082153,
"grad_norm": 0.39823625576558597,
"learning_rate": 9.752267996462434e-05,
"loss": 0.9132,
"num_input_tokens_seen": 5789544,
"step": 495
},
{
"epoch": 0.25753283543651817,
"grad_norm": 0.24856324117583653,
"learning_rate": 9.744652884632406e-05,
"loss": 0.8962,
"num_input_tokens_seen": 5848048,
"step": 500
},
{
"epoch": 0.25753283543651817,
"eval_loss": 0.8987945914268494,
"eval_runtime": 17.1622,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 0.874,
"num_input_tokens_seen": 5848048,
"step": 500
},
{
"epoch": 0.26010816379088336,
"grad_norm": 0.25461397268106634,
"learning_rate": 9.736925561061871e-05,
"loss": 0.8954,
"num_input_tokens_seen": 5906512,
"step": 505
},
{
"epoch": 0.2626834921452485,
"grad_norm": 0.38602603275675745,
"learning_rate": 9.729086208503174e-05,
"loss": 0.8927,
"num_input_tokens_seen": 5965024,
"step": 510
},
{
"epoch": 0.2652588204996137,
"grad_norm": 0.150082825225123,
"learning_rate": 9.721135012358156e-05,
"loss": 0.898,
"num_input_tokens_seen": 6023496,
"step": 515
},
{
"epoch": 0.2678341488539789,
"grad_norm": 0.26881662025899655,
"learning_rate": 9.713072160673777e-05,
"loss": 0.9016,
"num_input_tokens_seen": 6082000,
"step": 520
},
{
"epoch": 0.2704094772083441,
"grad_norm": 0.5039123575147229,
"learning_rate": 9.704897844137673e-05,
"loss": 0.8842,
"num_input_tokens_seen": 6140480,
"step": 525
},
{
"epoch": 0.27298480556270927,
"grad_norm": 0.27836945453098666,
"learning_rate": 9.696612256073633e-05,
"loss": 0.8921,
"num_input_tokens_seen": 6198968,
"step": 530
},
{
"epoch": 0.2755601339170744,
"grad_norm": 0.22936338891946384,
"learning_rate": 9.688215592437039e-05,
"loss": 0.8979,
"num_input_tokens_seen": 6257464,
"step": 535
},
{
"epoch": 0.2781354622714396,
"grad_norm": 0.396486857609105,
"learning_rate": 9.679708051810221e-05,
"loss": 0.8951,
"num_input_tokens_seen": 6315944,
"step": 540
},
{
"epoch": 0.2807107906258048,
"grad_norm": 0.4751226662261396,
"learning_rate": 9.67108983539777e-05,
"loss": 0.9149,
"num_input_tokens_seen": 6374408,
"step": 545
},
{
"epoch": 0.28328611898017,
"grad_norm": 0.26829103885131056,
"learning_rate": 9.662361147021779e-05,
"loss": 0.9013,
"num_input_tokens_seen": 6432936,
"step": 550
},
{
"epoch": 0.28328611898017,
"eval_loss": 0.9001271724700928,
"eval_runtime": 16.9878,
"eval_samples_per_second": 3.532,
"eval_steps_per_second": 0.883,
"num_input_tokens_seen": 6432936,
"step": 550
},
{
"epoch": 0.28586144733453517,
"grad_norm": 0.5334970266367584,
"learning_rate": 9.653522193117013e-05,
"loss": 0.8981,
"num_input_tokens_seen": 6491400,
"step": 555
},
{
"epoch": 0.28843677568890036,
"grad_norm": 0.33261202813259866,
"learning_rate": 9.644573182726035e-05,
"loss": 0.9041,
"num_input_tokens_seen": 6549872,
"step": 560
},
{
"epoch": 0.2910121040432655,
"grad_norm": 0.19122862132727417,
"learning_rate": 9.63551432749426e-05,
"loss": 0.9024,
"num_input_tokens_seen": 6608296,
"step": 565
},
{
"epoch": 0.2935874323976307,
"grad_norm": 0.27778009425329764,
"learning_rate": 9.626345841664953e-05,
"loss": 0.9002,
"num_input_tokens_seen": 6666768,
"step": 570
},
{
"epoch": 0.2961627607519959,
"grad_norm": 0.3065314332046026,
"learning_rate": 9.617067942074153e-05,
"loss": 0.9035,
"num_input_tokens_seen": 6725248,
"step": 575
},
{
"epoch": 0.29873808910636107,
"grad_norm": 0.24431496415058412,
"learning_rate": 9.607680848145558e-05,
"loss": 0.9019,
"num_input_tokens_seen": 6783680,
"step": 580
},
{
"epoch": 0.30131341746072626,
"grad_norm": 0.27088193021301504,
"learning_rate": 9.598184781885318e-05,
"loss": 0.9001,
"num_input_tokens_seen": 6842144,
"step": 585
},
{
"epoch": 0.3038887458150914,
"grad_norm": 0.33893098113605125,
"learning_rate": 9.588579967876806e-05,
"loss": 0.8961,
"num_input_tokens_seen": 6900656,
"step": 590
},
{
"epoch": 0.3064640741694566,
"grad_norm": 0.3038921833221806,
"learning_rate": 9.578866633275288e-05,
"loss": 0.9,
"num_input_tokens_seen": 6959128,
"step": 595
},
{
"epoch": 0.3090394025238218,
"grad_norm": 0.48929637235055645,
"learning_rate": 9.569045007802559e-05,
"loss": 0.9046,
"num_input_tokens_seen": 7017576,
"step": 600
},
{
"epoch": 0.3090394025238218,
"eval_loss": 0.9053278565406799,
"eval_runtime": 17.1218,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 0.876,
"num_input_tokens_seen": 7017576,
"step": 600
},
{
"epoch": 0.311614730878187,
"grad_norm": 0.3545950949033049,
"learning_rate": 9.55911532374151e-05,
"loss": 0.9019,
"num_input_tokens_seen": 7076032,
"step": 605
},
{
"epoch": 0.31419005923255217,
"grad_norm": 0.2355627006333952,
"learning_rate": 9.549077815930636e-05,
"loss": 0.8956,
"num_input_tokens_seen": 7134536,
"step": 610
},
{
"epoch": 0.31676538758691736,
"grad_norm": 0.17552483625655946,
"learning_rate": 9.538932721758474e-05,
"loss": 0.898,
"num_input_tokens_seen": 7193032,
"step": 615
},
{
"epoch": 0.3193407159412825,
"grad_norm": 0.1749010635522076,
"learning_rate": 9.528680281157999e-05,
"loss": 0.8991,
"num_input_tokens_seen": 7251568,
"step": 620
},
{
"epoch": 0.3219160442956477,
"grad_norm": 0.19885182954224315,
"learning_rate": 9.518320736600943e-05,
"loss": 0.8961,
"num_input_tokens_seen": 7310072,
"step": 625
},
{
"epoch": 0.3244913726500129,
"grad_norm": 0.4778756508206831,
"learning_rate": 9.507854333092063e-05,
"loss": 0.8994,
"num_input_tokens_seen": 7368560,
"step": 630
},
{
"epoch": 0.32706670100437807,
"grad_norm": 0.4123272743887767,
"learning_rate": 9.497281318163346e-05,
"loss": 0.8925,
"num_input_tokens_seen": 7427040,
"step": 635
},
{
"epoch": 0.32964202935874326,
"grad_norm": 0.34409942667705734,
"learning_rate": 9.486601941868154e-05,
"loss": 0.9087,
"num_input_tokens_seen": 7485552,
"step": 640
},
{
"epoch": 0.3322173577131084,
"grad_norm": 0.43327107411223276,
"learning_rate": 9.475816456775313e-05,
"loss": 0.8924,
"num_input_tokens_seen": 7544040,
"step": 645
},
{
"epoch": 0.3347926860674736,
"grad_norm": 0.6643023904352003,
"learning_rate": 9.464925117963133e-05,
"loss": 0.904,
"num_input_tokens_seen": 7602512,
"step": 650
},
{
"epoch": 0.3347926860674736,
"eval_loss": 0.90328449010849,
"eval_runtime": 16.1444,
"eval_samples_per_second": 3.716,
"eval_steps_per_second": 0.929,
"num_input_tokens_seen": 7602512,
"step": 650
},
{
"epoch": 0.3373680144218388,
"grad_norm": 0.620349194493935,
"learning_rate": 9.453928183013385e-05,
"loss": 0.8929,
"num_input_tokens_seen": 7660968,
"step": 655
},
{
"epoch": 0.33994334277620397,
"grad_norm": 0.18611846349930314,
"learning_rate": 9.442825912005202e-05,
"loss": 0.9078,
"num_input_tokens_seen": 7719448,
"step": 660
},
{
"epoch": 0.34251867113056916,
"grad_norm": 0.4448289413172567,
"learning_rate": 9.431618567508933e-05,
"loss": 0.8963,
"num_input_tokens_seen": 7777928,
"step": 665
},
{
"epoch": 0.34509399948493436,
"grad_norm": 0.6187189362250411,
"learning_rate": 9.420306414579925e-05,
"loss": 0.9134,
"num_input_tokens_seen": 7836424,
"step": 670
},
{
"epoch": 0.3476693278392995,
"grad_norm": 0.35247743418537675,
"learning_rate": 9.408889720752266e-05,
"loss": 0.8984,
"num_input_tokens_seen": 7894904,
"step": 675
},
{
"epoch": 0.3502446561936647,
"grad_norm": 0.20652916455346712,
"learning_rate": 9.397368756032445e-05,
"loss": 0.8997,
"num_input_tokens_seen": 7953432,
"step": 680
},
{
"epoch": 0.3528199845480299,
"grad_norm": 0.4289996063998063,
"learning_rate": 9.385743792892982e-05,
"loss": 0.8926,
"num_input_tokens_seen": 8011888,
"step": 685
},
{
"epoch": 0.35539531290239507,
"grad_norm": 0.13764054506536547,
"learning_rate": 9.374015106265968e-05,
"loss": 0.9008,
"num_input_tokens_seen": 8070344,
"step": 690
},
{
"epoch": 0.35797064125676026,
"grad_norm": 0.22142459689499855,
"learning_rate": 9.362182973536569e-05,
"loss": 0.8986,
"num_input_tokens_seen": 8128816,
"step": 695
},
{
"epoch": 0.3605459696111254,
"grad_norm": 0.3234539650829873,
"learning_rate": 9.35024767453647e-05,
"loss": 0.8972,
"num_input_tokens_seen": 8187320,
"step": 700
},
{
"epoch": 0.3605459696111254,
"eval_loss": 0.9028835892677307,
"eval_runtime": 16.1635,
"eval_samples_per_second": 3.712,
"eval_steps_per_second": 0.928,
"num_input_tokens_seen": 8187320,
"step": 700
},
{
"epoch": 0.3631212979654906,
"grad_norm": 0.3215674690491891,
"learning_rate": 9.338209491537257e-05,
"loss": 0.8998,
"num_input_tokens_seen": 8245776,
"step": 705
},
{
"epoch": 0.3656966263198558,
"grad_norm": 0.36428692362396536,
"learning_rate": 9.326068709243727e-05,
"loss": 0.8999,
"num_input_tokens_seen": 8304280,
"step": 710
},
{
"epoch": 0.36827195467422097,
"grad_norm": 0.280459809393624,
"learning_rate": 9.313825614787177e-05,
"loss": 0.8983,
"num_input_tokens_seen": 8362728,
"step": 715
},
{
"epoch": 0.37084728302858616,
"grad_norm": 0.1819339731162554,
"learning_rate": 9.301480497718593e-05,
"loss": 0.892,
"num_input_tokens_seen": 8421224,
"step": 720
},
{
"epoch": 0.37342261138295135,
"grad_norm": 0.23784840563699303,
"learning_rate": 9.289033650001817e-05,
"loss": 0.9034,
"num_input_tokens_seen": 8479720,
"step": 725
},
{
"epoch": 0.3759979397373165,
"grad_norm": 0.24070744588741375,
"learning_rate": 9.276485366006634e-05,
"loss": 0.895,
"num_input_tokens_seen": 8538192,
"step": 730
},
{
"epoch": 0.3785732680916817,
"grad_norm": 0.24846723619231478,
"learning_rate": 9.263835942501807e-05,
"loss": 0.8973,
"num_input_tokens_seen": 8596664,
"step": 735
},
{
"epoch": 0.3811485964460469,
"grad_norm": 0.2601614440419362,
"learning_rate": 9.251085678648072e-05,
"loss": 0.8972,
"num_input_tokens_seen": 8655128,
"step": 740
},
{
"epoch": 0.38372392480041206,
"grad_norm": 0.30194733839751087,
"learning_rate": 9.238234875991046e-05,
"loss": 0.8987,
"num_input_tokens_seen": 8713624,
"step": 745
},
{
"epoch": 0.38629925315477726,
"grad_norm": 0.3015609177439829,
"learning_rate": 9.225283838454111e-05,
"loss": 0.9005,
"num_input_tokens_seen": 8772104,
"step": 750
},
{
"epoch": 0.38629925315477726,
"eval_loss": 0.8981761336326599,
"eval_runtime": 16.0177,
"eval_samples_per_second": 3.746,
"eval_steps_per_second": 0.936,
"num_input_tokens_seen": 8772104,
"step": 750
},
{
"epoch": 0.3888745815091424,
"grad_norm": 0.44991480631292463,
"learning_rate": 9.21223287233121e-05,
"loss": 0.8973,
"num_input_tokens_seen": 8830568,
"step": 755
},
{
"epoch": 0.3914499098635076,
"grad_norm": 0.22570310903133853,
"learning_rate": 9.199082286279622e-05,
"loss": 0.8974,
"num_input_tokens_seen": 8889072,
"step": 760
},
{
"epoch": 0.3940252382178728,
"grad_norm": 0.22090133233732026,
"learning_rate": 9.185832391312644e-05,
"loss": 0.8985,
"num_input_tokens_seen": 8947568,
"step": 765
},
{
"epoch": 0.39660056657223797,
"grad_norm": 0.23738058530347297,
"learning_rate": 9.172483500792244e-05,
"loss": 0.8935,
"num_input_tokens_seen": 9006056,
"step": 770
},
{
"epoch": 0.39917589492660316,
"grad_norm": 0.41232659301572594,
"learning_rate": 9.159035930421658e-05,
"loss": 0.8985,
"num_input_tokens_seen": 9064592,
"step": 775
},
{
"epoch": 0.40175122328096835,
"grad_norm": 0.2004855543001356,
"learning_rate": 9.145489998237902e-05,
"loss": 0.9105,
"num_input_tokens_seen": 9123096,
"step": 780
},
{
"epoch": 0.4043265516353335,
"grad_norm": 0.16209487510237375,
"learning_rate": 9.131846024604274e-05,
"loss": 0.8925,
"num_input_tokens_seen": 9181576,
"step": 785
},
{
"epoch": 0.4069018799896987,
"grad_norm": 0.24319930530142153,
"learning_rate": 9.11810433220276e-05,
"loss": 0.8955,
"num_input_tokens_seen": 9240048,
"step": 790
},
{
"epoch": 0.40947720834406387,
"grad_norm": 0.24311562892750557,
"learning_rate": 9.104265246026415e-05,
"loss": 0.8986,
"num_input_tokens_seen": 9298528,
"step": 795
},
{
"epoch": 0.41205253669842906,
"grad_norm": 0.2891177185942039,
"learning_rate": 9.090329093371666e-05,
"loss": 0.8881,
"num_input_tokens_seen": 9357016,
"step": 800
},
{
"epoch": 0.41205253669842906,
"eval_loss": 0.8973079919815063,
"eval_runtime": 16.1396,
"eval_samples_per_second": 3.718,
"eval_steps_per_second": 0.929,
"num_input_tokens_seen": 9357016,
"step": 800
},
{
"epoch": 0.41462786505279425,
"grad_norm": 0.4728970278357675,
"learning_rate": 9.076296203830579e-05,
"loss": 0.8798,
"num_input_tokens_seen": 9415480,
"step": 805
},
{
"epoch": 0.4172031934071594,
"grad_norm": 0.2420351489416807,
"learning_rate": 9.062166909283062e-05,
"loss": 0.9104,
"num_input_tokens_seen": 9473928,
"step": 810
},
{
"epoch": 0.4197785217615246,
"grad_norm": 0.2262623911682871,
"learning_rate": 9.047941543889014e-05,
"loss": 0.9007,
"num_input_tokens_seen": 9532408,
"step": 815
},
{
"epoch": 0.4223538501158898,
"grad_norm": 0.18258980329217392,
"learning_rate": 9.033620444080428e-05,
"loss": 0.8974,
"num_input_tokens_seen": 9590920,
"step": 820
},
{
"epoch": 0.42492917847025496,
"grad_norm": 0.2898762949979446,
"learning_rate": 9.019203948553422e-05,
"loss": 0.8992,
"num_input_tokens_seen": 9649400,
"step": 825
},
{
"epoch": 0.42750450682462016,
"grad_norm": 0.3884592601874919,
"learning_rate": 9.004692398260244e-05,
"loss": 0.8991,
"num_input_tokens_seen": 9707888,
"step": 830
},
{
"epoch": 0.43007983517898535,
"grad_norm": 0.24055719869667014,
"learning_rate": 8.9900861364012e-05,
"loss": 0.8964,
"num_input_tokens_seen": 9766384,
"step": 835
},
{
"epoch": 0.4326551635333505,
"grad_norm": 0.4482774361285702,
"learning_rate": 8.975385508416532e-05,
"loss": 0.8723,
"num_input_tokens_seen": 9824896,
"step": 840
},
{
"epoch": 0.4352304918877157,
"grad_norm": 0.4612030185875055,
"learning_rate": 8.960590861978265e-05,
"loss": 0.874,
"num_input_tokens_seen": 9883408,
"step": 845
},
{
"epoch": 0.43780582024208087,
"grad_norm": 0.44197834194509644,
"learning_rate": 8.945702546981969e-05,
"loss": 0.9035,
"num_input_tokens_seen": 9941896,
"step": 850
},
{
"epoch": 0.43780582024208087,
"eval_loss": 0.8779178261756897,
"eval_runtime": 16.159,
"eval_samples_per_second": 3.713,
"eval_steps_per_second": 0.928,
"num_input_tokens_seen": 9941896,
"step": 850
},
{
"epoch": 0.44038114859644606,
"grad_norm": 0.8207188524660312,
"learning_rate": 8.930720915538487e-05,
"loss": 0.8516,
"num_input_tokens_seen": 10000336,
"step": 855
},
{
"epoch": 0.44295647695081125,
"grad_norm": 1.5881804699369033,
"learning_rate": 8.915646321965614e-05,
"loss": 0.9206,
"num_input_tokens_seen": 10058816,
"step": 860
},
{
"epoch": 0.4455318053051764,
"grad_norm": 0.3364043503653687,
"learning_rate": 8.900479122779712e-05,
"loss": 0.9028,
"num_input_tokens_seen": 10117320,
"step": 865
},
{
"epoch": 0.4481071336595416,
"grad_norm": 0.2888069815557639,
"learning_rate": 8.885219676687277e-05,
"loss": 0.8991,
"num_input_tokens_seen": 10175824,
"step": 870
},
{
"epoch": 0.45068246201390677,
"grad_norm": 0.26081919755231314,
"learning_rate": 8.869868344576459e-05,
"loss": 0.8934,
"num_input_tokens_seen": 10234288,
"step": 875
},
{
"epoch": 0.45325779036827196,
"grad_norm": 0.1672074260476841,
"learning_rate": 8.854425489508532e-05,
"loss": 0.8908,
"num_input_tokens_seen": 10292736,
"step": 880
},
{
"epoch": 0.45583311872263715,
"grad_norm": 0.3141498425127344,
"learning_rate": 8.838891476709288e-05,
"loss": 0.8988,
"num_input_tokens_seen": 10351224,
"step": 885
},
{
"epoch": 0.4584084470770023,
"grad_norm": 0.28442383194638554,
"learning_rate": 8.823266673560426e-05,
"loss": 0.8965,
"num_input_tokens_seen": 10409736,
"step": 890
},
{
"epoch": 0.4609837754313675,
"grad_norm": 0.24793143025843287,
"learning_rate": 8.807551449590846e-05,
"loss": 0.8989,
"num_input_tokens_seen": 10468240,
"step": 895
},
{
"epoch": 0.4635591037857327,
"grad_norm": 0.18173090045802157,
"learning_rate": 8.791746176467907e-05,
"loss": 0.8961,
"num_input_tokens_seen": 10526712,
"step": 900
},
{
"epoch": 0.4635591037857327,
"eval_loss": 0.891426146030426,
"eval_runtime": 16.0357,
"eval_samples_per_second": 3.742,
"eval_steps_per_second": 0.935,
"num_input_tokens_seen": 10526712,
"step": 900
},
{
"epoch": 0.46613443214009787,
"grad_norm": 0.18755280770432675,
"learning_rate": 8.775851227988656e-05,
"loss": 0.8955,
"num_input_tokens_seen": 10585232,
"step": 905
},
{
"epoch": 0.46870976049446306,
"grad_norm": 0.16684040416821233,
"learning_rate": 8.759866980070963e-05,
"loss": 0.8951,
"num_input_tokens_seen": 10643728,
"step": 910
},
{
"epoch": 0.47128508884882825,
"grad_norm": 0.33346521793095785,
"learning_rate": 8.743793810744654e-05,
"loss": 0.8951,
"num_input_tokens_seen": 10702240,
"step": 915
},
{
"epoch": 0.4738604172031934,
"grad_norm": 0.23650054707790025,
"learning_rate": 8.727632100142551e-05,
"loss": 0.9066,
"num_input_tokens_seen": 10760656,
"step": 920
},
{
"epoch": 0.4764357455575586,
"grad_norm": 0.20217442955339224,
"learning_rate": 8.711382230491493e-05,
"loss": 0.8953,
"num_input_tokens_seen": 10819128,
"step": 925
},
{
"epoch": 0.47901107391192377,
"grad_norm": 0.1648307621403396,
"learning_rate": 8.695044586103296e-05,
"loss": 0.8961,
"num_input_tokens_seen": 10877600,
"step": 930
},
{
"epoch": 0.48158640226628896,
"grad_norm": 0.25983065938238986,
"learning_rate": 8.678619553365659e-05,
"loss": 0.8965,
"num_input_tokens_seen": 10936088,
"step": 935
},
{
"epoch": 0.48416173062065415,
"grad_norm": 0.17882463002474594,
"learning_rate": 8.662107520733027e-05,
"loss": 0.9018,
"num_input_tokens_seen": 10994560,
"step": 940
},
{
"epoch": 0.4867370589750193,
"grad_norm": 0.14644012846994445,
"learning_rate": 8.64550887871741e-05,
"loss": 0.8944,
"num_input_tokens_seen": 11053016,
"step": 945
},
{
"epoch": 0.4893123873293845,
"grad_norm": 0.23751630760966444,
"learning_rate": 8.628824019879137e-05,
"loss": 0.8852,
"num_input_tokens_seen": 11111520,
"step": 950
},
{
"epoch": 0.4893123873293845,
"eval_loss": 0.8915690183639526,
"eval_runtime": 16.2589,
"eval_samples_per_second": 3.69,
"eval_steps_per_second": 0.923,
"num_input_tokens_seen": 11111520,
"step": 950
},
{
"epoch": 0.49188771568374967,
"grad_norm": 0.3904846319143667,
"learning_rate": 8.612053338817581e-05,
"loss": 0.9087,
"num_input_tokens_seen": 11170016,
"step": 955
},
{
"epoch": 0.49446304403811486,
"grad_norm": 0.44920450892911645,
"learning_rate": 8.595197232161824e-05,
"loss": 0.8915,
"num_input_tokens_seen": 11228496,
"step": 960
},
{
"epoch": 0.49703837239248005,
"grad_norm": 0.6093857047738649,
"learning_rate": 8.578256098561275e-05,
"loss": 0.8836,
"num_input_tokens_seen": 11286928,
"step": 965
},
{
"epoch": 0.49961370074684525,
"grad_norm": 0.6282945106836194,
"learning_rate": 8.561230338676239e-05,
"loss": 0.9116,
"num_input_tokens_seen": 11345400,
"step": 970
},
{
"epoch": 0.5021890291012104,
"grad_norm": 0.3187294296147391,
"learning_rate": 8.544120355168451e-05,
"loss": 0.8809,
"num_input_tokens_seen": 11403912,
"step": 975
},
{
"epoch": 0.5047643574555756,
"grad_norm": 0.4019889420836467,
"learning_rate": 8.526926552691544e-05,
"loss": 0.8895,
"num_input_tokens_seen": 11462344,
"step": 980
},
{
"epoch": 0.5073396858099408,
"grad_norm": 0.4762279449607594,
"learning_rate": 8.509649337881483e-05,
"loss": 0.8674,
"num_input_tokens_seen": 11520808,
"step": 985
},
{
"epoch": 0.509915014164306,
"grad_norm": 1.7062273050040726,
"learning_rate": 8.492289119346943e-05,
"loss": 0.8832,
"num_input_tokens_seen": 11579248,
"step": 990
},
{
"epoch": 0.5124903425186711,
"grad_norm": 0.7896696939552226,
"learning_rate": 8.474846307659658e-05,
"loss": 0.8581,
"num_input_tokens_seen": 11637712,
"step": 995
},
{
"epoch": 0.5150656708730363,
"grad_norm": 0.9287129351980297,
"learning_rate": 8.457321315344694e-05,
"loss": 0.8635,
"num_input_tokens_seen": 11696200,
"step": 1000
},
{
"epoch": 0.5150656708730363,
"eval_loss": 0.860200047492981,
"eval_runtime": 16.1196,
"eval_samples_per_second": 3.722,
"eval_steps_per_second": 0.931,
"num_input_tokens_seen": 11696200,
"step": 1000
},
{
"epoch": 0.5176409992274015,
"grad_norm": 0.9492829276877938,
"learning_rate": 8.439714556870704e-05,
"loss": 0.8499,
"num_input_tokens_seen": 11754720,
"step": 1005
},
{
"epoch": 0.5202163275817667,
"grad_norm": 1.57473364910246,
"learning_rate": 8.422026448640124e-05,
"loss": 0.8556,
"num_input_tokens_seen": 11813216,
"step": 1010
},
{
"epoch": 0.5227916559361319,
"grad_norm": 0.6562994819534732,
"learning_rate": 8.40425740897932e-05,
"loss": 0.8533,
"num_input_tokens_seen": 11871712,
"step": 1015
},
{
"epoch": 0.525366984290497,
"grad_norm": 0.5420643724864006,
"learning_rate": 8.386407858128706e-05,
"loss": 0.8921,
"num_input_tokens_seen": 11930200,
"step": 1020
},
{
"epoch": 0.5279423126448622,
"grad_norm": 0.4900953324933905,
"learning_rate": 8.368478218232787e-05,
"loss": 0.8815,
"num_input_tokens_seen": 11988704,
"step": 1025
},
{
"epoch": 0.5305176409992274,
"grad_norm": 0.46534021808416004,
"learning_rate": 8.350468913330192e-05,
"loss": 0.854,
"num_input_tokens_seen": 12047176,
"step": 1030
},
{
"epoch": 0.5330929693535926,
"grad_norm": 0.6739669998528043,
"learning_rate": 8.33238036934364e-05,
"loss": 0.8642,
"num_input_tokens_seen": 12105680,
"step": 1035
},
{
"epoch": 0.5356682977079578,
"grad_norm": 1.100337259258234,
"learning_rate": 8.31421301406986e-05,
"loss": 0.8072,
"num_input_tokens_seen": 12164208,
"step": 1040
},
{
"epoch": 0.5382436260623229,
"grad_norm": 1.2731858488127639,
"learning_rate": 8.29596727716949e-05,
"loss": 0.8532,
"num_input_tokens_seen": 12222672,
"step": 1045
},
{
"epoch": 0.5408189544166881,
"grad_norm": 0.8686963016555517,
"learning_rate": 8.277643590156894e-05,
"loss": 0.8844,
"num_input_tokens_seen": 12281072,
"step": 1050
},
{
"epoch": 0.5408189544166881,
"eval_loss": 0.8446129560470581,
"eval_runtime": 16.0508,
"eval_samples_per_second": 3.738,
"eval_steps_per_second": 0.935,
"num_input_tokens_seen": 12281072,
"step": 1050
},
{
"epoch": 0.5433942827710533,
"grad_norm": 0.5518554447099218,
"learning_rate": 8.259242386389973e-05,
"loss": 0.8602,
"num_input_tokens_seen": 12339544,
"step": 1055
},
{
"epoch": 0.5459696111254185,
"grad_norm": 0.7300911438509382,
"learning_rate": 8.240764101059912e-05,
"loss": 0.8615,
"num_input_tokens_seen": 12397992,
"step": 1060
},
{
"epoch": 0.5485449394797837,
"grad_norm": 0.7364983085887583,
"learning_rate": 8.222209171180883e-05,
"loss": 0.8732,
"num_input_tokens_seen": 12456480,
"step": 1065
},
{
"epoch": 0.5511202678341488,
"grad_norm": 0.4840408774949972,
"learning_rate": 8.203578035579715e-05,
"loss": 0.8691,
"num_input_tokens_seen": 12515000,
"step": 1070
},
{
"epoch": 0.553695596188514,
"grad_norm": 0.516278691776577,
"learning_rate": 8.184871134885513e-05,
"loss": 0.8544,
"num_input_tokens_seen": 12573504,
"step": 1075
},
{
"epoch": 0.5562709245428792,
"grad_norm": 0.8626943002609527,
"learning_rate": 8.166088911519235e-05,
"loss": 0.8501,
"num_input_tokens_seen": 12632008,
"step": 1080
},
{
"epoch": 0.5588462528972444,
"grad_norm": 0.7409465187036862,
"learning_rate": 8.147231809683236e-05,
"loss": 0.8646,
"num_input_tokens_seen": 12690520,
"step": 1085
},
{
"epoch": 0.5614215812516096,
"grad_norm": 0.5736639247313171,
"learning_rate": 8.128300275350756e-05,
"loss": 0.8327,
"num_input_tokens_seen": 12749032,
"step": 1090
},
{
"epoch": 0.5639969096059748,
"grad_norm": 0.7720514157947642,
"learning_rate": 8.109294756255375e-05,
"loss": 0.8218,
"num_input_tokens_seen": 12807504,
"step": 1095
},
{
"epoch": 0.56657223796034,
"grad_norm": 0.9129011996506371,
"learning_rate": 8.090215701880419e-05,
"loss": 0.8427,
"num_input_tokens_seen": 12865992,
"step": 1100
},
{
"epoch": 0.56657223796034,
"eval_loss": 0.7743102312088013,
"eval_runtime": 16.1034,
"eval_samples_per_second": 3.726,
"eval_steps_per_second": 0.931,
"num_input_tokens_seen": 12865992,
"step": 1100
},
{
"epoch": 0.5691475663147051,
"grad_norm": 1.6435842633079423,
"learning_rate": 8.07106356344834e-05,
"loss": 0.8335,
"num_input_tokens_seen": 12924448,
"step": 1105
},
{
"epoch": 0.5717228946690703,
"grad_norm": 1.2281943545237959,
"learning_rate": 8.051838793910038e-05,
"loss": 0.8267,
"num_input_tokens_seen": 12982912,
"step": 1110
},
{
"epoch": 0.5742982230234355,
"grad_norm": 1.4138823100284208,
"learning_rate": 8.032541847934146e-05,
"loss": 0.8866,
"num_input_tokens_seen": 13041424,
"step": 1115
},
{
"epoch": 0.5768735513778007,
"grad_norm": 0.6515311059204204,
"learning_rate": 8.013173181896283e-05,
"loss": 0.8446,
"num_input_tokens_seen": 13099888,
"step": 1120
},
{
"epoch": 0.5794488797321659,
"grad_norm": 0.7537544303655812,
"learning_rate": 7.993733253868256e-05,
"loss": 0.8176,
"num_input_tokens_seen": 13158344,
"step": 1125
},
{
"epoch": 0.582024208086531,
"grad_norm": 1.3613777296967222,
"learning_rate": 7.974222523607236e-05,
"loss": 0.8138,
"num_input_tokens_seen": 13216840,
"step": 1130
},
{
"epoch": 0.5845995364408962,
"grad_norm": 0.6640843445520798,
"learning_rate": 7.954641452544865e-05,
"loss": 0.8204,
"num_input_tokens_seen": 13275328,
"step": 1135
},
{
"epoch": 0.5871748647952614,
"grad_norm": 0.6917895597906035,
"learning_rate": 7.934990503776363e-05,
"loss": 0.8485,
"num_input_tokens_seen": 13333784,
"step": 1140
},
{
"epoch": 0.5897501931496266,
"grad_norm": 0.45542718993625547,
"learning_rate": 7.915270142049566e-05,
"loss": 0.8191,
"num_input_tokens_seen": 13392280,
"step": 1145
},
{
"epoch": 0.5923255215039918,
"grad_norm": 0.618954778582039,
"learning_rate": 7.89548083375394e-05,
"loss": 0.8185,
"num_input_tokens_seen": 13450720,
"step": 1150
},
{
"epoch": 0.5923255215039918,
"eval_loss": 0.7827339768409729,
"eval_runtime": 16.0127,
"eval_samples_per_second": 3.747,
"eval_steps_per_second": 0.937,
"num_input_tokens_seen": 13450720,
"step": 1150
},
{
"epoch": 0.5949008498583569,
"grad_norm": 1.5827740829243289,
"learning_rate": 7.875623046909544e-05,
"loss": 0.8168,
"num_input_tokens_seen": 13509200,
"step": 1155
},
{
"epoch": 0.5974761782127221,
"grad_norm": 2.344942216339615,
"learning_rate": 7.855697251155967e-05,
"loss": 0.7749,
"num_input_tokens_seen": 13567656,
"step": 1160
},
{
"epoch": 0.6000515065670873,
"grad_norm": 2.7313469239045305,
"learning_rate": 7.835703917741212e-05,
"loss": 0.9132,
"num_input_tokens_seen": 13626136,
"step": 1165
},
{
"epoch": 0.6026268349214525,
"grad_norm": 0.7410043911446527,
"learning_rate": 7.81564351951057e-05,
"loss": 0.8308,
"num_input_tokens_seen": 13684608,
"step": 1170
},
{
"epoch": 0.6052021632758177,
"grad_norm": 0.5628590604115411,
"learning_rate": 7.795516530895414e-05,
"loss": 0.8011,
"num_input_tokens_seen": 13743080,
"step": 1175
},
{
"epoch": 0.6077774916301828,
"grad_norm": 1.2008934424824649,
"learning_rate": 7.775323427901993e-05,
"loss": 0.8309,
"num_input_tokens_seen": 13801552,
"step": 1180
},
{
"epoch": 0.610352819984548,
"grad_norm": 1.2914156288367256,
"learning_rate": 7.755064688100171e-05,
"loss": 0.8089,
"num_input_tokens_seen": 13860064,
"step": 1185
},
{
"epoch": 0.6129281483389132,
"grad_norm": 1.420806774436513,
"learning_rate": 7.734740790612136e-05,
"loss": 0.8089,
"num_input_tokens_seen": 13918552,
"step": 1190
},
{
"epoch": 0.6155034766932784,
"grad_norm": 0.8352922832465102,
"learning_rate": 7.714352216101055e-05,
"loss": 0.8511,
"num_input_tokens_seen": 13977056,
"step": 1195
},
{
"epoch": 0.6180788050476436,
"grad_norm": 0.6321587989106885,
"learning_rate": 7.693899446759727e-05,
"loss": 0.8061,
"num_input_tokens_seen": 14035544,
"step": 1200
},
{
"epoch": 0.6180788050476436,
"eval_loss": 0.7593821287155151,
"eval_runtime": 16.1368,
"eval_samples_per_second": 3.718,
"eval_steps_per_second": 0.93,
"num_input_tokens_seen": 14035544,
"step": 1200
},
{
"epoch": 0.6206541334020087,
"grad_norm": 1.0526811295206564,
"learning_rate": 7.673382966299163e-05,
"loss": 0.7871,
"num_input_tokens_seen": 14094024,
"step": 1205
},
{
"epoch": 0.623229461756374,
"grad_norm": 1.832697637344859,
"learning_rate": 7.65280325993715e-05,
"loss": 0.7594,
"num_input_tokens_seen": 14152504,
"step": 1210
},
{
"epoch": 0.6258047901107391,
"grad_norm": 1.6875031192331054,
"learning_rate": 7.63216081438678e-05,
"loss": 0.7833,
"num_input_tokens_seen": 14210992,
"step": 1215
},
{
"epoch": 0.6283801184651043,
"grad_norm": 1.867117238207419,
"learning_rate": 7.611456117844934e-05,
"loss": 0.8445,
"num_input_tokens_seen": 14269488,
"step": 1220
},
{
"epoch": 0.6309554468194695,
"grad_norm": 0.9089614634143406,
"learning_rate": 7.59068965998074e-05,
"loss": 0.7857,
"num_input_tokens_seen": 14327968,
"step": 1225
},
{
"epoch": 0.6335307751738347,
"grad_norm": 2.3911537408111214,
"learning_rate": 7.569861931923989e-05,
"loss": 0.8064,
"num_input_tokens_seen": 14386448,
"step": 1230
},
{
"epoch": 0.6361061035281999,
"grad_norm": 1.6500224851295993,
"learning_rate": 7.548973426253521e-05,
"loss": 0.7117,
"num_input_tokens_seen": 14444912,
"step": 1235
},
{
"epoch": 0.638681431882565,
"grad_norm": 1.508924461189316,
"learning_rate": 7.528024636985575e-05,
"loss": 0.7449,
"num_input_tokens_seen": 14503392,
"step": 1240
},
{
"epoch": 0.6412567602369302,
"grad_norm": 1.3801142620835953,
"learning_rate": 7.507016059562107e-05,
"loss": 0.7507,
"num_input_tokens_seen": 14561872,
"step": 1245
},
{
"epoch": 0.6438320885912954,
"grad_norm": 1.2994701535106117,
"learning_rate": 7.485948190839077e-05,
"loss": 0.7917,
"num_input_tokens_seen": 14620336,
"step": 1250
},
{
"epoch": 0.6438320885912954,
"eval_loss": 0.7407085299491882,
"eval_runtime": 16.1168,
"eval_samples_per_second": 3.723,
"eval_steps_per_second": 0.931,
"num_input_tokens_seen": 14620336,
"step": 1250
},
{
"epoch": 0.6464074169456606,
"grad_norm": 0.9491399909407985,
"learning_rate": 7.464821529074679e-05,
"loss": 0.7763,
"num_input_tokens_seen": 14678792,
"step": 1255
},
{
"epoch": 0.6489827453000258,
"grad_norm": 1.1671149163333951,
"learning_rate": 7.443636573917585e-05,
"loss": 0.7979,
"num_input_tokens_seen": 14737272,
"step": 1260
},
{
"epoch": 0.6515580736543909,
"grad_norm": 1.4992002601057717,
"learning_rate": 7.422393826395108e-05,
"loss": 0.7883,
"num_input_tokens_seen": 14795784,
"step": 1265
},
{
"epoch": 0.6541334020087561,
"grad_norm": 1.2009664113851044,
"learning_rate": 7.40109378890136e-05,
"loss": 0.7183,
"num_input_tokens_seen": 14854272,
"step": 1270
},
{
"epoch": 0.6567087303631213,
"grad_norm": 1.5312778776593978,
"learning_rate": 7.379736965185368e-05,
"loss": 0.762,
"num_input_tokens_seen": 14912720,
"step": 1275
},
{
"epoch": 0.6592840587174865,
"grad_norm": 1.443384734396678,
"learning_rate": 7.358323860339165e-05,
"loss": 0.7951,
"num_input_tokens_seen": 14971192,
"step": 1280
},
{
"epoch": 0.6618593870718517,
"grad_norm": 1.3546652337943146,
"learning_rate": 7.336854980785839e-05,
"loss": 0.7528,
"num_input_tokens_seen": 15029656,
"step": 1285
},
{
"epoch": 0.6644347154262168,
"grad_norm": 1.4256460615881865,
"learning_rate": 7.315330834267553e-05,
"loss": 0.7633,
"num_input_tokens_seen": 15088144,
"step": 1290
},
{
"epoch": 0.667010043780582,
"grad_norm": 1.325772407306303,
"learning_rate": 7.293751929833553e-05,
"loss": 0.7443,
"num_input_tokens_seen": 15146600,
"step": 1295
},
{
"epoch": 0.6695853721349472,
"grad_norm": 2.727997344637842,
"learning_rate": 7.272118777828108e-05,
"loss": 0.7724,
"num_input_tokens_seen": 15205064,
"step": 1300
},
{
"epoch": 0.6695853721349472,
"eval_loss": 0.7189856171607971,
"eval_runtime": 16.0307,
"eval_samples_per_second": 3.743,
"eval_steps_per_second": 0.936,
"num_input_tokens_seen": 15205064,
"step": 1300
},
{
"epoch": 0.6721607004893124,
"grad_norm": 2.6154468701895066,
"learning_rate": 7.250431889878455e-05,
"loss": 0.7524,
"num_input_tokens_seen": 15263560,
"step": 1305
},
{
"epoch": 0.6747360288436776,
"grad_norm": 1.9549500311782502,
"learning_rate": 7.228691778882693e-05,
"loss": 0.6748,
"num_input_tokens_seen": 15322016,
"step": 1310
},
{
"epoch": 0.6773113571980427,
"grad_norm": 2.991178206089954,
"learning_rate": 7.20689895899765e-05,
"loss": 0.7571,
"num_input_tokens_seen": 15380504,
"step": 1315
},
{
"epoch": 0.6798866855524079,
"grad_norm": 1.7022848080804835,
"learning_rate": 7.185053945626733e-05,
"loss": 0.6615,
"num_input_tokens_seen": 15438944,
"step": 1320
},
{
"epoch": 0.6824620139067731,
"grad_norm": 1.739259284519112,
"learning_rate": 7.163157255407732e-05,
"loss": 0.7421,
"num_input_tokens_seen": 15497384,
"step": 1325
},
{
"epoch": 0.6850373422611383,
"grad_norm": 1.9142982939434143,
"learning_rate": 7.141209406200599e-05,
"loss": 0.7886,
"num_input_tokens_seen": 15555856,
"step": 1330
},
{
"epoch": 0.6876126706155035,
"grad_norm": 1.7562659805497576,
"learning_rate": 7.1192109170752e-05,
"loss": 0.7484,
"num_input_tokens_seen": 15614368,
"step": 1335
},
{
"epoch": 0.6901879989698687,
"grad_norm": 1.7590122465257017,
"learning_rate": 7.097162308299054e-05,
"loss": 0.7086,
"num_input_tokens_seen": 15672864,
"step": 1340
},
{
"epoch": 0.6927633273242338,
"grad_norm": 2.1211445265818845,
"learning_rate": 7.07506410132501e-05,
"loss": 0.7494,
"num_input_tokens_seen": 15731376,
"step": 1345
},
{
"epoch": 0.695338655678599,
"grad_norm": 2.683073565523052,
"learning_rate": 7.052916818778918e-05,
"loss": 0.7278,
"num_input_tokens_seen": 15789848,
"step": 1350
},
{
"epoch": 0.695338655678599,
"eval_loss": 0.712917685508728,
"eval_runtime": 16.0726,
"eval_samples_per_second": 3.733,
"eval_steps_per_second": 0.933,
"num_input_tokens_seen": 15789848,
"step": 1350
},
{
"epoch": 0.6979139840329642,
"grad_norm": 2.128495144345323,
"learning_rate": 7.030720984447279e-05,
"loss": 0.7005,
"num_input_tokens_seen": 15848328,
"step": 1355
},
{
"epoch": 0.7004893123873294,
"grad_norm": 1.9954206386005497,
"learning_rate": 7.008477123264848e-05,
"loss": 0.7406,
"num_input_tokens_seen": 15906824,
"step": 1360
},
{
"epoch": 0.7030646407416946,
"grad_norm": 2.2104679425901397,
"learning_rate": 6.986185761302224e-05,
"loss": 0.73,
"num_input_tokens_seen": 15965312,
"step": 1365
},
{
"epoch": 0.7056399690960597,
"grad_norm": 1.4881688553415275,
"learning_rate": 6.963847425753403e-05,
"loss": 0.7069,
"num_input_tokens_seen": 16023824,
"step": 1370
},
{
"epoch": 0.7082152974504249,
"grad_norm": 1.7307886623214839,
"learning_rate": 6.941462644923318e-05,
"loss": 0.6859,
"num_input_tokens_seen": 16082280,
"step": 1375
},
{
"epoch": 0.7107906258047901,
"grad_norm": 1.996363722225207,
"learning_rate": 6.919031948215335e-05,
"loss": 0.7254,
"num_input_tokens_seen": 16140800,
"step": 1380
},
{
"epoch": 0.7133659541591553,
"grad_norm": 1.9723274395570518,
"learning_rate": 6.896555866118741e-05,
"loss": 0.717,
"num_input_tokens_seen": 16199320,
"step": 1385
},
{
"epoch": 0.7159412825135205,
"grad_norm": 1.741253496639104,
"learning_rate": 6.87403493019619e-05,
"loss": 0.7094,
"num_input_tokens_seen": 16257768,
"step": 1390
},
{
"epoch": 0.7185166108678857,
"grad_norm": 1.6218002074106608,
"learning_rate": 6.851469673071143e-05,
"loss": 0.7862,
"num_input_tokens_seen": 16316264,
"step": 1395
},
{
"epoch": 0.7210919392222508,
"grad_norm": 1.7586707307941614,
"learning_rate": 6.828860628415253e-05,
"loss": 0.7359,
"num_input_tokens_seen": 16374784,
"step": 1400
},
{
"epoch": 0.7210919392222508,
"eval_loss": 0.6643603444099426,
"eval_runtime": 16.1894,
"eval_samples_per_second": 3.706,
"eval_steps_per_second": 0.927,
"num_input_tokens_seen": 16374784,
"step": 1400
},
{
"epoch": 0.723667267576616,
"grad_norm": 2.665622720042704,
"learning_rate": 6.806208330935766e-05,
"loss": 0.706,
"num_input_tokens_seen": 16433288,
"step": 1405
},
{
"epoch": 0.7262425959309812,
"grad_norm": 2.123869663010538,
"learning_rate": 6.783513316362855e-05,
"loss": 0.6714,
"num_input_tokens_seen": 16491784,
"step": 1410
},
{
"epoch": 0.7288179242853464,
"grad_norm": 1.584213945279146,
"learning_rate": 6.760776121436962e-05,
"loss": 0.693,
"num_input_tokens_seen": 16550272,
"step": 1415
},
{
"epoch": 0.7313932526397116,
"grad_norm": 2.2481839233017764,
"learning_rate": 6.737997283896103e-05,
"loss": 0.7005,
"num_input_tokens_seen": 16608704,
"step": 1420
},
{
"epoch": 0.7339685809940767,
"grad_norm": 2.4818230151927643,
"learning_rate": 6.715177342463145e-05,
"loss": 0.6573,
"num_input_tokens_seen": 16667200,
"step": 1425
},
{
"epoch": 0.7365439093484419,
"grad_norm": 2.5398594354263486,
"learning_rate": 6.692316836833065e-05,
"loss": 0.6751,
"num_input_tokens_seen": 16725704,
"step": 1430
},
{
"epoch": 0.7391192377028071,
"grad_norm": 2.7486055345229343,
"learning_rate": 6.6694163076602e-05,
"loss": 0.6173,
"num_input_tokens_seen": 16784192,
"step": 1435
},
{
"epoch": 0.7416945660571723,
"grad_norm": 5.356237563459472,
"learning_rate": 6.646476296545434e-05,
"loss": 0.728,
"num_input_tokens_seen": 16842704,
"step": 1440
},
{
"epoch": 0.7442698944115375,
"grad_norm": 2.088505948846248,
"learning_rate": 6.623497346023418e-05,
"loss": 0.743,
"num_input_tokens_seen": 16901176,
"step": 1445
},
{
"epoch": 0.7468452227659027,
"grad_norm": 2.2198436340262,
"learning_rate": 6.60047999954972e-05,
"loss": 0.6291,
"num_input_tokens_seen": 16959632,
"step": 1450
},
{
"epoch": 0.7468452227659027,
"eval_loss": 0.753077507019043,
"eval_runtime": 16.0383,
"eval_samples_per_second": 3.741,
"eval_steps_per_second": 0.935,
"num_input_tokens_seen": 16959632,
"step": 1450
},
{
"epoch": 0.7494205511202678,
"grad_norm": 1.9571252974715032,
"learning_rate": 6.57742480148798e-05,
"loss": 0.6533,
"num_input_tokens_seen": 17018072,
"step": 1455
},
{
"epoch": 0.751995879474633,
"grad_norm": 3.2075825448529542,
"learning_rate": 6.554332297097031e-05,
"loss": 0.7114,
"num_input_tokens_seen": 17076560,
"step": 1460
},
{
"epoch": 0.7545712078289982,
"grad_norm": 2.0030816579741266,
"learning_rate": 6.53120303251801e-05,
"loss": 0.6568,
"num_input_tokens_seen": 17135016,
"step": 1465
},
{
"epoch": 0.7571465361833634,
"grad_norm": 2.65056436638165,
"learning_rate": 6.508037554761432e-05,
"loss": 0.7016,
"num_input_tokens_seen": 17193496,
"step": 1470
},
{
"epoch": 0.7597218645377286,
"grad_norm": 1.9541651871708403,
"learning_rate": 6.484836411694267e-05,
"loss": 0.6612,
"num_input_tokens_seen": 17251944,
"step": 1475
},
{
"epoch": 0.7622971928920937,
"grad_norm": 3.0540242692558577,
"learning_rate": 6.461600152026965e-05,
"loss": 0.6115,
"num_input_tokens_seen": 17310456,
"step": 1480
},
{
"epoch": 0.7648725212464589,
"grad_norm": 2.796196437541352,
"learning_rate": 6.438329325300499e-05,
"loss": 0.6458,
"num_input_tokens_seen": 17368968,
"step": 1485
},
{
"epoch": 0.7674478496008241,
"grad_norm": 3.1979427976381207,
"learning_rate": 6.415024481873352e-05,
"loss": 0.6434,
"num_input_tokens_seen": 17427424,
"step": 1490
},
{
"epoch": 0.7700231779551893,
"grad_norm": 3.8375601078700203,
"learning_rate": 6.391686172908506e-05,
"loss": 0.5973,
"num_input_tokens_seen": 17485936,
"step": 1495
},
{
"epoch": 0.7725985063095545,
"grad_norm": 2.405705749864128,
"learning_rate": 6.368314950360415e-05,
"loss": 0.6021,
"num_input_tokens_seen": 17544440,
"step": 1500
},
{
"epoch": 0.7725985063095545,
"eval_loss": 0.632923424243927,
"eval_runtime": 16.1038,
"eval_samples_per_second": 3.726,
"eval_steps_per_second": 0.931,
"num_input_tokens_seen": 17544440,
"step": 1500
},
{
"epoch": 0.7751738346639196,
"grad_norm": 2.8519087211521734,
"learning_rate": 6.344911366961934e-05,
"loss": 0.5779,
"num_input_tokens_seen": 17602952,
"step": 1505
},
{
"epoch": 0.7777491630182848,
"grad_norm": 2.861290579940173,
"learning_rate": 6.321475976211266e-05,
"loss": 0.6707,
"num_input_tokens_seen": 17661440,
"step": 1510
},
{
"epoch": 0.78032449137265,
"grad_norm": 3.541365161144121,
"learning_rate": 6.298009332358856e-05,
"loss": 0.6326,
"num_input_tokens_seen": 17719928,
"step": 1515
},
{
"epoch": 0.7828998197270152,
"grad_norm": 2.969962641272996,
"learning_rate": 6.274511990394294e-05,
"loss": 0.6472,
"num_input_tokens_seen": 17778424,
"step": 1520
},
{
"epoch": 0.7854751480813804,
"grad_norm": 2.762063548864621,
"learning_rate": 6.250984506033183e-05,
"loss": 0.6215,
"num_input_tokens_seen": 17836936,
"step": 1525
},
{
"epoch": 0.7880504764357456,
"grad_norm": 3.2198855545004097,
"learning_rate": 6.227427435703997e-05,
"loss": 0.6102,
"num_input_tokens_seen": 17895392,
"step": 1530
},
{
"epoch": 0.7906258047901107,
"grad_norm": 3.846544371420393,
"learning_rate": 6.203841336534924e-05,
"loss": 0.6161,
"num_input_tokens_seen": 17953872,
"step": 1535
},
{
"epoch": 0.7932011331444759,
"grad_norm": 3.811248686105134,
"learning_rate": 6.180226766340688e-05,
"loss": 0.6103,
"num_input_tokens_seen": 18012320,
"step": 1540
},
{
"epoch": 0.7957764614988411,
"grad_norm": 2.9539705466919703,
"learning_rate": 6.156584283609359e-05,
"loss": 0.5791,
"num_input_tokens_seen": 18070792,
"step": 1545
},
{
"epoch": 0.7983517898532063,
"grad_norm": 3.0546686267383283,
"learning_rate": 6.132914447489137e-05,
"loss": 0.667,
"num_input_tokens_seen": 18129304,
"step": 1550
},
{
"epoch": 0.7983517898532063,
"eval_loss": 0.6617516279220581,
"eval_runtime": 16.0333,
"eval_samples_per_second": 3.742,
"eval_steps_per_second": 0.936,
"num_input_tokens_seen": 18129304,
"step": 1550
},
{
"epoch": 0.8009271182075715,
"grad_norm": 2.9735507158511987,
"learning_rate": 6.109217817775139e-05,
"loss": 0.5681,
"num_input_tokens_seen": 18187728,
"step": 1555
},
{
"epoch": 0.8035024465619367,
"grad_norm": 3.6620315644598778,
"learning_rate": 6.085494954896156e-05,
"loss": 0.6292,
"num_input_tokens_seen": 18246192,
"step": 1560
},
{
"epoch": 0.8060777749163018,
"grad_norm": 4.03631122919402,
"learning_rate": 6.061746419901388e-05,
"loss": 0.6512,
"num_input_tokens_seen": 18304632,
"step": 1565
},
{
"epoch": 0.808653103270667,
"grad_norm": 4.0040288177360805,
"learning_rate": 6.0379727744471936e-05,
"loss": 0.5476,
"num_input_tokens_seen": 18363136,
"step": 1570
},
{
"epoch": 0.8112284316250322,
"grad_norm": 3.9448861517599996,
"learning_rate": 6.014174580783794e-05,
"loss": 0.5632,
"num_input_tokens_seen": 18421592,
"step": 1575
},
{
"epoch": 0.8138037599793974,
"grad_norm": 3.8400680048739435,
"learning_rate": 5.990352401741981e-05,
"loss": 0.6225,
"num_input_tokens_seen": 18480104,
"step": 1580
},
{
"epoch": 0.8163790883337626,
"grad_norm": 2.7981339113543284,
"learning_rate": 5.9665068007197976e-05,
"loss": 0.5801,
"num_input_tokens_seen": 18538600,
"step": 1585
},
{
"epoch": 0.8189544166881277,
"grad_norm": 4.290843515697908,
"learning_rate": 5.94263834166923e-05,
"loss": 0.6364,
"num_input_tokens_seen": 18597104,
"step": 1590
},
{
"epoch": 0.8215297450424929,
"grad_norm": 3.9001572117535566,
"learning_rate": 5.918747589082853e-05,
"loss": 0.6088,
"num_input_tokens_seen": 18655584,
"step": 1595
},
{
"epoch": 0.8241050733968581,
"grad_norm": 3.5623412341260363,
"learning_rate": 5.8948351079804875e-05,
"loss": 0.6564,
"num_input_tokens_seen": 18714072,
"step": 1600
},
{
"epoch": 0.8241050733968581,
"eval_loss": 0.6319106221199036,
"eval_runtime": 16.0199,
"eval_samples_per_second": 3.745,
"eval_steps_per_second": 0.936,
"num_input_tokens_seen": 18714072,
"step": 1600
},
{
"epoch": 0.8266804017512233,
"grad_norm": 3.4115030121534953,
"learning_rate": 5.8709014638958404e-05,
"loss": 0.6095,
"num_input_tokens_seen": 18772552,
"step": 1605
},
{
"epoch": 0.8292557301055885,
"grad_norm": 2.8584050529867895,
"learning_rate": 5.846947222863123e-05,
"loss": 0.5896,
"num_input_tokens_seen": 18830992,
"step": 1610
},
{
"epoch": 0.8318310584599536,
"grad_norm": 3.083134826868609,
"learning_rate": 5.8229729514036705e-05,
"loss": 0.545,
"num_input_tokens_seen": 18889480,
"step": 1615
},
{
"epoch": 0.8344063868143188,
"grad_norm": 3.5650772646006703,
"learning_rate": 5.7989792165125356e-05,
"loss": 0.6021,
"num_input_tokens_seen": 18947936,
"step": 1620
},
{
"epoch": 0.836981715168684,
"grad_norm": 3.1787537764025737,
"learning_rate": 5.774966585645092e-05,
"loss": 0.5741,
"num_input_tokens_seen": 19006432,
"step": 1625
},
{
"epoch": 0.8395570435230492,
"grad_norm": 4.505205596087594,
"learning_rate": 5.7509356267035975e-05,
"loss": 0.5796,
"num_input_tokens_seen": 19064920,
"step": 1630
},
{
"epoch": 0.8421323718774144,
"grad_norm": 3.854433226263906,
"learning_rate": 5.726886908023776e-05,
"loss": 0.5088,
"num_input_tokens_seen": 19123376,
"step": 1635
},
{
"epoch": 0.8447077002317795,
"grad_norm": 3.5910960304247643,
"learning_rate": 5.702820998361373e-05,
"loss": 0.5431,
"num_input_tokens_seen": 19181864,
"step": 1640
},
{
"epoch": 0.8472830285861447,
"grad_norm": 4.55639282269759,
"learning_rate": 5.6787384668786994e-05,
"loss": 0.5849,
"num_input_tokens_seen": 19240352,
"step": 1645
},
{
"epoch": 0.8498583569405099,
"grad_norm": 4.031478721616991,
"learning_rate": 5.654639883131178e-05,
"loss": 0.5668,
"num_input_tokens_seen": 19298848,
"step": 1650
},
{
"epoch": 0.8498583569405099,
"eval_loss": 0.6634677648544312,
"eval_runtime": 16.0267,
"eval_samples_per_second": 3.744,
"eval_steps_per_second": 0.936,
"num_input_tokens_seen": 19298848,
"step": 1650
},
{
"epoch": 0.8524336852948751,
"grad_norm": 3.2430676664218496,
"learning_rate": 5.6305258170538676e-05,
"loss": 0.584,
"num_input_tokens_seen": 19357304,
"step": 1655
},
{
"epoch": 0.8550090136492403,
"grad_norm": 3.140559424454581,
"learning_rate": 5.606396838947988e-05,
"loss": 0.5544,
"num_input_tokens_seen": 19415800,
"step": 1660
},
{
"epoch": 0.8575843420036054,
"grad_norm": 3.993528386539066,
"learning_rate": 5.582253519467432e-05,
"loss": 0.6269,
"num_input_tokens_seen": 19474256,
"step": 1665
},
{
"epoch": 0.8601596703579707,
"grad_norm": 2.202747116085024,
"learning_rate": 5.558096429605263e-05,
"loss": 0.5073,
"num_input_tokens_seen": 19532736,
"step": 1670
},
{
"epoch": 0.8627349987123358,
"grad_norm": 4.4094334133851625,
"learning_rate": 5.533926140680221e-05,
"loss": 0.5319,
"num_input_tokens_seen": 19591184,
"step": 1675
},
{
"epoch": 0.865310327066701,
"grad_norm": 4.01821546567579,
"learning_rate": 5.509743224323203e-05,
"loss": 0.4525,
"num_input_tokens_seen": 19649656,
"step": 1680
},
{
"epoch": 0.8678856554210662,
"grad_norm": 5.3033277992950385,
"learning_rate": 5.485548252463749e-05,
"loss": 0.5276,
"num_input_tokens_seen": 19708144,
"step": 1685
},
{
"epoch": 0.8704609837754314,
"grad_norm": 5.124737819396939,
"learning_rate": 5.4613417973165106e-05,
"loss": 0.5482,
"num_input_tokens_seen": 19766592,
"step": 1690
},
{
"epoch": 0.8730363121297966,
"grad_norm": 3.47304956996904,
"learning_rate": 5.4371244313677225e-05,
"loss": 0.4656,
"num_input_tokens_seen": 19825064,
"step": 1695
},
{
"epoch": 0.8756116404841617,
"grad_norm": 6.394279811127835,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.5701,
"num_input_tokens_seen": 19883504,
"step": 1700
},
{
"epoch": 0.8756116404841617,
"eval_loss": 0.7144017815589905,
"eval_runtime": 16.1358,
"eval_samples_per_second": 3.718,
"eval_steps_per_second": 0.93,
"num_input_tokens_seen": 19883504,
"step": 1700
},
{
"epoch": 0.8781869688385269,
"grad_norm": 4.527262723362309,
"learning_rate": 5.388659258287102e-05,
"loss": 0.5823,
"num_input_tokens_seen": 19942000,
"step": 1705
},
{
"epoch": 0.8807622971928921,
"grad_norm": 4.628112845411063,
"learning_rate": 5.364412597363759e-05,
"loss": 0.5446,
"num_input_tokens_seen": 20000440,
"step": 1710
},
{
"epoch": 0.8833376255472573,
"grad_norm": 6.077375809046342,
"learning_rate": 5.3401573180287426e-05,
"loss": 0.5769,
"num_input_tokens_seen": 20058920,
"step": 1715
},
{
"epoch": 0.8859129539016225,
"grad_norm": 6.492863688878202,
"learning_rate": 5.315893993922986e-05,
"loss": 0.5614,
"num_input_tokens_seen": 20117416,
"step": 1720
},
{
"epoch": 0.8884882822559876,
"grad_norm": 5.332057542240503,
"learning_rate": 5.29162319887768e-05,
"loss": 0.5215,
"num_input_tokens_seen": 20175936,
"step": 1725
},
{
"epoch": 0.8910636106103528,
"grad_norm": 3.8772752615113077,
"learning_rate": 5.26734550690071e-05,
"loss": 0.4968,
"num_input_tokens_seen": 20234368,
"step": 1730
},
{
"epoch": 0.893638938964718,
"grad_norm": 4.886426418731965,
"learning_rate": 5.243061492163073e-05,
"loss": 0.5029,
"num_input_tokens_seen": 20292856,
"step": 1735
},
{
"epoch": 0.8962142673190832,
"grad_norm": 4.031774194047053,
"learning_rate": 5.2187717289852955e-05,
"loss": 0.5249,
"num_input_tokens_seen": 20351272,
"step": 1740
},
{
"epoch": 0.8987895956734484,
"grad_norm": 5.344580011428224,
"learning_rate": 5.1944767918238624e-05,
"loss": 0.5801,
"num_input_tokens_seen": 20409744,
"step": 1745
},
{
"epoch": 0.9013649240278135,
"grad_norm": 3.923379435953565,
"learning_rate": 5.170177255257618e-05,
"loss": 0.546,
"num_input_tokens_seen": 20468200,
"step": 1750
},
{
"epoch": 0.9013649240278135,
"eval_loss": 0.672294020652771,
"eval_runtime": 16.0203,
"eval_samples_per_second": 3.745,
"eval_steps_per_second": 0.936,
"num_input_tokens_seen": 20468200,
"step": 1750
},
{
"epoch": 0.9039402523821787,
"grad_norm": 4.616122198129487,
"learning_rate": 5.145873693974188e-05,
"loss": 0.5248,
"num_input_tokens_seen": 20526696,
"step": 1755
},
{
"epoch": 0.9065155807365439,
"grad_norm": 5.322590172525407,
"learning_rate": 5.12156668275638e-05,
"loss": 0.4756,
"num_input_tokens_seen": 20585160,
"step": 1760
},
{
"epoch": 0.9090909090909091,
"grad_norm": 4.002252878507737,
"learning_rate": 5.097256796468598e-05,
"loss": 0.4405,
"num_input_tokens_seen": 20643672,
"step": 1765
},
{
"epoch": 0.9116662374452743,
"grad_norm": 5.58017966349683,
"learning_rate": 5.072944610043232e-05,
"loss": 0.5201,
"num_input_tokens_seen": 20702152,
"step": 1770
},
{
"epoch": 0.9142415657996394,
"grad_norm": 4.688576373892097,
"learning_rate": 5.048630698467081e-05,
"loss": 0.4662,
"num_input_tokens_seen": 20760664,
"step": 1775
},
{
"epoch": 0.9168168941540046,
"grad_norm": 4.984086874604376,
"learning_rate": 5.024315636767738e-05,
"loss": 0.5376,
"num_input_tokens_seen": 20819144,
"step": 1780
},
{
"epoch": 0.9193922225083698,
"grad_norm": 4.470690620190923,
"learning_rate": 5e-05,
"loss": 0.5174,
"num_input_tokens_seen": 20877624,
"step": 1785
},
{
"epoch": 0.921967550862735,
"grad_norm": 4.1127649145734795,
"learning_rate": 4.9756843632322626e-05,
"loss": 0.4273,
"num_input_tokens_seen": 20936112,
"step": 1790
},
{
"epoch": 0.9245428792171002,
"grad_norm": 5.1892527739805185,
"learning_rate": 4.9513693015329197e-05,
"loss": 0.4646,
"num_input_tokens_seen": 20994608,
"step": 1795
},
{
"epoch": 0.9271182075714653,
"grad_norm": 6.8574703914708985,
"learning_rate": 4.9270553899567686e-05,
"loss": 0.412,
"num_input_tokens_seen": 21053080,
"step": 1800
},
{
"epoch": 0.9271182075714653,
"eval_loss": 0.6768696904182434,
"eval_runtime": 15.9758,
"eval_samples_per_second": 3.756,
"eval_steps_per_second": 0.939,
"num_input_tokens_seen": 21053080,
"step": 1800
},
{
"epoch": 0.9296935359258306,
"grad_norm": 6.328873193178562,
"learning_rate": 4.902743203531405e-05,
"loss": 0.4845,
"num_input_tokens_seen": 21111592,
"step": 1805
},
{
"epoch": 0.9322688642801957,
"grad_norm": 4.7019594666508215,
"learning_rate": 4.8784333172436206e-05,
"loss": 0.441,
"num_input_tokens_seen": 21170024,
"step": 1810
},
{
"epoch": 0.9348441926345609,
"grad_norm": 4.545287749618146,
"learning_rate": 4.854126306025812e-05,
"loss": 0.545,
"num_input_tokens_seen": 21228480,
"step": 1815
},
{
"epoch": 0.9374195209889261,
"grad_norm": 7.047942469299444,
"learning_rate": 4.829822744742383e-05,
"loss": 0.4697,
"num_input_tokens_seen": 21286944,
"step": 1820
},
{
"epoch": 0.9399948493432912,
"grad_norm": 3.917758669787159,
"learning_rate": 4.8055232081761395e-05,
"loss": 0.423,
"num_input_tokens_seen": 21345456,
"step": 1825
},
{
"epoch": 0.9425701776976565,
"grad_norm": 3.442911876713947,
"learning_rate": 4.781228271014704e-05,
"loss": 0.4715,
"num_input_tokens_seen": 21403896,
"step": 1830
},
{
"epoch": 0.9451455060520216,
"grad_norm": 4.755237925353789,
"learning_rate": 4.756938507836929e-05,
"loss": 0.5149,
"num_input_tokens_seen": 21462360,
"step": 1835
},
{
"epoch": 0.9477208344063868,
"grad_norm": 5.3552741805060275,
"learning_rate": 4.732654493099291e-05,
"loss": 0.5403,
"num_input_tokens_seen": 21520864,
"step": 1840
},
{
"epoch": 0.950296162760752,
"grad_norm": 3.417134377266731,
"learning_rate": 4.708376801122321e-05,
"loss": 0.4757,
"num_input_tokens_seen": 21579376,
"step": 1845
},
{
"epoch": 0.9528714911151172,
"grad_norm": 4.6802756294331855,
"learning_rate": 4.6841060060770154e-05,
"loss": 0.4347,
"num_input_tokens_seen": 21637848,
"step": 1850
},
{
"epoch": 0.9528714911151172,
"eval_loss": 0.6808218359947205,
"eval_runtime": 16.1166,
"eval_samples_per_second": 3.723,
"eval_steps_per_second": 0.931,
"num_input_tokens_seen": 21637848,
"step": 1850
},
{
"epoch": 0.9554468194694824,
"grad_norm": 5.573192417675986,
"learning_rate": 4.659842681971258e-05,
"loss": 0.5132,
"num_input_tokens_seen": 21696328,
"step": 1855
},
{
"epoch": 0.9580221478238475,
"grad_norm": 7.109977536510439,
"learning_rate": 4.635587402636241e-05,
"loss": 0.4347,
"num_input_tokens_seen": 21754816,
"step": 1860
},
{
"epoch": 0.9605974761782127,
"grad_norm": 7.143552890986281,
"learning_rate": 4.611340741712901e-05,
"loss": 0.4015,
"num_input_tokens_seen": 21813296,
"step": 1865
},
{
"epoch": 0.9631728045325779,
"grad_norm": 6.289734219426663,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.5023,
"num_input_tokens_seen": 21871800,
"step": 1870
},
{
"epoch": 0.9657481328869431,
"grad_norm": 5.981747103855226,
"learning_rate": 4.562875568632278e-05,
"loss": 0.5334,
"num_input_tokens_seen": 21930272,
"step": 1875
},
{
"epoch": 0.9683234612413083,
"grad_norm": 5.6559760588122545,
"learning_rate": 4.5386582026834906e-05,
"loss": 0.4386,
"num_input_tokens_seen": 21988736,
"step": 1880
},
{
"epoch": 0.9708987895956734,
"grad_norm": 5.861060155419055,
"learning_rate": 4.5144517475362514e-05,
"loss": 0.3807,
"num_input_tokens_seen": 22047200,
"step": 1885
},
{
"epoch": 0.9734741179500386,
"grad_norm": 7.801226281593827,
"learning_rate": 4.490256775676797e-05,
"loss": 0.4177,
"num_input_tokens_seen": 22105664,
"step": 1890
},
{
"epoch": 0.9760494463044038,
"grad_norm": 6.1755894964345135,
"learning_rate": 4.466073859319781e-05,
"loss": 0.5239,
"num_input_tokens_seen": 22164184,
"step": 1895
},
{
"epoch": 0.978624774658769,
"grad_norm": 5.397307732194541,
"learning_rate": 4.441903570394739e-05,
"loss": 0.3737,
"num_input_tokens_seen": 22222632,
"step": 1900
},
{
"epoch": 0.978624774658769,
"eval_loss": 0.773033082485199,
"eval_runtime": 15.9975,
"eval_samples_per_second": 3.751,
"eval_steps_per_second": 0.938,
"num_input_tokens_seen": 22222632,
"step": 1900
},
{
"epoch": 0.9812001030131342,
"grad_norm": 6.997624273550619,
"learning_rate": 4.41774648053257e-05,
"loss": 0.4437,
"num_input_tokens_seen": 22281080,
"step": 1905
},
{
"epoch": 0.9837754313674993,
"grad_norm": 5.030616381143982,
"learning_rate": 4.3936031610520124e-05,
"loss": 0.465,
"num_input_tokens_seen": 22339552,
"step": 1910
},
{
"epoch": 0.9863507597218646,
"grad_norm": 5.025845260709186,
"learning_rate": 4.3694741829461336e-05,
"loss": 0.4975,
"num_input_tokens_seen": 22398056,
"step": 1915
},
{
"epoch": 0.9889260880762297,
"grad_norm": 6.43843242330618,
"learning_rate": 4.345360116868823e-05,
"loss": 0.4504,
"num_input_tokens_seen": 22456520,
"step": 1920
},
{
"epoch": 0.9915014164305949,
"grad_norm": 5.281203851622467,
"learning_rate": 4.321261533121303e-05,
"loss": 0.4528,
"num_input_tokens_seen": 22515024,
"step": 1925
},
{
"epoch": 0.9940767447849601,
"grad_norm": 6.158304256456398,
"learning_rate": 4.2971790016386286e-05,
"loss": 0.441,
"num_input_tokens_seen": 22573480,
"step": 1930
},
{
"epoch": 0.9966520731393252,
"grad_norm": 3.898263595049965,
"learning_rate": 4.273113091976225e-05,
"loss": 0.4678,
"num_input_tokens_seen": 22631960,
"step": 1935
},
{
"epoch": 0.9992274014936905,
"grad_norm": 6.266433889699235,
"learning_rate": 4.249064373296403e-05,
"loss": 0.4352,
"num_input_tokens_seen": 22690432,
"step": 1940
},
{
"epoch": 1.001545197012619,
"grad_norm": 2.4601530377865695,
"learning_rate": 4.225033414354908e-05,
"loss": 0.3792,
"num_input_tokens_seen": 22743048,
"step": 1945
},
{
"epoch": 1.0041205253669843,
"grad_norm": 4.761740260797231,
"learning_rate": 4.201020783487464e-05,
"loss": 0.3783,
"num_input_tokens_seen": 22801512,
"step": 1950
},
{
"epoch": 1.0041205253669843,
"eval_loss": 0.6983156204223633,
"eval_runtime": 16.3172,
"eval_samples_per_second": 3.677,
"eval_steps_per_second": 0.919,
"num_input_tokens_seen": 22801512,
"step": 1950
},
{
"epoch": 1.0066958537213495,
"grad_norm": 6.506183969602581,
"learning_rate": 4.17702704859633e-05,
"loss": 0.3784,
"num_input_tokens_seen": 22859952,
"step": 1955
},
{
"epoch": 1.0092711820757148,
"grad_norm": 7.31299798110374,
"learning_rate": 4.153052777136879e-05,
"loss": 0.5587,
"num_input_tokens_seen": 22918440,
"step": 1960
},
{
"epoch": 1.0118465104300798,
"grad_norm": 4.338872323547646,
"learning_rate": 4.1290985361041614e-05,
"loss": 0.3803,
"num_input_tokens_seen": 22976944,
"step": 1965
},
{
"epoch": 1.014421838784445,
"grad_norm": 6.798827966152428,
"learning_rate": 4.105164892019514e-05,
"loss": 0.4038,
"num_input_tokens_seen": 23035408,
"step": 1970
},
{
"epoch": 1.0169971671388103,
"grad_norm": 5.018683403937771,
"learning_rate": 4.0812524109171476e-05,
"loss": 0.3226,
"num_input_tokens_seen": 23093912,
"step": 1975
},
{
"epoch": 1.0195724954931753,
"grad_norm": 4.594775856201265,
"learning_rate": 4.0573616583307705e-05,
"loss": 0.4026,
"num_input_tokens_seen": 23152344,
"step": 1980
},
{
"epoch": 1.0221478238475405,
"grad_norm": 7.5346230342964695,
"learning_rate": 4.033493199280202e-05,
"loss": 0.4225,
"num_input_tokens_seen": 23210800,
"step": 1985
},
{
"epoch": 1.0247231522019058,
"grad_norm": 8.213657673441388,
"learning_rate": 4.009647598258022e-05,
"loss": 0.3058,
"num_input_tokens_seen": 23269304,
"step": 1990
},
{
"epoch": 1.0272984805562708,
"grad_norm": 6.881744374075897,
"learning_rate": 3.985825419216207e-05,
"loss": 0.3821,
"num_input_tokens_seen": 23327800,
"step": 1995
},
{
"epoch": 1.029873808910636,
"grad_norm": 3.916989546123924,
"learning_rate": 3.962027225552807e-05,
"loss": 0.3328,
"num_input_tokens_seen": 23386232,
"step": 2000
},
{
"epoch": 1.029873808910636,
"eval_loss": 0.7484827041625977,
"eval_runtime": 16.091,
"eval_samples_per_second": 3.729,
"eval_steps_per_second": 0.932,
"num_input_tokens_seen": 23386232,
"step": 2000
},
{
"epoch": 1.0324491372650013,
"grad_norm": 5.8532055715340245,
"learning_rate": 3.938253580098613e-05,
"loss": 0.362,
"num_input_tokens_seen": 23444712,
"step": 2005
},
{
"epoch": 1.0350244656193666,
"grad_norm": 7.087739461357715,
"learning_rate": 3.914505045103845e-05,
"loss": 0.3903,
"num_input_tokens_seen": 23503192,
"step": 2010
},
{
"epoch": 1.0375997939737316,
"grad_norm": 6.061997147134047,
"learning_rate": 3.8907821822248605e-05,
"loss": 0.3341,
"num_input_tokens_seen": 23561688,
"step": 2015
},
{
"epoch": 1.0401751223280968,
"grad_norm": 6.783069419644998,
"learning_rate": 3.867085552510864e-05,
"loss": 0.4794,
"num_input_tokens_seen": 23620160,
"step": 2020
},
{
"epoch": 1.042750450682462,
"grad_norm": 4.11088291372727,
"learning_rate": 3.843415716390644e-05,
"loss": 0.4104,
"num_input_tokens_seen": 23678624,
"step": 2025
},
{
"epoch": 1.045325779036827,
"grad_norm": 5.727855298190317,
"learning_rate": 3.819773233659314e-05,
"loss": 0.3639,
"num_input_tokens_seen": 23737064,
"step": 2030
},
{
"epoch": 1.0479011073911924,
"grad_norm": 6.936114108935384,
"learning_rate": 3.7961586634650767e-05,
"loss": 0.4294,
"num_input_tokens_seen": 23795568,
"step": 2035
},
{
"epoch": 1.0504764357455576,
"grad_norm": 5.577801320854008,
"learning_rate": 3.772572564296005e-05,
"loss": 0.4713,
"num_input_tokens_seen": 23854040,
"step": 2040
},
{
"epoch": 1.0530517640999228,
"grad_norm": 7.466883391944433,
"learning_rate": 3.749015493966817e-05,
"loss": 0.3864,
"num_input_tokens_seen": 23912520,
"step": 2045
},
{
"epoch": 1.0556270924542879,
"grad_norm": 4.120909561971508,
"learning_rate": 3.7254880096057073e-05,
"loss": 0.3602,
"num_input_tokens_seen": 23971048,
"step": 2050
},
{
"epoch": 1.0556270924542879,
"eval_loss": 0.7190810441970825,
"eval_runtime": 16.0858,
"eval_samples_per_second": 3.73,
"eval_steps_per_second": 0.932,
"num_input_tokens_seen": 23971048,
"step": 2050
},
{
"epoch": 1.0582024208086531,
"grad_norm": 3.701758619566102,
"learning_rate": 3.7019906676411446e-05,
"loss": 0.3203,
"num_input_tokens_seen": 24029544,
"step": 2055
},
{
"epoch": 1.0607777491630184,
"grad_norm": 7.855789285552562,
"learning_rate": 3.678524023788735e-05,
"loss": 0.3906,
"num_input_tokens_seen": 24088008,
"step": 2060
},
{
"epoch": 1.0633530775173834,
"grad_norm": 6.682460948737117,
"learning_rate": 3.6550886330380665e-05,
"loss": 0.3604,
"num_input_tokens_seen": 24146480,
"step": 2065
},
{
"epoch": 1.0659284058717486,
"grad_norm": 3.587156705730744,
"learning_rate": 3.631685049639586e-05,
"loss": 0.3271,
"num_input_tokens_seen": 24204984,
"step": 2070
},
{
"epoch": 1.0685037342261139,
"grad_norm": 4.621273077841867,
"learning_rate": 3.608313827091493e-05,
"loss": 0.2996,
"num_input_tokens_seen": 24263456,
"step": 2075
},
{
"epoch": 1.071079062580479,
"grad_norm": 6.565390196167412,
"learning_rate": 3.5849755181266474e-05,
"loss": 0.3767,
"num_input_tokens_seen": 24321960,
"step": 2080
},
{
"epoch": 1.0736543909348442,
"grad_norm": 6.589833421708817,
"learning_rate": 3.5616706746995026e-05,
"loss": 0.4208,
"num_input_tokens_seen": 24380464,
"step": 2085
},
{
"epoch": 1.0762297192892094,
"grad_norm": 4.95070197991303,
"learning_rate": 3.538399847973036e-05,
"loss": 0.3479,
"num_input_tokens_seen": 24438976,
"step": 2090
},
{
"epoch": 1.0788050476435747,
"grad_norm": 5.124820683013397,
"learning_rate": 3.515163588305735e-05,
"loss": 0.3654,
"num_input_tokens_seen": 24497448,
"step": 2095
},
{
"epoch": 1.0813803759979397,
"grad_norm": 6.444785878585679,
"learning_rate": 3.491962445238569e-05,
"loss": 0.3351,
"num_input_tokens_seen": 24555904,
"step": 2100
},
{
"epoch": 1.0813803759979397,
"eval_loss": 0.8075026869773865,
"eval_runtime": 16.096,
"eval_samples_per_second": 3.728,
"eval_steps_per_second": 0.932,
"num_input_tokens_seen": 24555904,
"step": 2100
},
{
"epoch": 1.083955704352305,
"grad_norm": 5.259882631403194,
"learning_rate": 3.4687969674819906e-05,
"loss": 0.3827,
"num_input_tokens_seen": 24614392,
"step": 2105
},
{
"epoch": 1.0865310327066702,
"grad_norm": 4.276410371848581,
"learning_rate": 3.445667702902969e-05,
"loss": 0.3676,
"num_input_tokens_seen": 24672848,
"step": 2110
},
{
"epoch": 1.0891063610610352,
"grad_norm": 10.209040215860048,
"learning_rate": 3.4225751985120215e-05,
"loss": 0.3253,
"num_input_tokens_seen": 24731344,
"step": 2115
},
{
"epoch": 1.0916816894154004,
"grad_norm": 6.169752493978822,
"learning_rate": 3.3995200004502816e-05,
"loss": 0.4297,
"num_input_tokens_seen": 24789832,
"step": 2120
},
{
"epoch": 1.0942570177697657,
"grad_norm": 4.238650399680663,
"learning_rate": 3.3765026539765834e-05,
"loss": 0.3536,
"num_input_tokens_seen": 24848264,
"step": 2125
},
{
"epoch": 1.0968323461241307,
"grad_norm": 5.445173229006411,
"learning_rate": 3.3535237034545675e-05,
"loss": 0.3588,
"num_input_tokens_seen": 24906744,
"step": 2130
},
{
"epoch": 1.099407674478496,
"grad_norm": 4.508587102151408,
"learning_rate": 3.330583692339802e-05,
"loss": 0.3666,
"num_input_tokens_seen": 24965256,
"step": 2135
},
{
"epoch": 1.1019830028328612,
"grad_norm": 5.836654544282574,
"learning_rate": 3.307683163166934e-05,
"loss": 0.3334,
"num_input_tokens_seen": 25023768,
"step": 2140
},
{
"epoch": 1.1045583311872265,
"grad_norm": 6.855334175793522,
"learning_rate": 3.284822657536856e-05,
"loss": 0.3848,
"num_input_tokens_seen": 25082248,
"step": 2145
},
{
"epoch": 1.1071336595415915,
"grad_norm": 5.3006438448712565,
"learning_rate": 3.262002716103897e-05,
"loss": 0.3699,
"num_input_tokens_seen": 25140752,
"step": 2150
},
{
"epoch": 1.1071336595415915,
"eval_loss": 0.8523861169815063,
"eval_runtime": 16.0023,
"eval_samples_per_second": 3.749,
"eval_steps_per_second": 0.937,
"num_input_tokens_seen": 25140752,
"step": 2150
},
{
"epoch": 1.1097089878959567,
"grad_norm": 3.943124296473041,
"learning_rate": 3.2392238785630386e-05,
"loss": 0.3154,
"num_input_tokens_seen": 25199208,
"step": 2155
},
{
"epoch": 1.112284316250322,
"grad_norm": 8.398532132538953,
"learning_rate": 3.216486683637146e-05,
"loss": 0.3915,
"num_input_tokens_seen": 25257680,
"step": 2160
},
{
"epoch": 1.114859644604687,
"grad_norm": 4.081633194377614,
"learning_rate": 3.1937916690642356e-05,
"loss": 0.3675,
"num_input_tokens_seen": 25316200,
"step": 2165
},
{
"epoch": 1.1174349729590523,
"grad_norm": 6.920842495491902,
"learning_rate": 3.1711393715847476e-05,
"loss": 0.4047,
"num_input_tokens_seen": 25374656,
"step": 2170
},
{
"epoch": 1.1200103013134175,
"grad_norm": 8.460113153700512,
"learning_rate": 3.14853032692886e-05,
"loss": 0.4155,
"num_input_tokens_seen": 25433168,
"step": 2175
},
{
"epoch": 1.1225856296677827,
"grad_norm": 9.825074199159944,
"learning_rate": 3.125965069803811e-05,
"loss": 0.3966,
"num_input_tokens_seen": 25491664,
"step": 2180
},
{
"epoch": 1.1251609580221478,
"grad_norm": 5.732206927543506,
"learning_rate": 3.103444133881261e-05,
"loss": 0.3068,
"num_input_tokens_seen": 25550128,
"step": 2185
},
{
"epoch": 1.127736286376513,
"grad_norm": 6.135036052058211,
"learning_rate": 3.080968051784666e-05,
"loss": 0.386,
"num_input_tokens_seen": 25608624,
"step": 2190
},
{
"epoch": 1.1303116147308783,
"grad_norm": 3.31420885852192,
"learning_rate": 3.058537355076683e-05,
"loss": 0.3898,
"num_input_tokens_seen": 25667128,
"step": 2195
},
{
"epoch": 1.1328869430852433,
"grad_norm": 8.182546413863832,
"learning_rate": 3.0361525742465973e-05,
"loss": 0.4016,
"num_input_tokens_seen": 25725560,
"step": 2200
},
{
"epoch": 1.1328869430852433,
"eval_loss": 0.7534744143486023,
"eval_runtime": 15.969,
"eval_samples_per_second": 3.757,
"eval_steps_per_second": 0.939,
"num_input_tokens_seen": 25725560,
"step": 2200
},
{
"epoch": 1.1354622714396085,
"grad_norm": 4.616007617470174,
"learning_rate": 3.0138142386977787e-05,
"loss": 0.3465,
"num_input_tokens_seen": 25784048,
"step": 2205
},
{
"epoch": 1.1380375997939738,
"grad_norm": 4.752551024155875,
"learning_rate": 2.991522876735154e-05,
"loss": 0.3077,
"num_input_tokens_seen": 25842512,
"step": 2210
},
{
"epoch": 1.140612928148339,
"grad_norm": 6.021213921198953,
"learning_rate": 2.9692790155527227e-05,
"loss": 0.4497,
"num_input_tokens_seen": 25900992,
"step": 2215
},
{
"epoch": 1.143188256502704,
"grad_norm": 8.098592782255322,
"learning_rate": 2.9470831812210837e-05,
"loss": 0.3811,
"num_input_tokens_seen": 25959448,
"step": 2220
},
{
"epoch": 1.1457635848570693,
"grad_norm": 6.108837560432838,
"learning_rate": 2.924935898674992e-05,
"loss": 0.4053,
"num_input_tokens_seen": 26017936,
"step": 2225
},
{
"epoch": 1.1483389132114346,
"grad_norm": 7.709937017464705,
"learning_rate": 2.902837691700945e-05,
"loss": 0.3421,
"num_input_tokens_seen": 26076440,
"step": 2230
},
{
"epoch": 1.1509142415657996,
"grad_norm": 3.840146275079161,
"learning_rate": 2.880789082924798e-05,
"loss": 0.3228,
"num_input_tokens_seen": 26134896,
"step": 2235
},
{
"epoch": 1.1534895699201648,
"grad_norm": 6.088757703790803,
"learning_rate": 2.858790593799405e-05,
"loss": 0.3695,
"num_input_tokens_seen": 26193368,
"step": 2240
},
{
"epoch": 1.15606489827453,
"grad_norm": 3.8647543120940844,
"learning_rate": 2.8368427445922696e-05,
"loss": 0.3463,
"num_input_tokens_seen": 26251848,
"step": 2245
},
{
"epoch": 1.158640226628895,
"grad_norm": 4.425454601086007,
"learning_rate": 2.8149460543732664e-05,
"loss": 0.3442,
"num_input_tokens_seen": 26310336,
"step": 2250
},
{
"epoch": 1.158640226628895,
"eval_loss": 0.7066138386726379,
"eval_runtime": 15.9558,
"eval_samples_per_second": 3.76,
"eval_steps_per_second": 0.94,
"num_input_tokens_seen": 26310336,
"step": 2250
},
{
"epoch": 1.1612155549832603,
"grad_norm": 6.312367706992343,
"learning_rate": 2.7931010410023518e-05,
"loss": 0.3547,
"num_input_tokens_seen": 26368840,
"step": 2255
},
{
"epoch": 1.1637908833376256,
"grad_norm": 6.429493717694784,
"learning_rate": 2.771308221117309e-05,
"loss": 0.3125,
"num_input_tokens_seen": 26427280,
"step": 2260
},
{
"epoch": 1.1663662116919906,
"grad_norm": 6.993677707266103,
"learning_rate": 2.749568110121545e-05,
"loss": 0.3521,
"num_input_tokens_seen": 26485760,
"step": 2265
},
{
"epoch": 1.1689415400463559,
"grad_norm": 5.03743116566882,
"learning_rate": 2.7278812221718924e-05,
"loss": 0.281,
"num_input_tokens_seen": 26544224,
"step": 2270
},
{
"epoch": 1.1715168684007211,
"grad_norm": 5.828198718501714,
"learning_rate": 2.7062480701664488e-05,
"loss": 0.3653,
"num_input_tokens_seen": 26602712,
"step": 2275
},
{
"epoch": 1.1740921967550864,
"grad_norm": 6.1247491578050655,
"learning_rate": 2.6846691657324473e-05,
"loss": 0.3964,
"num_input_tokens_seen": 26661160,
"step": 2280
},
{
"epoch": 1.1766675251094514,
"grad_norm": 6.231155247277189,
"learning_rate": 2.663145019214163e-05,
"loss": 0.3119,
"num_input_tokens_seen": 26719648,
"step": 2285
},
{
"epoch": 1.1792428534638166,
"grad_norm": 6.501604840456734,
"learning_rate": 2.6416761396608362e-05,
"loss": 0.3832,
"num_input_tokens_seen": 26778112,
"step": 2290
},
{
"epoch": 1.1818181818181819,
"grad_norm": 5.377003761278013,
"learning_rate": 2.6202630348146324e-05,
"loss": 0.3277,
"num_input_tokens_seen": 26836592,
"step": 2295
},
{
"epoch": 1.184393510172547,
"grad_norm": 4.826044073542379,
"learning_rate": 2.598906211098643e-05,
"loss": 0.3877,
"num_input_tokens_seen": 26895096,
"step": 2300
},
{
"epoch": 1.184393510172547,
"eval_loss": 0.727741539478302,
"eval_runtime": 15.9289,
"eval_samples_per_second": 3.767,
"eval_steps_per_second": 0.942,
"num_input_tokens_seen": 26895096,
"step": 2300
},
{
"epoch": 1.1869688385269122,
"grad_norm": 6.370847827905799,
"learning_rate": 2.577606173604894e-05,
"loss": 0.3033,
"num_input_tokens_seen": 26953560,
"step": 2305
},
{
"epoch": 1.1895441668812774,
"grad_norm": 11.746077197029585,
"learning_rate": 2.5563634260824175e-05,
"loss": 0.4104,
"num_input_tokens_seen": 27012024,
"step": 2310
},
{
"epoch": 1.1921194952356426,
"grad_norm": 3.9544988689102762,
"learning_rate": 2.535178470925323e-05,
"loss": 0.3447,
"num_input_tokens_seen": 27070520,
"step": 2315
},
{
"epoch": 1.1946948235900077,
"grad_norm": 4.72491689052158,
"learning_rate": 2.5140518091609256e-05,
"loss": 0.2882,
"num_input_tokens_seen": 27128984,
"step": 2320
},
{
"epoch": 1.197270151944373,
"grad_norm": 2.1806068747411245,
"learning_rate": 2.4929839404378936e-05,
"loss": 0.2817,
"num_input_tokens_seen": 27187432,
"step": 2325
},
{
"epoch": 1.1998454802987382,
"grad_norm": 3.2798105115490745,
"learning_rate": 2.471975363014428e-05,
"loss": 0.3693,
"num_input_tokens_seen": 27245920,
"step": 2330
},
{
"epoch": 1.2024208086531032,
"grad_norm": 7.472396523773262,
"learning_rate": 2.451026573746482e-05,
"loss": 0.3587,
"num_input_tokens_seen": 27304384,
"step": 2335
},
{
"epoch": 1.2049961370074684,
"grad_norm": 6.7073623181550275,
"learning_rate": 2.430138068076013e-05,
"loss": 0.354,
"num_input_tokens_seen": 27362864,
"step": 2340
},
{
"epoch": 1.2075714653618337,
"grad_norm": 6.2693798293878515,
"learning_rate": 2.4093103400192625e-05,
"loss": 0.3209,
"num_input_tokens_seen": 27421360,
"step": 2345
},
{
"epoch": 1.210146793716199,
"grad_norm": 6.606866726236357,
"learning_rate": 2.388543882155067e-05,
"loss": 0.3871,
"num_input_tokens_seen": 27479840,
"step": 2350
},
{
"epoch": 1.210146793716199,
"eval_loss": 0.7659633755683899,
"eval_runtime": 16.0101,
"eval_samples_per_second": 3.748,
"eval_steps_per_second": 0.937,
"num_input_tokens_seen": 27479840,
"step": 2350
},
{
"epoch": 1.212722122070564,
"grad_norm": 8.004400275953609,
"learning_rate": 2.3678391856132204e-05,
"loss": 0.352,
"num_input_tokens_seen": 27538344,
"step": 2355
},
{
"epoch": 1.2152974504249292,
"grad_norm": 8.385547193425513,
"learning_rate": 2.3471967400628513e-05,
"loss": 0.347,
"num_input_tokens_seen": 27596808,
"step": 2360
},
{
"epoch": 1.2178727787792945,
"grad_norm": 3.9234442237475435,
"learning_rate": 2.3266170337008398e-05,
"loss": 0.3667,
"num_input_tokens_seen": 27655272,
"step": 2365
},
{
"epoch": 1.2204481071336595,
"grad_norm": 6.584480429736488,
"learning_rate": 2.306100553240274e-05,
"loss": 0.3311,
"num_input_tokens_seen": 27713784,
"step": 2370
},
{
"epoch": 1.2230234354880247,
"grad_norm": 5.791637874835276,
"learning_rate": 2.2856477838989456e-05,
"loss": 0.2964,
"num_input_tokens_seen": 27772248,
"step": 2375
},
{
"epoch": 1.22559876384239,
"grad_norm": 5.663503226529594,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.3683,
"num_input_tokens_seen": 27830704,
"step": 2380
},
{
"epoch": 1.228174092196755,
"grad_norm": 9.657080260273457,
"learning_rate": 2.244935311899829e-05,
"loss": 0.3819,
"num_input_tokens_seen": 27889160,
"step": 2385
},
{
"epoch": 1.2307494205511202,
"grad_norm": 4.757552901440964,
"learning_rate": 2.224676572098007e-05,
"loss": 0.3084,
"num_input_tokens_seen": 27947608,
"step": 2390
},
{
"epoch": 1.2333247489054855,
"grad_norm": 5.188072586185411,
"learning_rate": 2.2044834691045873e-05,
"loss": 0.4267,
"num_input_tokens_seen": 28006112,
"step": 2395
},
{
"epoch": 1.2359000772598505,
"grad_norm": 7.221389028269126,
"learning_rate": 2.184356480489432e-05,
"loss": 0.3486,
"num_input_tokens_seen": 28064552,
"step": 2400
},
{
"epoch": 1.2359000772598505,
"eval_loss": 0.7410638928413391,
"eval_runtime": 15.945,
"eval_samples_per_second": 3.763,
"eval_steps_per_second": 0.941,
"num_input_tokens_seen": 28064552,
"step": 2400
},
{
"epoch": 1.2384754056142158,
"grad_norm": 4.430659190759614,
"learning_rate": 2.1642960822587878e-05,
"loss": 0.2416,
"num_input_tokens_seen": 28123016,
"step": 2405
},
{
"epoch": 1.241050733968581,
"grad_norm": 4.985077238748084,
"learning_rate": 2.1443027488440338e-05,
"loss": 0.3007,
"num_input_tokens_seen": 28181464,
"step": 2410
},
{
"epoch": 1.2436260623229463,
"grad_norm": 11.21074775906945,
"learning_rate": 2.124376953090456e-05,
"loss": 0.2655,
"num_input_tokens_seen": 28239920,
"step": 2415
},
{
"epoch": 1.2462013906773113,
"grad_norm": 6.8116545197169724,
"learning_rate": 2.104519166246059e-05,
"loss": 0.3075,
"num_input_tokens_seen": 28298432,
"step": 2420
},
{
"epoch": 1.2487767190316765,
"grad_norm": 10.87615610006345,
"learning_rate": 2.0847298579504344e-05,
"loss": 0.3537,
"num_input_tokens_seen": 28356904,
"step": 2425
},
{
"epoch": 1.2513520473860418,
"grad_norm": 3.9413743825159133,
"learning_rate": 2.065009496223638e-05,
"loss": 0.2993,
"num_input_tokens_seen": 28415384,
"step": 2430
},
{
"epoch": 1.2539273757404068,
"grad_norm": 3.3043013555966407,
"learning_rate": 2.045358547455138e-05,
"loss": 0.2752,
"num_input_tokens_seen": 28473848,
"step": 2435
},
{
"epoch": 1.256502704094772,
"grad_norm": 3.6641007142438338,
"learning_rate": 2.0257774763927655e-05,
"loss": 0.2975,
"num_input_tokens_seen": 28532312,
"step": 2440
},
{
"epoch": 1.2590780324491373,
"grad_norm": 6.306122720573227,
"learning_rate": 2.0062667461317426e-05,
"loss": 0.4051,
"num_input_tokens_seen": 28590784,
"step": 2445
},
{
"epoch": 1.2616533608035025,
"grad_norm": 4.823015256168698,
"learning_rate": 1.9868268181037185e-05,
"loss": 0.2966,
"num_input_tokens_seen": 28649256,
"step": 2450
},
{
"epoch": 1.2616533608035025,
"eval_loss": 0.7485548853874207,
"eval_runtime": 16.0437,
"eval_samples_per_second": 3.74,
"eval_steps_per_second": 0.935,
"num_input_tokens_seen": 28649256,
"step": 2450
},
{
"epoch": 1.2642286891578676,
"grad_norm": 10.005201788297592,
"learning_rate": 1.967458152065857e-05,
"loss": 0.2664,
"num_input_tokens_seen": 28707736,
"step": 2455
},
{
"epoch": 1.2668040175122328,
"grad_norm": 4.744134155404128,
"learning_rate": 1.9481612060899646e-05,
"loss": 0.3692,
"num_input_tokens_seen": 28766232,
"step": 2460
},
{
"epoch": 1.269379345866598,
"grad_norm": 8.49200897563331,
"learning_rate": 1.928936436551661e-05,
"loss": 0.315,
"num_input_tokens_seen": 28824688,
"step": 2465
},
{
"epoch": 1.271954674220963,
"grad_norm": 5.112500789477909,
"learning_rate": 1.9097842981195834e-05,
"loss": 0.3536,
"num_input_tokens_seen": 28883176,
"step": 2470
},
{
"epoch": 1.2745300025753283,
"grad_norm": 4.93472430343828,
"learning_rate": 1.8907052437446272e-05,
"loss": 0.3143,
"num_input_tokens_seen": 28941592,
"step": 2475
},
{
"epoch": 1.2771053309296936,
"grad_norm": 4.6754631245280365,
"learning_rate": 1.871699724649244e-05,
"loss": 0.3114,
"num_input_tokens_seen": 29000064,
"step": 2480
},
{
"epoch": 1.2796806592840588,
"grad_norm": 7.198381813960669,
"learning_rate": 1.8527681903167644e-05,
"loss": 0.3327,
"num_input_tokens_seen": 29058496,
"step": 2485
},
{
"epoch": 1.2822559876384239,
"grad_norm": 9.221713217692685,
"learning_rate": 1.833911088480767e-05,
"loss": 0.2543,
"num_input_tokens_seen": 29116992,
"step": 2490
},
{
"epoch": 1.284831315992789,
"grad_norm": 8.499870267936974,
"learning_rate": 1.8151288651144893e-05,
"loss": 0.2854,
"num_input_tokens_seen": 29175496,
"step": 2495
},
{
"epoch": 1.2874066443471541,
"grad_norm": 4.289294450742717,
"learning_rate": 1.796421964420285e-05,
"loss": 0.3221,
"num_input_tokens_seen": 29233968,
"step": 2500
},
{
"epoch": 1.2874066443471541,
"eval_loss": 0.7222262620925903,
"eval_runtime": 16.106,
"eval_samples_per_second": 3.725,
"eval_steps_per_second": 0.931,
"num_input_tokens_seen": 29233968,
"step": 2500
},
{
"epoch": 1.2899819727015194,
"grad_norm": 3.3788238852269035,
"learning_rate": 1.7777908288191176e-05,
"loss": 0.2344,
"num_input_tokens_seen": 29292464,
"step": 2505
},
{
"epoch": 1.2925573010558846,
"grad_norm": 9.201457612553746,
"learning_rate": 1.7592358989400883e-05,
"loss": 0.2727,
"num_input_tokens_seen": 29350952,
"step": 2510
},
{
"epoch": 1.2951326294102499,
"grad_norm": 4.626370050462018,
"learning_rate": 1.740757613610028e-05,
"loss": 0.2687,
"num_input_tokens_seen": 29409432,
"step": 2515
},
{
"epoch": 1.2977079577646151,
"grad_norm": 5.784936514951468,
"learning_rate": 1.7223564098431067e-05,
"loss": 0.2632,
"num_input_tokens_seen": 29467880,
"step": 2520
},
{
"epoch": 1.3002832861189801,
"grad_norm": 4.405244480948001,
"learning_rate": 1.704032722830512e-05,
"loss": 0.3057,
"num_input_tokens_seen": 29526384,
"step": 2525
},
{
"epoch": 1.3028586144733454,
"grad_norm": 7.8069578913798825,
"learning_rate": 1.68578698593014e-05,
"loss": 0.3054,
"num_input_tokens_seen": 29584880,
"step": 2530
},
{
"epoch": 1.3054339428277104,
"grad_norm": 6.957468356582848,
"learning_rate": 1.6676196306563613e-05,
"loss": 0.28,
"num_input_tokens_seen": 29643344,
"step": 2535
},
{
"epoch": 1.3080092711820757,
"grad_norm": 9.353535349996537,
"learning_rate": 1.6495310866698093e-05,
"loss": 0.3169,
"num_input_tokens_seen": 29701864,
"step": 2540
},
{
"epoch": 1.310584599536441,
"grad_norm": 5.246799138683368,
"learning_rate": 1.631521781767214e-05,
"loss": 0.2985,
"num_input_tokens_seen": 29760376,
"step": 2545
},
{
"epoch": 1.3131599278908062,
"grad_norm": 10.51357763616516,
"learning_rate": 1.6135921418712956e-05,
"loss": 0.3231,
"num_input_tokens_seen": 29818856,
"step": 2550
},
{
"epoch": 1.3131599278908062,
"eval_loss": 0.7146337628364563,
"eval_runtime": 16.171,
"eval_samples_per_second": 3.71,
"eval_steps_per_second": 0.928,
"num_input_tokens_seen": 29818856,
"step": 2550
},
{
"epoch": 1.3157352562451712,
"grad_norm": 5.568529968511631,
"learning_rate": 1.5957425910206785e-05,
"loss": 0.2689,
"num_input_tokens_seen": 29877288,
"step": 2555
},
{
"epoch": 1.3183105845995364,
"grad_norm": 4.860244764698273,
"learning_rate": 1.577973551359877e-05,
"loss": 0.3889,
"num_input_tokens_seen": 29935776,
"step": 2560
},
{
"epoch": 1.3208859129539017,
"grad_norm": 4.938342083847672,
"learning_rate": 1.560285443129296e-05,
"loss": 0.2489,
"num_input_tokens_seen": 29994232,
"step": 2565
},
{
"epoch": 1.3234612413082667,
"grad_norm": 7.223451539163128,
"learning_rate": 1.542678684655306e-05,
"loss": 0.3016,
"num_input_tokens_seen": 30052760,
"step": 2570
},
{
"epoch": 1.326036569662632,
"grad_norm": 8.03849051806361,
"learning_rate": 1.5251536923403426e-05,
"loss": 0.3063,
"num_input_tokens_seen": 30111200,
"step": 2575
},
{
"epoch": 1.3286118980169972,
"grad_norm": 6.631117517846943,
"learning_rate": 1.5077108806530581e-05,
"loss": 0.3159,
"num_input_tokens_seen": 30169680,
"step": 2580
},
{
"epoch": 1.3311872263713624,
"grad_norm": 4.171513219192368,
"learning_rate": 1.4903506621185192e-05,
"loss": 0.3752,
"num_input_tokens_seen": 30228176,
"step": 2585
},
{
"epoch": 1.3337625547257275,
"grad_norm": 4.3829742543964985,
"learning_rate": 1.4730734473084568e-05,
"loss": 0.3207,
"num_input_tokens_seen": 30286656,
"step": 2590
},
{
"epoch": 1.3363378830800927,
"grad_norm": 7.160007281376411,
"learning_rate": 1.4558796448315504e-05,
"loss": 0.2928,
"num_input_tokens_seen": 30345160,
"step": 2595
},
{
"epoch": 1.338913211434458,
"grad_norm": 7.316812087176357,
"learning_rate": 1.4387696613237612e-05,
"loss": 0.2779,
"num_input_tokens_seen": 30403640,
"step": 2600
},
{
"epoch": 1.338913211434458,
"eval_loss": 0.695651650428772,
"eval_runtime": 16.2569,
"eval_samples_per_second": 3.691,
"eval_steps_per_second": 0.923,
"num_input_tokens_seen": 30403640,
"step": 2600
},
{
"epoch": 1.341488539788823,
"grad_norm": 6.900087606750275,
"learning_rate": 1.4217439014387251e-05,
"loss": 0.3037,
"num_input_tokens_seen": 30462128,
"step": 2605
},
{
"epoch": 1.3440638681431882,
"grad_norm": 9.361737062462586,
"learning_rate": 1.404802767838176e-05,
"loss": 0.2905,
"num_input_tokens_seen": 30520616,
"step": 2610
},
{
"epoch": 1.3466391964975535,
"grad_norm": 11.101564672040755,
"learning_rate": 1.3879466611824199e-05,
"loss": 0.317,
"num_input_tokens_seen": 30579024,
"step": 2615
},
{
"epoch": 1.3492145248519187,
"grad_norm": 5.213355428878847,
"learning_rate": 1.371175980120864e-05,
"loss": 0.2794,
"num_input_tokens_seen": 30637464,
"step": 2620
},
{
"epoch": 1.3517898532062838,
"grad_norm": 4.8688198861459915,
"learning_rate": 1.3544911212825906e-05,
"loss": 0.3056,
"num_input_tokens_seen": 30695936,
"step": 2625
},
{
"epoch": 1.354365181560649,
"grad_norm": 9.002025840794365,
"learning_rate": 1.337892479266974e-05,
"loss": 0.2712,
"num_input_tokens_seen": 30754408,
"step": 2630
},
{
"epoch": 1.356940509915014,
"grad_norm": 4.793656741683869,
"learning_rate": 1.3213804466343421e-05,
"loss": 0.2615,
"num_input_tokens_seen": 30812848,
"step": 2635
},
{
"epoch": 1.3595158382693793,
"grad_norm": 5.128300113893045,
"learning_rate": 1.3049554138967051e-05,
"loss": 0.2661,
"num_input_tokens_seen": 30871344,
"step": 2640
},
{
"epoch": 1.3620911666237445,
"grad_norm": 6.038434247454305,
"learning_rate": 1.2886177695085078e-05,
"loss": 0.3272,
"num_input_tokens_seen": 30929824,
"step": 2645
},
{
"epoch": 1.3646664949781098,
"grad_norm": 5.501317116522042,
"learning_rate": 1.2723678998574512e-05,
"loss": 0.2962,
"num_input_tokens_seen": 30988344,
"step": 2650
},
{
"epoch": 1.3646664949781098,
"eval_loss": 0.7657458186149597,
"eval_runtime": 16.0821,
"eval_samples_per_second": 3.731,
"eval_steps_per_second": 0.933,
"num_input_tokens_seen": 30988344,
"step": 2650
},
{
"epoch": 1.367241823332475,
"grad_norm": 5.445887797084714,
"learning_rate": 1.2562061892553473e-05,
"loss": 0.3207,
"num_input_tokens_seen": 31046848,
"step": 2655
},
{
"epoch": 1.36981715168684,
"grad_norm": 8.28343197617098,
"learning_rate": 1.2401330199290367e-05,
"loss": 0.3001,
"num_input_tokens_seen": 31105352,
"step": 2660
},
{
"epoch": 1.3723924800412053,
"grad_norm": 6.0349779847885054,
"learning_rate": 1.224148772011346e-05,
"loss": 0.2858,
"num_input_tokens_seen": 31163848,
"step": 2665
},
{
"epoch": 1.3749678083955703,
"grad_norm": 6.430225669948217,
"learning_rate": 1.2082538235320929e-05,
"loss": 0.2338,
"num_input_tokens_seen": 31222360,
"step": 2670
},
{
"epoch": 1.3775431367499356,
"grad_norm": 7.550675916086161,
"learning_rate": 1.1924485504091565e-05,
"loss": 0.2212,
"num_input_tokens_seen": 31280840,
"step": 2675
},
{
"epoch": 1.3801184651043008,
"grad_norm": 9.927835245980713,
"learning_rate": 1.1767333264395736e-05,
"loss": 0.3131,
"num_input_tokens_seen": 31339264,
"step": 2680
},
{
"epoch": 1.382693793458666,
"grad_norm": 6.940248775417007,
"learning_rate": 1.1611085232907132e-05,
"loss": 0.3616,
"num_input_tokens_seen": 31397744,
"step": 2685
},
{
"epoch": 1.385269121813031,
"grad_norm": 13.50108715364713,
"learning_rate": 1.14557451049147e-05,
"loss": 0.3153,
"num_input_tokens_seen": 31456240,
"step": 2690
},
{
"epoch": 1.3878444501673963,
"grad_norm": 5.379761157260886,
"learning_rate": 1.1301316554235397e-05,
"loss": 0.3044,
"num_input_tokens_seen": 31514744,
"step": 2695
},
{
"epoch": 1.3904197785217616,
"grad_norm": 6.480605347127299,
"learning_rate": 1.114780323312724e-05,
"loss": 0.3163,
"num_input_tokens_seen": 31573240,
"step": 2700
},
{
"epoch": 1.3904197785217616,
"eval_loss": 0.7473158240318298,
"eval_runtime": 16.166,
"eval_samples_per_second": 3.711,
"eval_steps_per_second": 0.928,
"num_input_tokens_seen": 31573240,
"step": 2700
},
{
"epoch": 1.3929951068761266,
"grad_norm": 4.579483859059419,
"learning_rate": 1.0995208772202897e-05,
"loss": 0.2798,
"num_input_tokens_seen": 31631688,
"step": 2705
},
{
"epoch": 1.3955704352304918,
"grad_norm": 6.098482033036635,
"learning_rate": 1.0843536780343865e-05,
"loss": 0.289,
"num_input_tokens_seen": 31690200,
"step": 2710
},
{
"epoch": 1.398145763584857,
"grad_norm": 9.834029857293697,
"learning_rate": 1.069279084461513e-05,
"loss": 0.2844,
"num_input_tokens_seen": 31748664,
"step": 2715
},
{
"epoch": 1.4007210919392223,
"grad_norm": 9.387518267357049,
"learning_rate": 1.0542974530180327e-05,
"loss": 0.3254,
"num_input_tokens_seen": 31807176,
"step": 2720
},
{
"epoch": 1.4032964202935874,
"grad_norm": 5.648695214602192,
"learning_rate": 1.0394091380217352e-05,
"loss": 0.3683,
"num_input_tokens_seen": 31865696,
"step": 2725
},
{
"epoch": 1.4058717486479526,
"grad_norm": 5.202858729177478,
"learning_rate": 1.0246144915834683e-05,
"loss": 0.2968,
"num_input_tokens_seen": 31924200,
"step": 2730
},
{
"epoch": 1.4084470770023179,
"grad_norm": 4.808429946385537,
"learning_rate": 1.0099138635988026e-05,
"loss": 0.2943,
"num_input_tokens_seen": 31982712,
"step": 2735
},
{
"epoch": 1.4110224053566829,
"grad_norm": 5.094039780174813,
"learning_rate": 9.953076017397578e-06,
"loss": 0.3037,
"num_input_tokens_seen": 32041176,
"step": 2740
},
{
"epoch": 1.4135977337110481,
"grad_norm": 5.807237736394797,
"learning_rate": 9.807960514465792e-06,
"loss": 0.3019,
"num_input_tokens_seen": 32099656,
"step": 2745
},
{
"epoch": 1.4161730620654134,
"grad_norm": 6.27488451409393,
"learning_rate": 9.663795559195733e-06,
"loss": 0.164,
"num_input_tokens_seen": 32158144,
"step": 2750
},
{
"epoch": 1.4161730620654134,
"eval_loss": 0.7807286381721497,
"eval_runtime": 16.139,
"eval_samples_per_second": 3.718,
"eval_steps_per_second": 0.929,
"num_input_tokens_seen": 32158144,
"step": 2750
},
{
"epoch": 1.4187483904197786,
"grad_norm": 6.584628814510667,
"learning_rate": 9.520584561109864e-06,
"loss": 0.3333,
"num_input_tokens_seen": 32216656,
"step": 2755
},
{
"epoch": 1.4213237187741437,
"grad_norm": 7.509676086247465,
"learning_rate": 9.378330907169386e-06,
"loss": 0.2993,
"num_input_tokens_seen": 32275168,
"step": 2760
},
{
"epoch": 1.423899047128509,
"grad_norm": 5.1775193353141535,
"learning_rate": 9.237037961694223e-06,
"loss": 0.2683,
"num_input_tokens_seen": 32333664,
"step": 2765
},
{
"epoch": 1.4264743754828741,
"grad_norm": 7.856433365965151,
"learning_rate": 9.096709066283354e-06,
"loss": 0.3145,
"num_input_tokens_seen": 32392088,
"step": 2770
},
{
"epoch": 1.4290497038372392,
"grad_norm": 8.252870521534577,
"learning_rate": 8.957347539735872e-06,
"loss": 0.3092,
"num_input_tokens_seen": 32450584,
"step": 2775
},
{
"epoch": 1.4316250321916044,
"grad_norm": 9.74883489294415,
"learning_rate": 8.818956677972406e-06,
"loss": 0.2993,
"num_input_tokens_seen": 32509096,
"step": 2780
},
{
"epoch": 1.4342003605459697,
"grad_norm": 4.008158818829899,
"learning_rate": 8.681539753957269e-06,
"loss": 0.326,
"num_input_tokens_seen": 32567560,
"step": 2785
},
{
"epoch": 1.436775688900335,
"grad_norm": 3.4229494980881174,
"learning_rate": 8.545100017620988e-06,
"loss": 0.2494,
"num_input_tokens_seen": 32626056,
"step": 2790
},
{
"epoch": 1.4393510172547,
"grad_norm": 4.425295787830864,
"learning_rate": 8.409640695783443e-06,
"loss": 0.2691,
"num_input_tokens_seen": 32684520,
"step": 2795
},
{
"epoch": 1.4419263456090652,
"grad_norm": 5.132559476583136,
"learning_rate": 8.275164992077556e-06,
"loss": 0.2939,
"num_input_tokens_seen": 32743032,
"step": 2800
},
{
"epoch": 1.4419263456090652,
"eval_loss": 0.791334331035614,
"eval_runtime": 16.1142,
"eval_samples_per_second": 3.723,
"eval_steps_per_second": 0.931,
"num_input_tokens_seen": 32743032,
"step": 2800
},
{
"epoch": 1.4445016739634302,
"grad_norm": 4.932628514942533,
"learning_rate": 8.141676086873572e-06,
"loss": 0.2974,
"num_input_tokens_seen": 32801504,
"step": 2805
},
{
"epoch": 1.4470770023177955,
"grad_norm": 8.764444587690557,
"learning_rate": 8.009177137203794e-06,
"loss": 0.2849,
"num_input_tokens_seen": 32860032,
"step": 2810
},
{
"epoch": 1.4496523306721607,
"grad_norm": 5.502098759051231,
"learning_rate": 7.877671276687898e-06,
"loss": 0.3024,
"num_input_tokens_seen": 32918472,
"step": 2815
},
{
"epoch": 1.452227659026526,
"grad_norm": 3.2634043608450183,
"learning_rate": 7.747161615458902e-06,
"loss": 0.2565,
"num_input_tokens_seen": 32976944,
"step": 2820
},
{
"epoch": 1.4548029873808912,
"grad_norm": 4.852977750360098,
"learning_rate": 7.617651240089546e-06,
"loss": 0.2473,
"num_input_tokens_seen": 33035424,
"step": 2825
},
{
"epoch": 1.4573783157352562,
"grad_norm": 8.667293936674204,
"learning_rate": 7.489143213519301e-06,
"loss": 0.3118,
"num_input_tokens_seen": 33093880,
"step": 2830
},
{
"epoch": 1.4599536440896215,
"grad_norm": 9.253351843058615,
"learning_rate": 7.361640574981937e-06,
"loss": 0.2593,
"num_input_tokens_seen": 33152328,
"step": 2835
},
{
"epoch": 1.4625289724439865,
"grad_norm": 6.811131820051524,
"learning_rate": 7.2351463399336735e-06,
"loss": 0.284,
"num_input_tokens_seen": 33210816,
"step": 2840
},
{
"epoch": 1.4651043007983517,
"grad_norm": 4.086720732934785,
"learning_rate": 7.109663499981834e-06,
"loss": 0.2671,
"num_input_tokens_seen": 33269320,
"step": 2845
},
{
"epoch": 1.467679629152717,
"grad_norm": 9.463519299706055,
"learning_rate": 6.985195022814067e-06,
"loss": 0.2848,
"num_input_tokens_seen": 33327720,
"step": 2850
},
{
"epoch": 1.467679629152717,
"eval_loss": 0.8045337796211243,
"eval_runtime": 15.9996,
"eval_samples_per_second": 3.75,
"eval_steps_per_second": 0.938,
"num_input_tokens_seen": 33327720,
"step": 2850
},
{
"epoch": 1.4702549575070822,
"grad_norm": 6.856320486947826,
"learning_rate": 6.861743852128233e-06,
"loss": 0.2811,
"num_input_tokens_seen": 33386160,
"step": 2855
},
{
"epoch": 1.4728302858614473,
"grad_norm": 8.133776634702407,
"learning_rate": 6.7393129075627335e-06,
"loss": 0.2394,
"num_input_tokens_seen": 33444648,
"step": 2860
},
{
"epoch": 1.4754056142158125,
"grad_norm": 5.884612144672532,
"learning_rate": 6.6179050846274515e-06,
"loss": 0.243,
"num_input_tokens_seen": 33503144,
"step": 2865
},
{
"epoch": 1.4779809425701778,
"grad_norm": 7.133095118516192,
"learning_rate": 6.497523254635296e-06,
"loss": 0.242,
"num_input_tokens_seen": 33561600,
"step": 2870
},
{
"epoch": 1.4805562709245428,
"grad_norm": 3.725193081900286,
"learning_rate": 6.37817026463432e-06,
"loss": 0.1864,
"num_input_tokens_seen": 33620056,
"step": 2875
},
{
"epoch": 1.483131599278908,
"grad_norm": 5.26408055314188,
"learning_rate": 6.25984893734034e-06,
"loss": 0.2406,
"num_input_tokens_seen": 33678512,
"step": 2880
},
{
"epoch": 1.4857069276332733,
"grad_norm": 5.139938399894378,
"learning_rate": 6.142562071070179e-06,
"loss": 0.2287,
"num_input_tokens_seen": 33736960,
"step": 2885
},
{
"epoch": 1.4882822559876385,
"grad_norm": 5.551633292498772,
"learning_rate": 6.026312439675552e-06,
"loss": 0.2643,
"num_input_tokens_seen": 33795416,
"step": 2890
},
{
"epoch": 1.4908575843420036,
"grad_norm": 5.974549504189433,
"learning_rate": 5.911102792477357e-06,
"loss": 0.2956,
"num_input_tokens_seen": 33853936,
"step": 2895
},
{
"epoch": 1.4934329126963688,
"grad_norm": 5.786971041370645,
"learning_rate": 5.796935854200763e-06,
"loss": 0.29,
"num_input_tokens_seen": 33912440,
"step": 2900
},
{
"epoch": 1.4934329126963688,
"eval_loss": 0.8113046884536743,
"eval_runtime": 16.0025,
"eval_samples_per_second": 3.749,
"eval_steps_per_second": 0.937,
"num_input_tokens_seen": 33912440,
"step": 2900
},
{
"epoch": 1.496008241050734,
"grad_norm": 5.559213288581127,
"learning_rate": 5.683814324910685e-06,
"loss": 0.2815,
"num_input_tokens_seen": 33970888,
"step": 2905
},
{
"epoch": 1.498583569405099,
"grad_norm": 4.093818675769417,
"learning_rate": 5.571740879947979e-06,
"loss": 0.2737,
"num_input_tokens_seen": 34029376,
"step": 2910
},
{
"epoch": 1.5011588977594643,
"grad_norm": 3.092699650877493,
"learning_rate": 5.4607181698661634e-06,
"loss": 0.2445,
"num_input_tokens_seen": 34087864,
"step": 2915
},
{
"epoch": 1.5037342261138296,
"grad_norm": 11.010380823046683,
"learning_rate": 5.35074882036869e-06,
"loss": 0.2802,
"num_input_tokens_seen": 34146296,
"step": 2920
},
{
"epoch": 1.5063095544681948,
"grad_norm": 6.09904123406433,
"learning_rate": 5.241835432246889e-06,
"loss": 0.2379,
"num_input_tokens_seen": 34204800,
"step": 2925
},
{
"epoch": 1.5088848828225598,
"grad_norm": 6.205588168386299,
"learning_rate": 5.133980581318459e-06,
"loss": 0.2783,
"num_input_tokens_seen": 34263296,
"step": 2930
},
{
"epoch": 1.511460211176925,
"grad_norm": 5.945749064464075,
"learning_rate": 5.027186818366542e-06,
"loss": 0.2609,
"num_input_tokens_seen": 34321792,
"step": 2935
},
{
"epoch": 1.51403553953129,
"grad_norm": 6.50829738633896,
"learning_rate": 4.921456669079366e-06,
"loss": 0.2367,
"num_input_tokens_seen": 34380264,
"step": 2940
},
{
"epoch": 1.5166108678856554,
"grad_norm": 8.02525724539128,
"learning_rate": 4.816792633990569e-06,
"loss": 0.3644,
"num_input_tokens_seen": 34438752,
"step": 2945
},
{
"epoch": 1.5191861962400206,
"grad_norm": 8.28398511184134,
"learning_rate": 4.713197188420026e-06,
"loss": 0.2494,
"num_input_tokens_seen": 34497216,
"step": 2950
},
{
"epoch": 1.5191861962400206,
"eval_loss": 0.8177086710929871,
"eval_runtime": 16.0851,
"eval_samples_per_second": 3.73,
"eval_steps_per_second": 0.933,
"num_input_tokens_seen": 34497216,
"step": 2950
},
{
"epoch": 1.5217615245943859,
"grad_norm": 7.1653439027229,
"learning_rate": 4.610672782415276e-06,
"loss": 0.2892,
"num_input_tokens_seen": 34555704,
"step": 2955
},
{
"epoch": 1.524336852948751,
"grad_norm": 5.9872264088640295,
"learning_rate": 4.509221840693656e-06,
"loss": 0.3006,
"num_input_tokens_seen": 34614168,
"step": 2960
},
{
"epoch": 1.5269121813031161,
"grad_norm": 3.47728801697101,
"learning_rate": 4.408846762584901e-06,
"loss": 0.2931,
"num_input_tokens_seen": 34672624,
"step": 2965
},
{
"epoch": 1.5294875096574814,
"grad_norm": 5.342563435045045,
"learning_rate": 4.309549921974421e-06,
"loss": 0.2255,
"num_input_tokens_seen": 34731056,
"step": 2970
},
{
"epoch": 1.5320628380118464,
"grad_norm": 8.130368656554953,
"learning_rate": 4.2113336672471245e-06,
"loss": 0.2725,
"num_input_tokens_seen": 34789552,
"step": 2975
},
{
"epoch": 1.5346381663662116,
"grad_norm": 6.656792231449799,
"learning_rate": 4.114200321231937e-06,
"loss": 0.3158,
"num_input_tokens_seen": 34848064,
"step": 2980
},
{
"epoch": 1.537213494720577,
"grad_norm": 16.361277885783338,
"learning_rate": 4.018152181146823e-06,
"loss": 0.2562,
"num_input_tokens_seen": 34906592,
"step": 2985
},
{
"epoch": 1.5397888230749421,
"grad_norm": 5.885778380254227,
"learning_rate": 3.923191518544434e-06,
"loss": 0.2814,
"num_input_tokens_seen": 34965064,
"step": 2990
},
{
"epoch": 1.5423641514293074,
"grad_norm": 7.567800102342742,
"learning_rate": 3.829320579258466e-06,
"loss": 0.2555,
"num_input_tokens_seen": 35023552,
"step": 2995
},
{
"epoch": 1.5449394797836724,
"grad_norm": 6.846236051634878,
"learning_rate": 3.7365415833504725e-06,
"loss": 0.2259,
"num_input_tokens_seen": 35082056,
"step": 3000
},
{
"epoch": 1.5449394797836724,
"eval_loss": 0.8405727744102478,
"eval_runtime": 16.2083,
"eval_samples_per_second": 3.702,
"eval_steps_per_second": 0.925,
"num_input_tokens_seen": 35082056,
"step": 3000
},
{
"epoch": 1.5475148081380374,
"grad_norm": 5.643348291984009,
"learning_rate": 3.644856725057405e-06,
"loss": 0.2157,
"num_input_tokens_seen": 35140568,
"step": 3005
},
{
"epoch": 1.5500901364924027,
"grad_norm": 6.225693907549098,
"learning_rate": 3.554268172739661e-06,
"loss": 0.2233,
"num_input_tokens_seen": 35199064,
"step": 3010
},
{
"epoch": 1.552665464846768,
"grad_norm": 5.080945994557626,
"learning_rate": 3.4647780688298826e-06,
"loss": 0.2951,
"num_input_tokens_seen": 35257576,
"step": 3015
},
{
"epoch": 1.5552407932011332,
"grad_norm": 5.263879934995459,
"learning_rate": 3.376388529782215e-06,
"loss": 0.2274,
"num_input_tokens_seen": 35316064,
"step": 3020
},
{
"epoch": 1.5578161215554984,
"grad_norm": 5.655349471422181,
"learning_rate": 3.2891016460222967e-06,
"loss": 0.2479,
"num_input_tokens_seen": 35374504,
"step": 3025
},
{
"epoch": 1.5603914499098637,
"grad_norm": 7.871895425892081,
"learning_rate": 3.2029194818977983e-06,
"loss": 0.292,
"num_input_tokens_seen": 35432984,
"step": 3030
},
{
"epoch": 1.5629667782642287,
"grad_norm": 6.441418084723481,
"learning_rate": 3.117844075629617e-06,
"loss": 0.241,
"num_input_tokens_seen": 35491488,
"step": 3035
},
{
"epoch": 1.5655421066185937,
"grad_norm": 5.268339109046189,
"learning_rate": 3.033877439263666e-06,
"loss": 0.228,
"num_input_tokens_seen": 35549984,
"step": 3040
},
{
"epoch": 1.568117434972959,
"grad_norm": 7.110464304213341,
"learning_rate": 2.951021558623274e-06,
"loss": 0.2485,
"num_input_tokens_seen": 35608488,
"step": 3045
},
{
"epoch": 1.5706927633273242,
"grad_norm": 12.567694093056492,
"learning_rate": 2.869278393262226e-06,
"loss": 0.2851,
"num_input_tokens_seen": 35666976,
"step": 3050
},
{
"epoch": 1.5706927633273242,
"eval_loss": 0.8473746180534363,
"eval_runtime": 16.0314,
"eval_samples_per_second": 3.743,
"eval_steps_per_second": 0.936,
"num_input_tokens_seen": 35666976,
"step": 3050
},
{
"epoch": 1.5732680916816895,
"grad_norm": 5.787936921221981,
"learning_rate": 2.7886498764184588e-06,
"loss": 0.2514,
"num_input_tokens_seen": 35725456,
"step": 3055
},
{
"epoch": 1.5758434200360547,
"grad_norm": 7.052716790363759,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.3091,
"num_input_tokens_seen": 35783912,
"step": 3060
},
{
"epoch": 1.5784187483904197,
"grad_norm": 4.6737853290480915,
"learning_rate": 2.6307443893812843e-06,
"loss": 0.2629,
"num_input_tokens_seen": 35842376,
"step": 3065
},
{
"epoch": 1.580994076744785,
"grad_norm": 8.400296818269052,
"learning_rate": 2.5534711536759404e-06,
"loss": 0.3065,
"num_input_tokens_seen": 35900824,
"step": 3070
},
{
"epoch": 1.58356940509915,
"grad_norm": 3.909241159865706,
"learning_rate": 2.4773200353756798e-06,
"loss": 0.2577,
"num_input_tokens_seen": 35959264,
"step": 3075
},
{
"epoch": 1.5861447334535153,
"grad_norm": 5.227660314173737,
"learning_rate": 2.4022928354656473e-06,
"loss": 0.2359,
"num_input_tokens_seen": 36017760,
"step": 3080
},
{
"epoch": 1.5887200618078805,
"grad_norm": 5.407491053931616,
"learning_rate": 2.3283913283502044e-06,
"loss": 0.1897,
"num_input_tokens_seen": 36076280,
"step": 3085
},
{
"epoch": 1.5912953901622457,
"grad_norm": 5.771594174948701,
"learning_rate": 2.2556172618108997e-06,
"loss": 0.286,
"num_input_tokens_seen": 36134784,
"step": 3090
},
{
"epoch": 1.593870718516611,
"grad_norm": 5.508770087080472,
"learning_rate": 2.183972356965125e-06,
"loss": 0.2733,
"num_input_tokens_seen": 36193288,
"step": 3095
},
{
"epoch": 1.596446046870976,
"grad_norm": 6.343942326218544,
"learning_rate": 2.113458308225458e-06,
"loss": 0.2351,
"num_input_tokens_seen": 36251744,
"step": 3100
},
{
"epoch": 1.596446046870976,
"eval_loss": 0.8650907874107361,
"eval_runtime": 16.0989,
"eval_samples_per_second": 3.727,
"eval_steps_per_second": 0.932,
"num_input_tokens_seen": 36251744,
"step": 3100
},
{
"epoch": 1.5990213752253413,
"grad_norm": 4.00048030481465,
"learning_rate": 2.0440767832595574e-06,
"loss": 0.2454,
"num_input_tokens_seen": 36310200,
"step": 3105
},
{
"epoch": 1.6015967035797063,
"grad_norm": 5.230064679031373,
"learning_rate": 1.975829422950709e-06,
"loss": 0.2629,
"num_input_tokens_seen": 36368688,
"step": 3110
},
{
"epoch": 1.6041720319340715,
"grad_norm": 12.271894553598498,
"learning_rate": 1.908717841359048e-06,
"loss": 0.2848,
"num_input_tokens_seen": 36427192,
"step": 3115
},
{
"epoch": 1.6067473602884368,
"grad_norm": 8.178213306290619,
"learning_rate": 1.8427436256833852e-06,
"loss": 0.228,
"num_input_tokens_seen": 36485656,
"step": 3120
},
{
"epoch": 1.609322688642802,
"grad_norm": 4.853366085377887,
"learning_rate": 1.7779083362236547e-06,
"loss": 0.2239,
"num_input_tokens_seen": 36544128,
"step": 3125
},
{
"epoch": 1.6118980169971673,
"grad_norm": 10.968162741068843,
"learning_rate": 1.7142135063440035e-06,
"loss": 0.2585,
"num_input_tokens_seen": 36602568,
"step": 3130
},
{
"epoch": 1.6144733453515323,
"grad_norm": 5.564416348243761,
"learning_rate": 1.6516606424365643e-06,
"loss": 0.2887,
"num_input_tokens_seen": 36661064,
"step": 3135
},
{
"epoch": 1.6170486737058976,
"grad_norm": 8.095832161946442,
"learning_rate": 1.5902512238857858e-06,
"loss": 0.2446,
"num_input_tokens_seen": 36719544,
"step": 3140
},
{
"epoch": 1.6196240020602626,
"grad_norm": 8.906257390618395,
"learning_rate": 1.5299867030334814e-06,
"loss": 0.2673,
"num_input_tokens_seen": 36778064,
"step": 3145
},
{
"epoch": 1.6221993304146278,
"grad_norm": 6.864070166407251,
"learning_rate": 1.4708685051444515e-06,
"loss": 0.2638,
"num_input_tokens_seen": 36836560,
"step": 3150
},
{
"epoch": 1.6221993304146278,
"eval_loss": 0.8633677363395691,
"eval_runtime": 16.2031,
"eval_samples_per_second": 3.703,
"eval_steps_per_second": 0.926,
"num_input_tokens_seen": 36836560,
"step": 3150
},
{
"epoch": 1.624774658768993,
"grad_norm": 8.026607293073416,
"learning_rate": 1.4128980283727943e-06,
"loss": 0.2793,
"num_input_tokens_seen": 36895016,
"step": 3155
},
{
"epoch": 1.6273499871233583,
"grad_norm": 11.669862098293653,
"learning_rate": 1.356076643728843e-06,
"loss": 0.2887,
"num_input_tokens_seen": 36953528,
"step": 3160
},
{
"epoch": 1.6299253154777236,
"grad_norm": 5.580791837684188,
"learning_rate": 1.3004056950467135e-06,
"loss": 0.317,
"num_input_tokens_seen": 37012056,
"step": 3165
},
{
"epoch": 1.6325006438320886,
"grad_norm": 4.650356589287389,
"learning_rate": 1.2458864989525698e-06,
"loss": 0.2095,
"num_input_tokens_seen": 37070528,
"step": 3170
},
{
"epoch": 1.6350759721864536,
"grad_norm": 6.089813437162075,
"learning_rate": 1.19252034483342e-06,
"loss": 0.237,
"num_input_tokens_seen": 37129008,
"step": 3175
},
{
"epoch": 1.6376513005408189,
"grad_norm": 5.287668578489162,
"learning_rate": 1.1403084948067021e-06,
"loss": 0.2448,
"num_input_tokens_seen": 37187472,
"step": 3180
},
{
"epoch": 1.6402266288951841,
"grad_norm": 6.982602482070445,
"learning_rate": 1.089252183690348e-06,
"loss": 0.2563,
"num_input_tokens_seen": 37245936,
"step": 3185
},
{
"epoch": 1.6428019572495494,
"grad_norm": 2.9242653665827647,
"learning_rate": 1.0393526189736602e-06,
"loss": 0.2538,
"num_input_tokens_seen": 37304424,
"step": 3190
},
{
"epoch": 1.6453772856039146,
"grad_norm": 6.894723044936381,
"learning_rate": 9.906109807887032e-07,
"loss": 0.1768,
"num_input_tokens_seen": 37362888,
"step": 3195
},
{
"epoch": 1.6479526139582796,
"grad_norm": 6.796664957587956,
"learning_rate": 9.430284218824026e-07,
"loss": 0.312,
"num_input_tokens_seen": 37421416,
"step": 3200
},
{
"epoch": 1.6479526139582796,
"eval_loss": 0.8679988980293274,
"eval_runtime": 16.1678,
"eval_samples_per_second": 3.711,
"eval_steps_per_second": 0.928,
"num_input_tokens_seen": 37421416,
"step": 3200
},
{
"epoch": 1.6505279423126449,
"grad_norm": 12.027460444161642,
"learning_rate": 8.966060675892951e-07,
"loss": 0.2865,
"num_input_tokens_seen": 37479848,
"step": 3205
},
{
"epoch": 1.65310327066701,
"grad_norm": 6.851221931248735,
"learning_rate": 8.513450158049108e-07,
"loss": 0.3299,
"num_input_tokens_seen": 37538312,
"step": 3210
},
{
"epoch": 1.6556785990213752,
"grad_norm": 6.971651790450948,
"learning_rate": 8.072463369597993e-07,
"loss": 0.3218,
"num_input_tokens_seen": 37596800,
"step": 3215
},
{
"epoch": 1.6582539273757404,
"grad_norm": 10.994527310957624,
"learning_rate": 7.643110739942172e-07,
"loss": 0.2593,
"num_input_tokens_seen": 37655312,
"step": 3220
},
{
"epoch": 1.6608292557301056,
"grad_norm": 13.542379224085927,
"learning_rate": 7.225402423334693e-07,
"loss": 0.3072,
"num_input_tokens_seen": 37713800,
"step": 3225
},
{
"epoch": 1.663404584084471,
"grad_norm": 5.442561929450427,
"learning_rate": 6.819348298638839e-07,
"loss": 0.2276,
"num_input_tokens_seen": 37772280,
"step": 3230
},
{
"epoch": 1.665979912438836,
"grad_norm": 8.128386248398428,
"learning_rate": 6.424957969094536e-07,
"loss": 0.2489,
"num_input_tokens_seen": 37830800,
"step": 3235
},
{
"epoch": 1.6685552407932012,
"grad_norm": 3.9766881915113266,
"learning_rate": 6.0422407620912e-07,
"loss": 0.2552,
"num_input_tokens_seen": 37889280,
"step": 3240
},
{
"epoch": 1.6711305691475662,
"grad_norm": 5.555365927504982,
"learning_rate": 5.671205728947305e-07,
"loss": 0.226,
"num_input_tokens_seen": 37947728,
"step": 3245
},
{
"epoch": 1.6737058975019314,
"grad_norm": 5.733028191926084,
"learning_rate": 5.311861644696048e-07,
"loss": 0.2785,
"num_input_tokens_seen": 38006200,
"step": 3250
},
{
"epoch": 1.6737058975019314,
"eval_loss": 0.8640011548995972,
"eval_runtime": 16.0965,
"eval_samples_per_second": 3.728,
"eval_steps_per_second": 0.932,
"num_input_tokens_seen": 38006200,
"step": 3250
},
{
"epoch": 1.6762812258562967,
"grad_norm": 4.778342712582032,
"learning_rate": 4.964217007878081e-07,
"loss": 0.2291,
"num_input_tokens_seen": 38064672,
"step": 3255
},
{
"epoch": 1.678856554210662,
"grad_norm": 4.4902131141962,
"learning_rate": 4.6282800403402715e-07,
"loss": 0.3101,
"num_input_tokens_seen": 38123192,
"step": 3260
},
{
"epoch": 1.6814318825650272,
"grad_norm": 7.687294001046122,
"learning_rate": 4.3040586870415346e-07,
"loss": 0.3196,
"num_input_tokens_seen": 38181696,
"step": 3265
},
{
"epoch": 1.6840072109193922,
"grad_norm": 7.392271519909896,
"learning_rate": 3.991560615864587e-07,
"loss": 0.2587,
"num_input_tokens_seen": 38240216,
"step": 3270
},
{
"epoch": 1.6865825392737575,
"grad_norm": 6.335589264461425,
"learning_rate": 3.6907932174349846e-07,
"loss": 0.2093,
"num_input_tokens_seen": 38298688,
"step": 3275
},
{
"epoch": 1.6891578676281225,
"grad_norm": 7.268228162683875,
"learning_rate": 3.40176360494604e-07,
"loss": 0.2282,
"num_input_tokens_seen": 38357128,
"step": 3280
},
{
"epoch": 1.6917331959824877,
"grad_norm": 4.776419874246786,
"learning_rate": 3.124478613990733e-07,
"loss": 0.2092,
"num_input_tokens_seen": 38415600,
"step": 3285
},
{
"epoch": 1.694308524336853,
"grad_norm": 8.522894464657169,
"learning_rate": 2.8589448023998987e-07,
"loss": 0.2861,
"num_input_tokens_seen": 38474112,
"step": 3290
},
{
"epoch": 1.6968838526912182,
"grad_norm": 5.304805044526707,
"learning_rate": 2.605168450087514e-07,
"loss": 0.2494,
"num_input_tokens_seen": 38532624,
"step": 3295
},
{
"epoch": 1.6994591810455835,
"grad_norm": 7.112591931914542,
"learning_rate": 2.363155558901542e-07,
"loss": 0.2752,
"num_input_tokens_seen": 38591128,
"step": 3300
},
{
"epoch": 1.6994591810455835,
"eval_loss": 0.8644178509712219,
"eval_runtime": 16.1497,
"eval_samples_per_second": 3.715,
"eval_steps_per_second": 0.929,
"num_input_tokens_seen": 38591128,
"step": 3300
},
{
"epoch": 1.7020345093999485,
"grad_norm": 4.935833215525081,
"learning_rate": 2.1329118524827662e-07,
"loss": 0.2337,
"num_input_tokens_seen": 38649640,
"step": 3305
},
{
"epoch": 1.7046098377543135,
"grad_norm": 5.746920185244728,
"learning_rate": 1.9144427761286222e-07,
"loss": 0.215,
"num_input_tokens_seen": 38708112,
"step": 3310
},
{
"epoch": 1.7071851661086788,
"grad_norm": 6.501004359690972,
"learning_rate": 1.7077534966650766e-07,
"loss": 0.2871,
"num_input_tokens_seen": 38766624,
"step": 3315
},
{
"epoch": 1.709760494463044,
"grad_norm": 6.996403813160393,
"learning_rate": 1.51284890232406e-07,
"loss": 0.3478,
"num_input_tokens_seen": 38825104,
"step": 3320
},
{
"epoch": 1.7123358228174093,
"grad_norm": 5.178545190033401,
"learning_rate": 1.3297336026280027e-07,
"loss": 0.2055,
"num_input_tokens_seen": 38883560,
"step": 3325
},
{
"epoch": 1.7149111511717745,
"grad_norm": 6.686144266429449,
"learning_rate": 1.158411928280645e-07,
"loss": 0.2992,
"num_input_tokens_seen": 38942040,
"step": 3330
},
{
"epoch": 1.7174864795261395,
"grad_norm": 4.337439288142164,
"learning_rate": 9.988879310649513e-08,
"loss": 0.2302,
"num_input_tokens_seen": 39000488,
"step": 3335
},
{
"epoch": 1.7200618078805048,
"grad_norm": 6.5240260149211755,
"learning_rate": 8.511653837470212e-08,
"loss": 0.265,
"num_input_tokens_seen": 39058960,
"step": 3340
},
{
"epoch": 1.7226371362348698,
"grad_norm": 7.592689596688837,
"learning_rate": 7.152477799867719e-08,
"loss": 0.3147,
"num_input_tokens_seen": 39117416,
"step": 3345
},
{
"epoch": 1.725212464589235,
"grad_norm": 6.429413076205037,
"learning_rate": 5.911383342556143e-08,
"loss": 0.2674,
"num_input_tokens_seen": 39175888,
"step": 3350
},
{
"epoch": 1.725212464589235,
"eval_loss": 0.8666485548019409,
"eval_runtime": 16.1238,
"eval_samples_per_second": 3.721,
"eval_steps_per_second": 0.93,
"num_input_tokens_seen": 39175888,
"step": 3350
},
{
"epoch": 1.7277877929436003,
"grad_norm": 10.968051828666288,
"learning_rate": 4.788399817602929e-08,
"loss": 0.2565,
"num_input_tokens_seen": 39234336,
"step": 3355
},
{
"epoch": 1.7303631212979655,
"grad_norm": 5.1159559645491335,
"learning_rate": 3.7835537837338506e-08,
"loss": 0.2762,
"num_input_tokens_seen": 39292800,
"step": 3360
},
{
"epoch": 1.7329384496523308,
"grad_norm": 6.735859744015271,
"learning_rate": 2.8968690057051828e-08,
"loss": 0.2196,
"num_input_tokens_seen": 39351272,
"step": 3365
},
{
"epoch": 1.7355137780066958,
"grad_norm": 3.989003741597172,
"learning_rate": 2.128366453743591e-08,
"loss": 0.2482,
"num_input_tokens_seen": 39409736,
"step": 3370
},
{
"epoch": 1.738089106361061,
"grad_norm": 5.083412307953648,
"learning_rate": 1.4780643030476438e-08,
"loss": 0.2778,
"num_input_tokens_seen": 39468176,
"step": 3375
},
{
"epoch": 1.740664434715426,
"grad_norm": 7.4306605849577565,
"learning_rate": 9.459779333587104e-09,
"loss": 0.2048,
"num_input_tokens_seen": 39526688,
"step": 3380
},
{
"epoch": 1.7432397630697913,
"grad_norm": 4.202839419581782,
"learning_rate": 5.3211992859791835e-09,
"loss": 0.2296,
"num_input_tokens_seen": 39585152,
"step": 3385
},
{
"epoch": 1.7458150914241566,
"grad_norm": 7.909317855624412,
"learning_rate": 2.3650007656805806e-09,
"loss": 0.2713,
"num_input_tokens_seen": 39643640,
"step": 3390
},
{
"epoch": 1.7483904197785218,
"grad_norm": 7.880795429819755,
"learning_rate": 5.912536872321184e-10,
"loss": 0.2964,
"num_input_tokens_seen": 39702144,
"step": 3395
},
{
"epoch": 1.750965748132887,
"grad_norm": 4.00234080349809,
"learning_rate": 0.0,
"loss": 0.1797,
"num_input_tokens_seen": 39760664,
"step": 3400
},
{
"epoch": 1.750965748132887,
"eval_loss": 0.8603056073188782,
"eval_runtime": 16.2474,
"eval_samples_per_second": 3.693,
"eval_steps_per_second": 0.923,
"num_input_tokens_seen": 39760664,
"step": 3400
},
{
"epoch": 1.750965748132887,
"num_input_tokens_seen": 39760664,
"step": 3400,
"total_flos": 2232757993603072.0,
"train_loss": 0.5904174627801951,
"train_runtime": 45337.3565,
"train_samples_per_second": 1.8,
"train_steps_per_second": 0.075
}
],
"logging_steps": 5,
"max_steps": 3400,
"num_input_tokens_seen": 39760664,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2232757993603072.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}