{ "best_metric": 0.2966395914554596, "best_model_checkpoint": "saves/CADICA_qwenvl_direction_then_DetectAndClassify_scale6/lora/sft/checkpoint-1200", "epoch": 0.413589364844904, "eval_steps": 50, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014771048744460858, "grad_norm": 10.597820832195618, "learning_rate": 1.4749262536873157e-06, "loss": 1.8844, "num_input_tokens_seen": 52288, "step": 5 }, { "epoch": 0.0029542097488921715, "grad_norm": 10.631406320216014, "learning_rate": 2.9498525073746313e-06, "loss": 1.9494, "num_input_tokens_seen": 103976, "step": 10 }, { "epoch": 0.004431314623338257, "grad_norm": 9.455384953384051, "learning_rate": 4.424778761061947e-06, "loss": 1.995, "num_input_tokens_seen": 155560, "step": 15 }, { "epoch": 0.005908419497784343, "grad_norm": 21.163264642835454, "learning_rate": 5.899705014749263e-06, "loss": 2.0327, "num_input_tokens_seen": 206520, "step": 20 }, { "epoch": 0.007385524372230428, "grad_norm": 10.023350257859445, "learning_rate": 7.374631268436579e-06, "loss": 1.9153, "num_input_tokens_seen": 258464, "step": 25 }, { "epoch": 0.008862629246676515, "grad_norm": 10.512071286717141, "learning_rate": 8.849557522123894e-06, "loss": 1.9723, "num_input_tokens_seen": 309800, "step": 30 }, { "epoch": 0.0103397341211226, "grad_norm": 11.56445556664387, "learning_rate": 1.032448377581121e-05, "loss": 1.6646, "num_input_tokens_seen": 361216, "step": 35 }, { "epoch": 0.011816838995568686, "grad_norm": 12.04717354860619, "learning_rate": 1.1799410029498525e-05, "loss": 1.7057, "num_input_tokens_seen": 412680, "step": 40 }, { "epoch": 0.013293943870014771, "grad_norm": 4.748411551761613, "learning_rate": 1.3274336283185843e-05, "loss": 1.4552, "num_input_tokens_seen": 464640, "step": 45 }, { "epoch": 0.014771048744460856, "grad_norm": 7.526687970196985, "learning_rate": 1.4749262536873157e-05, "loss": 1.3918, "num_input_tokens_seen": 516240, "step": 50 }, { "epoch": 0.014771048744460856, "eval_loss": 1.042170763015747, "eval_runtime": 48.777, "eval_samples_per_second": 1.23, "eval_steps_per_second": 0.308, "num_input_tokens_seen": 516240, "step": 50 }, { "epoch": 0.01624815361890694, "grad_norm": 5.812969222976318, "learning_rate": 1.6224188790560475e-05, "loss": 1.2308, "num_input_tokens_seen": 567536, "step": 55 }, { "epoch": 0.01772525849335303, "grad_norm": 2.497864489287105, "learning_rate": 1.7699115044247787e-05, "loss": 1.0922, "num_input_tokens_seen": 619392, "step": 60 }, { "epoch": 0.019202363367799114, "grad_norm": 1.4292999616096396, "learning_rate": 1.9174041297935107e-05, "loss": 0.9517, "num_input_tokens_seen": 671168, "step": 65 }, { "epoch": 0.0206794682422452, "grad_norm": 1.5696921990486115, "learning_rate": 2.064896755162242e-05, "loss": 0.9277, "num_input_tokens_seen": 722464, "step": 70 }, { "epoch": 0.022156573116691284, "grad_norm": 2.288073027990093, "learning_rate": 2.2123893805309738e-05, "loss": 0.8741, "num_input_tokens_seen": 774120, "step": 75 }, { "epoch": 0.023633677991137372, "grad_norm": 2.658756604981594, "learning_rate": 2.359882005899705e-05, "loss": 0.8837, "num_input_tokens_seen": 825944, "step": 80 }, { "epoch": 0.025110782865583457, "grad_norm": 2.142324288581783, "learning_rate": 2.5073746312684367e-05, "loss": 0.8658, "num_input_tokens_seen": 877632, "step": 85 }, { "epoch": 0.026587887740029542, "grad_norm": 1.4822847258427907, "learning_rate": 2.6548672566371686e-05, "loss": 0.8626, "num_input_tokens_seen": 928664, "step": 90 }, { "epoch": 0.028064992614475627, "grad_norm": 1.2798949826045687, "learning_rate": 2.8023598820059e-05, "loss": 0.828, "num_input_tokens_seen": 980120, "step": 95 }, { "epoch": 0.029542097488921712, "grad_norm": 1.0072961699690943, "learning_rate": 2.9498525073746314e-05, "loss": 0.8208, "num_input_tokens_seen": 1030696, "step": 100 }, { "epoch": 0.029542097488921712, "eval_loss": 0.8917127847671509, "eval_runtime": 19.2166, "eval_samples_per_second": 3.122, "eval_steps_per_second": 0.781, "num_input_tokens_seen": 1030696, "step": 100 }, { "epoch": 0.0310192023633678, "grad_norm": 1.8545776603803357, "learning_rate": 3.097345132743363e-05, "loss": 0.8858, "num_input_tokens_seen": 1083184, "step": 105 }, { "epoch": 0.03249630723781388, "grad_norm": 1.5328827906971274, "learning_rate": 3.244837758112095e-05, "loss": 0.8395, "num_input_tokens_seen": 1135216, "step": 110 }, { "epoch": 0.033973412112259974, "grad_norm": 1.2347057774721741, "learning_rate": 3.3923303834808265e-05, "loss": 0.8729, "num_input_tokens_seen": 1187592, "step": 115 }, { "epoch": 0.03545051698670606, "grad_norm": 1.478939426983387, "learning_rate": 3.5398230088495574e-05, "loss": 0.8534, "num_input_tokens_seen": 1239544, "step": 120 }, { "epoch": 0.03692762186115214, "grad_norm": 1.7015290012303235, "learning_rate": 3.687315634218289e-05, "loss": 0.8621, "num_input_tokens_seen": 1291496, "step": 125 }, { "epoch": 0.03840472673559823, "grad_norm": 1.1767502699022077, "learning_rate": 3.834808259587021e-05, "loss": 0.8548, "num_input_tokens_seen": 1344704, "step": 130 }, { "epoch": 0.03988183161004431, "grad_norm": 0.8775651985355217, "learning_rate": 3.982300884955752e-05, "loss": 0.8555, "num_input_tokens_seen": 1396432, "step": 135 }, { "epoch": 0.0413589364844904, "grad_norm": 1.0196921729702504, "learning_rate": 4.129793510324484e-05, "loss": 0.8503, "num_input_tokens_seen": 1448304, "step": 140 }, { "epoch": 0.04283604135893648, "grad_norm": 1.0316734242866998, "learning_rate": 4.2772861356932154e-05, "loss": 0.7974, "num_input_tokens_seen": 1500480, "step": 145 }, { "epoch": 0.04431314623338257, "grad_norm": 1.3815092616140325, "learning_rate": 4.4247787610619477e-05, "loss": 0.8125, "num_input_tokens_seen": 1550792, "step": 150 }, { "epoch": 0.04431314623338257, "eval_loss": 0.9009397625923157, "eval_runtime": 19.1097, "eval_samples_per_second": 3.14, "eval_steps_per_second": 0.785, "num_input_tokens_seen": 1550792, "step": 150 }, { "epoch": 0.04579025110782865, "grad_norm": 0.9311064643035706, "learning_rate": 4.5722713864306786e-05, "loss": 0.8444, "num_input_tokens_seen": 1602680, "step": 155 }, { "epoch": 0.047267355982274745, "grad_norm": 0.8826375779686524, "learning_rate": 4.71976401179941e-05, "loss": 0.8832, "num_input_tokens_seen": 1655184, "step": 160 }, { "epoch": 0.04874446085672083, "grad_norm": 1.7104989113767923, "learning_rate": 4.867256637168142e-05, "loss": 0.8428, "num_input_tokens_seen": 1707544, "step": 165 }, { "epoch": 0.050221565731166914, "grad_norm": 1.0884565315781036, "learning_rate": 5.014749262536873e-05, "loss": 0.8235, "num_input_tokens_seen": 1759296, "step": 170 }, { "epoch": 0.051698670605613, "grad_norm": 1.2821380368908926, "learning_rate": 5.162241887905604e-05, "loss": 0.8293, "num_input_tokens_seen": 1812488, "step": 175 }, { "epoch": 0.053175775480059084, "grad_norm": 1.8069978017028316, "learning_rate": 5.309734513274337e-05, "loss": 0.8284, "num_input_tokens_seen": 1864408, "step": 180 }, { "epoch": 0.05465288035450517, "grad_norm": 1.3165540148767247, "learning_rate": 5.457227138643069e-05, "loss": 0.8268, "num_input_tokens_seen": 1916744, "step": 185 }, { "epoch": 0.056129985228951254, "grad_norm": 1.5174910847154595, "learning_rate": 5.6047197640118e-05, "loss": 0.8153, "num_input_tokens_seen": 1968128, "step": 190 }, { "epoch": 0.05760709010339734, "grad_norm": 1.4410901045723357, "learning_rate": 5.752212389380531e-05, "loss": 0.8123, "num_input_tokens_seen": 2019312, "step": 195 }, { "epoch": 0.059084194977843424, "grad_norm": 1.5417669370284124, "learning_rate": 5.899705014749263e-05, "loss": 0.7675, "num_input_tokens_seen": 2071176, "step": 200 }, { "epoch": 0.059084194977843424, "eval_loss": 0.9007444977760315, "eval_runtime": 19.0725, "eval_samples_per_second": 3.146, "eval_steps_per_second": 0.786, "num_input_tokens_seen": 2071176, "step": 200 }, { "epoch": 0.060561299852289516, "grad_norm": 3.485842845243708, "learning_rate": 6.0471976401179945e-05, "loss": 0.8075, "num_input_tokens_seen": 2122328, "step": 205 }, { "epoch": 0.0620384047267356, "grad_norm": 1.9945831681716613, "learning_rate": 6.194690265486725e-05, "loss": 0.8207, "num_input_tokens_seen": 2174744, "step": 210 }, { "epoch": 0.06351550960118169, "grad_norm": 2.5224102035468907, "learning_rate": 6.342182890855458e-05, "loss": 0.7867, "num_input_tokens_seen": 2227136, "step": 215 }, { "epoch": 0.06499261447562776, "grad_norm": 2.7703394460607833, "learning_rate": 6.48967551622419e-05, "loss": 0.8256, "num_input_tokens_seen": 2278568, "step": 220 }, { "epoch": 0.06646971935007386, "grad_norm": 4.676590157125056, "learning_rate": 6.637168141592921e-05, "loss": 0.7897, "num_input_tokens_seen": 2330224, "step": 225 }, { "epoch": 0.06794682422451995, "grad_norm": 1.6101062596215647, "learning_rate": 6.784660766961653e-05, "loss": 0.792, "num_input_tokens_seen": 2381344, "step": 230 }, { "epoch": 0.06942392909896603, "grad_norm": 3.202676293331468, "learning_rate": 6.932153392330384e-05, "loss": 0.8309, "num_input_tokens_seen": 2432136, "step": 235 }, { "epoch": 0.07090103397341212, "grad_norm": 1.3573723032246008, "learning_rate": 7.079646017699115e-05, "loss": 0.7974, "num_input_tokens_seen": 2483568, "step": 240 }, { "epoch": 0.0723781388478582, "grad_norm": 1.317885929036595, "learning_rate": 7.227138643067847e-05, "loss": 0.7739, "num_input_tokens_seen": 2535040, "step": 245 }, { "epoch": 0.07385524372230429, "grad_norm": 2.1810508869311067, "learning_rate": 7.374631268436578e-05, "loss": 0.7558, "num_input_tokens_seen": 2587272, "step": 250 }, { "epoch": 0.07385524372230429, "eval_loss": 0.810763955116272, "eval_runtime": 19.1642, "eval_samples_per_second": 3.131, "eval_steps_per_second": 0.783, "num_input_tokens_seen": 2587272, "step": 250 }, { "epoch": 0.07533234859675036, "grad_norm": 2.353270620583961, "learning_rate": 7.522123893805309e-05, "loss": 0.7851, "num_input_tokens_seen": 2638632, "step": 255 }, { "epoch": 0.07680945347119646, "grad_norm": 3.0664271713541873, "learning_rate": 7.669616519174043e-05, "loss": 0.7211, "num_input_tokens_seen": 2691016, "step": 260 }, { "epoch": 0.07828655834564253, "grad_norm": 5.498034008223314, "learning_rate": 7.817109144542774e-05, "loss": 0.8082, "num_input_tokens_seen": 2742912, "step": 265 }, { "epoch": 0.07976366322008863, "grad_norm": 14.573947499657416, "learning_rate": 7.964601769911504e-05, "loss": 0.7485, "num_input_tokens_seen": 2795264, "step": 270 }, { "epoch": 0.08124076809453472, "grad_norm": 3.007807281619928, "learning_rate": 8.112094395280237e-05, "loss": 0.7454, "num_input_tokens_seen": 2846344, "step": 275 }, { "epoch": 0.0827178729689808, "grad_norm": 6.015750773450144, "learning_rate": 8.259587020648968e-05, "loss": 0.7258, "num_input_tokens_seen": 2898304, "step": 280 }, { "epoch": 0.08419497784342689, "grad_norm": 2.390238834834483, "learning_rate": 8.4070796460177e-05, "loss": 0.7863, "num_input_tokens_seen": 2951368, "step": 285 }, { "epoch": 0.08567208271787297, "grad_norm": 3.0216023427899357, "learning_rate": 8.554572271386431e-05, "loss": 0.7105, "num_input_tokens_seen": 3003288, "step": 290 }, { "epoch": 0.08714918759231906, "grad_norm": 3.255437171887138, "learning_rate": 8.702064896755162e-05, "loss": 0.6885, "num_input_tokens_seen": 3054808, "step": 295 }, { "epoch": 0.08862629246676514, "grad_norm": 3.506440325367033, "learning_rate": 8.849557522123895e-05, "loss": 0.78, "num_input_tokens_seen": 3107200, "step": 300 }, { "epoch": 0.08862629246676514, "eval_loss": 0.8194220662117004, "eval_runtime": 19.1748, "eval_samples_per_second": 3.129, "eval_steps_per_second": 0.782, "num_input_tokens_seen": 3107200, "step": 300 }, { "epoch": 0.09010339734121123, "grad_norm": 2.2491377996087385, "learning_rate": 8.997050147492626e-05, "loss": 0.7394, "num_input_tokens_seen": 3158648, "step": 305 }, { "epoch": 0.0915805022156573, "grad_norm": 8.740989037358858, "learning_rate": 9.144542772861357e-05, "loss": 0.7371, "num_input_tokens_seen": 3210560, "step": 310 }, { "epoch": 0.0930576070901034, "grad_norm": 4.552322042735297, "learning_rate": 9.29203539823009e-05, "loss": 0.7622, "num_input_tokens_seen": 3263664, "step": 315 }, { "epoch": 0.09453471196454949, "grad_norm": 5.062297996808734, "learning_rate": 9.43952802359882e-05, "loss": 0.7214, "num_input_tokens_seen": 3315520, "step": 320 }, { "epoch": 0.09601181683899557, "grad_norm": 4.295724990139621, "learning_rate": 9.587020648967551e-05, "loss": 0.7078, "num_input_tokens_seen": 3368088, "step": 325 }, { "epoch": 0.09748892171344166, "grad_norm": 7.980776602247676, "learning_rate": 9.734513274336283e-05, "loss": 0.6852, "num_input_tokens_seen": 3420176, "step": 330 }, { "epoch": 0.09896602658788774, "grad_norm": 33.51326353666061, "learning_rate": 9.882005899705014e-05, "loss": 0.7557, "num_input_tokens_seen": 3471184, "step": 335 }, { "epoch": 0.10044313146233383, "grad_norm": 4.612370523858782, "learning_rate": 9.99999940340072e-05, "loss": 0.6709, "num_input_tokens_seen": 3523008, "step": 340 }, { "epoch": 0.1019202363367799, "grad_norm": 3.706129373980499, "learning_rate": 9.999978522440803e-05, "loss": 0.7252, "num_input_tokens_seen": 3573880, "step": 345 }, { "epoch": 0.103397341211226, "grad_norm": 4.907030070826967, "learning_rate": 9.999927811659165e-05, "loss": 0.6602, "num_input_tokens_seen": 3625752, "step": 350 }, { "epoch": 0.103397341211226, "eval_loss": 0.7663387656211853, "eval_runtime": 19.2114, "eval_samples_per_second": 3.123, "eval_steps_per_second": 0.781, "num_input_tokens_seen": 3625752, "step": 350 }, { "epoch": 0.10487444608567208, "grad_norm": 21.440776172892136, "learning_rate": 9.999847271358347e-05, "loss": 0.7222, "num_input_tokens_seen": 3676984, "step": 355 }, { "epoch": 0.10635155096011817, "grad_norm": 3.9046938348237252, "learning_rate": 9.99973690201885e-05, "loss": 0.6639, "num_input_tokens_seen": 3729168, "step": 360 }, { "epoch": 0.10782865583456426, "grad_norm": 5.783088469074539, "learning_rate": 9.999596704299139e-05, "loss": 0.6501, "num_input_tokens_seen": 3780672, "step": 365 }, { "epoch": 0.10930576070901034, "grad_norm": 4.421078404189889, "learning_rate": 9.999426679035628e-05, "loss": 0.6871, "num_input_tokens_seen": 3832328, "step": 370 }, { "epoch": 0.11078286558345643, "grad_norm": 6.348668054760613, "learning_rate": 9.99922682724269e-05, "loss": 0.6621, "num_input_tokens_seen": 3883112, "step": 375 }, { "epoch": 0.11225997045790251, "grad_norm": 3.7958066942788573, "learning_rate": 9.998997150112635e-05, "loss": 0.7156, "num_input_tokens_seen": 3934976, "step": 380 }, { "epoch": 0.1137370753323486, "grad_norm": 2.694693287446712, "learning_rate": 9.998737649015718e-05, "loss": 0.6662, "num_input_tokens_seen": 3986192, "step": 385 }, { "epoch": 0.11521418020679468, "grad_norm": 6.428397441401454, "learning_rate": 9.998448325500118e-05, "loss": 0.682, "num_input_tokens_seen": 4037760, "step": 390 }, { "epoch": 0.11669128508124077, "grad_norm": 6.165388347571309, "learning_rate": 9.998129181291936e-05, "loss": 0.6137, "num_input_tokens_seen": 4090872, "step": 395 }, { "epoch": 0.11816838995568685, "grad_norm": 4.354814876263017, "learning_rate": 9.997780218295185e-05, "loss": 0.6739, "num_input_tokens_seen": 4142592, "step": 400 }, { "epoch": 0.11816838995568685, "eval_loss": 0.7038857936859131, "eval_runtime": 19.0624, "eval_samples_per_second": 3.148, "eval_steps_per_second": 0.787, "num_input_tokens_seen": 4142592, "step": 400 }, { "epoch": 0.11964549483013294, "grad_norm": 4.400494365327609, "learning_rate": 9.997401438591772e-05, "loss": 0.6209, "num_input_tokens_seen": 4194920, "step": 405 }, { "epoch": 0.12112259970457903, "grad_norm": 4.518582133930376, "learning_rate": 9.996992844441495e-05, "loss": 0.6576, "num_input_tokens_seen": 4247048, "step": 410 }, { "epoch": 0.12259970457902511, "grad_norm": 2.6773114089558043, "learning_rate": 9.996554438282022e-05, "loss": 0.6851, "num_input_tokens_seen": 4299728, "step": 415 }, { "epoch": 0.1240768094534712, "grad_norm": 10.583849604294256, "learning_rate": 9.996086222728879e-05, "loss": 0.6288, "num_input_tokens_seen": 4351088, "step": 420 }, { "epoch": 0.1255539143279173, "grad_norm": 5.18430085456359, "learning_rate": 9.995588200575439e-05, "loss": 0.667, "num_input_tokens_seen": 4403016, "step": 425 }, { "epoch": 0.12703101920236337, "grad_norm": 3.624202284960618, "learning_rate": 9.995060374792892e-05, "loss": 0.6747, "num_input_tokens_seen": 4453880, "step": 430 }, { "epoch": 0.12850812407680945, "grad_norm": 8.193068077958594, "learning_rate": 9.994502748530244e-05, "loss": 0.6594, "num_input_tokens_seen": 4505616, "step": 435 }, { "epoch": 0.12998522895125553, "grad_norm": 6.0681139183306145, "learning_rate": 9.993915325114288e-05, "loss": 0.6727, "num_input_tokens_seen": 4558384, "step": 440 }, { "epoch": 0.13146233382570163, "grad_norm": 3.998790148445953, "learning_rate": 9.993298108049582e-05, "loss": 0.6526, "num_input_tokens_seen": 4611184, "step": 445 }, { "epoch": 0.1329394387001477, "grad_norm": 3.1838689643423392, "learning_rate": 9.992651101018445e-05, "loss": 0.5661, "num_input_tokens_seen": 4663320, "step": 450 }, { "epoch": 0.1329394387001477, "eval_loss": 0.7132604718208313, "eval_runtime": 18.996, "eval_samples_per_second": 3.159, "eval_steps_per_second": 0.79, "num_input_tokens_seen": 4663320, "step": 450 }, { "epoch": 0.1344165435745938, "grad_norm": 8.239876852269616, "learning_rate": 9.991974307880907e-05, "loss": 0.5954, "num_input_tokens_seen": 4714448, "step": 455 }, { "epoch": 0.1358936484490399, "grad_norm": 21.47914479659389, "learning_rate": 9.991267732674711e-05, "loss": 0.721, "num_input_tokens_seen": 4767136, "step": 460 }, { "epoch": 0.13737075332348597, "grad_norm": 7.30460731660639, "learning_rate": 9.99053137961528e-05, "loss": 0.6578, "num_input_tokens_seen": 4819408, "step": 465 }, { "epoch": 0.13884785819793205, "grad_norm": 8.944279395571234, "learning_rate": 9.989765253095686e-05, "loss": 0.6642, "num_input_tokens_seen": 4872120, "step": 470 }, { "epoch": 0.14032496307237813, "grad_norm": 20.451510949854647, "learning_rate": 9.988969357686636e-05, "loss": 0.6462, "num_input_tokens_seen": 4924400, "step": 475 }, { "epoch": 0.14180206794682423, "grad_norm": 6.498846626456819, "learning_rate": 9.988143698136429e-05, "loss": 0.6055, "num_input_tokens_seen": 4976504, "step": 480 }, { "epoch": 0.1432791728212703, "grad_norm": 9.137509561710141, "learning_rate": 9.987288279370945e-05, "loss": 0.5928, "num_input_tokens_seen": 5028648, "step": 485 }, { "epoch": 0.1447562776957164, "grad_norm": 6.8502382033465885, "learning_rate": 9.986403106493604e-05, "loss": 0.5835, "num_input_tokens_seen": 5080488, "step": 490 }, { "epoch": 0.14623338257016247, "grad_norm": 6.731902879463201, "learning_rate": 9.985488184785336e-05, "loss": 0.6641, "num_input_tokens_seen": 5131744, "step": 495 }, { "epoch": 0.14771048744460857, "grad_norm": 5.718986997919185, "learning_rate": 9.984543519704557e-05, "loss": 0.6283, "num_input_tokens_seen": 5183664, "step": 500 }, { "epoch": 0.14771048744460857, "eval_loss": 0.6505001187324524, "eval_runtime": 18.9372, "eval_samples_per_second": 3.168, "eval_steps_per_second": 0.792, "num_input_tokens_seen": 5183664, "step": 500 }, { "epoch": 0.14918759231905465, "grad_norm": 11.975231950682153, "learning_rate": 9.983569116887128e-05, "loss": 0.573, "num_input_tokens_seen": 5234920, "step": 505 }, { "epoch": 0.15066469719350073, "grad_norm": 13.701425113518031, "learning_rate": 9.982564982146327e-05, "loss": 0.6261, "num_input_tokens_seen": 5287312, "step": 510 }, { "epoch": 0.15214180206794684, "grad_norm": 6.956866723254775, "learning_rate": 9.981531121472811e-05, "loss": 0.6072, "num_input_tokens_seen": 5340240, "step": 515 }, { "epoch": 0.1536189069423929, "grad_norm": 5.293847645949678, "learning_rate": 9.980467541034584e-05, "loss": 0.565, "num_input_tokens_seen": 5392600, "step": 520 }, { "epoch": 0.155096011816839, "grad_norm": 3.5333148010719357, "learning_rate": 9.979374247176956e-05, "loss": 0.6188, "num_input_tokens_seen": 5445168, "step": 525 }, { "epoch": 0.15657311669128507, "grad_norm": 2.715838950258193, "learning_rate": 9.978251246422505e-05, "loss": 0.6069, "num_input_tokens_seen": 5496384, "step": 530 }, { "epoch": 0.15805022156573117, "grad_norm": 7.400638197441027, "learning_rate": 9.977098545471046e-05, "loss": 0.5805, "num_input_tokens_seen": 5548264, "step": 535 }, { "epoch": 0.15952732644017725, "grad_norm": 8.936418653401088, "learning_rate": 9.975916151199579e-05, "loss": 0.6383, "num_input_tokens_seen": 5599216, "step": 540 }, { "epoch": 0.16100443131462333, "grad_norm": 7.142901090509074, "learning_rate": 9.974704070662254e-05, "loss": 0.5845, "num_input_tokens_seen": 5650816, "step": 545 }, { "epoch": 0.16248153618906944, "grad_norm": 18.523556086651276, "learning_rate": 9.973462311090336e-05, "loss": 0.5957, "num_input_tokens_seen": 5703016, "step": 550 }, { "epoch": 0.16248153618906944, "eval_loss": 0.6883422136306763, "eval_runtime": 19.183, "eval_samples_per_second": 3.128, "eval_steps_per_second": 0.782, "num_input_tokens_seen": 5703016, "step": 550 }, { "epoch": 0.16395864106351551, "grad_norm": 10.880382658420737, "learning_rate": 9.972190879892147e-05, "loss": 0.6076, "num_input_tokens_seen": 5754192, "step": 555 }, { "epoch": 0.1654357459379616, "grad_norm": 5.9115707757479345, "learning_rate": 9.970889784653033e-05, "loss": 0.6136, "num_input_tokens_seen": 5806272, "step": 560 }, { "epoch": 0.16691285081240767, "grad_norm": 8.300559629359741, "learning_rate": 9.969559033135318e-05, "loss": 0.5554, "num_input_tokens_seen": 5858632, "step": 565 }, { "epoch": 0.16838995568685378, "grad_norm": 19.24269810236072, "learning_rate": 9.96819863327825e-05, "loss": 0.5847, "num_input_tokens_seen": 5909936, "step": 570 }, { "epoch": 0.16986706056129985, "grad_norm": 2.997295434716295, "learning_rate": 9.966808593197959e-05, "loss": 0.6217, "num_input_tokens_seen": 5961464, "step": 575 }, { "epoch": 0.17134416543574593, "grad_norm": 8.454212007467431, "learning_rate": 9.965388921187413e-05, "loss": 0.5569, "num_input_tokens_seen": 6013696, "step": 580 }, { "epoch": 0.172821270310192, "grad_norm": 11.728020547911296, "learning_rate": 9.963939625716361e-05, "loss": 0.5894, "num_input_tokens_seen": 6065736, "step": 585 }, { "epoch": 0.17429837518463812, "grad_norm": 20.470288976160585, "learning_rate": 9.962460715431284e-05, "loss": 0.5783, "num_input_tokens_seen": 6118400, "step": 590 }, { "epoch": 0.1757754800590842, "grad_norm": 4.675971808784723, "learning_rate": 9.960952199155347e-05, "loss": 0.5657, "num_input_tokens_seen": 6171120, "step": 595 }, { "epoch": 0.17725258493353027, "grad_norm": 9.775804001092958, "learning_rate": 9.959414085888342e-05, "loss": 0.6331, "num_input_tokens_seen": 6222736, "step": 600 }, { "epoch": 0.17725258493353027, "eval_loss": 0.5883122682571411, "eval_runtime": 19.002, "eval_samples_per_second": 3.158, "eval_steps_per_second": 0.789, "num_input_tokens_seen": 6222736, "step": 600 }, { "epoch": 0.17872968980797638, "grad_norm": 8.081060384434974, "learning_rate": 9.957846384806636e-05, "loss": 0.5678, "num_input_tokens_seen": 6274328, "step": 605 }, { "epoch": 0.18020679468242246, "grad_norm": 10.520198943062466, "learning_rate": 9.956249105263121e-05, "loss": 0.5609, "num_input_tokens_seen": 6327088, "step": 610 }, { "epoch": 0.18168389955686853, "grad_norm": 5.336067400981417, "learning_rate": 9.95462225678715e-05, "loss": 0.5177, "num_input_tokens_seen": 6378824, "step": 615 }, { "epoch": 0.1831610044313146, "grad_norm": 5.263245734989025, "learning_rate": 9.952965849084483e-05, "loss": 0.5839, "num_input_tokens_seen": 6431024, "step": 620 }, { "epoch": 0.18463810930576072, "grad_norm": 5.175847441048381, "learning_rate": 9.951279892037233e-05, "loss": 0.5069, "num_input_tokens_seen": 6483072, "step": 625 }, { "epoch": 0.1861152141802068, "grad_norm": 12.247546396996816, "learning_rate": 9.949564395703803e-05, "loss": 0.495, "num_input_tokens_seen": 6534768, "step": 630 }, { "epoch": 0.18759231905465287, "grad_norm": 8.126956720775665, "learning_rate": 9.947819370318825e-05, "loss": 0.6435, "num_input_tokens_seen": 6586416, "step": 635 }, { "epoch": 0.18906942392909898, "grad_norm": 9.112136009018696, "learning_rate": 9.946044826293106e-05, "loss": 0.5014, "num_input_tokens_seen": 6638592, "step": 640 }, { "epoch": 0.19054652880354506, "grad_norm": 7.086235271485555, "learning_rate": 9.944240774213556e-05, "loss": 0.529, "num_input_tokens_seen": 6689920, "step": 645 }, { "epoch": 0.19202363367799113, "grad_norm": 21.538813510868643, "learning_rate": 9.942407224843132e-05, "loss": 0.5483, "num_input_tokens_seen": 6743120, "step": 650 }, { "epoch": 0.19202363367799113, "eval_loss": 0.6100574135780334, "eval_runtime": 18.9585, "eval_samples_per_second": 3.165, "eval_steps_per_second": 0.791, "num_input_tokens_seen": 6743120, "step": 650 }, { "epoch": 0.1935007385524372, "grad_norm": 5.437189286202135, "learning_rate": 9.940544189120771e-05, "loss": 0.5499, "num_input_tokens_seen": 6794096, "step": 655 }, { "epoch": 0.19497784342688332, "grad_norm": 11.687077385856876, "learning_rate": 9.938651678161326e-05, "loss": 0.5866, "num_input_tokens_seen": 6846200, "step": 660 }, { "epoch": 0.1964549483013294, "grad_norm": 7.149806146104705, "learning_rate": 9.936729703255498e-05, "loss": 0.4958, "num_input_tokens_seen": 6899280, "step": 665 }, { "epoch": 0.19793205317577547, "grad_norm": 7.7389407646353225, "learning_rate": 9.93477827586977e-05, "loss": 0.4232, "num_input_tokens_seen": 6950608, "step": 670 }, { "epoch": 0.19940915805022155, "grad_norm": 10.670856796845847, "learning_rate": 9.932797407646338e-05, "loss": 0.5407, "num_input_tokens_seen": 7002696, "step": 675 }, { "epoch": 0.20088626292466766, "grad_norm": 4.979252179082294, "learning_rate": 9.93078711040304e-05, "loss": 0.4553, "num_input_tokens_seen": 7055160, "step": 680 }, { "epoch": 0.20236336779911374, "grad_norm": 16.633134937375967, "learning_rate": 9.928747396133294e-05, "loss": 0.5565, "num_input_tokens_seen": 7107224, "step": 685 }, { "epoch": 0.2038404726735598, "grad_norm": 11.730384893098227, "learning_rate": 9.926678277006011e-05, "loss": 0.5951, "num_input_tokens_seen": 7158376, "step": 690 }, { "epoch": 0.20531757754800592, "grad_norm": 7.120874965147562, "learning_rate": 9.924579765365536e-05, "loss": 0.4764, "num_input_tokens_seen": 7210552, "step": 695 }, { "epoch": 0.206794682422452, "grad_norm": 7.463812624673142, "learning_rate": 9.922451873731569e-05, "loss": 0.477, "num_input_tokens_seen": 7262832, "step": 700 }, { "epoch": 0.206794682422452, "eval_loss": 0.5883837938308716, "eval_runtime": 19.0983, "eval_samples_per_second": 3.142, "eval_steps_per_second": 0.785, "num_input_tokens_seen": 7262832, "step": 700 }, { "epoch": 0.20827178729689808, "grad_norm": 5.1484150923671, "learning_rate": 9.92029461479909e-05, "loss": 0.5151, "num_input_tokens_seen": 7314520, "step": 705 }, { "epoch": 0.20974889217134415, "grad_norm": 10.902688361325707, "learning_rate": 9.918108001438283e-05, "loss": 0.6158, "num_input_tokens_seen": 7365368, "step": 710 }, { "epoch": 0.21122599704579026, "grad_norm": 6.960249994011121, "learning_rate": 9.915892046694464e-05, "loss": 0.5164, "num_input_tokens_seen": 7417296, "step": 715 }, { "epoch": 0.21270310192023634, "grad_norm": 6.138105593354917, "learning_rate": 9.913646763787992e-05, "loss": 0.5823, "num_input_tokens_seen": 7469640, "step": 720 }, { "epoch": 0.21418020679468242, "grad_norm": 5.865897561310743, "learning_rate": 9.911372166114208e-05, "loss": 0.5145, "num_input_tokens_seen": 7521520, "step": 725 }, { "epoch": 0.21565731166912852, "grad_norm": 14.330290158050401, "learning_rate": 9.909068267243336e-05, "loss": 0.571, "num_input_tokens_seen": 7573880, "step": 730 }, { "epoch": 0.2171344165435746, "grad_norm": 6.90669118248274, "learning_rate": 9.906735080920413e-05, "loss": 0.4638, "num_input_tokens_seen": 7625896, "step": 735 }, { "epoch": 0.21861152141802068, "grad_norm": 6.310395883326308, "learning_rate": 9.904372621065206e-05, "loss": 0.5449, "num_input_tokens_seen": 7676528, "step": 740 }, { "epoch": 0.22008862629246675, "grad_norm": 27.446565035007364, "learning_rate": 9.901980901772126e-05, "loss": 0.5505, "num_input_tokens_seen": 7728240, "step": 745 }, { "epoch": 0.22156573116691286, "grad_norm": 5.6934624405279655, "learning_rate": 9.899559937310148e-05, "loss": 0.514, "num_input_tokens_seen": 7779872, "step": 750 }, { "epoch": 0.22156573116691286, "eval_loss": 0.4665524661540985, "eval_runtime": 19.1629, "eval_samples_per_second": 3.131, "eval_steps_per_second": 0.783, "num_input_tokens_seen": 7779872, "step": 750 }, { "epoch": 0.22304283604135894, "grad_norm": 7.392056712218606, "learning_rate": 9.897109742122721e-05, "loss": 0.5248, "num_input_tokens_seen": 7832168, "step": 755 }, { "epoch": 0.22451994091580502, "grad_norm": 9.230824229530686, "learning_rate": 9.894630330827686e-05, "loss": 0.5017, "num_input_tokens_seen": 7884040, "step": 760 }, { "epoch": 0.2259970457902511, "grad_norm": 11.203609848309013, "learning_rate": 9.892121718217182e-05, "loss": 0.4896, "num_input_tokens_seen": 7935528, "step": 765 }, { "epoch": 0.2274741506646972, "grad_norm": 30.185572869944284, "learning_rate": 9.88958391925757e-05, "loss": 0.5125, "num_input_tokens_seen": 7987760, "step": 770 }, { "epoch": 0.22895125553914328, "grad_norm": 18.649424971543322, "learning_rate": 9.887016949089333e-05, "loss": 0.5615, "num_input_tokens_seen": 8039400, "step": 775 }, { "epoch": 0.23042836041358936, "grad_norm": 5.360845077873566, "learning_rate": 9.884420823026989e-05, "loss": 0.494, "num_input_tokens_seen": 8092440, "step": 780 }, { "epoch": 0.23190546528803546, "grad_norm": 10.101391912363345, "learning_rate": 9.881795556558999e-05, "loss": 0.5122, "num_input_tokens_seen": 8145040, "step": 785 }, { "epoch": 0.23338257016248154, "grad_norm": 5.90491429019666, "learning_rate": 9.879141165347678e-05, "loss": 0.4925, "num_input_tokens_seen": 8196904, "step": 790 }, { "epoch": 0.23485967503692762, "grad_norm": 6.228283676778458, "learning_rate": 9.876457665229097e-05, "loss": 0.4752, "num_input_tokens_seen": 8249232, "step": 795 }, { "epoch": 0.2363367799113737, "grad_norm": 8.496099871334396, "learning_rate": 9.87374507221299e-05, "loss": 0.4239, "num_input_tokens_seen": 8301976, "step": 800 }, { "epoch": 0.2363367799113737, "eval_loss": 0.48219749331474304, "eval_runtime": 19.0825, "eval_samples_per_second": 3.144, "eval_steps_per_second": 0.786, "num_input_tokens_seen": 8301976, "step": 800 }, { "epoch": 0.2378138847858198, "grad_norm": 5.415068909643764, "learning_rate": 9.87100340248266e-05, "loss": 0.4482, "num_input_tokens_seen": 8353736, "step": 805 }, { "epoch": 0.23929098966026588, "grad_norm": 24.48801392473646, "learning_rate": 9.868232672394881e-05, "loss": 0.4764, "num_input_tokens_seen": 8406216, "step": 810 }, { "epoch": 0.24076809453471196, "grad_norm": 12.500688224717539, "learning_rate": 9.8654328984798e-05, "loss": 0.4476, "num_input_tokens_seen": 8457752, "step": 815 }, { "epoch": 0.24224519940915806, "grad_norm": 6.4171543173466405, "learning_rate": 9.862604097440844e-05, "loss": 0.4295, "num_input_tokens_seen": 8510440, "step": 820 }, { "epoch": 0.24372230428360414, "grad_norm": 8.42476760299212, "learning_rate": 9.859746286154607e-05, "loss": 0.5384, "num_input_tokens_seen": 8562016, "step": 825 }, { "epoch": 0.24519940915805022, "grad_norm": 5.79752775822047, "learning_rate": 9.856859481670764e-05, "loss": 0.5357, "num_input_tokens_seen": 8614184, "step": 830 }, { "epoch": 0.2466765140324963, "grad_norm": 6.468218270458443, "learning_rate": 9.853943701211963e-05, "loss": 0.5309, "num_input_tokens_seen": 8666528, "step": 835 }, { "epoch": 0.2481536189069424, "grad_norm": 6.446467495370782, "learning_rate": 9.850998962173719e-05, "loss": 0.4949, "num_input_tokens_seen": 8718048, "step": 840 }, { "epoch": 0.24963072378138848, "grad_norm": 7.926476306574312, "learning_rate": 9.848025282124317e-05, "loss": 0.4681, "num_input_tokens_seen": 8769968, "step": 845 }, { "epoch": 0.2511078286558346, "grad_norm": 19.45042923754815, "learning_rate": 9.845022678804701e-05, "loss": 0.4949, "num_input_tokens_seen": 8822832, "step": 850 }, { "epoch": 0.2511078286558346, "eval_loss": 0.6121839880943298, "eval_runtime": 19.1203, "eval_samples_per_second": 3.138, "eval_steps_per_second": 0.785, "num_input_tokens_seen": 8822832, "step": 850 }, { "epoch": 0.25258493353028066, "grad_norm": 8.111385407981246, "learning_rate": 9.841991170128374e-05, "loss": 0.4636, "num_input_tokens_seen": 8875608, "step": 855 }, { "epoch": 0.25406203840472674, "grad_norm": 4.789169716538139, "learning_rate": 9.838930774181285e-05, "loss": 0.4322, "num_input_tokens_seen": 8927600, "step": 860 }, { "epoch": 0.2555391432791728, "grad_norm": 3.6029916519925167, "learning_rate": 9.835841509221725e-05, "loss": 0.4302, "num_input_tokens_seen": 8980224, "step": 865 }, { "epoch": 0.2570162481536189, "grad_norm": 17.138905616592684, "learning_rate": 9.83272339368022e-05, "loss": 0.5231, "num_input_tokens_seen": 9032112, "step": 870 }, { "epoch": 0.258493353028065, "grad_norm": 6.810210745159563, "learning_rate": 9.829576446159416e-05, "loss": 0.4414, "num_input_tokens_seen": 9084480, "step": 875 }, { "epoch": 0.25997045790251105, "grad_norm": 6.785950897404188, "learning_rate": 9.826400685433968e-05, "loss": 0.4469, "num_input_tokens_seen": 9136816, "step": 880 }, { "epoch": 0.2614475627769572, "grad_norm": 14.335926789263953, "learning_rate": 9.823196130450434e-05, "loss": 0.3859, "num_input_tokens_seen": 9189808, "step": 885 }, { "epoch": 0.26292466765140327, "grad_norm": 24.791700587075013, "learning_rate": 9.819962800327156e-05, "loss": 0.4794, "num_input_tokens_seen": 9241712, "step": 890 }, { "epoch": 0.26440177252584934, "grad_norm": 16.38282434047279, "learning_rate": 9.81670071435415e-05, "loss": 0.4476, "num_input_tokens_seen": 9293328, "step": 895 }, { "epoch": 0.2658788774002954, "grad_norm": 3.8069696135300846, "learning_rate": 9.813409891992988e-05, "loss": 0.4852, "num_input_tokens_seen": 9345160, "step": 900 }, { "epoch": 0.2658788774002954, "eval_loss": 0.5605542063713074, "eval_runtime": 19.0274, "eval_samples_per_second": 3.153, "eval_steps_per_second": 0.788, "num_input_tokens_seen": 9345160, "step": 900 }, { "epoch": 0.2673559822747415, "grad_norm": 8.596452892791511, "learning_rate": 9.810090352876685e-05, "loss": 0.4973, "num_input_tokens_seen": 9396608, "step": 905 }, { "epoch": 0.2688330871491876, "grad_norm": 22.570326690897016, "learning_rate": 9.806742116809575e-05, "loss": 0.4845, "num_input_tokens_seen": 9448264, "step": 910 }, { "epoch": 0.27031019202363366, "grad_norm": 10.987740344554055, "learning_rate": 9.803365203767201e-05, "loss": 0.4405, "num_input_tokens_seen": 9501288, "step": 915 }, { "epoch": 0.2717872968980798, "grad_norm": 5.629472815681504, "learning_rate": 9.799959633896194e-05, "loss": 0.5228, "num_input_tokens_seen": 9552680, "step": 920 }, { "epoch": 0.27326440177252587, "grad_norm": 4.8888946075363355, "learning_rate": 9.79652542751415e-05, "loss": 0.4189, "num_input_tokens_seen": 9604432, "step": 925 }, { "epoch": 0.27474150664697194, "grad_norm": 11.753985857725072, "learning_rate": 9.793062605109509e-05, "loss": 0.4449, "num_input_tokens_seen": 9656992, "step": 930 }, { "epoch": 0.276218611521418, "grad_norm": 10.956276728284152, "learning_rate": 9.789571187341433e-05, "loss": 0.4678, "num_input_tokens_seen": 9709016, "step": 935 }, { "epoch": 0.2776957163958641, "grad_norm": 5.760995229664893, "learning_rate": 9.786051195039689e-05, "loss": 0.5359, "num_input_tokens_seen": 9759936, "step": 940 }, { "epoch": 0.2791728212703102, "grad_norm": 5.4002668741362365, "learning_rate": 9.782502649204512e-05, "loss": 0.5507, "num_input_tokens_seen": 9811880, "step": 945 }, { "epoch": 0.28064992614475626, "grad_norm": 3.1909077693586876, "learning_rate": 9.778925571006495e-05, "loss": 0.4737, "num_input_tokens_seen": 9863168, "step": 950 }, { "epoch": 0.28064992614475626, "eval_loss": 0.479105681180954, "eval_runtime": 19.2085, "eval_samples_per_second": 3.124, "eval_steps_per_second": 0.781, "num_input_tokens_seen": 9863168, "step": 950 }, { "epoch": 0.2821270310192024, "grad_norm": 3.2511615728403744, "learning_rate": 9.775319981786445e-05, "loss": 0.4393, "num_input_tokens_seen": 9914672, "step": 955 }, { "epoch": 0.28360413589364847, "grad_norm": 16.435101279621147, "learning_rate": 9.771685903055277e-05, "loss": 0.4355, "num_input_tokens_seen": 9966736, "step": 960 }, { "epoch": 0.28508124076809455, "grad_norm": 15.842537939054491, "learning_rate": 9.768023356493864e-05, "loss": 0.4459, "num_input_tokens_seen": 10017984, "step": 965 }, { "epoch": 0.2865583456425406, "grad_norm": 4.234230919149069, "learning_rate": 9.764332363952927e-05, "loss": 0.4774, "num_input_tokens_seen": 10069520, "step": 970 }, { "epoch": 0.2880354505169867, "grad_norm": 4.408868276054397, "learning_rate": 9.760612947452884e-05, "loss": 0.413, "num_input_tokens_seen": 10122208, "step": 975 }, { "epoch": 0.2895125553914328, "grad_norm": 18.46536438022927, "learning_rate": 9.756865129183741e-05, "loss": 0.5433, "num_input_tokens_seen": 10173760, "step": 980 }, { "epoch": 0.29098966026587886, "grad_norm": 10.416515634178488, "learning_rate": 9.753088931504944e-05, "loss": 0.4096, "num_input_tokens_seen": 10224976, "step": 985 }, { "epoch": 0.29246676514032494, "grad_norm": 8.959580527519506, "learning_rate": 9.749284376945248e-05, "loss": 0.3916, "num_input_tokens_seen": 10276928, "step": 990 }, { "epoch": 0.29394387001477107, "grad_norm": 4.106784187834887, "learning_rate": 9.74545148820259e-05, "loss": 0.3899, "num_input_tokens_seen": 10328048, "step": 995 }, { "epoch": 0.29542097488921715, "grad_norm": 7.661197997005464, "learning_rate": 9.741590288143944e-05, "loss": 0.4005, "num_input_tokens_seen": 10379136, "step": 1000 }, { "epoch": 0.29542097488921715, "eval_loss": 0.5501028299331665, "eval_runtime": 19.0051, "eval_samples_per_second": 3.157, "eval_steps_per_second": 0.789, "num_input_tokens_seen": 10379136, "step": 1000 }, { "epoch": 0.2968980797636632, "grad_norm": 28.402265641893973, "learning_rate": 9.737700799805191e-05, "loss": 0.4585, "num_input_tokens_seen": 10430680, "step": 1005 }, { "epoch": 0.2983751846381093, "grad_norm": 7.624783658458961, "learning_rate": 9.73378304639098e-05, "loss": 0.4257, "num_input_tokens_seen": 10482472, "step": 1010 }, { "epoch": 0.2998522895125554, "grad_norm": 9.68942996120796, "learning_rate": 9.729837051274591e-05, "loss": 0.4359, "num_input_tokens_seen": 10534392, "step": 1015 }, { "epoch": 0.30132939438700146, "grad_norm": 4.997152707865521, "learning_rate": 9.725862837997786e-05, "loss": 0.4158, "num_input_tokens_seen": 10586104, "step": 1020 }, { "epoch": 0.30280649926144754, "grad_norm": 12.772476641379384, "learning_rate": 9.721860430270685e-05, "loss": 0.4067, "num_input_tokens_seen": 10637560, "step": 1025 }, { "epoch": 0.30428360413589367, "grad_norm": 11.194625798156807, "learning_rate": 9.717829851971612e-05, "loss": 0.4811, "num_input_tokens_seen": 10689552, "step": 1030 }, { "epoch": 0.30576070901033975, "grad_norm": 8.371187346484113, "learning_rate": 9.713771127146955e-05, "loss": 0.4732, "num_input_tokens_seen": 10742208, "step": 1035 }, { "epoch": 0.3072378138847858, "grad_norm": 20.552637977751065, "learning_rate": 9.70968428001103e-05, "loss": 0.4735, "num_input_tokens_seen": 10794008, "step": 1040 }, { "epoch": 0.3087149187592319, "grad_norm": 6.868858377235537, "learning_rate": 9.705569334945921e-05, "loss": 0.4381, "num_input_tokens_seen": 10845736, "step": 1045 }, { "epoch": 0.310192023633678, "grad_norm": 12.499280962869927, "learning_rate": 9.701426316501352e-05, "loss": 0.3991, "num_input_tokens_seen": 10897528, "step": 1050 }, { "epoch": 0.310192023633678, "eval_loss": 0.4378110468387604, "eval_runtime": 47.2127, "eval_samples_per_second": 1.271, "eval_steps_per_second": 0.318, "num_input_tokens_seen": 10897528, "step": 1050 }, { "epoch": 0.31166912850812406, "grad_norm": 12.581621849544964, "learning_rate": 9.697255249394527e-05, "loss": 0.3724, "num_input_tokens_seen": 10949888, "step": 1055 }, { "epoch": 0.31314623338257014, "grad_norm": 6.318957148146118, "learning_rate": 9.693056158509992e-05, "loss": 0.4483, "num_input_tokens_seen": 11001208, "step": 1060 }, { "epoch": 0.31462333825701627, "grad_norm": 5.822614134671903, "learning_rate": 9.688829068899483e-05, "loss": 0.4133, "num_input_tokens_seen": 11052368, "step": 1065 }, { "epoch": 0.31610044313146235, "grad_norm": 6.115498616882066, "learning_rate": 9.684574005781772e-05, "loss": 0.5406, "num_input_tokens_seen": 11104008, "step": 1070 }, { "epoch": 0.3175775480059084, "grad_norm": 12.381439695843321, "learning_rate": 9.680290994542523e-05, "loss": 0.4148, "num_input_tokens_seen": 11155888, "step": 1075 }, { "epoch": 0.3190546528803545, "grad_norm": 5.292808434292701, "learning_rate": 9.675980060734138e-05, "loss": 0.4169, "num_input_tokens_seen": 11207352, "step": 1080 }, { "epoch": 0.3205317577548006, "grad_norm": 5.275144555938926, "learning_rate": 9.671641230075604e-05, "loss": 0.4706, "num_input_tokens_seen": 11257672, "step": 1085 }, { "epoch": 0.32200886262924666, "grad_norm": 7.458715041450571, "learning_rate": 9.667274528452344e-05, "loss": 0.3736, "num_input_tokens_seen": 11309944, "step": 1090 }, { "epoch": 0.32348596750369274, "grad_norm": 8.390618541362887, "learning_rate": 9.662879981916054e-05, "loss": 0.4413, "num_input_tokens_seen": 11361032, "step": 1095 }, { "epoch": 0.3249630723781389, "grad_norm": 14.65510134410483, "learning_rate": 9.658457616684555e-05, "loss": 0.4624, "num_input_tokens_seen": 11413120, "step": 1100 }, { "epoch": 0.3249630723781389, "eval_loss": 0.5300672650337219, "eval_runtime": 19.0076, "eval_samples_per_second": 3.157, "eval_steps_per_second": 0.789, "num_input_tokens_seen": 11413120, "step": 1100 }, { "epoch": 0.32644017725258495, "grad_norm": 5.808097944570942, "learning_rate": 9.654007459141634e-05, "loss": 0.4121, "num_input_tokens_seen": 11465064, "step": 1105 }, { "epoch": 0.32791728212703103, "grad_norm": 4.015479597894709, "learning_rate": 9.649529535836887e-05, "loss": 0.4569, "num_input_tokens_seen": 11516304, "step": 1110 }, { "epoch": 0.3293943870014771, "grad_norm": 8.768813687811088, "learning_rate": 9.645023873485557e-05, "loss": 0.4121, "num_input_tokens_seen": 11568568, "step": 1115 }, { "epoch": 0.3308714918759232, "grad_norm": 10.930663245586304, "learning_rate": 9.640490498968383e-05, "loss": 0.4112, "num_input_tokens_seen": 11620672, "step": 1120 }, { "epoch": 0.33234859675036926, "grad_norm": 6.691827883878219, "learning_rate": 9.63592943933143e-05, "loss": 0.3564, "num_input_tokens_seen": 11672864, "step": 1125 }, { "epoch": 0.33382570162481534, "grad_norm": 8.33100451031768, "learning_rate": 9.631340721785934e-05, "loss": 0.3909, "num_input_tokens_seen": 11724128, "step": 1130 }, { "epoch": 0.3353028064992615, "grad_norm": 7.005971082198048, "learning_rate": 9.62672437370814e-05, "loss": 0.4636, "num_input_tokens_seen": 11776416, "step": 1135 }, { "epoch": 0.33677991137370755, "grad_norm": 14.154463913713748, "learning_rate": 9.622080422639133e-05, "loss": 0.4617, "num_input_tokens_seen": 11828256, "step": 1140 }, { "epoch": 0.33825701624815363, "grad_norm": 14.751813027169304, "learning_rate": 9.617408896284678e-05, "loss": 0.3443, "num_input_tokens_seen": 11882048, "step": 1145 }, { "epoch": 0.3397341211225997, "grad_norm": 3.2576085972339706, "learning_rate": 9.612709822515054e-05, "loss": 0.4432, "num_input_tokens_seen": 11933632, "step": 1150 }, { "epoch": 0.3397341211225997, "eval_loss": 0.42494550347328186, "eval_runtime": 19.6038, "eval_samples_per_second": 3.061, "eval_steps_per_second": 0.765, "num_input_tokens_seen": 11933632, "step": 1150 }, { "epoch": 0.3412112259970458, "grad_norm": 4.778973114307738, "learning_rate": 9.60798322936489e-05, "loss": 0.3716, "num_input_tokens_seen": 11986496, "step": 1155 }, { "epoch": 0.34268833087149186, "grad_norm": 8.20796735033587, "learning_rate": 9.603229145032993e-05, "loss": 0.4234, "num_input_tokens_seen": 12039112, "step": 1160 }, { "epoch": 0.34416543574593794, "grad_norm": 7.158508103350641, "learning_rate": 9.598447597882181e-05, "loss": 0.3973, "num_input_tokens_seen": 12091728, "step": 1165 }, { "epoch": 0.345642540620384, "grad_norm": 9.320131732384727, "learning_rate": 9.593638616439118e-05, "loss": 0.3494, "num_input_tokens_seen": 12143896, "step": 1170 }, { "epoch": 0.34711964549483015, "grad_norm": 10.150141046652656, "learning_rate": 9.588802229394137e-05, "loss": 0.4182, "num_input_tokens_seen": 12195336, "step": 1175 }, { "epoch": 0.34859675036927623, "grad_norm": 9.270011962927722, "learning_rate": 9.583938465601075e-05, "loss": 0.462, "num_input_tokens_seen": 12247696, "step": 1180 }, { "epoch": 0.3500738552437223, "grad_norm": 8.96068778293971, "learning_rate": 9.5790473540771e-05, "loss": 0.4451, "num_input_tokens_seen": 12300040, "step": 1185 }, { "epoch": 0.3515509601181684, "grad_norm": 24.761476817148992, "learning_rate": 9.574128924002533e-05, "loss": 0.4789, "num_input_tokens_seen": 12351904, "step": 1190 }, { "epoch": 0.35302806499261447, "grad_norm": 1.8519516556186366, "learning_rate": 9.569183204720677e-05, "loss": 0.3898, "num_input_tokens_seen": 12403280, "step": 1195 }, { "epoch": 0.35450516986706054, "grad_norm": 5.005586803143539, "learning_rate": 9.564210225737647e-05, "loss": 0.3296, "num_input_tokens_seen": 12456040, "step": 1200 }, { "epoch": 0.35450516986706054, "eval_loss": 0.2966395914554596, "eval_runtime": 19.5244, "eval_samples_per_second": 3.073, "eval_steps_per_second": 0.768, "num_input_tokens_seen": 12456040, "step": 1200 }, { "epoch": 0.3559822747415066, "grad_norm": 5.5028656713393245, "learning_rate": 9.559210016722184e-05, "loss": 0.3717, "num_input_tokens_seen": 12507640, "step": 1205 }, { "epoch": 0.35745937961595275, "grad_norm": 13.214008089689216, "learning_rate": 9.554182607505484e-05, "loss": 0.541, "num_input_tokens_seen": 12559400, "step": 1210 }, { "epoch": 0.35893648449039883, "grad_norm": 6.269664608708862, "learning_rate": 9.54912802808102e-05, "loss": 0.3965, "num_input_tokens_seen": 12610992, "step": 1215 }, { "epoch": 0.3604135893648449, "grad_norm": 21.247512275128738, "learning_rate": 9.544046308604364e-05, "loss": 0.4834, "num_input_tokens_seen": 12662688, "step": 1220 }, { "epoch": 0.361890694239291, "grad_norm": 4.14272563629135, "learning_rate": 9.538937479393001e-05, "loss": 0.4538, "num_input_tokens_seen": 12713600, "step": 1225 }, { "epoch": 0.36336779911373707, "grad_norm": 12.86150407455535, "learning_rate": 9.533801570926157e-05, "loss": 0.4226, "num_input_tokens_seen": 12766360, "step": 1230 }, { "epoch": 0.36484490398818314, "grad_norm": 4.36264988758363, "learning_rate": 9.52863861384461e-05, "loss": 0.4315, "num_input_tokens_seen": 12817248, "step": 1235 }, { "epoch": 0.3663220088626292, "grad_norm": 1.6622545996067835, "learning_rate": 9.523448638950508e-05, "loss": 0.3567, "num_input_tokens_seen": 12868496, "step": 1240 }, { "epoch": 0.36779911373707536, "grad_norm": 5.5241376090939065, "learning_rate": 9.518231677207192e-05, "loss": 0.3431, "num_input_tokens_seen": 12920168, "step": 1245 }, { "epoch": 0.36927621861152143, "grad_norm": 5.717434283790562, "learning_rate": 9.512987759739003e-05, "loss": 0.335, "num_input_tokens_seen": 12972696, "step": 1250 }, { "epoch": 0.36927621861152143, "eval_loss": 0.31846168637275696, "eval_runtime": 18.9319, "eval_samples_per_second": 3.169, "eval_steps_per_second": 0.792, "num_input_tokens_seen": 12972696, "step": 1250 }, { "epoch": 0.3707533234859675, "grad_norm": 2.1850732370217045, "learning_rate": 9.507716917831099e-05, "loss": 0.3242, "num_input_tokens_seen": 13025280, "step": 1255 }, { "epoch": 0.3722304283604136, "grad_norm": 7.470465465497159, "learning_rate": 9.50241918292927e-05, "loss": 0.4083, "num_input_tokens_seen": 13075992, "step": 1260 }, { "epoch": 0.37370753323485967, "grad_norm": 10.134768151698713, "learning_rate": 9.49709458663975e-05, "loss": 0.4043, "num_input_tokens_seen": 13128592, "step": 1265 }, { "epoch": 0.37518463810930575, "grad_norm": 7.635543650225297, "learning_rate": 9.491743160729026e-05, "loss": 0.3481, "num_input_tokens_seen": 13181824, "step": 1270 }, { "epoch": 0.3766617429837518, "grad_norm": 17.089924601510244, "learning_rate": 9.486364937123651e-05, "loss": 0.4121, "num_input_tokens_seen": 13233624, "step": 1275 }, { "epoch": 0.37813884785819796, "grad_norm": 8.52905916993994, "learning_rate": 9.480959947910055e-05, "loss": 0.487, "num_input_tokens_seen": 13285808, "step": 1280 }, { "epoch": 0.37961595273264404, "grad_norm": 11.841989523288227, "learning_rate": 9.47552822533435e-05, "loss": 0.3798, "num_input_tokens_seen": 13337864, "step": 1285 }, { "epoch": 0.3810930576070901, "grad_norm": 2.1853711175575734, "learning_rate": 9.470069801802135e-05, "loss": 0.348, "num_input_tokens_seen": 13390544, "step": 1290 }, { "epoch": 0.3825701624815362, "grad_norm": 2.9516647949035826, "learning_rate": 9.464584709878313e-05, "loss": 0.41, "num_input_tokens_seen": 13441664, "step": 1295 }, { "epoch": 0.38404726735598227, "grad_norm": 3.7764410954952514, "learning_rate": 9.459072982286886e-05, "loss": 0.3594, "num_input_tokens_seen": 13493264, "step": 1300 }, { "epoch": 0.38404726735598227, "eval_loss": 0.4715976417064667, "eval_runtime": 19.0919, "eval_samples_per_second": 3.143, "eval_steps_per_second": 0.786, "num_input_tokens_seen": 13493264, "step": 1300 }, { "epoch": 0.38552437223042835, "grad_norm": 15.50071615139337, "learning_rate": 9.453534651910765e-05, "loss": 0.402, "num_input_tokens_seen": 13545256, "step": 1305 }, { "epoch": 0.3870014771048744, "grad_norm": 23.183495844663526, "learning_rate": 9.447969751791577e-05, "loss": 0.3075, "num_input_tokens_seen": 13597792, "step": 1310 }, { "epoch": 0.38847858197932056, "grad_norm": 9.67544956653079, "learning_rate": 9.442378315129455e-05, "loss": 0.3702, "num_input_tokens_seen": 13649848, "step": 1315 }, { "epoch": 0.38995568685376664, "grad_norm": 2.9059361985914416, "learning_rate": 9.436760375282859e-05, "loss": 0.3603, "num_input_tokens_seen": 13701592, "step": 1320 }, { "epoch": 0.3914327917282127, "grad_norm": 10.431238621222658, "learning_rate": 9.431115965768358e-05, "loss": 0.4072, "num_input_tokens_seen": 13753064, "step": 1325 }, { "epoch": 0.3929098966026588, "grad_norm": 11.216612661805582, "learning_rate": 9.425445120260445e-05, "loss": 0.3279, "num_input_tokens_seen": 13805528, "step": 1330 }, { "epoch": 0.39438700147710487, "grad_norm": 32.22838128750362, "learning_rate": 9.419747872591325e-05, "loss": 0.3754, "num_input_tokens_seen": 13858192, "step": 1335 }, { "epoch": 0.39586410635155095, "grad_norm": 1.8703742105152936, "learning_rate": 9.414024256750723e-05, "loss": 0.3754, "num_input_tokens_seen": 13910128, "step": 1340 }, { "epoch": 0.397341211225997, "grad_norm": 5.011302513950015, "learning_rate": 9.408274306885674e-05, "loss": 0.3235, "num_input_tokens_seen": 13962536, "step": 1345 }, { "epoch": 0.3988183161004431, "grad_norm": 15.197987760428996, "learning_rate": 9.402498057300317e-05, "loss": 0.3731, "num_input_tokens_seen": 14014736, "step": 1350 }, { "epoch": 0.3988183161004431, "eval_loss": 0.5565826892852783, "eval_runtime": 19.3029, "eval_samples_per_second": 3.108, "eval_steps_per_second": 0.777, "num_input_tokens_seen": 14014736, "step": 1350 }, { "epoch": 0.40029542097488924, "grad_norm": 2.292705535408954, "learning_rate": 9.396695542455704e-05, "loss": 0.4115, "num_input_tokens_seen": 14066880, "step": 1355 }, { "epoch": 0.4017725258493353, "grad_norm": 10.68072230240614, "learning_rate": 9.390866796969577e-05, "loss": 0.365, "num_input_tokens_seen": 14118320, "step": 1360 }, { "epoch": 0.4032496307237814, "grad_norm": 28.45565288311722, "learning_rate": 9.385011855616177e-05, "loss": 0.3904, "num_input_tokens_seen": 14169208, "step": 1365 }, { "epoch": 0.40472673559822747, "grad_norm": 9.32794663574214, "learning_rate": 9.379130753326021e-05, "loss": 0.5425, "num_input_tokens_seen": 14220632, "step": 1370 }, { "epoch": 0.40620384047267355, "grad_norm": 4.737143544435888, "learning_rate": 9.373223525185709e-05, "loss": 0.3985, "num_input_tokens_seen": 14272640, "step": 1375 }, { "epoch": 0.4076809453471196, "grad_norm": 17.480173613134482, "learning_rate": 9.367290206437702e-05, "loss": 0.3528, "num_input_tokens_seen": 14324960, "step": 1380 }, { "epoch": 0.4091580502215657, "grad_norm": 4.40598964753602, "learning_rate": 9.361330832480124e-05, "loss": 0.3687, "num_input_tokens_seen": 14376792, "step": 1385 }, { "epoch": 0.41063515509601184, "grad_norm": 15.961709998187562, "learning_rate": 9.355345438866538e-05, "loss": 0.3552, "num_input_tokens_seen": 14428192, "step": 1390 }, { "epoch": 0.4121122599704579, "grad_norm": 4.033485652398453, "learning_rate": 9.349334061305743e-05, "loss": 0.3194, "num_input_tokens_seen": 14480568, "step": 1395 }, { "epoch": 0.413589364844904, "grad_norm": 9.187315388235644, "learning_rate": 9.343296735661557e-05, "loss": 0.388, "num_input_tokens_seen": 14532288, "step": 1400 }, { "epoch": 0.413589364844904, "eval_loss": 0.38656601309776306, "eval_runtime": 19.1495, "eval_samples_per_second": 3.133, "eval_steps_per_second": 0.783, "num_input_tokens_seen": 14532288, "step": 1400 } ], "logging_steps": 5, "max_steps": 6770, "num_input_tokens_seen": 14532288, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 958658687795200.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }