{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9985022466300549, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00798801797304044, "grad_norm": 0.02341790311038494, "learning_rate": 4.99921047320825e-05, "loss": 2.5021, "num_input_tokens_seen": 162400, "step": 2 }, { "epoch": 0.01597603594608088, "grad_norm": 0.030116591602563858, "learning_rate": 4.996842391515044e-05, "loss": 2.5086, "num_input_tokens_seen": 316832, "step": 4 }, { "epoch": 0.023964053919121316, "grad_norm": 0.03529619425535202, "learning_rate": 4.992897250651535e-05, "loss": 2.4884, "num_input_tokens_seen": 456560, "step": 6 }, { "epoch": 0.03195207189216176, "grad_norm": 0.028305748477578163, "learning_rate": 4.987377542453251e-05, "loss": 2.5476, "num_input_tokens_seen": 612256, "step": 8 }, { "epoch": 0.0399400898652022, "grad_norm": 0.02712586335837841, "learning_rate": 4.980286753286195e-05, "loss": 2.5127, "num_input_tokens_seen": 774544, "step": 10 }, { "epoch": 0.04792810783824263, "grad_norm": 0.029004713520407677, "learning_rate": 4.971629361844785e-05, "loss": 2.5413, "num_input_tokens_seen": 939664, "step": 12 }, { "epoch": 0.05591612581128307, "grad_norm": 0.02819664776325226, "learning_rate": 4.9614108363230135e-05, "loss": 2.4724, "num_input_tokens_seen": 1091408, "step": 14 }, { "epoch": 0.06390414378432352, "grad_norm": 0.025089334696531296, "learning_rate": 4.949637630960617e-05, "loss": 2.4269, "num_input_tokens_seen": 1251968, "step": 16 }, { "epoch": 0.07189216175736396, "grad_norm": 0.027914991602301598, "learning_rate": 4.9363171819664434e-05, "loss": 2.4497, "num_input_tokens_seen": 1413632, "step": 18 }, { "epoch": 0.0798801797304044, "grad_norm": 0.029010389000177383, "learning_rate": 4.9214579028215776e-05, "loss": 2.4304, "num_input_tokens_seen": 1561616, "step": 20 }, { "epoch": 0.08786819770344484, "grad_norm": 0.032729391008615494, "learning_rate": 4.905069178965215e-05, "loss": 2.4962, "num_input_tokens_seen": 1702048, "step": 22 }, { "epoch": 0.09585621567648527, "grad_norm": 0.034885670989751816, "learning_rate": 4.887161361866608e-05, "loss": 2.4427, "num_input_tokens_seen": 1847168, "step": 24 }, { "epoch": 0.1038442336495257, "grad_norm": 0.028687166050076485, "learning_rate": 4.867745762486861e-05, "loss": 2.4515, "num_input_tokens_seen": 2002624, "step": 26 }, { "epoch": 0.11183225162256615, "grad_norm": 0.02790650725364685, "learning_rate": 4.846834644134686e-05, "loss": 2.4309, "num_input_tokens_seen": 2162992, "step": 28 }, { "epoch": 0.11982026959560658, "grad_norm": 0.0356418751180172, "learning_rate": 4.8244412147206284e-05, "loss": 2.4502, "num_input_tokens_seen": 2304544, "step": 30 }, { "epoch": 0.12780828756864704, "grad_norm": 0.03653424605727196, "learning_rate": 4.800579618414676e-05, "loss": 2.4509, "num_input_tokens_seen": 2444336, "step": 32 }, { "epoch": 0.13579630554168748, "grad_norm": 0.03637837618589401, "learning_rate": 4.775264926712489e-05, "loss": 2.4778, "num_input_tokens_seen": 2598848, "step": 34 }, { "epoch": 0.14378432351472792, "grad_norm": 0.03624645993113518, "learning_rate": 4.7485131289159276e-05, "loss": 2.3898, "num_input_tokens_seen": 2745520, "step": 36 }, { "epoch": 0.15177234148776836, "grad_norm": 0.039211999624967575, "learning_rate": 4.720341122033862e-05, "loss": 2.5515, "num_input_tokens_seen": 2913392, "step": 38 }, { "epoch": 0.1597603594608088, "grad_norm": 0.04233478382229805, "learning_rate": 4.690766700109659e-05, "loss": 2.4457, "num_input_tokens_seen": 3053600, "step": 40 }, { "epoch": 0.16774837743384924, "grad_norm": 0.03884820267558098, "learning_rate": 4.659808542982088e-05, "loss": 2.4336, "num_input_tokens_seen": 3199248, "step": 42 }, { "epoch": 0.17573639540688968, "grad_norm": 0.04048454761505127, "learning_rate": 4.6274862044867304e-05, "loss": 2.4655, "num_input_tokens_seen": 3324464, "step": 44 }, { "epoch": 0.18372441337993012, "grad_norm": 0.034897685050964355, "learning_rate": 4.593820100105355e-05, "loss": 2.4566, "num_input_tokens_seen": 3485088, "step": 46 }, { "epoch": 0.19171243135297053, "grad_norm": 0.04081996530294418, "learning_rate": 4.558831494071069e-05, "loss": 2.4912, "num_input_tokens_seen": 3635152, "step": 48 }, { "epoch": 0.19970044932601097, "grad_norm": 0.03764244541525841, "learning_rate": 4.522542485937369e-05, "loss": 2.4185, "num_input_tokens_seen": 3787152, "step": 50 }, { "epoch": 0.2076884672990514, "grad_norm": 0.04559960216283798, "learning_rate": 4.484975996619589e-05, "loss": 2.4221, "num_input_tokens_seen": 3931280, "step": 52 }, { "epoch": 0.21567648527209185, "grad_norm": 0.03996207192540169, "learning_rate": 4.4461557539175594e-05, "loss": 2.4091, "num_input_tokens_seen": 4083536, "step": 54 }, { "epoch": 0.2236645032451323, "grad_norm": 0.042057909071445465, "learning_rate": 4.40610627752862e-05, "loss": 2.4697, "num_input_tokens_seen": 4239440, "step": 56 }, { "epoch": 0.23165252121817273, "grad_norm": 0.04598196595907211, "learning_rate": 4.3648528635604556e-05, "loss": 2.3675, "num_input_tokens_seen": 4369008, "step": 58 }, { "epoch": 0.23964053919121317, "grad_norm": 0.03921639174222946, "learning_rate": 4.3224215685535294e-05, "loss": 2.4421, "num_input_tokens_seen": 4525680, "step": 60 }, { "epoch": 0.2476285571642536, "grad_norm": 0.03894852474331856, "learning_rate": 4.278839193023214e-05, "loss": 2.3973, "num_input_tokens_seen": 4683360, "step": 62 }, { "epoch": 0.2556165751372941, "grad_norm": 0.048196952790021896, "learning_rate": 4.234133264532012e-05, "loss": 2.5022, "num_input_tokens_seen": 4833520, "step": 64 }, { "epoch": 0.2636045931103345, "grad_norm": 0.03995994105935097, "learning_rate": 4.188332020302561e-05, "loss": 2.3089, "num_input_tokens_seen": 4974480, "step": 66 }, { "epoch": 0.27159261108337496, "grad_norm": 0.047357227653265, "learning_rate": 4.1414643893823914e-05, "loss": 2.3378, "num_input_tokens_seen": 5099616, "step": 68 }, { "epoch": 0.2795806290564154, "grad_norm": 0.046735942363739014, "learning_rate": 4.093559974371725e-05, "loss": 2.4277, "num_input_tokens_seen": 5244416, "step": 70 }, { "epoch": 0.28756864702945584, "grad_norm": 0.04764910414814949, "learning_rate": 4.044649032725836e-05, "loss": 2.3468, "num_input_tokens_seen": 5392128, "step": 72 }, { "epoch": 0.2955566650024963, "grad_norm": 0.04130158573389053, "learning_rate": 3.9947624576437975e-05, "loss": 2.3907, "num_input_tokens_seen": 5545216, "step": 74 }, { "epoch": 0.3035446829755367, "grad_norm": 0.048350926488637924, "learning_rate": 3.943931758555669e-05, "loss": 2.3878, "num_input_tokens_seen": 5712080, "step": 76 }, { "epoch": 0.31153270094857716, "grad_norm": 0.052576858550310135, "learning_rate": 3.8921890412204705e-05, "loss": 2.3853, "num_input_tokens_seen": 5858800, "step": 78 }, { "epoch": 0.3195207189216176, "grad_norm": 0.04666517302393913, "learning_rate": 3.8395669874474915e-05, "loss": 2.3765, "num_input_tokens_seen": 6008784, "step": 80 }, { "epoch": 0.32750873689465804, "grad_norm": 0.045086730271577835, "learning_rate": 3.786098834453766e-05, "loss": 2.4277, "num_input_tokens_seen": 6155536, "step": 82 }, { "epoch": 0.3354967548676985, "grad_norm": 0.0495474711060524, "learning_rate": 3.731818353870729e-05, "loss": 2.3935, "num_input_tokens_seen": 6306576, "step": 84 }, { "epoch": 0.3434847728407389, "grad_norm": 0.0486772395670414, "learning_rate": 3.6767598304133324e-05, "loss": 2.4679, "num_input_tokens_seen": 6470448, "step": 86 }, { "epoch": 0.35147279081377936, "grad_norm": 0.05734413489699364, "learning_rate": 3.6209580402250815e-05, "loss": 2.3868, "num_input_tokens_seen": 6616864, "step": 88 }, { "epoch": 0.3594608087868198, "grad_norm": 0.04023748263716698, "learning_rate": 3.564448228912682e-05, "loss": 2.3859, "num_input_tokens_seen": 6786448, "step": 90 }, { "epoch": 0.36744882675986024, "grad_norm": 0.04801137000322342, "learning_rate": 3.507266089284157e-05, "loss": 2.4216, "num_input_tokens_seen": 6954720, "step": 92 }, { "epoch": 0.3754368447329007, "grad_norm": 0.05846545100212097, "learning_rate": 3.4494477388045035e-05, "loss": 2.4611, "num_input_tokens_seen": 7094208, "step": 94 }, { "epoch": 0.38342486270594106, "grad_norm": 0.04973718896508217, "learning_rate": 3.3910296967831266e-05, "loss": 2.3384, "num_input_tokens_seen": 7227440, "step": 96 }, { "epoch": 0.3914128806789815, "grad_norm": 0.0684569925069809, "learning_rate": 3.332048861307467e-05, "loss": 2.4074, "num_input_tokens_seen": 7362608, "step": 98 }, { "epoch": 0.39940089865202194, "grad_norm": 0.05771000683307648, "learning_rate": 3.272542485937369e-05, "loss": 2.3572, "num_input_tokens_seen": 7515024, "step": 100 }, { "epoch": 0.4073889166250624, "grad_norm": 0.043234046548604965, "learning_rate": 3.21254815617494e-05, "loss": 2.3579, "num_input_tokens_seen": 7670624, "step": 102 }, { "epoch": 0.4153769345981028, "grad_norm": 0.06148277968168259, "learning_rate": 3.152103765724743e-05, "loss": 2.3728, "num_input_tokens_seen": 7817648, "step": 104 }, { "epoch": 0.42336495257114326, "grad_norm": 0.04587692394852638, "learning_rate": 3.091247492559312e-05, "loss": 2.2712, "num_input_tokens_seen": 7968432, "step": 106 }, { "epoch": 0.4313529705441837, "grad_norm": 0.05067060887813568, "learning_rate": 3.0300177748051373e-05, "loss": 2.4054, "num_input_tokens_seen": 8127312, "step": 108 }, { "epoch": 0.43934098851722414, "grad_norm": 0.04505151882767677, "learning_rate": 2.9684532864643122e-05, "loss": 2.3966, "num_input_tokens_seen": 8275008, "step": 110 }, { "epoch": 0.4473290064902646, "grad_norm": 0.058195654302835464, "learning_rate": 2.9065929129872094e-05, "loss": 2.4478, "num_input_tokens_seen": 8425536, "step": 112 }, { "epoch": 0.455317024463305, "grad_norm": 0.05548926442861557, "learning_rate": 2.844475726711595e-05, "loss": 2.4386, "num_input_tokens_seen": 8573776, "step": 114 }, { "epoch": 0.46330504243634546, "grad_norm": 0.05548759549856186, "learning_rate": 2.782140962183704e-05, "loss": 2.4176, "num_input_tokens_seen": 8729200, "step": 116 }, { "epoch": 0.4712930604093859, "grad_norm": 0.05361337587237358, "learning_rate": 2.7196279913768584e-05, "loss": 2.3505, "num_input_tokens_seen": 8884720, "step": 118 }, { "epoch": 0.47928107838242634, "grad_norm": 0.05254572257399559, "learning_rate": 2.656976298823284e-05, "loss": 2.3867, "num_input_tokens_seen": 9035088, "step": 120 }, { "epoch": 0.4872690963554668, "grad_norm": 0.051810409873723984, "learning_rate": 2.594225456674837e-05, "loss": 2.4321, "num_input_tokens_seen": 9188528, "step": 122 }, { "epoch": 0.4952571143285072, "grad_norm": 0.05497866868972778, "learning_rate": 2.531415099708382e-05, "loss": 2.4392, "num_input_tokens_seen": 9352752, "step": 124 }, { "epoch": 0.5032451323015477, "grad_norm": 0.06646806746721268, "learning_rate": 2.4685849002916183e-05, "loss": 2.4005, "num_input_tokens_seen": 9497840, "step": 126 }, { "epoch": 0.5112331502745882, "grad_norm": 0.05319731682538986, "learning_rate": 2.4057745433251635e-05, "loss": 2.309, "num_input_tokens_seen": 9654816, "step": 128 }, { "epoch": 0.5192211682476285, "grad_norm": 0.07142467051744461, "learning_rate": 2.3430237011767167e-05, "loss": 2.522, "num_input_tokens_seen": 9814944, "step": 130 }, { "epoch": 0.527209186220669, "grad_norm": 0.06771399825811386, "learning_rate": 2.280372008623142e-05, "loss": 2.4151, "num_input_tokens_seen": 9951952, "step": 132 }, { "epoch": 0.5351972041937094, "grad_norm": 0.062045566737651825, "learning_rate": 2.217859037816296e-05, "loss": 2.3723, "num_input_tokens_seen": 10112272, "step": 134 }, { "epoch": 0.5431852221667499, "grad_norm": 0.05891815572977066, "learning_rate": 2.155524273288405e-05, "loss": 2.416, "num_input_tokens_seen": 10281344, "step": 136 }, { "epoch": 0.5511732401397903, "grad_norm": 0.06526540219783783, "learning_rate": 2.0934070870127912e-05, "loss": 2.5335, "num_input_tokens_seen": 10409584, "step": 138 }, { "epoch": 0.5591612581128308, "grad_norm": 0.0685892105102539, "learning_rate": 2.031546713535688e-05, "loss": 2.4946, "num_input_tokens_seen": 10573840, "step": 140 }, { "epoch": 0.5671492760858712, "grad_norm": 0.05832570418715477, "learning_rate": 1.969982225194864e-05, "loss": 2.3169, "num_input_tokens_seen": 10729760, "step": 142 }, { "epoch": 0.5751372940589117, "grad_norm": 0.05707252770662308, "learning_rate": 1.908752507440689e-05, "loss": 2.3797, "num_input_tokens_seen": 10881664, "step": 144 }, { "epoch": 0.5831253120319521, "grad_norm": 0.05318214371800423, "learning_rate": 1.8478962342752583e-05, "loss": 2.4498, "num_input_tokens_seen": 11034352, "step": 146 }, { "epoch": 0.5911133300049926, "grad_norm": 0.06147105619311333, "learning_rate": 1.7874518438250597e-05, "loss": 2.3818, "num_input_tokens_seen": 11175952, "step": 148 }, { "epoch": 0.5991013479780329, "grad_norm": 0.05764273181557655, "learning_rate": 1.7274575140626318e-05, "loss": 2.3818, "num_input_tokens_seen": 11314112, "step": 150 }, { "epoch": 0.6070893659510734, "grad_norm": 0.05358808860182762, "learning_rate": 1.6679511386925337e-05, "loss": 2.4073, "num_input_tokens_seen": 11481792, "step": 152 }, { "epoch": 0.6150773839241138, "grad_norm": 0.05264300853013992, "learning_rate": 1.6089703032168733e-05, "loss": 2.3467, "num_input_tokens_seen": 11649040, "step": 154 }, { "epoch": 0.6230654018971543, "grad_norm": 0.060223497450351715, "learning_rate": 1.5505522611954975e-05, "loss": 2.4047, "num_input_tokens_seen": 11791488, "step": 156 }, { "epoch": 0.6310534198701947, "grad_norm": 0.0639820247888565, "learning_rate": 1.4927339107158437e-05, "loss": 2.2998, "num_input_tokens_seen": 11940144, "step": 158 }, { "epoch": 0.6390414378432352, "grad_norm": 0.053195178508758545, "learning_rate": 1.4355517710873184e-05, "loss": 2.383, "num_input_tokens_seen": 12097648, "step": 160 }, { "epoch": 0.6470294558162756, "grad_norm": 0.059055812656879425, "learning_rate": 1.3790419597749199e-05, "loss": 2.3525, "num_input_tokens_seen": 12253152, "step": 162 }, { "epoch": 0.6550174737893161, "grad_norm": 0.07648273557424545, "learning_rate": 1.3232401695866687e-05, "loss": 2.4133, "num_input_tokens_seen": 12410080, "step": 164 }, { "epoch": 0.6630054917623565, "grad_norm": 0.06658945977687836, "learning_rate": 1.2681816461292715e-05, "loss": 2.3677, "num_input_tokens_seen": 12544848, "step": 166 }, { "epoch": 0.670993509735397, "grad_norm": 0.06263954192399979, "learning_rate": 1.2139011655462337e-05, "loss": 2.4215, "num_input_tokens_seen": 12701808, "step": 168 }, { "epoch": 0.6789815277084373, "grad_norm": 0.0712100937962532, "learning_rate": 1.1604330125525079e-05, "loss": 2.3162, "num_input_tokens_seen": 12845840, "step": 170 }, { "epoch": 0.6869695456814778, "grad_norm": 0.06635987758636475, "learning_rate": 1.107810958779531e-05, "loss": 2.4129, "num_input_tokens_seen": 12988144, "step": 172 }, { "epoch": 0.6949575636545182, "grad_norm": 0.07223087549209595, "learning_rate": 1.0560682414443315e-05, "loss": 2.4046, "num_input_tokens_seen": 13129808, "step": 174 }, { "epoch": 0.7029455816275587, "grad_norm": 0.08978503942489624, "learning_rate": 1.0052375423562038e-05, "loss": 2.3596, "num_input_tokens_seen": 13269264, "step": 176 }, { "epoch": 0.7109335996005991, "grad_norm": 0.06082882732152939, "learning_rate": 9.553509672741645e-06, "loss": 2.3047, "num_input_tokens_seen": 13417184, "step": 178 }, { "epoch": 0.7189216175736396, "grad_norm": 0.07053129374980927, "learning_rate": 9.064400256282757e-06, "loss": 2.4177, "num_input_tokens_seen": 13561008, "step": 180 }, { "epoch": 0.72690963554668, "grad_norm": 0.0724414512515068, "learning_rate": 8.585356106176094e-06, "loss": 2.4451, "num_input_tokens_seen": 13706976, "step": 182 }, { "epoch": 0.7348976535197205, "grad_norm": 0.07262030243873596, "learning_rate": 8.116679796974388e-06, "loss": 2.3997, "num_input_tokens_seen": 13835936, "step": 184 }, { "epoch": 0.7428856714927609, "grad_norm": 0.07069452106952667, "learning_rate": 7.65866735467988e-06, "loss": 2.412, "num_input_tokens_seen": 13962144, "step": 186 }, { "epoch": 0.7508736894658014, "grad_norm": 0.053225237876176834, "learning_rate": 7.211608069767867e-06, "loss": 2.4054, "num_input_tokens_seen": 14128800, "step": 188 }, { "epoch": 0.7588617074388417, "grad_norm": 0.05883209779858589, "learning_rate": 6.775784314464717e-06, "loss": 2.4417, "num_input_tokens_seen": 14278288, "step": 190 }, { "epoch": 0.7668497254118821, "grad_norm": 0.0630330890417099, "learning_rate": 6.3514713643954475e-06, "loss": 2.3358, "num_input_tokens_seen": 14429936, "step": 192 }, { "epoch": 0.7748377433849226, "grad_norm": 0.06319437175989151, "learning_rate": 5.9389372247138e-06, "loss": 2.5089, "num_input_tokens_seen": 14567552, "step": 194 }, { "epoch": 0.782825761357963, "grad_norm": 0.0682671070098877, "learning_rate": 5.538442460824417e-06, "loss": 2.4246, "num_input_tokens_seen": 14704528, "step": 196 }, { "epoch": 0.7908137793310035, "grad_norm": 0.06514116376638412, "learning_rate": 5.150240033804116e-06, "loss": 2.369, "num_input_tokens_seen": 14846192, "step": 198 }, { "epoch": 0.7988017973040439, "grad_norm": 0.05886775627732277, "learning_rate": 4.7745751406263165e-06, "loss": 2.4204, "num_input_tokens_seen": 14993936, "step": 200 }, { "epoch": 0.8067898152770844, "grad_norm": 0.06780359148979187, "learning_rate": 4.411685059289314e-06, "loss": 2.3141, "num_input_tokens_seen": 15136240, "step": 202 }, { "epoch": 0.8147778332501248, "grad_norm": 0.06595364212989807, "learning_rate": 4.061798998946459e-06, "loss": 2.3841, "num_input_tokens_seen": 15274672, "step": 204 }, { "epoch": 0.8227658512231653, "grad_norm": 0.06310597062110901, "learning_rate": 3.725137955132707e-06, "loss": 2.3653, "num_input_tokens_seen": 15417920, "step": 206 }, { "epoch": 0.8307538691962056, "grad_norm": 0.06177780404686928, "learning_rate": 3.4019145701791184e-06, "loss": 2.4382, "num_input_tokens_seen": 15557920, "step": 208 }, { "epoch": 0.8387418871692461, "grad_norm": 0.0720074325799942, "learning_rate": 3.092332998903416e-06, "loss": 2.4455, "num_input_tokens_seen": 15687968, "step": 210 }, { "epoch": 0.8467299051422865, "grad_norm": 0.05820966139435768, "learning_rate": 2.7965887796613884e-06, "loss": 2.4322, "num_input_tokens_seen": 15843072, "step": 212 }, { "epoch": 0.854717923115327, "grad_norm": 0.07062980532646179, "learning_rate": 2.514868710840723e-06, "loss": 2.4433, "num_input_tokens_seen": 16002112, "step": 214 }, { "epoch": 0.8627059410883674, "grad_norm": 0.07145073264837265, "learning_rate": 2.2473507328751086e-06, "loss": 2.4399, "num_input_tokens_seen": 16135328, "step": 216 }, { "epoch": 0.8706939590614079, "grad_norm": 0.06476866453886032, "learning_rate": 1.9942038158532407e-06, "loss": 2.378, "num_input_tokens_seen": 16293008, "step": 218 }, { "epoch": 0.8786819770344483, "grad_norm": 0.05755852535367012, "learning_rate": 1.7555878527937164e-06, "loss": 2.3874, "num_input_tokens_seen": 16441024, "step": 220 }, { "epoch": 0.8866699950074888, "grad_norm": 0.059895843267440796, "learning_rate": 1.5316535586531483e-06, "loss": 2.4028, "num_input_tokens_seen": 16587616, "step": 222 }, { "epoch": 0.8946580129805292, "grad_norm": 0.055422358214855194, "learning_rate": 1.3225423751313942e-06, "loss": 2.3372, "num_input_tokens_seen": 16730800, "step": 224 }, { "epoch": 0.9026460309535697, "grad_norm": 0.06502599269151688, "learning_rate": 1.1283863813339263e-06, "loss": 2.4288, "num_input_tokens_seen": 16886144, "step": 226 }, { "epoch": 0.91063404892661, "grad_norm": 0.06254018843173981, "learning_rate": 9.493082103478517e-07, "loss": 2.3483, "num_input_tokens_seen": 17038816, "step": 228 }, { "epoch": 0.9186220668996505, "grad_norm": 0.08103015273809433, "learning_rate": 7.854209717842231e-07, "loss": 2.4648, "num_input_tokens_seen": 17171792, "step": 230 }, { "epoch": 0.9266100848726909, "grad_norm": 0.06423594057559967, "learning_rate": 6.368281803355691e-07, "loss": 2.4327, "num_input_tokens_seen": 17324752, "step": 232 }, { "epoch": 0.9345981028457314, "grad_norm": 0.06621188670396805, "learning_rate": 5.036236903938285e-07, "loss": 2.3037, "num_input_tokens_seen": 17485168, "step": 234 }, { "epoch": 0.9425861208187718, "grad_norm": 0.06229124590754509, "learning_rate": 3.8589163676986674e-07, "loss": 2.4094, "num_input_tokens_seen": 17632832, "step": 236 }, { "epoch": 0.9505741387918123, "grad_norm": 0.07828541845083237, "learning_rate": 2.8370638155215123e-07, "loss": 2.3348, "num_input_tokens_seen": 17781744, "step": 238 }, { "epoch": 0.9585621567648527, "grad_norm": 0.049717940390110016, "learning_rate": 1.9713246713805588e-07, "loss": 2.3979, "num_input_tokens_seen": 17959392, "step": 240 }, { "epoch": 0.9665501747378932, "grad_norm": 0.06252018362283707, "learning_rate": 1.2622457546749567e-07, "loss": 2.346, "num_input_tokens_seen": 18121744, "step": 242 }, { "epoch": 0.9745381927109336, "grad_norm": 0.06641443818807602, "learning_rate": 7.102749348465165e-08, "loss": 2.4486, "num_input_tokens_seen": 18273856, "step": 244 }, { "epoch": 0.982526210683974, "grad_norm": 0.052787262946367264, "learning_rate": 3.157608484956332e-08, "loss": 2.3959, "num_input_tokens_seen": 18407456, "step": 246 }, { "epoch": 0.9905142286570144, "grad_norm": 0.058858100324869156, "learning_rate": 7.895267917501504e-09, "loss": 2.4287, "num_input_tokens_seen": 18555776, "step": 248 }, { "epoch": 0.9985022466300549, "grad_norm": 0.07202674448490143, "learning_rate": 0.0, "loss": 2.4911, "num_input_tokens_seen": 18683616, "step": 250 }, { "epoch": 0.9985022466300549, "num_input_tokens_seen": 18683616, "step": 250, "total_flos": 1.599323192678744e+18, "train_loss": 2.4129163398742675, "train_runtime": 18708.0864, "train_samples_per_second": 0.214, "train_steps_per_second": 0.013 } ], "logging_steps": 2, "max_steps": 250, "num_input_tokens_seen": 18683616, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.599323192678744e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }