|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 955, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2461.7163655359695, |
|
"learning_rate": 2.0833333333333333e-07, |
|
"loss": 13.4422, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1188.3611863310348, |
|
"learning_rate": 1.0416666666666667e-06, |
|
"loss": 12.564, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 383.2777721914162, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 7.5043, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 85.32523705996411, |
|
"learning_rate": 3.125e-06, |
|
"loss": 4.3848, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 44.902597609978045, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 3.608, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 33.8945502589581, |
|
"learning_rate": 5.208333333333334e-06, |
|
"loss": 3.3411, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 22.789039733408682, |
|
"learning_rate": 6.25e-06, |
|
"loss": 3.1891, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 15.294909208517709, |
|
"learning_rate": 7.291666666666667e-06, |
|
"loss": 3.0935, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 16.346336894842942, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 2.9414, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 23.553918714950836, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 2.8648, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 164.07588077953187, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 2.5263, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 48.559212060939444, |
|
"learning_rate": 1.1458333333333333e-05, |
|
"loss": 2.037, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 23.491477625412887, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.5635, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 18.534543889923896, |
|
"learning_rate": 1.3541666666666668e-05, |
|
"loss": 1.4948, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 12.384878393353059, |
|
"learning_rate": 1.4583333333333333e-05, |
|
"loss": 1.4257, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.477210018385332, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 1.3939, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.144109465129273, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.3457, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.552451526784706, |
|
"learning_rate": 1.7708333333333335e-05, |
|
"loss": 1.3328, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.9924625599541206, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 1.3063, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.7886605684614403, |
|
"learning_rate": 1.979166666666667e-05, |
|
"loss": 1.3082, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.590180516177269, |
|
"learning_rate": 1.999892997072575e-05, |
|
"loss": 1.2868, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.258640200275494, |
|
"learning_rate": 1.99945833692589e-05, |
|
"loss": 1.3019, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.200726638847038, |
|
"learning_rate": 1.9986894771071707e-05, |
|
"loss": 1.2737, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.226850310714147, |
|
"learning_rate": 1.9975866747083734e-05, |
|
"loss": 1.2763, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.9708049477307656, |
|
"learning_rate": 1.9961502984854394e-05, |
|
"loss": 1.2526, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.498450336726286, |
|
"learning_rate": 1.9943808287349902e-05, |
|
"loss": 1.2729, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.3080885736994894, |
|
"learning_rate": 1.992278857133726e-05, |
|
"loss": 1.243, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.053212283450933, |
|
"learning_rate": 1.9898450865405786e-05, |
|
"loss": 1.2731, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.17535563832229, |
|
"learning_rate": 1.9870803307616916e-05, |
|
"loss": 1.2692, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.822523160059189, |
|
"learning_rate": 1.983985514278296e-05, |
|
"loss": 1.2534, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.4132366650716515, |
|
"learning_rate": 1.9805616719375852e-05, |
|
"loss": 1.2642, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.773367552594801, |
|
"learning_rate": 1.9768099486066776e-05, |
|
"loss": 1.2669, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.505204667225315, |
|
"learning_rate": 1.9727315987897993e-05, |
|
"loss": 1.2221, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.378575933551222, |
|
"learning_rate": 1.9683279862087986e-05, |
|
"loss": 1.2432, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.9521011616333936, |
|
"learning_rate": 1.963600583347147e-05, |
|
"loss": 1.2243, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.126328532378981, |
|
"learning_rate": 1.9585509709575646e-05, |
|
"loss": 1.242, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.314912808408647, |
|
"learning_rate": 1.9531808375334512e-05, |
|
"loss": 1.2545, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.0121454881889225, |
|
"learning_rate": 1.9474919787442835e-05, |
|
"loss": 1.2378, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.369424629625938, |
|
"learning_rate": 1.9414862968351788e-05, |
|
"loss": 1.2304, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.135366351640658, |
|
"learning_rate": 1.935165799990821e-05, |
|
"loss": 1.2384, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.468708898225178, |
|
"learning_rate": 1.9285326016639624e-05, |
|
"loss": 1.2372, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.141211198337755, |
|
"learning_rate": 1.9215889198687245e-05, |
|
"loss": 1.2342, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.564399747906429, |
|
"learning_rate": 1.9143370764389374e-05, |
|
"loss": 1.245, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.423614603504273, |
|
"learning_rate": 1.906779496251763e-05, |
|
"loss": 1.2127, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.77469168667574, |
|
"learning_rate": 1.8989187064168643e-05, |
|
"loss": 1.2114, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.7507426379732873, |
|
"learning_rate": 1.8907573354313853e-05, |
|
"loss": 1.2057, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.2625231428295396, |
|
"learning_rate": 1.8822981123010343e-05, |
|
"loss": 1.2005, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.559565219376083, |
|
"learning_rate": 1.873543865627556e-05, |
|
"loss": 1.2121, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.674000836928357, |
|
"learning_rate": 1.8644975226629025e-05, |
|
"loss": 1.2064, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.054211528310419, |
|
"learning_rate": 1.8551621083304147e-05, |
|
"loss": 1.206, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.8005222642398317, |
|
"learning_rate": 1.8455407442133467e-05, |
|
"loss": 1.1824, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.270369602281788, |
|
"learning_rate": 1.8356366475110697e-05, |
|
"loss": 1.2048, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.904842228016003, |
|
"learning_rate": 1.8254531299633007e-05, |
|
"loss": 1.2052, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.286875472560863, |
|
"learning_rate": 1.81499359674272e-05, |
|
"loss": 1.2018, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.1899104367297864, |
|
"learning_rate": 1.8042615453163484e-05, |
|
"loss": 1.2018, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.273989115747855, |
|
"learning_rate": 1.7932605642760607e-05, |
|
"loss": 1.1888, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.268115493890591, |
|
"learning_rate": 1.7819943321386295e-05, |
|
"loss": 1.1906, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.421685752864093, |
|
"learning_rate": 1.7704666161156994e-05, |
|
"loss": 1.2086, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.715983707085352, |
|
"learning_rate": 1.7586812708541046e-05, |
|
"loss": 1.1922, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.727172850716651, |
|
"learning_rate": 1.746642237146948e-05, |
|
"loss": 1.2142, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.115772105064366, |
|
"learning_rate": 1.7343535406158773e-05, |
|
"loss": 1.1973, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.907139227773489, |
|
"learning_rate": 1.7218192903649926e-05, |
|
"loss": 1.1804, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.857633937230002, |
|
"learning_rate": 1.7090436776068422e-05, |
|
"loss": 1.2183, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.6278739870653705, |
|
"learning_rate": 1.6960309742609603e-05, |
|
"loss": 1.1918, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.037905874676671, |
|
"learning_rate": 1.682785531525422e-05, |
|
"loss": 1.1793, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.042020400450931, |
|
"learning_rate": 1.6693117784218818e-05, |
|
"loss": 1.1942, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.6059688835627632, |
|
"learning_rate": 1.655614220314598e-05, |
|
"loss": 1.1901, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.027640956254004, |
|
"learning_rate": 1.6416974374039227e-05, |
|
"loss": 1.1815, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.469869352005406, |
|
"learning_rate": 1.6275660831947725e-05, |
|
"loss": 1.1882, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.913192938827672, |
|
"learning_rate": 1.6132248829405845e-05, |
|
"loss": 1.1799, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.193881408589, |
|
"learning_rate": 1.5986786320632842e-05, |
|
"loss": 1.1993, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.863252905360088, |
|
"learning_rate": 1.5839321945497847e-05, |
|
"loss": 1.1824, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.775458735232249, |
|
"learning_rate": 1.5689905013255683e-05, |
|
"loss": 1.1721, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.4362566052272228, |
|
"learning_rate": 1.5538585486058747e-05, |
|
"loss": 1.1846, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.812829346931678, |
|
"learning_rate": 1.5385413962250657e-05, |
|
"loss": 1.1828, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.6565563697093495, |
|
"learning_rate": 1.5230441659447128e-05, |
|
"loss": 1.1707, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.7129216895293524, |
|
"learning_rate": 1.507372039740978e-05, |
|
"loss": 1.1778, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.934774129904253, |
|
"learning_rate": 1.4915302580718614e-05, |
|
"loss": 1.1913, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.941744536837656, |
|
"learning_rate": 1.4755241181248923e-05, |
|
"loss": 1.1825, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 5.725933867619445, |
|
"learning_rate": 1.4593589720458507e-05, |
|
"loss": 1.1804, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.884385616617823, |
|
"learning_rate": 1.443040225149114e-05, |
|
"loss": 1.1766, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 6.448418045396495, |
|
"learning_rate": 1.4265733341102235e-05, |
|
"loss": 1.1677, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.543829508747149, |
|
"learning_rate": 1.4099638051412745e-05, |
|
"loss": 1.1802, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.2565990410366017, |
|
"learning_rate": 1.3932171921497483e-05, |
|
"loss": 1.1866, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.5069328829068236, |
|
"learning_rate": 1.3763390948813897e-05, |
|
"loss": 1.1622, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.3809314070147165, |
|
"learning_rate": 1.3593351570477608e-05, |
|
"loss": 1.1941, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.910200688628565, |
|
"learning_rate": 1.3422110644390911e-05, |
|
"loss": 1.1709, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.302365264776221, |
|
"learning_rate": 1.3249725430230595e-05, |
|
"loss": 1.1739, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.246681366222887, |
|
"learning_rate": 1.3076253570301409e-05, |
|
"loss": 1.1603, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.280421565698981, |
|
"learning_rate": 1.2901753070261565e-05, |
|
"loss": 1.186, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.81317992852912, |
|
"learning_rate": 1.2726282279726788e-05, |
|
"loss": 1.1658, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.8610620659207617, |
|
"learning_rate": 1.2549899872759288e-05, |
|
"loss": 1.1825, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.4010239256735115, |
|
"learning_rate": 1.237266482824832e-05, |
|
"loss": 1.1496, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 8.49710325570422, |
|
"learning_rate": 1.2194636410188748e-05, |
|
"loss": 1.173, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.6548860275475397, |
|
"learning_rate": 1.2015874147864314e-05, |
|
"loss": 1.1591, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.5679037646309197, |
|
"learning_rate": 1.183643781594219e-05, |
|
"loss": 1.1691, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.749061818607719, |
|
"learning_rate": 1.165638741448548e-05, |
|
"loss": 1.1716, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.8240982671284898, |
|
"learning_rate": 1.147578314889033e-05, |
|
"loss": 1.1539, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.9542808971197947, |
|
"learning_rate": 1.1294685409754434e-05, |
|
"loss": 1.159, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.1350411827108906, |
|
"learning_rate": 1.1113154752683548e-05, |
|
"loss": 1.2067, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.8905318360977628, |
|
"learning_rate": 1.0931251878042882e-05, |
|
"loss": 1.1769, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.0053111002447745, |
|
"learning_rate": 1.0749037610660041e-05, |
|
"loss": 1.1723, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.3485613915497026, |
|
"learning_rate": 1.0566572879486388e-05, |
|
"loss": 1.1653, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.082048046997961, |
|
"learning_rate": 1.0383918697223564e-05, |
|
"loss": 1.1785, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.8712520171978, |
|
"learning_rate": 1.020113613992203e-05, |
|
"loss": 1.1746, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.26766401286956, |
|
"learning_rate": 1.001828632655837e-05, |
|
"loss": 1.1372, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.0082362180315343, |
|
"learning_rate": 9.835430398598319e-06, |
|
"loss": 1.1699, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.6001344186192736, |
|
"learning_rate": 9.652629499552216e-06, |
|
"loss": 1.187, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.455098713036843, |
|
"learning_rate": 9.469944754529784e-06, |
|
"loss": 1.1526, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.506173198506875, |
|
"learning_rate": 9.28743724980107e-06, |
|
"loss": 1.1593, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.492647004795416, |
|
"learning_rate": 9.105168012370372e-06, |
|
"loss": 1.1407, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.070429402307121, |
|
"learning_rate": 8.923197989569981e-06, |
|
"loss": 1.1662, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.7831387429713894, |
|
"learning_rate": 8.741588028680566e-06, |
|
"loss": 1.1552, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.5233461063094142, |
|
"learning_rate": 8.560398856585002e-06, |
|
"loss": 1.165, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.079554638580303, |
|
"learning_rate": 8.379691059462478e-06, |
|
"loss": 1.1741, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.367550043194744, |
|
"learning_rate": 8.199525062529626e-06, |
|
"loss": 1.1572, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.442018148642513, |
|
"learning_rate": 8.01996110983552e-06, |
|
"loss": 1.1591, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.0524742405717142, |
|
"learning_rate": 7.841059244117189e-06, |
|
"loss": 1.1678, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.747340823312245, |
|
"learning_rate": 7.662879286722496e-06, |
|
"loss": 1.1598, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.9472846603989944, |
|
"learning_rate": 7.485480817607031e-06, |
|
"loss": 1.1753, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.9279530098870943, |
|
"learning_rate": 7.30892315541171e-06, |
|
"loss": 1.1462, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.056461777781048, |
|
"learning_rate": 7.133265337627757e-06, |
|
"loss": 1.1319, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.5655491221643256, |
|
"learning_rate": 6.958566100855716e-06, |
|
"loss": 1.1469, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.033014172979536, |
|
"learning_rate": 6.78488386116505e-06, |
|
"loss": 1.1522, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.2636560884902512, |
|
"learning_rate": 6.612276694560927e-06, |
|
"loss": 1.1653, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.099825834412963, |
|
"learning_rate": 6.44080231756473e-06, |
|
"loss": 1.1695, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.5276354474143465, |
|
"learning_rate": 6.2705180679147455e-06, |
|
"loss": 1.1586, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.8120221788177417, |
|
"learning_rate": 6.101480885393537e-06, |
|
"loss": 1.1735, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.7317060856823705, |
|
"learning_rate": 5.933747292788369e-06, |
|
"loss": 1.1601, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.8630486147702414, |
|
"learning_rate": 5.767373376991082e-06, |
|
"loss": 1.1548, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.044565653563942, |
|
"learning_rate": 5.602414770243698e-06, |
|
"loss": 1.1431, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.784721573945372, |
|
"learning_rate": 5.438926631536087e-06, |
|
"loss": 1.1562, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.9182503829263386, |
|
"learning_rate": 5.276963628161833e-06, |
|
"loss": 1.1501, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.0148200179539995, |
|
"learning_rate": 5.116579917438564e-06, |
|
"loss": 1.1599, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.779138701850297, |
|
"learning_rate": 4.957829128598781e-06, |
|
"loss": 1.1407, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.8364467956103447, |
|
"learning_rate": 4.80076434485727e-06, |
|
"loss": 1.1632, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.9955667796891445, |
|
"learning_rate": 4.645438085661085e-06, |
|
"loss": 1.1653, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.2640655907001874, |
|
"learning_rate": 4.4919022891280725e-06, |
|
"loss": 1.1526, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.2789098333817797, |
|
"learning_rate": 4.340208294679745e-06, |
|
"loss": 1.1529, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.755334882067621, |
|
"learning_rate": 4.190406825874377e-06, |
|
"loss": 1.1461, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.3859515382051506, |
|
"learning_rate": 4.042547973446017e-06, |
|
"loss": 1.136, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.7768674529680197, |
|
"learning_rate": 3.896681178555099e-06, |
|
"loss": 1.1494, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.7160887414907453, |
|
"learning_rate": 3.7528552162562858e-06, |
|
"loss": 1.1435, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.445482448946737, |
|
"learning_rate": 3.6111181791890184e-06, |
|
"loss": 1.1518, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.0678976022117186, |
|
"learning_rate": 3.471517461496253e-06, |
|
"loss": 1.1191, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.9920996388866588, |
|
"learning_rate": 3.3340997429767786e-06, |
|
"loss": 1.1509, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.5189782505268377, |
|
"learning_rate": 3.1989109734763936e-06, |
|
"loss": 1.1447, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.465010089218058, |
|
"learning_rate": 3.0659963575231544e-06, |
|
"loss": 1.1384, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.8116282143136586, |
|
"learning_rate": 2.935400339211841e-06, |
|
"loss": 1.1448, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.821536237846906, |
|
"learning_rate": 2.8071665873427244e-06, |
|
"loss": 1.1529, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.616142918937676, |
|
"learning_rate": 2.681337980819536e-06, |
|
"loss": 1.1276, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.648221961084194, |
|
"learning_rate": 2.5579565943116092e-06, |
|
"loss": 1.1511, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.305338632641424, |
|
"learning_rate": 2.437063684184893e-06, |
|
"loss": 1.139, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.820239172457965, |
|
"learning_rate": 2.318699674706639e-06, |
|
"loss": 1.1305, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.813379017123292, |
|
"learning_rate": 2.202904144528295e-06, |
|
"loss": 1.1465, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.5727532652839655, |
|
"learning_rate": 2.08971581345115e-06, |
|
"loss": 1.1392, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.2395630905470822, |
|
"learning_rate": 1.979172529479193e-06, |
|
"loss": 1.1545, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.6177906399433017, |
|
"learning_rate": 1.8713112561634671e-06, |
|
"loss": 1.1372, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.523775559650496, |
|
"learning_rate": 1.7661680602421594e-06, |
|
"loss": 1.1374, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.8009704991006927, |
|
"learning_rate": 1.663778099580583e-06, |
|
"loss": 1.1272, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.4906631285299548, |
|
"learning_rate": 1.5641756114150552e-06, |
|
"loss": 1.1294, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.1402684266522565, |
|
"learning_rate": 1.4673939009046268e-06, |
|
"loss": 1.1361, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.8346706660144414, |
|
"learning_rate": 1.3734653299944834e-06, |
|
"loss": 1.1416, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.656713151507134, |
|
"learning_rate": 1.2824213065947232e-06, |
|
"loss": 1.1123, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.6781138539963876, |
|
"learning_rate": 1.194292274078156e-06, |
|
"loss": 1.1428, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.683665650227597, |
|
"learning_rate": 1.1091077011006302e-06, |
|
"loss": 1.1546, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.6042088842327704, |
|
"learning_rate": 1.0268960717472742e-06, |
|
"loss": 1.1501, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.5487361412205423, |
|
"learning_rate": 9.476848760079671e-07, |
|
"loss": 1.1409, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.5405778981950227, |
|
"learning_rate": 8.715006005852144e-07, |
|
"loss": 1.1482, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.384660533885737, |
|
"learning_rate": 7.983687200375046e-07, |
|
"loss": 1.1196, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.7475252922006828, |
|
"learning_rate": 7.283136882611063e-07, |
|
"loss": 1.1417, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.832637666387188, |
|
"learning_rate": 6.613589303131506e-07, |
|
"loss": 1.1508, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.7620043818781497, |
|
"learning_rate": 5.975268345787455e-07, |
|
"loss": 1.1787, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.548067253837801, |
|
"learning_rate": 5.368387452847312e-07, |
|
"loss": 1.1385, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.6676850416420477, |
|
"learning_rate": 4.793149553625786e-07, |
|
"loss": 1.1464, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.813724959820899, |
|
"learning_rate": 4.2497469966282125e-07, |
|
"loss": 1.1306, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.48368594730713, |
|
"learning_rate": 3.738361485232922e-07, |
|
"loss": 1.1174, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.4430892492916625, |
|
"learning_rate": 3.2591640169331697e-07, |
|
"loss": 1.1436, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.857473623835687, |
|
"learning_rate": 2.8123148261587465e-07, |
|
"loss": 1.1365, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.5325081829773244, |
|
"learning_rate": 2.397963330696751e-07, |
|
"loss": 1.1367, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.5616316720827585, |
|
"learning_rate": 2.0162480817291442e-07, |
|
"loss": 1.1283, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.427762885310401, |
|
"learning_rate": 1.6672967175038634e-07, |
|
"loss": 1.1387, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.621680915045004, |
|
"learning_rate": 1.3512259206550748e-07, |
|
"loss": 1.1372, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.485476210774143, |
|
"learning_rate": 1.0681413791867157e-07, |
|
"loss": 1.1432, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.4391825872607704, |
|
"learning_rate": 8.181377511324306e-08, |
|
"loss": 1.1405, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.7849881804956573, |
|
"learning_rate": 6.012986329038462e-08, |
|
"loss": 1.1186, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.401471198249117, |
|
"learning_rate": 4.1769653133743036e-08, |
|
"loss": 1.1486, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.416110237940305, |
|
"learning_rate": 2.673928394496206e-08, |
|
"loss": 1.1432, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.4944820214028582, |
|
"learning_rate": 1.5043781590823313e-08, |
|
"loss": 1.1578, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.41912854884359, |
|
"learning_rate": 6.687056822688442e-09, |
|
"loss": 1.1183, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.738704813702846, |
|
"learning_rate": 1.6719039688162242e-09, |
|
"loss": 1.1294, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.4680956929047855, |
|
"learning_rate": 0.0, |
|
"loss": 1.1598, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 955, |
|
"total_flos": 262883405463552.0, |
|
"train_loss": 1.3692507653960382, |
|
"train_runtime": 2783.1362, |
|
"train_samples_per_second": 43.907, |
|
"train_steps_per_second": 0.343 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 955, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 262883405463552.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|