{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 4656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006443298969072165, "grad_norm": 37.304439544677734, "learning_rate": 1.0000000000000002e-06, "loss": 3.2892, "step": 1 }, { "epoch": 0.01610824742268041, "grad_norm": 21.749683380126953, "learning_rate": 2.5e-05, "loss": 3.1951, "step": 25 }, { "epoch": 0.03221649484536082, "grad_norm": 17.803585052490234, "learning_rate": 5e-05, "loss": 3.3824, "step": 50 }, { "epoch": 0.04832474226804124, "grad_norm": 13.760115623474121, "learning_rate": 4.97286148501954e-05, "loss": 3.4591, "step": 75 }, { "epoch": 0.06443298969072164, "grad_norm": 17.308778762817383, "learning_rate": 4.945722970039079e-05, "loss": 3.5677, "step": 100 }, { "epoch": 0.08054123711340207, "grad_norm": 11.235258102416992, "learning_rate": 4.9185844550586194e-05, "loss": 3.5688, "step": 125 }, { "epoch": 0.09664948453608248, "grad_norm": 11.886427879333496, "learning_rate": 4.891445940078159e-05, "loss": 3.5572, "step": 150 }, { "epoch": 0.11275773195876289, "grad_norm": 10.693597793579102, "learning_rate": 4.864307425097699e-05, "loss": 3.5816, "step": 175 }, { "epoch": 0.12886597938144329, "grad_norm": 10.654956817626953, "learning_rate": 4.8371689101172386e-05, "loss": 3.4257, "step": 200 }, { "epoch": 0.14497422680412372, "grad_norm": 78.01908111572266, "learning_rate": 4.810030395136778e-05, "loss": 3.4916, "step": 225 }, { "epoch": 0.16108247422680413, "grad_norm": 14.090380668640137, "learning_rate": 4.782891880156318e-05, "loss": 3.4205, "step": 250 }, { "epoch": 0.17719072164948454, "grad_norm": 9.427169799804688, "learning_rate": 4.755753365175858e-05, "loss": 3.4519, "step": 275 }, { "epoch": 0.19329896907216496, "grad_norm": 57.52346420288086, "learning_rate": 4.728614850195397e-05, "loss": 3.5561, "step": 300 }, { "epoch": 0.20940721649484537, "grad_norm": 16.087291717529297, "learning_rate": 4.701476335214937e-05, "loss": 3.4669, "step": 325 }, { "epoch": 0.22551546391752578, "grad_norm": 11.201750755310059, "learning_rate": 4.674337820234477e-05, "loss": 3.4119, "step": 350 }, { "epoch": 0.2416237113402062, "grad_norm": 12.096583366394043, "learning_rate": 4.647199305254017e-05, "loss": 3.3902, "step": 375 }, { "epoch": 0.25773195876288657, "grad_norm": 12.698060989379883, "learning_rate": 4.620060790273557e-05, "loss": 3.4245, "step": 400 }, { "epoch": 0.27384020618556704, "grad_norm": 6.767716884613037, "learning_rate": 4.592922275293096e-05, "loss": 3.3878, "step": 425 }, { "epoch": 0.28994845360824745, "grad_norm": 8.80782413482666, "learning_rate": 4.565783760312636e-05, "loss": 3.36, "step": 450 }, { "epoch": 0.30605670103092786, "grad_norm": 8.567368507385254, "learning_rate": 4.538645245332175e-05, "loss": 3.46, "step": 475 }, { "epoch": 0.32216494845360827, "grad_norm": 7.334051132202148, "learning_rate": 4.5115067303517154e-05, "loss": 3.4295, "step": 500 }, { "epoch": 0.3382731958762887, "grad_norm": 35.50625991821289, "learning_rate": 4.484368215371255e-05, "loss": 3.458, "step": 525 }, { "epoch": 0.3543814432989691, "grad_norm": 8.980048179626465, "learning_rate": 4.457229700390795e-05, "loss": 3.3348, "step": 550 }, { "epoch": 0.3704896907216495, "grad_norm": 11.022858619689941, "learning_rate": 4.4300911854103346e-05, "loss": 3.4155, "step": 575 }, { "epoch": 0.3865979381443299, "grad_norm": 7.455577373504639, "learning_rate": 4.402952670429874e-05, "loss": 3.3107, "step": 600 }, { "epoch": 0.4027061855670103, "grad_norm": 6.974651336669922, "learning_rate": 4.375814155449414e-05, "loss": 3.371, "step": 625 }, { "epoch": 0.41881443298969073, "grad_norm": 6.6951680183410645, "learning_rate": 4.348675640468954e-05, "loss": 3.3197, "step": 650 }, { "epoch": 0.43492268041237114, "grad_norm": 7.696976661682129, "learning_rate": 4.321537125488493e-05, "loss": 3.3954, "step": 675 }, { "epoch": 0.45103092783505155, "grad_norm": 7.18176794052124, "learning_rate": 4.294398610508033e-05, "loss": 3.3443, "step": 700 }, { "epoch": 0.46713917525773196, "grad_norm": 6.54254150390625, "learning_rate": 4.267260095527572e-05, "loss": 3.235, "step": 725 }, { "epoch": 0.4832474226804124, "grad_norm": 6.953215599060059, "learning_rate": 4.2401215805471125e-05, "loss": 3.3103, "step": 750 }, { "epoch": 0.4993556701030928, "grad_norm": 5.925103187561035, "learning_rate": 4.212983065566653e-05, "loss": 3.3243, "step": 775 }, { "epoch": 0.5154639175257731, "grad_norm": 7.676642417907715, "learning_rate": 4.185844550586192e-05, "loss": 3.2962, "step": 800 }, { "epoch": 0.5315721649484536, "grad_norm": 5.8870038986206055, "learning_rate": 4.158706035605732e-05, "loss": 3.2808, "step": 825 }, { "epoch": 0.5476804123711341, "grad_norm": 6.311049938201904, "learning_rate": 4.131567520625272e-05, "loss": 3.2797, "step": 850 }, { "epoch": 0.5637886597938144, "grad_norm": 12.73991584777832, "learning_rate": 4.1044290056448114e-05, "loss": 3.2348, "step": 875 }, { "epoch": 0.5798969072164949, "grad_norm": 8.892550468444824, "learning_rate": 4.077290490664351e-05, "loss": 3.2308, "step": 900 }, { "epoch": 0.5960051546391752, "grad_norm": 5.8089704513549805, "learning_rate": 4.0501519756838904e-05, "loss": 3.2951, "step": 925 }, { "epoch": 0.6121134020618557, "grad_norm": 6.098554611206055, "learning_rate": 4.0230134607034306e-05, "loss": 3.1594, "step": 950 }, { "epoch": 0.6282216494845361, "grad_norm": 26.497699737548828, "learning_rate": 3.995874945722971e-05, "loss": 3.2403, "step": 975 }, { "epoch": 0.6443298969072165, "grad_norm": 7.092996597290039, "learning_rate": 3.96873643074251e-05, "loss": 3.1879, "step": 1000 }, { "epoch": 0.6604381443298969, "grad_norm": 5.452559947967529, "learning_rate": 3.94159791576205e-05, "loss": 3.2312, "step": 1025 }, { "epoch": 0.6765463917525774, "grad_norm": 11.570932388305664, "learning_rate": 3.914459400781589e-05, "loss": 3.1979, "step": 1050 }, { "epoch": 0.6926546391752577, "grad_norm": 10.654516220092773, "learning_rate": 3.887320885801129e-05, "loss": 3.1124, "step": 1075 }, { "epoch": 0.7087628865979382, "grad_norm": 5.750201225280762, "learning_rate": 3.860182370820669e-05, "loss": 3.1902, "step": 1100 }, { "epoch": 0.7248711340206185, "grad_norm": 6.332087993621826, "learning_rate": 3.8330438558402085e-05, "loss": 3.1835, "step": 1125 }, { "epoch": 0.740979381443299, "grad_norm": 6.187074661254883, "learning_rate": 3.805905340859749e-05, "loss": 3.2402, "step": 1150 }, { "epoch": 0.7570876288659794, "grad_norm": 5.326789379119873, "learning_rate": 3.778766825879288e-05, "loss": 3.1311, "step": 1175 }, { "epoch": 0.7731958762886598, "grad_norm": 5.872878551483154, "learning_rate": 3.751628310898828e-05, "loss": 3.168, "step": 1200 }, { "epoch": 0.7893041237113402, "grad_norm": 5.528806209564209, "learning_rate": 3.724489795918368e-05, "loss": 3.2025, "step": 1225 }, { "epoch": 0.8054123711340206, "grad_norm": 4.855608940124512, "learning_rate": 3.6973512809379074e-05, "loss": 3.1711, "step": 1250 }, { "epoch": 0.821520618556701, "grad_norm": 5.053402423858643, "learning_rate": 3.670212765957447e-05, "loss": 3.0436, "step": 1275 }, { "epoch": 0.8376288659793815, "grad_norm": 6.834145545959473, "learning_rate": 3.6430742509769864e-05, "loss": 3.0668, "step": 1300 }, { "epoch": 0.8537371134020618, "grad_norm": 5.844705104827881, "learning_rate": 3.615935735996526e-05, "loss": 3.129, "step": 1325 }, { "epoch": 0.8698453608247423, "grad_norm": 5.622738361358643, "learning_rate": 3.588797221016066e-05, "loss": 3.123, "step": 1350 }, { "epoch": 0.8859536082474226, "grad_norm": 5.435595512390137, "learning_rate": 3.561658706035606e-05, "loss": 3.1695, "step": 1375 }, { "epoch": 0.9020618556701031, "grad_norm": 5.923786640167236, "learning_rate": 3.534520191055146e-05, "loss": 3.1486, "step": 1400 }, { "epoch": 0.9181701030927835, "grad_norm": 5.717883586883545, "learning_rate": 3.507381676074685e-05, "loss": 3.119, "step": 1425 }, { "epoch": 0.9342783505154639, "grad_norm": 5.194445610046387, "learning_rate": 3.480243161094225e-05, "loss": 3.0391, "step": 1450 }, { "epoch": 0.9503865979381443, "grad_norm": 4.672726154327393, "learning_rate": 3.453104646113765e-05, "loss": 3.0887, "step": 1475 }, { "epoch": 0.9664948453608248, "grad_norm": 5.593866348266602, "learning_rate": 3.4259661311333045e-05, "loss": 3.1433, "step": 1500 }, { "epoch": 0.9826030927835051, "grad_norm": 6.05122709274292, "learning_rate": 3.398827616152844e-05, "loss": 3.094, "step": 1525 }, { "epoch": 0.9987113402061856, "grad_norm": 5.456536769866943, "learning_rate": 3.371689101172384e-05, "loss": 3.0894, "step": 1550 }, { "epoch": 1.014819587628866, "grad_norm": 6.8430867195129395, "learning_rate": 3.344550586191924e-05, "loss": 2.5031, "step": 1575 }, { "epoch": 1.0309278350515463, "grad_norm": 6.864569664001465, "learning_rate": 3.317412071211464e-05, "loss": 2.3879, "step": 1600 }, { "epoch": 1.0470360824742269, "grad_norm": 7.638180732727051, "learning_rate": 3.2902735562310034e-05, "loss": 2.3948, "step": 1625 }, { "epoch": 1.0631443298969072, "grad_norm": 5.917698860168457, "learning_rate": 3.263135041250543e-05, "loss": 2.3638, "step": 1650 }, { "epoch": 1.0792525773195876, "grad_norm": 6.708238124847412, "learning_rate": 3.2359965262700824e-05, "loss": 2.3536, "step": 1675 }, { "epoch": 1.0953608247422681, "grad_norm": 9.36337947845459, "learning_rate": 3.208858011289622e-05, "loss": 2.4048, "step": 1700 }, { "epoch": 1.1114690721649485, "grad_norm": 7.072855472564697, "learning_rate": 3.181719496309162e-05, "loss": 2.3709, "step": 1725 }, { "epoch": 1.1275773195876289, "grad_norm": 6.986050128936768, "learning_rate": 3.154580981328702e-05, "loss": 2.46, "step": 1750 }, { "epoch": 1.1436855670103092, "grad_norm": 6.583354949951172, "learning_rate": 3.127442466348242e-05, "loss": 2.3884, "step": 1775 }, { "epoch": 1.1597938144329896, "grad_norm": 6.607515811920166, "learning_rate": 3.100303951367781e-05, "loss": 2.3733, "step": 1800 }, { "epoch": 1.1759020618556701, "grad_norm": 7.239434719085693, "learning_rate": 3.073165436387321e-05, "loss": 2.4139, "step": 1825 }, { "epoch": 1.1920103092783505, "grad_norm": 7.7802042961120605, "learning_rate": 3.046026921406861e-05, "loss": 2.3074, "step": 1850 }, { "epoch": 1.2081185567010309, "grad_norm": 5.834593772888184, "learning_rate": 3.0188884064264005e-05, "loss": 2.3383, "step": 1875 }, { "epoch": 1.2242268041237114, "grad_norm": 6.189608097076416, "learning_rate": 2.9917498914459403e-05, "loss": 2.2833, "step": 1900 }, { "epoch": 1.2403350515463918, "grad_norm": 6.848288536071777, "learning_rate": 2.9646113764654798e-05, "loss": 2.3957, "step": 1925 }, { "epoch": 1.2564432989690721, "grad_norm": 6.78605842590332, "learning_rate": 2.9374728614850193e-05, "loss": 2.4055, "step": 1950 }, { "epoch": 1.2725515463917525, "grad_norm": 7.676894664764404, "learning_rate": 2.9103343465045595e-05, "loss": 2.3365, "step": 1975 }, { "epoch": 1.2886597938144329, "grad_norm": 6.011926651000977, "learning_rate": 2.8831958315240993e-05, "loss": 2.3317, "step": 2000 }, { "epoch": 1.3047680412371134, "grad_norm": 6.217193126678467, "learning_rate": 2.856057316543639e-05, "loss": 2.3736, "step": 2025 }, { "epoch": 1.3208762886597938, "grad_norm": 7.027468681335449, "learning_rate": 2.8289188015631784e-05, "loss": 2.3509, "step": 2050 }, { "epoch": 1.3369845360824741, "grad_norm": 7.210168838500977, "learning_rate": 2.8017802865827182e-05, "loss": 2.449, "step": 2075 }, { "epoch": 1.3530927835051547, "grad_norm": 7.149182319641113, "learning_rate": 2.7746417716022584e-05, "loss": 2.3631, "step": 2100 }, { "epoch": 1.369201030927835, "grad_norm": 6.41991662979126, "learning_rate": 2.747503256621798e-05, "loss": 2.4368, "step": 2125 }, { "epoch": 1.3853092783505154, "grad_norm": 6.897440433502197, "learning_rate": 2.7203647416413374e-05, "loss": 2.3772, "step": 2150 }, { "epoch": 1.401417525773196, "grad_norm": 6.562511444091797, "learning_rate": 2.6932262266608772e-05, "loss": 2.346, "step": 2175 }, { "epoch": 1.4175257731958764, "grad_norm": 6.86238431930542, "learning_rate": 2.6660877116804168e-05, "loss": 2.3861, "step": 2200 }, { "epoch": 1.4336340206185567, "grad_norm": 7.627070426940918, "learning_rate": 2.638949196699957e-05, "loss": 2.3415, "step": 2225 }, { "epoch": 1.449742268041237, "grad_norm": 6.463057518005371, "learning_rate": 2.6118106817194964e-05, "loss": 2.3558, "step": 2250 }, { "epoch": 1.4658505154639174, "grad_norm": 6.722979545593262, "learning_rate": 2.584672166739036e-05, "loss": 2.3812, "step": 2275 }, { "epoch": 1.481958762886598, "grad_norm": 7.5143585205078125, "learning_rate": 2.5575336517585758e-05, "loss": 2.3577, "step": 2300 }, { "epoch": 1.4980670103092784, "grad_norm": 6.2719197273254395, "learning_rate": 2.5303951367781153e-05, "loss": 2.3249, "step": 2325 }, { "epoch": 1.5141752577319587, "grad_norm": 6.567588806152344, "learning_rate": 2.5032566217976555e-05, "loss": 2.3663, "step": 2350 }, { "epoch": 1.5302835051546393, "grad_norm": 6.04072380065918, "learning_rate": 2.476118106817195e-05, "loss": 2.3098, "step": 2375 }, { "epoch": 1.5463917525773194, "grad_norm": 6.608715057373047, "learning_rate": 2.448979591836735e-05, "loss": 2.3335, "step": 2400 }, { "epoch": 1.5625, "grad_norm": 6.724149227142334, "learning_rate": 2.4218410768562747e-05, "loss": 2.3538, "step": 2425 }, { "epoch": 1.5786082474226806, "grad_norm": 7.360804080963135, "learning_rate": 2.3947025618758142e-05, "loss": 2.3797, "step": 2450 }, { "epoch": 1.5947164948453607, "grad_norm": 7.265044689178467, "learning_rate": 2.367564046895354e-05, "loss": 2.3377, "step": 2475 }, { "epoch": 1.6108247422680413, "grad_norm": 7.212481498718262, "learning_rate": 2.340425531914894e-05, "loss": 2.3111, "step": 2500 }, { "epoch": 1.6269329896907216, "grad_norm": 6.6800456047058105, "learning_rate": 2.3132870169344334e-05, "loss": 2.3555, "step": 2525 }, { "epoch": 1.643041237113402, "grad_norm": 6.473804950714111, "learning_rate": 2.2861485019539732e-05, "loss": 2.2877, "step": 2550 }, { "epoch": 1.6591494845360826, "grad_norm": 13.455022811889648, "learning_rate": 2.2590099869735127e-05, "loss": 2.2963, "step": 2575 }, { "epoch": 1.675257731958763, "grad_norm": 6.606278419494629, "learning_rate": 2.2318714719930526e-05, "loss": 2.3671, "step": 2600 }, { "epoch": 1.6913659793814433, "grad_norm": 6.745218276977539, "learning_rate": 2.2047329570125924e-05, "loss": 2.3202, "step": 2625 }, { "epoch": 1.7074742268041239, "grad_norm": 7.282406330108643, "learning_rate": 2.177594442032132e-05, "loss": 2.3242, "step": 2650 }, { "epoch": 1.723582474226804, "grad_norm": 7.313311576843262, "learning_rate": 2.1504559270516718e-05, "loss": 2.3228, "step": 2675 }, { "epoch": 1.7396907216494846, "grad_norm": 7.339620590209961, "learning_rate": 2.1233174120712116e-05, "loss": 2.3336, "step": 2700 }, { "epoch": 1.755798969072165, "grad_norm": 6.999018669128418, "learning_rate": 2.096178897090751e-05, "loss": 2.2578, "step": 2725 }, { "epoch": 1.7719072164948453, "grad_norm": 6.459262371063232, "learning_rate": 2.069040382110291e-05, "loss": 2.2741, "step": 2750 }, { "epoch": 1.7880154639175259, "grad_norm": 7.308042049407959, "learning_rate": 2.0419018671298308e-05, "loss": 2.2925, "step": 2775 }, { "epoch": 1.8041237113402062, "grad_norm": 6.555530071258545, "learning_rate": 2.0147633521493707e-05, "loss": 2.3082, "step": 2800 }, { "epoch": 1.8202319587628866, "grad_norm": 6.764036655426025, "learning_rate": 1.9876248371689102e-05, "loss": 2.2095, "step": 2825 }, { "epoch": 1.8363402061855671, "grad_norm": 7.759133815765381, "learning_rate": 1.96048632218845e-05, "loss": 2.3166, "step": 2850 }, { "epoch": 1.8524484536082473, "grad_norm": 6.442126274108887, "learning_rate": 1.9333478072079895e-05, "loss": 2.3075, "step": 2875 }, { "epoch": 1.8685567010309279, "grad_norm": 6.804947376251221, "learning_rate": 1.9062092922275294e-05, "loss": 2.2889, "step": 2900 }, { "epoch": 1.8846649484536082, "grad_norm": 6.473705291748047, "learning_rate": 1.8790707772470692e-05, "loss": 2.2423, "step": 2925 }, { "epoch": 1.9007731958762886, "grad_norm": 6.420748710632324, "learning_rate": 1.8519322622666087e-05, "loss": 2.2308, "step": 2950 }, { "epoch": 1.9168814432989691, "grad_norm": 7.469099044799805, "learning_rate": 1.8247937472861486e-05, "loss": 2.3467, "step": 2975 }, { "epoch": 1.9329896907216495, "grad_norm": 7.019501686096191, "learning_rate": 1.7976552323056884e-05, "loss": 2.3117, "step": 3000 }, { "epoch": 1.9490979381443299, "grad_norm": 7.53558874130249, "learning_rate": 1.770516717325228e-05, "loss": 2.2763, "step": 3025 }, { "epoch": 1.9652061855670104, "grad_norm": 6.622589588165283, "learning_rate": 1.7433782023447678e-05, "loss": 2.2725, "step": 3050 }, { "epoch": 1.9813144329896906, "grad_norm": 6.681495189666748, "learning_rate": 1.7162396873643076e-05, "loss": 2.1871, "step": 3075 }, { "epoch": 1.9974226804123711, "grad_norm": 5.900623321533203, "learning_rate": 1.6891011723838475e-05, "loss": 2.2892, "step": 3100 }, { "epoch": 2.0135309278350517, "grad_norm": 9.32268238067627, "learning_rate": 1.661962657403387e-05, "loss": 1.3008, "step": 3125 }, { "epoch": 2.029639175257732, "grad_norm": 7.467723369598389, "learning_rate": 1.6348241424229265e-05, "loss": 1.0731, "step": 3150 }, { "epoch": 2.0457474226804124, "grad_norm": 8.434012413024902, "learning_rate": 1.6076856274424663e-05, "loss": 1.0061, "step": 3175 }, { "epoch": 2.0618556701030926, "grad_norm": 9.433366775512695, "learning_rate": 1.580547112462006e-05, "loss": 1.0132, "step": 3200 }, { "epoch": 2.077963917525773, "grad_norm": 7.6198039054870605, "learning_rate": 1.553408597481546e-05, "loss": 0.9944, "step": 3225 }, { "epoch": 2.0940721649484537, "grad_norm": 8.139434814453125, "learning_rate": 1.5262700825010855e-05, "loss": 0.9757, "step": 3250 }, { "epoch": 2.110180412371134, "grad_norm": 8.175223350524902, "learning_rate": 1.4991315675206252e-05, "loss": 0.978, "step": 3275 }, { "epoch": 2.1262886597938144, "grad_norm": 8.026739120483398, "learning_rate": 1.4719930525401652e-05, "loss": 0.9758, "step": 3300 }, { "epoch": 2.142396907216495, "grad_norm": 8.502424240112305, "learning_rate": 1.4448545375597047e-05, "loss": 0.9047, "step": 3325 }, { "epoch": 2.158505154639175, "grad_norm": 9.062753677368164, "learning_rate": 1.4177160225792445e-05, "loss": 0.9462, "step": 3350 }, { "epoch": 2.1746134020618557, "grad_norm": 9.223316192626953, "learning_rate": 1.3905775075987842e-05, "loss": 0.9564, "step": 3375 }, { "epoch": 2.1907216494845363, "grad_norm": 8.59533977508545, "learning_rate": 1.3634389926183239e-05, "loss": 0.9476, "step": 3400 }, { "epoch": 2.2068298969072164, "grad_norm": 8.367724418640137, "learning_rate": 1.3363004776378637e-05, "loss": 0.9684, "step": 3425 }, { "epoch": 2.222938144329897, "grad_norm": 9.15878963470459, "learning_rate": 1.3091619626574034e-05, "loss": 0.9847, "step": 3450 }, { "epoch": 2.239046391752577, "grad_norm": 10.106039047241211, "learning_rate": 1.2820234476769433e-05, "loss": 0.9148, "step": 3475 }, { "epoch": 2.2551546391752577, "grad_norm": 8.91595458984375, "learning_rate": 1.254884932696483e-05, "loss": 0.9293, "step": 3500 }, { "epoch": 2.2712628865979383, "grad_norm": 9.854774475097656, "learning_rate": 1.2277464177160226e-05, "loss": 0.9469, "step": 3525 }, { "epoch": 2.2873711340206184, "grad_norm": 8.479780197143555, "learning_rate": 1.2006079027355625e-05, "loss": 0.925, "step": 3550 }, { "epoch": 2.303479381443299, "grad_norm": 8.944768905639648, "learning_rate": 1.1734693877551021e-05, "loss": 0.9593, "step": 3575 }, { "epoch": 2.319587628865979, "grad_norm": 8.820865631103516, "learning_rate": 1.1463308727746418e-05, "loss": 0.9583, "step": 3600 }, { "epoch": 2.3356958762886597, "grad_norm": 9.563779830932617, "learning_rate": 1.1191923577941815e-05, "loss": 0.9298, "step": 3625 }, { "epoch": 2.3518041237113403, "grad_norm": 8.982272148132324, "learning_rate": 1.0920538428137213e-05, "loss": 0.9376, "step": 3650 }, { "epoch": 2.367912371134021, "grad_norm": 9.715324401855469, "learning_rate": 1.064915327833261e-05, "loss": 0.9428, "step": 3675 }, { "epoch": 2.384020618556701, "grad_norm": 8.481626510620117, "learning_rate": 1.0377768128528009e-05, "loss": 0.9254, "step": 3700 }, { "epoch": 2.4001288659793816, "grad_norm": 9.785958290100098, "learning_rate": 1.0106382978723404e-05, "loss": 0.9266, "step": 3725 }, { "epoch": 2.4162371134020617, "grad_norm": 10.050392150878906, "learning_rate": 9.834997828918802e-06, "loss": 0.9335, "step": 3750 }, { "epoch": 2.4323453608247423, "grad_norm": 8.707124710083008, "learning_rate": 9.563612679114199e-06, "loss": 0.872, "step": 3775 }, { "epoch": 2.448453608247423, "grad_norm": 9.095705032348633, "learning_rate": 9.292227529309597e-06, "loss": 0.8928, "step": 3800 }, { "epoch": 2.464561855670103, "grad_norm": 9.691436767578125, "learning_rate": 9.020842379504994e-06, "loss": 0.8993, "step": 3825 }, { "epoch": 2.4806701030927836, "grad_norm": 17.811647415161133, "learning_rate": 8.749457229700392e-06, "loss": 0.8943, "step": 3850 }, { "epoch": 2.4967783505154637, "grad_norm": 9.972207069396973, "learning_rate": 8.478072079895788e-06, "loss": 0.9248, "step": 3875 }, { "epoch": 2.5128865979381443, "grad_norm": 9.202563285827637, "learning_rate": 8.206686930091186e-06, "loss": 0.9077, "step": 3900 }, { "epoch": 2.528994845360825, "grad_norm": 9.509817123413086, "learning_rate": 7.935301780286583e-06, "loss": 0.9037, "step": 3925 }, { "epoch": 2.545103092783505, "grad_norm": 8.833476066589355, "learning_rate": 7.663916630481981e-06, "loss": 0.8766, "step": 3950 }, { "epoch": 2.5612113402061856, "grad_norm": 10.363802909851074, "learning_rate": 7.392531480677378e-06, "loss": 0.895, "step": 3975 }, { "epoch": 2.5773195876288657, "grad_norm": 9.111068725585938, "learning_rate": 7.121146330872775e-06, "loss": 0.9224, "step": 4000 }, { "epoch": 2.5934278350515463, "grad_norm": 10.667325019836426, "learning_rate": 6.849761181068172e-06, "loss": 0.8776, "step": 4025 }, { "epoch": 2.609536082474227, "grad_norm": 11.279472351074219, "learning_rate": 6.578376031263569e-06, "loss": 0.8723, "step": 4050 }, { "epoch": 2.6256443298969074, "grad_norm": 15.722869873046875, "learning_rate": 6.306990881458967e-06, "loss": 0.8858, "step": 4075 }, { "epoch": 2.6417525773195876, "grad_norm": 10.252237319946289, "learning_rate": 6.035605731654364e-06, "loss": 0.8639, "step": 4100 }, { "epoch": 2.657860824742268, "grad_norm": 9.055089950561523, "learning_rate": 5.764220581849761e-06, "loss": 0.8794, "step": 4125 }, { "epoch": 2.6739690721649483, "grad_norm": 9.109421730041504, "learning_rate": 5.492835432045159e-06, "loss": 0.8667, "step": 4150 }, { "epoch": 2.690077319587629, "grad_norm": 9.12623119354248, "learning_rate": 5.221450282240556e-06, "loss": 0.8626, "step": 4175 }, { "epoch": 2.7061855670103094, "grad_norm": 9.60417366027832, "learning_rate": 4.950065132435953e-06, "loss": 0.9106, "step": 4200 }, { "epoch": 2.7222938144329896, "grad_norm": 9.32435417175293, "learning_rate": 4.678679982631351e-06, "loss": 0.8714, "step": 4225 }, { "epoch": 2.73840206185567, "grad_norm": 9.819196701049805, "learning_rate": 4.407294832826748e-06, "loss": 0.8621, "step": 4250 }, { "epoch": 2.7545103092783503, "grad_norm": 8.934945106506348, "learning_rate": 4.135909683022145e-06, "loss": 0.8644, "step": 4275 }, { "epoch": 2.770618556701031, "grad_norm": 10.425902366638184, "learning_rate": 3.864524533217543e-06, "loss": 0.8937, "step": 4300 }, { "epoch": 2.7867268041237114, "grad_norm": 9.629773139953613, "learning_rate": 3.5931393834129398e-06, "loss": 0.8774, "step": 4325 }, { "epoch": 2.802835051546392, "grad_norm": 9.796236038208008, "learning_rate": 3.3217542336083374e-06, "loss": 0.8589, "step": 4350 }, { "epoch": 2.818943298969072, "grad_norm": 9.853483200073242, "learning_rate": 3.050369083803734e-06, "loss": 0.8394, "step": 4375 }, { "epoch": 2.8350515463917527, "grad_norm": 9.696287155151367, "learning_rate": 2.7789839339991317e-06, "loss": 0.8523, "step": 4400 }, { "epoch": 2.851159793814433, "grad_norm": 9.468950271606445, "learning_rate": 2.507598784194529e-06, "loss": 0.8444, "step": 4425 }, { "epoch": 2.8672680412371134, "grad_norm": 9.996761322021484, "learning_rate": 2.236213634389926e-06, "loss": 0.8517, "step": 4450 }, { "epoch": 2.883376288659794, "grad_norm": 12.354564666748047, "learning_rate": 1.9648284845853233e-06, "loss": 0.8523, "step": 4475 }, { "epoch": 2.899484536082474, "grad_norm": 11.12836742401123, "learning_rate": 1.6934433347807209e-06, "loss": 0.8458, "step": 4500 }, { "epoch": 2.9155927835051547, "grad_norm": 9.318047523498535, "learning_rate": 1.4220581849761183e-06, "loss": 0.8548, "step": 4525 }, { "epoch": 2.931701030927835, "grad_norm": 9.987869262695312, "learning_rate": 1.1506730351715155e-06, "loss": 0.8567, "step": 4550 }, { "epoch": 2.9478092783505154, "grad_norm": 10.364538192749023, "learning_rate": 8.792878853669127e-07, "loss": 0.8547, "step": 4575 }, { "epoch": 2.963917525773196, "grad_norm": 10.010146141052246, "learning_rate": 6.0790273556231e-07, "loss": 0.8464, "step": 4600 }, { "epoch": 2.980025773195876, "grad_norm": 8.987505912780762, "learning_rate": 3.3651758575770737e-07, "loss": 0.8361, "step": 4625 }, { "epoch": 2.9961340206185567, "grad_norm": 9.563461303710938, "learning_rate": 6.513243595310464e-08, "loss": 0.8486, "step": 4650 } ], "logging_steps": 25, "max_steps": 4656, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3657646372356096e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }