{ "best_metric": null, "best_model_checkpoint": null, "epoch": 31.91571553994732, "eval_steps": 2000, "global_step": 4544, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07023705004389816, "grad_norm": 6.03125, "learning_rate": 2.197802197802198e-07, "loss": 2.4857, "step": 10 }, { "epoch": 0.14047410008779632, "grad_norm": 5.03125, "learning_rate": 4.395604395604396e-07, "loss": 2.5113, "step": 20 }, { "epoch": 0.21071115013169447, "grad_norm": 4.71875, "learning_rate": 6.593406593406594e-07, "loss": 2.4922, "step": 30 }, { "epoch": 0.28094820017559263, "grad_norm": 5.09375, "learning_rate": 8.791208791208792e-07, "loss": 2.4394, "step": 40 }, { "epoch": 0.35118525021949076, "grad_norm": 16.75, "learning_rate": 1.098901098901099e-06, "loss": 2.4773, "step": 50 }, { "epoch": 0.42142230026338895, "grad_norm": 14.5625, "learning_rate": 1.3186813186813187e-06, "loss": 2.3524, "step": 60 }, { "epoch": 0.4916593503072871, "grad_norm": 15.125, "learning_rate": 1.5384615384615387e-06, "loss": 2.42, "step": 70 }, { "epoch": 0.5618964003511853, "grad_norm": 14.0625, "learning_rate": 1.7582417582417585e-06, "loss": 2.3288, "step": 80 }, { "epoch": 0.6321334503950834, "grad_norm": 12.6875, "learning_rate": 1.9780219780219782e-06, "loss": 2.2519, "step": 90 }, { "epoch": 0.7023705004389815, "grad_norm": 2.53125, "learning_rate": 2.197802197802198e-06, "loss": 2.2021, "step": 100 }, { "epoch": 0.7726075504828798, "grad_norm": 2.296875, "learning_rate": 2.4175824175824177e-06, "loss": 2.2151, "step": 110 }, { "epoch": 0.8428446005267779, "grad_norm": 2.015625, "learning_rate": 2.6373626373626375e-06, "loss": 2.1102, "step": 120 }, { "epoch": 0.913081650570676, "grad_norm": 2.109375, "learning_rate": 2.8571428571428573e-06, "loss": 2.1034, "step": 130 }, { "epoch": 0.9833187006145742, "grad_norm": 2.078125, "learning_rate": 3.0769230769230774e-06, "loss": 2.0737, "step": 140 }, { "epoch": 1.0535557506584723, "grad_norm": 1.7578125, "learning_rate": 3.2967032967032968e-06, "loss": 2.0345, "step": 150 }, { "epoch": 1.1237928007023705, "grad_norm": 2.109375, "learning_rate": 3.516483516483517e-06, "loss": 1.9568, "step": 160 }, { "epoch": 1.1940298507462686, "grad_norm": 2.1875, "learning_rate": 3.7362637362637367e-06, "loss": 1.9494, "step": 170 }, { "epoch": 1.2642669007901668, "grad_norm": 1.875, "learning_rate": 3.9560439560439565e-06, "loss": 1.8932, "step": 180 }, { "epoch": 1.334503950834065, "grad_norm": 1.6875, "learning_rate": 4.175824175824177e-06, "loss": 1.9219, "step": 190 }, { "epoch": 1.404741000877963, "grad_norm": 2.1875, "learning_rate": 4.395604395604396e-06, "loss": 1.9751, "step": 200 }, { "epoch": 1.4749780509218613, "grad_norm": 1.7265625, "learning_rate": 4.615384615384616e-06, "loss": 1.8729, "step": 210 }, { "epoch": 1.5452151009657595, "grad_norm": 1.765625, "learning_rate": 4.8351648351648355e-06, "loss": 1.885, "step": 220 }, { "epoch": 1.6154521510096576, "grad_norm": 1.546875, "learning_rate": 5.054945054945055e-06, "loss": 1.8549, "step": 230 }, { "epoch": 1.6856892010535558, "grad_norm": 2.046875, "learning_rate": 5.274725274725275e-06, "loss": 1.8736, "step": 240 }, { "epoch": 1.755926251097454, "grad_norm": 1.6875, "learning_rate": 5.494505494505495e-06, "loss": 1.8388, "step": 250 }, { "epoch": 1.826163301141352, "grad_norm": 2.09375, "learning_rate": 5.7142857142857145e-06, "loss": 1.8676, "step": 260 }, { "epoch": 1.89640035118525, "grad_norm": 2.0, "learning_rate": 5.934065934065935e-06, "loss": 1.8181, "step": 270 }, { "epoch": 1.9666374012291485, "grad_norm": 1.890625, "learning_rate": 6.153846153846155e-06, "loss": 1.8007, "step": 280 }, { "epoch": 2.0368744512730466, "grad_norm": 1.8046875, "learning_rate": 6.373626373626373e-06, "loss": 1.7727, "step": 290 }, { "epoch": 2.1071115013169446, "grad_norm": 1.8203125, "learning_rate": 6.5934065934065935e-06, "loss": 1.7038, "step": 300 }, { "epoch": 2.177348551360843, "grad_norm": 1.7578125, "learning_rate": 6.813186813186814e-06, "loss": 1.7398, "step": 310 }, { "epoch": 2.247585601404741, "grad_norm": 2.15625, "learning_rate": 7.032967032967034e-06, "loss": 1.7468, "step": 320 }, { "epoch": 2.317822651448639, "grad_norm": 1.9296875, "learning_rate": 7.252747252747253e-06, "loss": 1.7326, "step": 330 }, { "epoch": 2.388059701492537, "grad_norm": 1.8671875, "learning_rate": 7.472527472527473e-06, "loss": 1.7779, "step": 340 }, { "epoch": 2.4582967515364356, "grad_norm": 1.96875, "learning_rate": 7.692307692307694e-06, "loss": 1.6381, "step": 350 }, { "epoch": 2.5285338015803336, "grad_norm": 1.71875, "learning_rate": 7.912087912087913e-06, "loss": 1.6605, "step": 360 }, { "epoch": 2.5987708516242316, "grad_norm": 1.703125, "learning_rate": 8.131868131868132e-06, "loss": 1.6683, "step": 370 }, { "epoch": 2.66900790166813, "grad_norm": 1.6640625, "learning_rate": 8.351648351648353e-06, "loss": 1.6819, "step": 380 }, { "epoch": 2.739244951712028, "grad_norm": 1.984375, "learning_rate": 8.571428571428571e-06, "loss": 1.7072, "step": 390 }, { "epoch": 2.809482001755926, "grad_norm": 1.8671875, "learning_rate": 8.791208791208792e-06, "loss": 1.6747, "step": 400 }, { "epoch": 2.8797190517998246, "grad_norm": 1.8671875, "learning_rate": 9.010989010989011e-06, "loss": 1.6596, "step": 410 }, { "epoch": 2.9499561018437226, "grad_norm": 1.7265625, "learning_rate": 9.230769230769232e-06, "loss": 1.65, "step": 420 }, { "epoch": 3.0201931518876206, "grad_norm": 1.53125, "learning_rate": 9.450549450549452e-06, "loss": 1.6037, "step": 430 }, { "epoch": 3.090430201931519, "grad_norm": 1.78125, "learning_rate": 9.670329670329671e-06, "loss": 1.6015, "step": 440 }, { "epoch": 3.160667251975417, "grad_norm": 1.859375, "learning_rate": 9.890109890109892e-06, "loss": 1.5708, "step": 450 }, { "epoch": 3.230904302019315, "grad_norm": 1.671875, "learning_rate": 9.99996310691202e-06, "loss": 1.5778, "step": 460 }, { "epoch": 3.3011413520632136, "grad_norm": 1.609375, "learning_rate": 9.999667965474806e-06, "loss": 1.5528, "step": 470 }, { "epoch": 3.3713784021071116, "grad_norm": 1.984375, "learning_rate": 9.9990777000222e-06, "loss": 1.5539, "step": 480 }, { "epoch": 3.4416154521510096, "grad_norm": 2.5, "learning_rate": 9.998192345396817e-06, "loss": 1.5289, "step": 490 }, { "epoch": 3.511852502194908, "grad_norm": 1.484375, "learning_rate": 9.997011953860014e-06, "loss": 1.5335, "step": 500 }, { "epoch": 3.582089552238806, "grad_norm": 1.4765625, "learning_rate": 9.99553659508879e-06, "loss": 1.5565, "step": 510 }, { "epoch": 3.652326602282704, "grad_norm": 1.46875, "learning_rate": 9.993766356171694e-06, "loss": 1.5414, "step": 520 }, { "epoch": 3.722563652326602, "grad_norm": 1.40625, "learning_rate": 9.991701341603667e-06, "loss": 1.5271, "step": 530 }, { "epoch": 3.7928007023705006, "grad_norm": 1.4921875, "learning_rate": 9.989341673279881e-06, "loss": 1.4782, "step": 540 }, { "epoch": 3.8630377524143986, "grad_norm": 1.6015625, "learning_rate": 9.986687490488545e-06, "loss": 1.5165, "step": 550 }, { "epoch": 3.9332748024582966, "grad_norm": 1.6953125, "learning_rate": 9.98373894990268e-06, "loss": 1.507, "step": 560 }, { "epoch": 4.003511852502195, "grad_norm": 1.65625, "learning_rate": 9.980496225570869e-06, "loss": 1.5207, "step": 570 }, { "epoch": 4.073748902546093, "grad_norm": 1.59375, "learning_rate": 9.976959508906992e-06, "loss": 1.4324, "step": 580 }, { "epoch": 4.143985952589992, "grad_norm": 1.5859375, "learning_rate": 9.973129008678915e-06, "loss": 1.4171, "step": 590 }, { "epoch": 4.214223002633889, "grad_norm": 1.4375, "learning_rate": 9.969004950996175e-06, "loss": 1.4378, "step": 600 }, { "epoch": 4.284460052677788, "grad_norm": 1.3828125, "learning_rate": 9.964587579296631e-06, "loss": 1.3886, "step": 610 }, { "epoch": 4.354697102721686, "grad_norm": 1.6796875, "learning_rate": 9.959877154332095e-06, "loss": 1.4154, "step": 620 }, { "epoch": 4.424934152765584, "grad_norm": 1.625, "learning_rate": 9.954873954152933e-06, "loss": 1.4231, "step": 630 }, { "epoch": 4.495171202809482, "grad_norm": 1.4609375, "learning_rate": 9.949578274091666e-06, "loss": 1.3745, "step": 640 }, { "epoch": 4.565408252853381, "grad_norm": 1.6796875, "learning_rate": 9.943990426745525e-06, "loss": 1.3663, "step": 650 }, { "epoch": 4.635645302897278, "grad_norm": 1.5, "learning_rate": 9.938110741958003e-06, "loss": 1.4177, "step": 660 }, { "epoch": 4.705882352941177, "grad_norm": 1.6171875, "learning_rate": 9.931939566799385e-06, "loss": 1.412, "step": 670 }, { "epoch": 4.776119402985074, "grad_norm": 1.6953125, "learning_rate": 9.925477265546258e-06, "loss": 1.4155, "step": 680 }, { "epoch": 4.846356453028973, "grad_norm": 1.5859375, "learning_rate": 9.918724219660013e-06, "loss": 1.4241, "step": 690 }, { "epoch": 4.916593503072871, "grad_norm": 1.5234375, "learning_rate": 9.911680827764329e-06, "loss": 1.3562, "step": 700 }, { "epoch": 4.98683055311677, "grad_norm": 1.4609375, "learning_rate": 9.90434750562163e-06, "loss": 1.3542, "step": 710 }, { "epoch": 5.057067603160667, "grad_norm": 1.484375, "learning_rate": 9.896724686108561e-06, "loss": 1.305, "step": 720 }, { "epoch": 5.127304653204566, "grad_norm": 1.4609375, "learning_rate": 9.888812819190419e-06, "loss": 1.314, "step": 730 }, { "epoch": 5.197541703248463, "grad_norm": 1.5390625, "learning_rate": 9.88061237189461e-06, "loss": 1.2962, "step": 740 }, { "epoch": 5.267778753292362, "grad_norm": 1.4375, "learning_rate": 9.872123828283063e-06, "loss": 1.2993, "step": 750 }, { "epoch": 5.33801580333626, "grad_norm": 1.4140625, "learning_rate": 9.863347689423666e-06, "loss": 1.2649, "step": 760 }, { "epoch": 5.408252853380158, "grad_norm": 1.6796875, "learning_rate": 9.854284473360694e-06, "loss": 1.2943, "step": 770 }, { "epoch": 5.478489903424056, "grad_norm": 1.3203125, "learning_rate": 9.84493471508421e-06, "loss": 1.2512, "step": 780 }, { "epoch": 5.548726953467955, "grad_norm": 1.375, "learning_rate": 9.835298966498511e-06, "loss": 1.3046, "step": 790 }, { "epoch": 5.618964003511852, "grad_norm": 1.390625, "learning_rate": 9.82537779638953e-06, "loss": 1.2338, "step": 800 }, { "epoch": 5.689201053555751, "grad_norm": 1.3359375, "learning_rate": 9.815171790391269e-06, "loss": 1.2921, "step": 810 }, { "epoch": 5.759438103599649, "grad_norm": 1.3046875, "learning_rate": 9.804681550951228e-06, "loss": 1.2306, "step": 820 }, { "epoch": 5.829675153643547, "grad_norm": 1.3515625, "learning_rate": 9.793907697294844e-06, "loss": 1.2304, "step": 830 }, { "epoch": 5.899912203687445, "grad_norm": 1.1796875, "learning_rate": 9.782850865388941e-06, "loss": 1.2693, "step": 840 }, { "epoch": 5.970149253731344, "grad_norm": 1.2578125, "learning_rate": 9.77151170790419e-06, "loss": 1.2642, "step": 850 }, { "epoch": 6.040386303775241, "grad_norm": 1.3203125, "learning_rate": 9.759890894176574e-06, "loss": 1.2702, "step": 860 }, { "epoch": 6.11062335381914, "grad_norm": 1.328125, "learning_rate": 9.747989110167887e-06, "loss": 1.2085, "step": 870 }, { "epoch": 6.180860403863038, "grad_norm": 1.265625, "learning_rate": 9.735807058425241e-06, "loss": 1.1754, "step": 880 }, { "epoch": 6.251097453906936, "grad_norm": 1.1484375, "learning_rate": 9.723345458039595e-06, "loss": 1.1378, "step": 890 }, { "epoch": 6.321334503950834, "grad_norm": 1.1640625, "learning_rate": 9.710605044603305e-06, "loss": 1.1588, "step": 900 }, { "epoch": 6.391571553994732, "grad_norm": 1.21875, "learning_rate": 9.697586570166707e-06, "loss": 1.1934, "step": 910 }, { "epoch": 6.46180860403863, "grad_norm": 1.0859375, "learning_rate": 9.684290803193721e-06, "loss": 1.1504, "step": 920 }, { "epoch": 6.532045654082529, "grad_norm": 1.0859375, "learning_rate": 9.670718528516495e-06, "loss": 1.1152, "step": 930 }, { "epoch": 6.602282704126427, "grad_norm": 1.171875, "learning_rate": 9.65687054728907e-06, "loss": 1.1628, "step": 940 }, { "epoch": 6.672519754170325, "grad_norm": 1.109375, "learning_rate": 9.642747676940094e-06, "loss": 1.1299, "step": 950 }, { "epoch": 6.742756804214223, "grad_norm": 1.125, "learning_rate": 9.62835075112457e-06, "loss": 1.1635, "step": 960 }, { "epoch": 6.812993854258121, "grad_norm": 0.953125, "learning_rate": 9.61368061967464e-06, "loss": 1.1673, "step": 970 }, { "epoch": 6.883230904302019, "grad_norm": 0.953125, "learning_rate": 9.598738148549434e-06, "loss": 1.1281, "step": 980 }, { "epoch": 6.953467954345918, "grad_norm": 1.078125, "learning_rate": 9.583524219783938e-06, "loss": 1.1617, "step": 990 }, { "epoch": 7.023705004389815, "grad_norm": 1.0, "learning_rate": 9.56803973143694e-06, "loss": 1.1436, "step": 1000 }, { "epoch": 7.093942054433714, "grad_norm": 1.0078125, "learning_rate": 9.552285597538014e-06, "loss": 1.0962, "step": 1010 }, { "epoch": 7.164179104477612, "grad_norm": 0.8828125, "learning_rate": 9.536262748033564e-06, "loss": 1.0893, "step": 1020 }, { "epoch": 7.23441615452151, "grad_norm": 1.0234375, "learning_rate": 9.519972128731937e-06, "loss": 1.0618, "step": 1030 }, { "epoch": 7.304653204565408, "grad_norm": 1.0078125, "learning_rate": 9.503414701247587e-06, "loss": 1.0986, "step": 1040 }, { "epoch": 7.374890254609307, "grad_norm": 1.0546875, "learning_rate": 9.486591442944313e-06, "loss": 1.1748, "step": 1050 }, { "epoch": 7.445127304653204, "grad_norm": 1.1484375, "learning_rate": 9.469503346877569e-06, "loss": 1.1092, "step": 1060 }, { "epoch": 7.515364354697103, "grad_norm": 0.9296875, "learning_rate": 9.452151421735846e-06, "loss": 1.1099, "step": 1070 }, { "epoch": 7.585601404741001, "grad_norm": 0.99609375, "learning_rate": 9.434536691781125e-06, "loss": 1.0662, "step": 1080 }, { "epoch": 7.655838454784899, "grad_norm": 1.046875, "learning_rate": 9.416660196788423e-06, "loss": 1.0551, "step": 1090 }, { "epoch": 7.726075504828797, "grad_norm": 0.94921875, "learning_rate": 9.39852299198441e-06, "loss": 1.0643, "step": 1100 }, { "epoch": 7.796312554872696, "grad_norm": 0.98828125, "learning_rate": 9.380126147985122e-06, "loss": 1.0576, "step": 1110 }, { "epoch": 7.866549604916593, "grad_norm": 0.95703125, "learning_rate": 9.36147075073277e-06, "loss": 1.0703, "step": 1120 }, { "epoch": 7.936786654960492, "grad_norm": 0.95703125, "learning_rate": 9.34255790143163e-06, "loss": 1.0488, "step": 1130 }, { "epoch": 8.00702370500439, "grad_norm": 0.8125, "learning_rate": 9.323388716483046e-06, "loss": 1.0843, "step": 1140 }, { "epoch": 8.077260755048288, "grad_norm": 0.81640625, "learning_rate": 9.303964327419524e-06, "loss": 1.0359, "step": 1150 }, { "epoch": 8.147497805092186, "grad_norm": 0.84375, "learning_rate": 9.284285880837947e-06, "loss": 1.0227, "step": 1160 }, { "epoch": 8.217734855136085, "grad_norm": 0.87109375, "learning_rate": 9.264354538331886e-06, "loss": 0.997, "step": 1170 }, { "epoch": 8.287971905179983, "grad_norm": 0.875, "learning_rate": 9.244171476423037e-06, "loss": 1.0194, "step": 1180 }, { "epoch": 8.35820895522388, "grad_norm": 0.86328125, "learning_rate": 9.223737886491771e-06, "loss": 1.0264, "step": 1190 }, { "epoch": 8.428446005267778, "grad_norm": 0.84375, "learning_rate": 9.203054974706807e-06, "loss": 1.037, "step": 1200 }, { "epoch": 8.498683055311677, "grad_norm": 0.76171875, "learning_rate": 9.182123961954016e-06, "loss": 1.0202, "step": 1210 }, { "epoch": 8.568920105355575, "grad_norm": 0.7265625, "learning_rate": 9.160946083764353e-06, "loss": 1.0577, "step": 1220 }, { "epoch": 8.639157155399474, "grad_norm": 0.75, "learning_rate": 9.13952259024092e-06, "loss": 1.009, "step": 1230 }, { "epoch": 8.709394205443372, "grad_norm": 0.74609375, "learning_rate": 9.117854745985183e-06, "loss": 0.9763, "step": 1240 }, { "epoch": 8.779631255487269, "grad_norm": 0.80859375, "learning_rate": 9.095943830022323e-06, "loss": 1.0406, "step": 1250 }, { "epoch": 8.849868305531167, "grad_norm": 0.8359375, "learning_rate": 9.073791135725722e-06, "loss": 1.0093, "step": 1260 }, { "epoch": 8.920105355575066, "grad_norm": 0.7890625, "learning_rate": 9.051397970740638e-06, "loss": 1.0146, "step": 1270 }, { "epoch": 8.990342405618964, "grad_norm": 0.75390625, "learning_rate": 9.028765656907005e-06, "loss": 1.0404, "step": 1280 }, { "epoch": 9.060579455662863, "grad_norm": 0.75, "learning_rate": 9.005895530181406e-06, "loss": 0.9933, "step": 1290 }, { "epoch": 9.130816505706761, "grad_norm": 0.78515625, "learning_rate": 8.982788940558216e-06, "loss": 0.9701, "step": 1300 }, { "epoch": 9.201053555750658, "grad_norm": 0.81640625, "learning_rate": 8.959447251989914e-06, "loss": 0.9554, "step": 1310 }, { "epoch": 9.271290605794556, "grad_norm": 0.81640625, "learning_rate": 8.935871842306569e-06, "loss": 1.0002, "step": 1320 }, { "epoch": 9.341527655838455, "grad_norm": 0.83203125, "learning_rate": 8.912064103134505e-06, "loss": 0.999, "step": 1330 }, { "epoch": 9.411764705882353, "grad_norm": 0.81640625, "learning_rate": 8.888025439814169e-06, "loss": 0.9726, "step": 1340 }, { "epoch": 9.482001755926252, "grad_norm": 0.80078125, "learning_rate": 8.863757271317154e-06, "loss": 1.0067, "step": 1350 }, { "epoch": 9.552238805970148, "grad_norm": 0.81640625, "learning_rate": 8.839261030162459e-06, "loss": 1.0181, "step": 1360 }, { "epoch": 9.622475856014047, "grad_norm": 0.875, "learning_rate": 8.814538162331913e-06, "loss": 0.9688, "step": 1370 }, { "epoch": 9.692712906057945, "grad_norm": 0.76171875, "learning_rate": 8.789590127184837e-06, "loss": 0.9309, "step": 1380 }, { "epoch": 9.762949956101844, "grad_norm": 0.7734375, "learning_rate": 8.764418397371888e-06, "loss": 0.987, "step": 1390 }, { "epoch": 9.833187006145742, "grad_norm": 0.86328125, "learning_rate": 8.739024458748128e-06, "loss": 0.9292, "step": 1400 }, { "epoch": 9.90342405618964, "grad_norm": 0.87890625, "learning_rate": 8.713409810285327e-06, "loss": 0.9909, "step": 1410 }, { "epoch": 9.973661106233537, "grad_norm": 0.9296875, "learning_rate": 8.687575963983477e-06, "loss": 0.9914, "step": 1420 }, { "epoch": 10.043898156277436, "grad_norm": 0.8359375, "learning_rate": 8.661524444781531e-06, "loss": 0.9369, "step": 1430 }, { "epoch": 10.114135206321334, "grad_norm": 0.87109375, "learning_rate": 8.635256790467402e-06, "loss": 0.9573, "step": 1440 }, { "epoch": 10.184372256365233, "grad_norm": 0.92578125, "learning_rate": 8.60877455158718e-06, "loss": 0.9491, "step": 1450 }, { "epoch": 10.254609306409131, "grad_norm": 0.96484375, "learning_rate": 8.582079291353607e-06, "loss": 0.8985, "step": 1460 }, { "epoch": 10.32484635645303, "grad_norm": 0.95703125, "learning_rate": 8.555172585553804e-06, "loss": 0.913, "step": 1470 }, { "epoch": 10.395083406496926, "grad_norm": 0.94921875, "learning_rate": 8.528056022456256e-06, "loss": 0.9727, "step": 1480 }, { "epoch": 10.465320456540825, "grad_norm": 0.9453125, "learning_rate": 8.500731202717056e-06, "loss": 0.9723, "step": 1490 }, { "epoch": 10.535557506584723, "grad_norm": 1.0859375, "learning_rate": 8.473199739285416e-06, "loss": 0.9626, "step": 1500 }, { "epoch": 10.605794556628622, "grad_norm": 1.2109375, "learning_rate": 8.445463257308463e-06, "loss": 0.9487, "step": 1510 }, { "epoch": 10.67603160667252, "grad_norm": 1.1875, "learning_rate": 8.417523394035316e-06, "loss": 0.9832, "step": 1520 }, { "epoch": 10.746268656716419, "grad_norm": 1.203125, "learning_rate": 8.389381798720417e-06, "loss": 0.9621, "step": 1530 }, { "epoch": 10.816505706760315, "grad_norm": 1.1875, "learning_rate": 8.361040132526204e-06, "loss": 0.9222, "step": 1540 }, { "epoch": 10.886742756804214, "grad_norm": 2.109375, "learning_rate": 8.332500068425038e-06, "loss": 0.9183, "step": 1550 }, { "epoch": 10.956979806848112, "grad_norm": 2.34375, "learning_rate": 8.303763291100459e-06, "loss": 0.9173, "step": 1560 }, { "epoch": 11.02721685689201, "grad_norm": 2.09375, "learning_rate": 8.274831496847735e-06, "loss": 0.9582, "step": 1570 }, { "epoch": 11.09745390693591, "grad_norm": 1.828125, "learning_rate": 8.245706393473734e-06, "loss": 0.907, "step": 1580 }, { "epoch": 11.167690956979808, "grad_norm": 1.8828125, "learning_rate": 8.216389700196116e-06, "loss": 0.9339, "step": 1590 }, { "epoch": 11.237928007023704, "grad_norm": 4.375, "learning_rate": 8.186883147541846e-06, "loss": 0.9417, "step": 1600 }, { "epoch": 11.308165057067603, "grad_norm": 6.53125, "learning_rate": 8.157188477245048e-06, "loss": 0.8978, "step": 1610 }, { "epoch": 11.378402107111501, "grad_norm": 4.625, "learning_rate": 8.12730744214419e-06, "loss": 0.9335, "step": 1620 }, { "epoch": 11.4486391571554, "grad_norm": 5.6875, "learning_rate": 8.097241806078616e-06, "loss": 0.9623, "step": 1630 }, { "epoch": 11.518876207199298, "grad_norm": 4.71875, "learning_rate": 8.066993343784427e-06, "loss": 0.9691, "step": 1640 }, { "epoch": 11.589113257243195, "grad_norm": 8.625, "learning_rate": 8.036563840789726e-06, "loss": 0.9208, "step": 1650 }, { "epoch": 11.659350307287093, "grad_norm": 9.5625, "learning_rate": 8.005955093309217e-06, "loss": 0.9352, "step": 1660 }, { "epoch": 11.729587357330992, "grad_norm": 8.625, "learning_rate": 7.975168908138174e-06, "loss": 0.911, "step": 1670 }, { "epoch": 11.79982440737489, "grad_norm": 10.3125, "learning_rate": 7.944207102545795e-06, "loss": 0.9109, "step": 1680 }, { "epoch": 11.870061457418789, "grad_norm": 8.375, "learning_rate": 7.913071504167925e-06, "loss": 0.946, "step": 1690 }, { "epoch": 11.940298507462687, "grad_norm": 2.78125, "learning_rate": 7.881763950899175e-06, "loss": 0.8947, "step": 1700 }, { "epoch": 12.010535557506584, "grad_norm": 2.28125, "learning_rate": 7.850286290784437e-06, "loss": 0.8951, "step": 1710 }, { "epoch": 12.080772607550482, "grad_norm": 2.34375, "learning_rate": 7.81864038190979e-06, "loss": 0.893, "step": 1720 }, { "epoch": 12.151009657594381, "grad_norm": 2.84375, "learning_rate": 7.786828092292821e-06, "loss": 0.8729, "step": 1730 }, { "epoch": 12.22124670763828, "grad_norm": 2.890625, "learning_rate": 7.754851299772362e-06, "loss": 0.8704, "step": 1740 }, { "epoch": 12.291483757682178, "grad_norm": 2.171875, "learning_rate": 7.722711891897641e-06, "loss": 0.8639, "step": 1750 }, { "epoch": 12.361720807726076, "grad_norm": 2.921875, "learning_rate": 7.690411765816864e-06, "loss": 0.8624, "step": 1760 }, { "epoch": 12.431957857769973, "grad_norm": 2.96875, "learning_rate": 7.657952828165225e-06, "loss": 0.878, "step": 1770 }, { "epoch": 12.502194907813871, "grad_norm": 2.546875, "learning_rate": 7.625336994952364e-06, "loss": 0.8555, "step": 1780 }, { "epoch": 12.57243195785777, "grad_norm": 2.265625, "learning_rate": 7.592566191449262e-06, "loss": 0.8889, "step": 1790 }, { "epoch": 12.642669007901668, "grad_norm": 2.453125, "learning_rate": 7.559642352074606e-06, "loss": 0.8598, "step": 1800 }, { "epoch": 12.712906057945567, "grad_norm": 2.234375, "learning_rate": 7.526567420280585e-06, "loss": 0.8856, "step": 1810 }, { "epoch": 12.783143107989464, "grad_norm": 2.25, "learning_rate": 7.4933433484381905e-06, "loss": 0.8677, "step": 1820 }, { "epoch": 12.853380158033362, "grad_norm": 2.84375, "learning_rate": 7.459972097721954e-06, "loss": 0.9036, "step": 1830 }, { "epoch": 12.92361720807726, "grad_norm": 2.171875, "learning_rate": 7.4264556379941895e-06, "loss": 0.8607, "step": 1840 }, { "epoch": 12.993854258121159, "grad_norm": 2.28125, "learning_rate": 7.392795947688715e-06, "loss": 0.8699, "step": 1850 }, { "epoch": 13.064091308165057, "grad_norm": 2.296875, "learning_rate": 7.3589950136940645e-06, "loss": 0.8343, "step": 1860 }, { "epoch": 13.134328358208956, "grad_norm": 2.34375, "learning_rate": 7.325054831236211e-06, "loss": 0.8305, "step": 1870 }, { "epoch": 13.204565408252853, "grad_norm": 2.3125, "learning_rate": 7.2909774037607775e-06, "loss": 0.8039, "step": 1880 }, { "epoch": 13.274802458296751, "grad_norm": 3.0, "learning_rate": 7.256764742814796e-06, "loss": 0.8274, "step": 1890 }, { "epoch": 13.34503950834065, "grad_norm": 2.171875, "learning_rate": 7.222418867927948e-06, "loss": 0.8126, "step": 1900 }, { "epoch": 13.415276558384548, "grad_norm": 2.4375, "learning_rate": 7.187941806493372e-06, "loss": 0.8155, "step": 1910 }, { "epoch": 13.485513608428446, "grad_norm": 2.125, "learning_rate": 7.153335593647974e-06, "loss": 0.8346, "step": 1920 }, { "epoch": 13.555750658472345, "grad_norm": 2.234375, "learning_rate": 7.118602272152308e-06, "loss": 0.8275, "step": 1930 }, { "epoch": 13.625987708516242, "grad_norm": 2.25, "learning_rate": 7.083743892269987e-06, "loss": 0.8215, "step": 1940 }, { "epoch": 13.69622475856014, "grad_norm": 1.890625, "learning_rate": 7.04876251164666e-06, "loss": 0.8181, "step": 1950 }, { "epoch": 13.766461808604038, "grad_norm": 2.171875, "learning_rate": 7.013660195188553e-06, "loss": 0.8099, "step": 1960 }, { "epoch": 13.836698858647937, "grad_norm": 2.125, "learning_rate": 6.978439014940584e-06, "loss": 0.8278, "step": 1970 }, { "epoch": 13.906935908691835, "grad_norm": 2.21875, "learning_rate": 6.943101049964042e-06, "loss": 0.8299, "step": 1980 }, { "epoch": 13.977172958735734, "grad_norm": 2.328125, "learning_rate": 6.907648386213875e-06, "loss": 0.8003, "step": 1990 }, { "epoch": 14.04741000877963, "grad_norm": 2.0625, "learning_rate": 6.872083116415547e-06, "loss": 0.7669, "step": 2000 }, { "epoch": 14.04741000877963, "eval_loss": 1.4163318872451782, "eval_runtime": 28.9493, "eval_samples_per_second": 8.774, "eval_steps_per_second": 8.774, "step": 2000 }, { "epoch": 14.117647058823529, "grad_norm": 1.8984375, "learning_rate": 6.836407339941522e-06, "loss": 0.7701, "step": 2010 }, { "epoch": 14.187884108867427, "grad_norm": 1.921875, "learning_rate": 6.800623162687325e-06, "loss": 0.7634, "step": 2020 }, { "epoch": 14.258121158911326, "grad_norm": 2.34375, "learning_rate": 6.764732696947243e-06, "loss": 0.7579, "step": 2030 }, { "epoch": 14.328358208955224, "grad_norm": 2.5, "learning_rate": 6.728738061289634e-06, "loss": 0.7713, "step": 2040 }, { "epoch": 14.398595258999123, "grad_norm": 1.9453125, "learning_rate": 6.692641380431879e-06, "loss": 0.7512, "step": 2050 }, { "epoch": 14.46883230904302, "grad_norm": 2.125, "learning_rate": 6.6564447851149505e-06, "loss": 0.7673, "step": 2060 }, { "epoch": 14.539069359086918, "grad_norm": 2.0, "learning_rate": 6.620150411977648e-06, "loss": 0.747, "step": 2070 }, { "epoch": 14.609306409130816, "grad_norm": 1.96875, "learning_rate": 6.5837604034304715e-06, "loss": 0.7471, "step": 2080 }, { "epoch": 14.679543459174715, "grad_norm": 1.9453125, "learning_rate": 6.547276907529152e-06, "loss": 0.7733, "step": 2090 }, { "epoch": 14.749780509218613, "grad_norm": 1.59375, "learning_rate": 6.510702077847864e-06, "loss": 0.7356, "step": 2100 }, { "epoch": 14.82001755926251, "grad_norm": 1.9921875, "learning_rate": 6.474038073352098e-06, "loss": 0.7702, "step": 2110 }, { "epoch": 14.890254609306409, "grad_norm": 1.96875, "learning_rate": 6.4372870582712196e-06, "loss": 0.76, "step": 2120 }, { "epoch": 14.960491659350307, "grad_norm": 1.8359375, "learning_rate": 6.4004512019707144e-06, "loss": 0.7664, "step": 2130 }, { "epoch": 15.030728709394205, "grad_norm": 1.5859375, "learning_rate": 6.363532678824145e-06, "loss": 0.7274, "step": 2140 }, { "epoch": 15.100965759438104, "grad_norm": 1.8828125, "learning_rate": 6.326533668084783e-06, "loss": 0.703, "step": 2150 }, { "epoch": 15.171202809482002, "grad_norm": 1.9921875, "learning_rate": 6.289456353756988e-06, "loss": 0.7176, "step": 2160 }, { "epoch": 15.241439859525899, "grad_norm": 1.9453125, "learning_rate": 6.252302924467276e-06, "loss": 0.6589, "step": 2170 }, { "epoch": 15.311676909569798, "grad_norm": 1.7578125, "learning_rate": 6.2150755733351305e-06, "loss": 0.6822, "step": 2180 }, { "epoch": 15.381913959613696, "grad_norm": 1.734375, "learning_rate": 6.177776497843552e-06, "loss": 0.6947, "step": 2190 }, { "epoch": 15.452151009657594, "grad_norm": 1.7578125, "learning_rate": 6.140407899709333e-06, "loss": 0.7027, "step": 2200 }, { "epoch": 15.522388059701493, "grad_norm": 1.765625, "learning_rate": 6.102971984753104e-06, "loss": 0.6625, "step": 2210 }, { "epoch": 15.592625109745391, "grad_norm": 1.8359375, "learning_rate": 6.065470962769119e-06, "loss": 0.7181, "step": 2220 }, { "epoch": 15.662862159789288, "grad_norm": 1.7421875, "learning_rate": 6.027907047394812e-06, "loss": 0.6745, "step": 2230 }, { "epoch": 15.733099209833187, "grad_norm": 1.6953125, "learning_rate": 5.990282455980145e-06, "loss": 0.6744, "step": 2240 }, { "epoch": 15.803336259877085, "grad_norm": 1.9609375, "learning_rate": 5.952599409456697e-06, "loss": 0.6877, "step": 2250 }, { "epoch": 15.873573309920983, "grad_norm": 1.765625, "learning_rate": 5.914860132206584e-06, "loss": 0.7303, "step": 2260 }, { "epoch": 15.943810359964882, "grad_norm": 1.71875, "learning_rate": 5.877066851931151e-06, "loss": 0.712, "step": 2270 }, { "epoch": 16.01404741000878, "grad_norm": 1.859375, "learning_rate": 5.83922179951947e-06, "loss": 0.6674, "step": 2280 }, { "epoch": 16.08428446005268, "grad_norm": 2.03125, "learning_rate": 5.8013272089166526e-06, "loss": 0.6726, "step": 2290 }, { "epoch": 16.154521510096576, "grad_norm": 1.7109375, "learning_rate": 5.763385316991995e-06, "loss": 0.6198, "step": 2300 }, { "epoch": 16.224758560140476, "grad_norm": 1.8046875, "learning_rate": 5.725398363406922e-06, "loss": 0.6223, "step": 2310 }, { "epoch": 16.294995610184372, "grad_norm": 1.8671875, "learning_rate": 5.687368590482797e-06, "loss": 0.6315, "step": 2320 }, { "epoch": 16.36523266022827, "grad_norm": 1.78125, "learning_rate": 5.64929824306855e-06, "loss": 0.6418, "step": 2330 }, { "epoch": 16.43546971027217, "grad_norm": 1.6875, "learning_rate": 5.611189568408173e-06, "loss": 0.6086, "step": 2340 }, { "epoch": 16.505706760316066, "grad_norm": 1.8515625, "learning_rate": 5.573044816008066e-06, "loss": 0.6375, "step": 2350 }, { "epoch": 16.575943810359966, "grad_norm": 1.5390625, "learning_rate": 5.534866237504252e-06, "loss": 0.6085, "step": 2360 }, { "epoch": 16.646180860403863, "grad_norm": 1.7109375, "learning_rate": 5.496656086529467e-06, "loss": 0.6355, "step": 2370 }, { "epoch": 16.71641791044776, "grad_norm": 1.7109375, "learning_rate": 5.458416618580126e-06, "loss": 0.6206, "step": 2380 }, { "epoch": 16.78665496049166, "grad_norm": 1.4921875, "learning_rate": 5.420150090883191e-06, "loss": 0.6445, "step": 2390 }, { "epoch": 16.856892010535557, "grad_norm": 1.546875, "learning_rate": 5.381858762262927e-06, "loss": 0.6508, "step": 2400 }, { "epoch": 16.927129060579457, "grad_norm": 1.578125, "learning_rate": 5.343544893007563e-06, "loss": 0.6198, "step": 2410 }, { "epoch": 16.997366110623354, "grad_norm": 1.4921875, "learning_rate": 5.305210744735874e-06, "loss": 0.6156, "step": 2420 }, { "epoch": 17.06760316066725, "grad_norm": 1.515625, "learning_rate": 5.266858580263678e-06, "loss": 0.5986, "step": 2430 }, { "epoch": 17.13784021071115, "grad_norm": 1.578125, "learning_rate": 5.228490663470271e-06, "loss": 0.5637, "step": 2440 }, { "epoch": 17.208077260755047, "grad_norm": 1.390625, "learning_rate": 5.190109259164782e-06, "loss": 0.5738, "step": 2450 }, { "epoch": 17.278314310798947, "grad_norm": 1.5, "learning_rate": 5.151716632952495e-06, "loss": 0.5912, "step": 2460 }, { "epoch": 17.348551360842844, "grad_norm": 1.28125, "learning_rate": 5.113315051101111e-06, "loss": 0.5782, "step": 2470 }, { "epoch": 17.418788410886744, "grad_norm": 1.5546875, "learning_rate": 5.074906780406962e-06, "loss": 0.5548, "step": 2480 }, { "epoch": 17.48902546093064, "grad_norm": 1.3359375, "learning_rate": 5.036494088061222e-06, "loss": 0.5678, "step": 2490 }, { "epoch": 17.559262510974538, "grad_norm": 1.1953125, "learning_rate": 4.998079241516068e-06, "loss": 0.5768, "step": 2500 }, { "epoch": 17.629499561018438, "grad_norm": 1.21875, "learning_rate": 4.959664508350834e-06, "loss": 0.5794, "step": 2510 }, { "epoch": 17.699736611062335, "grad_norm": 1.234375, "learning_rate": 4.921252156138163e-06, "loss": 0.5754, "step": 2520 }, { "epoch": 17.769973661106235, "grad_norm": 1.28125, "learning_rate": 4.882844452310155e-06, "loss": 0.5781, "step": 2530 }, { "epoch": 17.84021071115013, "grad_norm": 1.265625, "learning_rate": 4.844443664024517e-06, "loss": 0.5834, "step": 2540 }, { "epoch": 17.91044776119403, "grad_norm": 1.15625, "learning_rate": 4.8060520580307456e-06, "loss": 0.565, "step": 2550 }, { "epoch": 17.98068481123793, "grad_norm": 1.15625, "learning_rate": 4.767671900536315e-06, "loss": 0.6071, "step": 2560 }, { "epoch": 18.050921861281825, "grad_norm": 1.078125, "learning_rate": 4.729305457072913e-06, "loss": 0.5734, "step": 2570 }, { "epoch": 18.121158911325725, "grad_norm": 1.1796875, "learning_rate": 4.690954992362699e-06, "loss": 0.5322, "step": 2580 }, { "epoch": 18.191395961369622, "grad_norm": 1.171875, "learning_rate": 4.652622770184637e-06, "loss": 0.5304, "step": 2590 }, { "epoch": 18.261633011413522, "grad_norm": 1.0859375, "learning_rate": 4.6143110532408455e-06, "loss": 0.5368, "step": 2600 }, { "epoch": 18.33187006145742, "grad_norm": 1.1484375, "learning_rate": 4.576022103023053e-06, "loss": 0.5456, "step": 2610 }, { "epoch": 18.402107111501316, "grad_norm": 1.0859375, "learning_rate": 4.537758179679098e-06, "loss": 0.5699, "step": 2620 }, { "epoch": 18.472344161545216, "grad_norm": 1.0859375, "learning_rate": 4.499521541879508e-06, "loss": 0.5587, "step": 2630 }, { "epoch": 18.542581211589113, "grad_norm": 1.1328125, "learning_rate": 4.461314446684189e-06, "loss": 0.526, "step": 2640 }, { "epoch": 18.612818261633013, "grad_norm": 1.0078125, "learning_rate": 4.423139149409176e-06, "loss": 0.5593, "step": 2650 }, { "epoch": 18.68305531167691, "grad_norm": 1.0703125, "learning_rate": 4.384997903493519e-06, "loss": 0.5379, "step": 2660 }, { "epoch": 18.753292361720806, "grad_norm": 1.078125, "learning_rate": 4.346892960366255e-06, "loss": 0.5503, "step": 2670 }, { "epoch": 18.823529411764707, "grad_norm": 1.03125, "learning_rate": 4.30882656931352e-06, "loss": 0.5275, "step": 2680 }, { "epoch": 18.893766461808603, "grad_norm": 1.1484375, "learning_rate": 4.270800977345767e-06, "loss": 0.5515, "step": 2690 }, { "epoch": 18.964003511852503, "grad_norm": 1.0234375, "learning_rate": 4.232818429065128e-06, "loss": 0.5484, "step": 2700 }, { "epoch": 19.0342405618964, "grad_norm": 0.9140625, "learning_rate": 4.194881166532923e-06, "loss": 0.5451, "step": 2710 }, { "epoch": 19.104477611940297, "grad_norm": 1.015625, "learning_rate": 4.156991429137317e-06, "loss": 0.5222, "step": 2720 }, { "epoch": 19.174714661984197, "grad_norm": 0.9921875, "learning_rate": 4.119151453461121e-06, "loss": 0.5167, "step": 2730 }, { "epoch": 19.244951712028094, "grad_norm": 1.0078125, "learning_rate": 4.081363473149778e-06, "loss": 0.5009, "step": 2740 }, { "epoch": 19.315188762071994, "grad_norm": 0.94140625, "learning_rate": 4.0436297187795085e-06, "loss": 0.5206, "step": 2750 }, { "epoch": 19.38542581211589, "grad_norm": 0.96484375, "learning_rate": 4.005952417725649e-06, "loss": 0.5241, "step": 2760 }, { "epoch": 19.45566286215979, "grad_norm": 0.91796875, "learning_rate": 3.968333794031165e-06, "loss": 0.5247, "step": 2770 }, { "epoch": 19.525899912203688, "grad_norm": 0.8671875, "learning_rate": 3.930776068275375e-06, "loss": 0.5138, "step": 2780 }, { "epoch": 19.596136962247584, "grad_norm": 0.9453125, "learning_rate": 3.89328145744287e-06, "loss": 0.5295, "step": 2790 }, { "epoch": 19.666374012291485, "grad_norm": 0.88671875, "learning_rate": 3.8558521747926434e-06, "loss": 0.5235, "step": 2800 }, { "epoch": 19.73661106233538, "grad_norm": 0.87890625, "learning_rate": 3.818490429727455e-06, "loss": 0.5198, "step": 2810 }, { "epoch": 19.80684811237928, "grad_norm": 0.90234375, "learning_rate": 3.7811984276634024e-06, "loss": 0.5037, "step": 2820 }, { "epoch": 19.877085162423178, "grad_norm": 0.82421875, "learning_rate": 3.743978369899748e-06, "loss": 0.5069, "step": 2830 }, { "epoch": 19.947322212467075, "grad_norm": 0.90625, "learning_rate": 3.70683245348897e-06, "loss": 0.5333, "step": 2840 }, { "epoch": 20.017559262510975, "grad_norm": 0.7890625, "learning_rate": 3.6697628711070786e-06, "loss": 0.4939, "step": 2850 }, { "epoch": 20.08779631255487, "grad_norm": 0.78125, "learning_rate": 3.632771810924184e-06, "loss": 0.4879, "step": 2860 }, { "epoch": 20.158033362598772, "grad_norm": 0.82421875, "learning_rate": 3.5958614564753313e-06, "loss": 0.518, "step": 2870 }, { "epoch": 20.22827041264267, "grad_norm": 0.79296875, "learning_rate": 3.559033986531608e-06, "loss": 0.4944, "step": 2880 }, { "epoch": 20.298507462686565, "grad_norm": 0.8125, "learning_rate": 3.522291574971538e-06, "loss": 0.5026, "step": 2890 }, { "epoch": 20.368744512730466, "grad_norm": 0.8359375, "learning_rate": 3.4856363906527513e-06, "loss": 0.5134, "step": 2900 }, { "epoch": 20.438981562774362, "grad_norm": 0.69140625, "learning_rate": 3.449070597283972e-06, "loss": 0.5056, "step": 2910 }, { "epoch": 20.509218612818263, "grad_norm": 0.8828125, "learning_rate": 3.4125963532972878e-06, "loss": 0.5035, "step": 2920 }, { "epoch": 20.57945566286216, "grad_norm": 0.82421875, "learning_rate": 3.376215811720744e-06, "loss": 0.5134, "step": 2930 }, { "epoch": 20.64969271290606, "grad_norm": 0.81640625, "learning_rate": 3.3399311200512495e-06, "loss": 0.4666, "step": 2940 }, { "epoch": 20.719929762949956, "grad_norm": 0.78125, "learning_rate": 3.3037444201278202e-06, "loss": 0.4965, "step": 2950 }, { "epoch": 20.790166812993853, "grad_norm": 0.80078125, "learning_rate": 3.267657848005139e-06, "loss": 0.4953, "step": 2960 }, { "epoch": 20.860403863037753, "grad_norm": 0.8125, "learning_rate": 3.2316735338274795e-06, "loss": 0.4914, "step": 2970 }, { "epoch": 20.93064091308165, "grad_norm": 0.83203125, "learning_rate": 3.1957936017029513e-06, "loss": 0.5002, "step": 2980 }, { "epoch": 21.00087796312555, "grad_norm": 0.79296875, "learning_rate": 3.1600201695781335e-06, "loss": 0.5163, "step": 2990 }, { "epoch": 21.071115013169447, "grad_norm": 0.87109375, "learning_rate": 3.124355349113037e-06, "loss": 0.4863, "step": 3000 }, { "epoch": 21.141352063213343, "grad_norm": 0.84375, "learning_rate": 3.0888012455564707e-06, "loss": 0.4616, "step": 3010 }, { "epoch": 21.211589113257244, "grad_norm": 0.91796875, "learning_rate": 3.0533599576217664e-06, "loss": 0.4897, "step": 3020 }, { "epoch": 21.28182616330114, "grad_norm": 0.87109375, "learning_rate": 3.0180335773628912e-06, "loss": 0.4801, "step": 3030 }, { "epoch": 21.35206321334504, "grad_norm": 0.8984375, "learning_rate": 2.982824190050958e-06, "loss": 0.5161, "step": 3040 }, { "epoch": 21.422300263388937, "grad_norm": 0.84375, "learning_rate": 2.94773387405114e-06, "loss": 0.4854, "step": 3050 }, { "epoch": 21.492537313432837, "grad_norm": 0.953125, "learning_rate": 2.912764700699978e-06, "loss": 0.5062, "step": 3060 }, { "epoch": 21.562774363476734, "grad_norm": 1.0234375, "learning_rate": 2.8779187341831205e-06, "loss": 0.4976, "step": 3070 }, { "epoch": 21.63301141352063, "grad_norm": 1.046875, "learning_rate": 2.843198031413473e-06, "loss": 0.4782, "step": 3080 }, { "epoch": 21.70324846356453, "grad_norm": 1.09375, "learning_rate": 2.808604641909781e-06, "loss": 0.4694, "step": 3090 }, { "epoch": 21.773485513608428, "grad_norm": 1.0078125, "learning_rate": 2.7741406076756484e-06, "loss": 0.4864, "step": 3100 }, { "epoch": 21.843722563652328, "grad_norm": 1.03125, "learning_rate": 2.7398079630790064e-06, "loss": 0.4775, "step": 3110 }, { "epoch": 21.913959613696225, "grad_norm": 1.0703125, "learning_rate": 2.7056087347320238e-06, "loss": 0.4923, "step": 3120 }, { "epoch": 21.98419666374012, "grad_norm": 1.015625, "learning_rate": 2.6715449413714778e-06, "loss": 0.4862, "step": 3130 }, { "epoch": 22.05443371378402, "grad_norm": 0.9921875, "learning_rate": 2.637618593739588e-06, "loss": 0.4765, "step": 3140 }, { "epoch": 22.12467076382792, "grad_norm": 1.453125, "learning_rate": 2.603831694465333e-06, "loss": 0.4581, "step": 3150 }, { "epoch": 22.19490781387182, "grad_norm": 1.5390625, "learning_rate": 2.57018623794623e-06, "loss": 0.479, "step": 3160 }, { "epoch": 22.265144863915715, "grad_norm": 1.4375, "learning_rate": 2.5366842102306144e-06, "loss": 0.4777, "step": 3170 }, { "epoch": 22.335381913959615, "grad_norm": 1.4921875, "learning_rate": 2.503327588900396e-06, "loss": 0.4865, "step": 3180 }, { "epoch": 22.405618964003512, "grad_norm": 1.28125, "learning_rate": 2.4701183429543386e-06, "loss": 0.4691, "step": 3190 }, { "epoch": 22.47585601404741, "grad_norm": 4.15625, "learning_rate": 2.437058432691819e-06, "loss": 0.4755, "step": 3200 }, { "epoch": 22.54609306409131, "grad_norm": 4.53125, "learning_rate": 2.4041498095971253e-06, "loss": 0.46, "step": 3210 }, { "epoch": 22.616330114135206, "grad_norm": 4.09375, "learning_rate": 2.3713944162242506e-06, "loss": 0.4886, "step": 3220 }, { "epoch": 22.686567164179106, "grad_norm": 4.65625, "learning_rate": 2.3387941860822395e-06, "loss": 0.488, "step": 3230 }, { "epoch": 22.756804214223003, "grad_norm": 4.4375, "learning_rate": 2.3063510435210456e-06, "loss": 0.4788, "step": 3240 }, { "epoch": 22.8270412642669, "grad_norm": 8.6875, "learning_rate": 2.2740669036179464e-06, "loss": 0.4856, "step": 3250 }, { "epoch": 22.8972783143108, "grad_norm": 8.5, "learning_rate": 2.24194367206449e-06, "loss": 0.4672, "step": 3260 }, { "epoch": 22.967515364354696, "grad_norm": 9.875, "learning_rate": 2.209983245054014e-06, "loss": 0.4663, "step": 3270 }, { "epoch": 23.037752414398597, "grad_norm": 8.6875, "learning_rate": 2.178187509169713e-06, "loss": 0.465, "step": 3280 }, { "epoch": 23.107989464442493, "grad_norm": 10.375, "learning_rate": 2.146558341273273e-06, "loss": 0.4592, "step": 3290 }, { "epoch": 23.17822651448639, "grad_norm": 2.65625, "learning_rate": 2.115097608394084e-06, "loss": 0.4521, "step": 3300 }, { "epoch": 23.24846356453029, "grad_norm": 3.0, "learning_rate": 2.083807167619029e-06, "loss": 0.4396, "step": 3310 }, { "epoch": 23.318700614574187, "grad_norm": 2.859375, "learning_rate": 2.0526888659828716e-06, "loss": 0.4557, "step": 3320 }, { "epoch": 23.388937664618087, "grad_norm": 2.828125, "learning_rate": 2.0217445403592185e-06, "loss": 0.4686, "step": 3330 }, { "epoch": 23.459174714661984, "grad_norm": 2.4375, "learning_rate": 1.990976017352097e-06, "loss": 0.4696, "step": 3340 }, { "epoch": 23.529411764705884, "grad_norm": 2.625, "learning_rate": 1.9603851131881256e-06, "loss": 0.4566, "step": 3350 }, { "epoch": 23.59964881474978, "grad_norm": 2.9375, "learning_rate": 1.9299736336093137e-06, "loss": 0.4642, "step": 3360 }, { "epoch": 23.669885864793677, "grad_norm": 2.8125, "learning_rate": 1.8997433737664673e-06, "loss": 0.4371, "step": 3370 }, { "epoch": 23.740122914837578, "grad_norm": 2.703125, "learning_rate": 1.869696118113216e-06, "loss": 0.4616, "step": 3380 }, { "epoch": 23.810359964881474, "grad_norm": 2.859375, "learning_rate": 1.8398336403006956e-06, "loss": 0.4551, "step": 3390 }, { "epoch": 23.880597014925375, "grad_norm": 2.390625, "learning_rate": 1.8101577030728324e-06, "loss": 0.4456, "step": 3400 }, { "epoch": 23.95083406496927, "grad_norm": 2.625, "learning_rate": 1.7806700581623059e-06, "loss": 0.4756, "step": 3410 }, { "epoch": 24.021071115013168, "grad_norm": 2.453125, "learning_rate": 1.7513724461871423e-06, "loss": 0.4519, "step": 3420 }, { "epoch": 24.091308165057068, "grad_norm": 2.484375, "learning_rate": 1.7222665965479585e-06, "loss": 0.4489, "step": 3430 }, { "epoch": 24.161545215100965, "grad_norm": 2.171875, "learning_rate": 1.6933542273258924e-06, "loss": 0.4369, "step": 3440 }, { "epoch": 24.231782265144865, "grad_norm": 2.359375, "learning_rate": 1.6646370451811784e-06, "loss": 0.4246, "step": 3450 }, { "epoch": 24.302019315188762, "grad_norm": 2.34375, "learning_rate": 1.6361167452524073e-06, "loss": 0.4181, "step": 3460 }, { "epoch": 24.37225636523266, "grad_norm": 2.328125, "learning_rate": 1.6077950110564606e-06, "loss": 0.4349, "step": 3470 }, { "epoch": 24.44249341527656, "grad_norm": 2.515625, "learning_rate": 1.5796735143891423e-06, "loss": 0.4358, "step": 3480 }, { "epoch": 24.512730465320455, "grad_norm": 2.71875, "learning_rate": 1.551753915226491e-06, "loss": 0.4428, "step": 3490 }, { "epoch": 24.582967515364356, "grad_norm": 2.5, "learning_rate": 1.5240378616267887e-06, "loss": 0.4538, "step": 3500 }, { "epoch": 24.653204565408252, "grad_norm": 2.234375, "learning_rate": 1.4965269896332884e-06, "loss": 0.4311, "step": 3510 }, { "epoch": 24.723441615452153, "grad_norm": 2.140625, "learning_rate": 1.46922292317763e-06, "loss": 0.4664, "step": 3520 }, { "epoch": 24.79367866549605, "grad_norm": 2.28125, "learning_rate": 1.4421272739839898e-06, "loss": 0.4315, "step": 3530 }, { "epoch": 24.863915715539946, "grad_norm": 2.234375, "learning_rate": 1.4152416414739401e-06, "loss": 0.4403, "step": 3540 }, { "epoch": 24.934152765583846, "grad_norm": 2.34375, "learning_rate": 1.3885676126720315e-06, "loss": 0.4652, "step": 3550 }, { "epoch": 25.004389815627743, "grad_norm": 2.140625, "learning_rate": 1.362106762112123e-06, "loss": 0.4439, "step": 3560 }, { "epoch": 25.074626865671643, "grad_norm": 2.3125, "learning_rate": 1.3358606517444328e-06, "loss": 0.443, "step": 3570 }, { "epoch": 25.14486391571554, "grad_norm": 2.0, "learning_rate": 1.3098308308433411e-06, "loss": 0.416, "step": 3580 }, { "epoch": 25.215100965759436, "grad_norm": 2.171875, "learning_rate": 1.2840188359159329e-06, "loss": 0.4035, "step": 3590 }, { "epoch": 25.285338015803337, "grad_norm": 1.9765625, "learning_rate": 1.258426190611306e-06, "loss": 0.4279, "step": 3600 }, { "epoch": 25.355575065847233, "grad_norm": 2.171875, "learning_rate": 1.2330544056306315e-06, "loss": 0.4358, "step": 3610 }, { "epoch": 25.425812115891134, "grad_norm": 1.9765625, "learning_rate": 1.2079049786379782e-06, "loss": 0.4368, "step": 3620 }, { "epoch": 25.49604916593503, "grad_norm": 2.109375, "learning_rate": 1.1829793941719053e-06, "loss": 0.4271, "step": 3630 }, { "epoch": 25.56628621597893, "grad_norm": 2.078125, "learning_rate": 1.1582791235578321e-06, "loss": 0.4482, "step": 3640 }, { "epoch": 25.636523266022827, "grad_norm": 2.140625, "learning_rate": 1.1338056248211916e-06, "loss": 0.4131, "step": 3650 }, { "epoch": 25.706760316066724, "grad_norm": 2.125, "learning_rate": 1.1095603426013613e-06, "loss": 0.4513, "step": 3660 }, { "epoch": 25.776997366110624, "grad_norm": 2.15625, "learning_rate": 1.0855447080663907e-06, "loss": 0.4118, "step": 3670 }, { "epoch": 25.84723441615452, "grad_norm": 2.1875, "learning_rate": 1.0617601388285149e-06, "loss": 0.4082, "step": 3680 }, { "epoch": 25.91747146619842, "grad_norm": 2.421875, "learning_rate": 1.0382080388604866e-06, "loss": 0.428, "step": 3690 }, { "epoch": 25.987708516242318, "grad_norm": 2.28125, "learning_rate": 1.0148897984126876e-06, "loss": 0.452, "step": 3700 }, { "epoch": 26.057945566286215, "grad_norm": 2.0625, "learning_rate": 9.918067939310766e-07, "loss": 0.4177, "step": 3710 }, { "epoch": 26.128182616330115, "grad_norm": 2.046875, "learning_rate": 9.689603879759284e-07, "loss": 0.4091, "step": 3720 }, { "epoch": 26.19841966637401, "grad_norm": 1.90625, "learning_rate": 9.463519291414131e-07, "loss": 0.4035, "step": 3730 }, { "epoch": 26.26865671641791, "grad_norm": 1.7578125, "learning_rate": 9.239827519759842e-07, "loss": 0.41, "step": 3740 }, { "epoch": 26.33889376646181, "grad_norm": 2.25, "learning_rate": 9.018541769036054e-07, "loss": 0.4306, "step": 3750 }, { "epoch": 26.409130816505705, "grad_norm": 2.015625, "learning_rate": 8.799675101458033e-07, "loss": 0.4122, "step": 3760 }, { "epoch": 26.479367866549605, "grad_norm": 1.8984375, "learning_rate": 8.583240436445666e-07, "loss": 0.3986, "step": 3770 }, { "epoch": 26.549604916593502, "grad_norm": 2.046875, "learning_rate": 8.369250549860869e-07, "loss": 0.437, "step": 3780 }, { "epoch": 26.619841966637402, "grad_norm": 2.328125, "learning_rate": 8.157718073253351e-07, "loss": 0.4344, "step": 3790 }, { "epoch": 26.6900790166813, "grad_norm": 1.8828125, "learning_rate": 7.948655493115098e-07, "loss": 0.4269, "step": 3800 }, { "epoch": 26.7603160667252, "grad_norm": 1.9765625, "learning_rate": 7.742075150143225e-07, "loss": 0.3962, "step": 3810 }, { "epoch": 26.830553116769096, "grad_norm": 1.9765625, "learning_rate": 7.537989238511578e-07, "loss": 0.4376, "step": 3820 }, { "epoch": 26.900790166812993, "grad_norm": 1.734375, "learning_rate": 7.336409805150901e-07, "loss": 0.4504, "step": 3830 }, { "epoch": 26.971027216856893, "grad_norm": 1.984375, "learning_rate": 7.137348749037748e-07, "loss": 0.4166, "step": 3840 }, { "epoch": 27.04126426690079, "grad_norm": 1.9453125, "learning_rate": 6.940817820492024e-07, "loss": 0.4244, "step": 3850 }, { "epoch": 27.11150131694469, "grad_norm": 1.9609375, "learning_rate": 6.746828620483487e-07, "loss": 0.4096, "step": 3860 }, { "epoch": 27.181738366988586, "grad_norm": 1.78125, "learning_rate": 6.555392599946903e-07, "loss": 0.3896, "step": 3870 }, { "epoch": 27.251975417032483, "grad_norm": 1.9375, "learning_rate": 6.366521059106078e-07, "loss": 0.4363, "step": 3880 }, { "epoch": 27.322212467076383, "grad_norm": 1.9296875, "learning_rate": 6.180225146806878e-07, "loss": 0.4183, "step": 3890 }, { "epoch": 27.39244951712028, "grad_norm": 1.8125, "learning_rate": 5.996515859859109e-07, "loss": 0.3911, "step": 3900 }, { "epoch": 27.46268656716418, "grad_norm": 1.8671875, "learning_rate": 5.815404042387379e-07, "loss": 0.4237, "step": 3910 }, { "epoch": 27.532923617208077, "grad_norm": 1.78125, "learning_rate": 5.636900385191014e-07, "loss": 0.4077, "step": 3920 }, { "epoch": 27.603160667251977, "grad_norm": 1.859375, "learning_rate": 5.461015425112915e-07, "loss": 0.407, "step": 3930 }, { "epoch": 27.673397717295874, "grad_norm": 2.015625, "learning_rate": 5.287759544417687e-07, "loss": 0.436, "step": 3940 }, { "epoch": 27.74363476733977, "grad_norm": 1.6953125, "learning_rate": 5.117142970178712e-07, "loss": 0.4014, "step": 3950 }, { "epoch": 27.81387181738367, "grad_norm": 1.6640625, "learning_rate": 4.949175773674502e-07, "loss": 0.4272, "step": 3960 }, { "epoch": 27.884108867427567, "grad_norm": 1.640625, "learning_rate": 4.783867869794157e-07, "loss": 0.4179, "step": 3970 }, { "epoch": 27.954345917471468, "grad_norm": 1.75, "learning_rate": 4.6212290164521554e-07, "loss": 0.4351, "step": 3980 }, { "epoch": 28.024582967515364, "grad_norm": 1.609375, "learning_rate": 4.461268814012304e-07, "loss": 0.3982, "step": 3990 }, { "epoch": 28.09482001755926, "grad_norm": 1.4921875, "learning_rate": 4.3039967047210865e-07, "loss": 0.4249, "step": 4000 }, { "epoch": 28.09482001755926, "eval_loss": 1.6928666830062866, "eval_runtime": 10.5811, "eval_samples_per_second": 24.005, "eval_steps_per_second": 24.005, "step": 4000 }, { "epoch": 28.16505706760316, "grad_norm": 1.515625, "learning_rate": 4.1494219721502917e-07, "loss": 0.406, "step": 4010 }, { "epoch": 28.235294117647058, "grad_norm": 1.5703125, "learning_rate": 3.997553740648974e-07, "loss": 0.4208, "step": 4020 }, { "epoch": 28.30553116769096, "grad_norm": 1.625, "learning_rate": 3.8484009748049053e-07, "loss": 0.4113, "step": 4030 }, { "epoch": 28.375768217734855, "grad_norm": 1.546875, "learning_rate": 3.7019724789154e-07, "loss": 0.4131, "step": 4040 }, { "epoch": 28.44600526777875, "grad_norm": 1.4453125, "learning_rate": 3.558276896467555e-07, "loss": 0.4192, "step": 4050 }, { "epoch": 28.516242317822652, "grad_norm": 1.46875, "learning_rate": 3.4173227096281124e-07, "loss": 0.3869, "step": 4060 }, { "epoch": 28.58647936786655, "grad_norm": 1.4609375, "learning_rate": 3.279118238742729e-07, "loss": 0.4353, "step": 4070 }, { "epoch": 28.65671641791045, "grad_norm": 1.4453125, "learning_rate": 3.143671641844831e-07, "loss": 0.3981, "step": 4080 }, { "epoch": 28.726953467954345, "grad_norm": 1.4765625, "learning_rate": 3.0109909141740614e-07, "loss": 0.3976, "step": 4090 }, { "epoch": 28.797190517998246, "grad_norm": 1.28125, "learning_rate": 2.881083887704339e-07, "loss": 0.4075, "step": 4100 }, { "epoch": 28.867427568042142, "grad_norm": 1.2421875, "learning_rate": 2.753958230681547e-07, "loss": 0.4224, "step": 4110 }, { "epoch": 28.93766461808604, "grad_norm": 1.3125, "learning_rate": 2.6296214471708826e-07, "loss": 0.4291, "step": 4120 }, { "epoch": 29.00790166812994, "grad_norm": 1.375, "learning_rate": 2.5080808766138996e-07, "loss": 0.4209, "step": 4130 }, { "epoch": 29.078138718173836, "grad_norm": 1.2578125, "learning_rate": 2.3893436933952575e-07, "loss": 0.426, "step": 4140 }, { "epoch": 29.148375768217736, "grad_norm": 1.109375, "learning_rate": 2.2734169064192623e-07, "loss": 0.4265, "step": 4150 }, { "epoch": 29.218612818261633, "grad_norm": 1.234375, "learning_rate": 2.1603073586961067e-07, "loss": 0.3948, "step": 4160 }, { "epoch": 29.28884986830553, "grad_norm": 1.2734375, "learning_rate": 2.0500217269379618e-07, "loss": 0.4023, "step": 4170 }, { "epoch": 29.35908691834943, "grad_norm": 1.171875, "learning_rate": 1.9425665211648238e-07, "loss": 0.4033, "step": 4180 }, { "epoch": 29.429323968393327, "grad_norm": 1.15625, "learning_rate": 1.837948084320268e-07, "loss": 0.4199, "step": 4190 }, { "epoch": 29.499561018437227, "grad_norm": 1.1953125, "learning_rate": 1.736172591897023e-07, "loss": 0.4347, "step": 4200 }, { "epoch": 29.569798068481123, "grad_norm": 1.1484375, "learning_rate": 1.6372460515724498e-07, "loss": 0.3949, "step": 4210 }, { "epoch": 29.64003511852502, "grad_norm": 1.140625, "learning_rate": 1.541174302853876e-07, "loss": 0.4254, "step": 4220 }, { "epoch": 29.71027216856892, "grad_norm": 1.1171875, "learning_rate": 1.4479630167339554e-07, "loss": 0.3997, "step": 4230 }, { "epoch": 29.780509218612817, "grad_norm": 1.1328125, "learning_rate": 1.3576176953558783e-07, "loss": 0.3875, "step": 4240 }, { "epoch": 29.850746268656717, "grad_norm": 1.234375, "learning_rate": 1.2701436716885897e-07, "loss": 0.4052, "step": 4250 }, { "epoch": 29.920983318700614, "grad_norm": 1.0859375, "learning_rate": 1.1855461092119991e-07, "loss": 0.4227, "step": 4260 }, { "epoch": 29.991220368744514, "grad_norm": 1.1875, "learning_rate": 1.1038300016121883e-07, "loss": 0.4192, "step": 4270 }, { "epoch": 30.06145741878841, "grad_norm": 1.171875, "learning_rate": 1.025000172486651e-07, "loss": 0.4067, "step": 4280 }, { "epoch": 30.131694468832308, "grad_norm": 1.1015625, "learning_rate": 9.490612750595096e-08, "loss": 0.4177, "step": 4290 }, { "epoch": 30.201931518876208, "grad_norm": 0.96875, "learning_rate": 8.760177919069302e-08, "loss": 0.4126, "step": 4300 }, { "epoch": 30.272168568920105, "grad_norm": 0.984375, "learning_rate": 8.058740346924221e-08, "loss": 0.4098, "step": 4310 }, { "epoch": 30.342405618964005, "grad_norm": 1.03125, "learning_rate": 7.386341439124145e-08, "loss": 0.4183, "step": 4320 }, { "epoch": 30.4126426690079, "grad_norm": 0.98828125, "learning_rate": 6.74302088651796e-08, "loss": 0.4249, "step": 4330 }, { "epoch": 30.482879719051798, "grad_norm": 0.96484375, "learning_rate": 6.128816663496296e-08, "loss": 0.4003, "step": 4340 }, { "epoch": 30.5531167690957, "grad_norm": 0.984375, "learning_rate": 5.543765025750103e-08, "loss": 0.4085, "step": 4350 }, { "epoch": 30.623353819139595, "grad_norm": 0.95703125, "learning_rate": 4.987900508130417e-08, "loss": 0.437, "step": 4360 }, { "epoch": 30.693590869183495, "grad_norm": 0.89453125, "learning_rate": 4.461255922609986e-08, "loss": 0.4117, "step": 4370 }, { "epoch": 30.763827919227392, "grad_norm": 0.87109375, "learning_rate": 3.963862356346049e-08, "loss": 0.4054, "step": 4380 }, { "epoch": 30.834064969271292, "grad_norm": 0.93359375, "learning_rate": 3.49574916984563e-08, "loss": 0.3948, "step": 4390 }, { "epoch": 30.90430201931519, "grad_norm": 0.9453125, "learning_rate": 3.056943995232431e-08, "loss": 0.3932, "step": 4400 }, { "epoch": 30.974539069359086, "grad_norm": 0.8671875, "learning_rate": 2.6474727346155194e-08, "loss": 0.426, "step": 4410 }, { "epoch": 31.044776119402986, "grad_norm": 0.89453125, "learning_rate": 2.2673595585605557e-08, "loss": 0.4219, "step": 4420 }, { "epoch": 31.115013169446883, "grad_norm": 0.85546875, "learning_rate": 1.9166269046628215e-08, "loss": 0.426, "step": 4430 }, { "epoch": 31.185250219490783, "grad_norm": 0.87890625, "learning_rate": 1.5952954762230575e-08, "loss": 0.4222, "step": 4440 }, { "epoch": 31.25548726953468, "grad_norm": 0.8359375, "learning_rate": 1.3033842410251074e-08, "loss": 0.4178, "step": 4450 }, { "epoch": 31.325724319578576, "grad_norm": 0.84375, "learning_rate": 1.0409104302164241e-08, "loss": 0.4068, "step": 4460 }, { "epoch": 31.395961369622476, "grad_norm": 0.82421875, "learning_rate": 8.078895372908846e-09, "loss": 0.4126, "step": 4470 }, { "epoch": 31.466198419666373, "grad_norm": 0.83984375, "learning_rate": 6.0433531717424275e-09, "loss": 0.4136, "step": 4480 }, { "epoch": 31.536435469710273, "grad_norm": 0.83984375, "learning_rate": 4.302597854121127e-09, "loss": 0.4096, "step": 4490 }, { "epoch": 31.60667251975417, "grad_norm": 0.85546875, "learning_rate": 2.8567321746092446e-09, "loss": 0.4092, "step": 4500 }, { "epoch": 31.67690956979807, "grad_norm": 0.8515625, "learning_rate": 1.705841480810766e-09, "loss": 0.4072, "step": 4510 }, { "epoch": 31.747146619841967, "grad_norm": 0.8671875, "learning_rate": 8.499937083339404e-10, "loss": 0.4261, "step": 4520 }, { "epoch": 31.817383669885864, "grad_norm": 0.84765625, "learning_rate": 2.892393767800483e-10, "loss": 0.4138, "step": 4530 }, { "epoch": 31.887620719929764, "grad_norm": 0.87109375, "learning_rate": 2.3611586760785566e-11, "loss": 0.3897, "step": 4540 }, { "epoch": 31.91571553994732, "step": 4544, "total_flos": 9.547395325030564e+17, "train_loss": 0.8426483250682203, "train_runtime": 5512.317, "train_samples_per_second": 13.218, "train_steps_per_second": 0.824 } ], "logging_steps": 10, "max_steps": 4544, "num_input_tokens_seen": 0, "num_train_epochs": 32, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.547395325030564e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }