diff --git "a/checkpoint-1802/trainer_state.json" "b/checkpoint-1802/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1802/trainer_state.json" @@ -0,0 +1,12899 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6817364986285822, + "eval_steps": 133, + "global_step": 1802, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003783221413033198, + "grad_norm": 56.01426433664828, + "learning_rate": 1e-08, + "loss": 8.5655, + "step": 1 + }, + { + "epoch": 0.0003783221413033198, + "eval_loss": 8.416223526000977, + "eval_runtime": 26.8642, + "eval_samples_per_second": 32.944, + "eval_steps_per_second": 1.042, + "step": 1 + }, + { + "epoch": 0.0003783221413033198, + "eval_bench_accuracy_arc_challenge": 0.12857142857142856, + "eval_bench_accuracy_hellaswag": 0.025, + "eval_bench_accuracy_mmlu": 0.21739130434782608, + "eval_bench_average_accuracy": 0.1236542443064182, + "eval_bench_loss": 10.19635223924068, + "eval_bench_total_accuracy": 0.1054945054945055, + "step": 1 + }, + { + "epoch": 0.0007566442826066396, + "grad_norm": 52.75063804651517, + "learning_rate": 2e-08, + "loss": 8.4236, + "step": 2 + }, + { + "epoch": 0.0011349664239099593, + "grad_norm": 54.29511008856074, + "learning_rate": 3e-08, + "loss": 8.5128, + "step": 3 + }, + { + "epoch": 0.0015132885652132792, + "grad_norm": 50.84717091006242, + "learning_rate": 4e-08, + "loss": 8.368, + "step": 4 + }, + { + "epoch": 0.0018916107065165989, + "grad_norm": 58.682276590467374, + "learning_rate": 5e-08, + "loss": 8.5171, + "step": 5 + }, + { + "epoch": 0.0022699328478199185, + "grad_norm": 54.19973526319146, + "learning_rate": 6e-08, + "loss": 8.4329, + "step": 6 + }, + { + "epoch": 0.0026482549891232382, + "grad_norm": 52.00177926668044, + "learning_rate": 7e-08, + "loss": 8.4562, + "step": 7 + }, + { + "epoch": 0.0030265771304265584, + "grad_norm": 55.9652762703784, + "learning_rate": 8e-08, + "loss": 8.5017, + "step": 8 + }, + { + "epoch": 0.003404899271729878, + "grad_norm": 54.88105368356734, + "learning_rate": 9e-08, + "loss": 8.471, + "step": 9 + }, + { + "epoch": 0.0037832214130331977, + "grad_norm": 50.22661382824928, + "learning_rate": 1e-07, + "loss": 8.4042, + "step": 10 + }, + { + "epoch": 0.004161543554336518, + "grad_norm": 51.712774406266966, + "learning_rate": 1.0999999999999999e-07, + "loss": 8.4819, + "step": 11 + }, + { + "epoch": 0.004539865695639837, + "grad_norm": 44.20700801792938, + "learning_rate": 1.2e-07, + "loss": 8.2981, + "step": 12 + }, + { + "epoch": 0.004918187836943157, + "grad_norm": 46.914384802444836, + "learning_rate": 1.3e-07, + "loss": 8.4152, + "step": 13 + }, + { + "epoch": 0.0052965099782464765, + "grad_norm": 46.66045652280597, + "learning_rate": 1.4e-07, + "loss": 8.4776, + "step": 14 + }, + { + "epoch": 0.005674832119549797, + "grad_norm": 45.99567071730722, + "learning_rate": 1.5e-07, + "loss": 8.4602, + "step": 15 + }, + { + "epoch": 0.006053154260853117, + "grad_norm": 31.7220420827569, + "learning_rate": 1.6e-07, + "loss": 8.342, + "step": 16 + }, + { + "epoch": 0.006431476402156436, + "grad_norm": 31.79821930177939, + "learning_rate": 1.7000000000000001e-07, + "loss": 8.4073, + "step": 17 + }, + { + "epoch": 0.006809798543459756, + "grad_norm": 34.99852513062481, + "learning_rate": 1.8e-07, + "loss": 8.4475, + "step": 18 + }, + { + "epoch": 0.007188120684763075, + "grad_norm": 32.34312521349501, + "learning_rate": 1.8999999999999998e-07, + "loss": 8.3691, + "step": 19 + }, + { + "epoch": 0.0075664428260663955, + "grad_norm": 28.491575199383966, + "learning_rate": 2e-07, + "loss": 8.2467, + "step": 20 + }, + { + "epoch": 0.007944764967369716, + "grad_norm": 27.788350456113577, + "learning_rate": 2.0999999999999997e-07, + "loss": 8.2619, + "step": 21 + }, + { + "epoch": 0.008323087108673036, + "grad_norm": 23.054768686734494, + "learning_rate": 2.1999999999999998e-07, + "loss": 8.2719, + "step": 22 + }, + { + "epoch": 0.008701409249976354, + "grad_norm": 20.862948070445295, + "learning_rate": 2.3e-07, + "loss": 8.1701, + "step": 23 + }, + { + "epoch": 0.009079731391279674, + "grad_norm": 23.840305973367958, + "learning_rate": 2.4e-07, + "loss": 8.2447, + "step": 24 + }, + { + "epoch": 0.009458053532582994, + "grad_norm": 22.407061285607927, + "learning_rate": 2.5e-07, + "loss": 8.2056, + "step": 25 + }, + { + "epoch": 0.009836375673886314, + "grad_norm": 21.55132867797403, + "learning_rate": 2.6e-07, + "loss": 8.1552, + "step": 26 + }, + { + "epoch": 0.010214697815189635, + "grad_norm": 20.992840710071967, + "learning_rate": 2.7e-07, + "loss": 8.188, + "step": 27 + }, + { + "epoch": 0.010593019956492953, + "grad_norm": 22.39828627182125, + "learning_rate": 2.8e-07, + "loss": 8.1256, + "step": 28 + }, + { + "epoch": 0.010971342097796273, + "grad_norm": 18.46346034557574, + "learning_rate": 2.9e-07, + "loss": 8.0045, + "step": 29 + }, + { + "epoch": 0.011349664239099593, + "grad_norm": 12.704677816631309, + "learning_rate": 3e-07, + "loss": 8.0417, + "step": 30 + }, + { + "epoch": 0.011727986380402913, + "grad_norm": 15.722346563574124, + "learning_rate": 3.1e-07, + "loss": 7.9647, + "step": 31 + }, + { + "epoch": 0.012106308521706233, + "grad_norm": 14.31712037195988, + "learning_rate": 3.2e-07, + "loss": 8.0119, + "step": 32 + }, + { + "epoch": 0.012484630663009552, + "grad_norm": 13.002942588027526, + "learning_rate": 3.3e-07, + "loss": 8.029, + "step": 33 + }, + { + "epoch": 0.012862952804312872, + "grad_norm": 15.303670533896709, + "learning_rate": 3.4000000000000003e-07, + "loss": 7.9847, + "step": 34 + }, + { + "epoch": 0.013241274945616192, + "grad_norm": 12.964425414274471, + "learning_rate": 3.5e-07, + "loss": 8.0026, + "step": 35 + }, + { + "epoch": 0.013619597086919512, + "grad_norm": 19.040688578500415, + "learning_rate": 3.6e-07, + "loss": 8.0397, + "step": 36 + }, + { + "epoch": 0.013997919228222832, + "grad_norm": 14.264527574014561, + "learning_rate": 3.7e-07, + "loss": 7.8472, + "step": 37 + }, + { + "epoch": 0.01437624136952615, + "grad_norm": 14.259878980724565, + "learning_rate": 3.7999999999999996e-07, + "loss": 7.9499, + "step": 38 + }, + { + "epoch": 0.01475456351082947, + "grad_norm": 21.02927607859569, + "learning_rate": 3.8999999999999997e-07, + "loss": 7.8521, + "step": 39 + }, + { + "epoch": 0.015132885652132791, + "grad_norm": 16.308228829260607, + "learning_rate": 4e-07, + "loss": 7.8008, + "step": 40 + }, + { + "epoch": 0.015511207793436111, + "grad_norm": 21.835730681754328, + "learning_rate": 4.0999999999999994e-07, + "loss": 7.7515, + "step": 41 + }, + { + "epoch": 0.01588952993473943, + "grad_norm": 22.548471887636545, + "learning_rate": 4.1999999999999995e-07, + "loss": 7.7859, + "step": 42 + }, + { + "epoch": 0.01626785207604275, + "grad_norm": 23.40758724577002, + "learning_rate": 4.2999999999999996e-07, + "loss": 7.7679, + "step": 43 + }, + { + "epoch": 0.01664617421734607, + "grad_norm": 22.806229545212982, + "learning_rate": 4.3999999999999997e-07, + "loss": 7.7211, + "step": 44 + }, + { + "epoch": 0.01702449635864939, + "grad_norm": 19.930882370057223, + "learning_rate": 4.5e-07, + "loss": 7.7017, + "step": 45 + }, + { + "epoch": 0.017402818499952708, + "grad_norm": 17.292062567746196, + "learning_rate": 4.6e-07, + "loss": 7.7146, + "step": 46 + }, + { + "epoch": 0.01778114064125603, + "grad_norm": 18.070618266890932, + "learning_rate": 4.6999999999999995e-07, + "loss": 7.7119, + "step": 47 + }, + { + "epoch": 0.01815946278255935, + "grad_norm": 16.65539275683302, + "learning_rate": 4.8e-07, + "loss": 7.6178, + "step": 48 + }, + { + "epoch": 0.01853778492386267, + "grad_norm": 19.36073786979339, + "learning_rate": 4.9e-07, + "loss": 7.6387, + "step": 49 + }, + { + "epoch": 0.01891610706516599, + "grad_norm": 22.520853767642276, + "learning_rate": 5e-07, + "loss": 7.6346, + "step": 50 + }, + { + "epoch": 0.01929442920646931, + "grad_norm": 21.674704957397896, + "learning_rate": 5.1e-07, + "loss": 7.5339, + "step": 51 + }, + { + "epoch": 0.01967275134777263, + "grad_norm": 26.85039717209422, + "learning_rate": 5.2e-07, + "loss": 7.3655, + "step": 52 + }, + { + "epoch": 0.02005107348907595, + "grad_norm": 29.784500661137994, + "learning_rate": 5.3e-07, + "loss": 7.3935, + "step": 53 + }, + { + "epoch": 0.02042939563037927, + "grad_norm": 36.73803214173563, + "learning_rate": 5.4e-07, + "loss": 7.3942, + "step": 54 + }, + { + "epoch": 0.02080771777168259, + "grad_norm": 55.998259201380826, + "learning_rate": 5.5e-07, + "loss": 7.3246, + "step": 55 + }, + { + "epoch": 0.021186039912985906, + "grad_norm": 54.6219968094922, + "learning_rate": 5.6e-07, + "loss": 7.2241, + "step": 56 + }, + { + "epoch": 0.021564362054289226, + "grad_norm": 115.48000957700997, + "learning_rate": 5.699999999999999e-07, + "loss": 7.3169, + "step": 57 + }, + { + "epoch": 0.021942684195592546, + "grad_norm": 240.40441808566737, + "learning_rate": 5.8e-07, + "loss": 7.1243, + "step": 58 + }, + { + "epoch": 0.022321006336895866, + "grad_norm": 102.2272021984647, + "learning_rate": 5.9e-07, + "loss": 7.0371, + "step": 59 + }, + { + "epoch": 0.022699328478199186, + "grad_norm": 256.9288700751086, + "learning_rate": 6e-07, + "loss": 6.8907, + "step": 60 + }, + { + "epoch": 0.023077650619502506, + "grad_norm": 131.56800170402965, + "learning_rate": 6.1e-07, + "loss": 6.854, + "step": 61 + }, + { + "epoch": 0.023455972760805827, + "grad_norm": 358.2045690657579, + "learning_rate": 6.2e-07, + "loss": 6.7673, + "step": 62 + }, + { + "epoch": 0.023834294902109147, + "grad_norm": 259.0360488341225, + "learning_rate": 6.3e-07, + "loss": 6.6898, + "step": 63 + }, + { + "epoch": 0.024212617043412467, + "grad_norm": 324.46556421575104, + "learning_rate": 6.4e-07, + "loss": 6.6792, + "step": 64 + }, + { + "epoch": 0.024590939184715787, + "grad_norm": 218.90309813691587, + "learning_rate": 6.5e-07, + "loss": 6.5833, + "step": 65 + }, + { + "epoch": 0.024969261326019104, + "grad_norm": 345.9947605906595, + "learning_rate": 6.6e-07, + "loss": 6.5841, + "step": 66 + }, + { + "epoch": 0.025347583467322424, + "grad_norm": 327.5192852015763, + "learning_rate": 6.7e-07, + "loss": 6.5379, + "step": 67 + }, + { + "epoch": 0.025725905608625744, + "grad_norm": 272.0304082708135, + "learning_rate": 6.800000000000001e-07, + "loss": 6.4003, + "step": 68 + }, + { + "epoch": 0.026104227749929064, + "grad_norm": 224.03062395364572, + "learning_rate": 6.9e-07, + "loss": 6.3064, + "step": 69 + }, + { + "epoch": 0.026482549891232384, + "grad_norm": 326.13516923115037, + "learning_rate": 7e-07, + "loss": 6.2681, + "step": 70 + }, + { + "epoch": 0.026860872032535704, + "grad_norm": 236.06386821993763, + "learning_rate": 7.1e-07, + "loss": 6.1658, + "step": 71 + }, + { + "epoch": 0.027239194173839024, + "grad_norm": 117.09820504079929, + "learning_rate": 7.2e-07, + "loss": 6.1013, + "step": 72 + }, + { + "epoch": 0.027617516315142344, + "grad_norm": 130.77996709008073, + "learning_rate": 7.3e-07, + "loss": 6.0313, + "step": 73 + }, + { + "epoch": 0.027995838456445665, + "grad_norm": 184.1694406122909, + "learning_rate": 7.4e-07, + "loss": 5.9761, + "step": 74 + }, + { + "epoch": 0.028374160597748985, + "grad_norm": 107.41668355609693, + "learning_rate": 7.5e-07, + "loss": 5.8533, + "step": 75 + }, + { + "epoch": 0.0287524827390523, + "grad_norm": 167.17458055865583, + "learning_rate": 7.599999999999999e-07, + "loss": 5.842, + "step": 76 + }, + { + "epoch": 0.02913080488035562, + "grad_norm": 83.1018765552699, + "learning_rate": 7.699999999999999e-07, + "loss": 5.8106, + "step": 77 + }, + { + "epoch": 0.02950912702165894, + "grad_norm": 930.4199949174266, + "learning_rate": 7.799999999999999e-07, + "loss": 5.9417, + "step": 78 + }, + { + "epoch": 0.02988744916296226, + "grad_norm": 344.9243101513464, + "learning_rate": 7.9e-07, + "loss": 5.9401, + "step": 79 + }, + { + "epoch": 0.030265771304265582, + "grad_norm": 203.82832876269842, + "learning_rate": 8e-07, + "loss": 5.8335, + "step": 80 + }, + { + "epoch": 0.030644093445568902, + "grad_norm": 303.4319382071192, + "learning_rate": 8.1e-07, + "loss": 5.6823, + "step": 81 + }, + { + "epoch": 0.031022415586872222, + "grad_norm": 248.28331376619403, + "learning_rate": 8.199999999999999e-07, + "loss": 5.7745, + "step": 82 + }, + { + "epoch": 0.03140073772817554, + "grad_norm": 462.20565983043144, + "learning_rate": 8.299999999999999e-07, + "loss": 5.6386, + "step": 83 + }, + { + "epoch": 0.03177905986947886, + "grad_norm": 194.41981862598635, + "learning_rate": 8.399999999999999e-07, + "loss": 5.5997, + "step": 84 + }, + { + "epoch": 0.03215738201078218, + "grad_norm": 293.3275031516269, + "learning_rate": 8.499999999999999e-07, + "loss": 5.5106, + "step": 85 + }, + { + "epoch": 0.0325357041520855, + "grad_norm": 140.97321101678344, + "learning_rate": 8.599999999999999e-07, + "loss": 5.4563, + "step": 86 + }, + { + "epoch": 0.03291402629338882, + "grad_norm": 180.15140475284437, + "learning_rate": 8.699999999999999e-07, + "loss": 5.4357, + "step": 87 + }, + { + "epoch": 0.03329234843469214, + "grad_norm": 333.3719583206301, + "learning_rate": 8.799999999999999e-07, + "loss": 5.3168, + "step": 88 + }, + { + "epoch": 0.03367067057599546, + "grad_norm": 121.82713201522955, + "learning_rate": 8.9e-07, + "loss": 5.3945, + "step": 89 + }, + { + "epoch": 0.03404899271729878, + "grad_norm": 582.7969295558685, + "learning_rate": 9e-07, + "loss": 5.3863, + "step": 90 + }, + { + "epoch": 0.0344273148586021, + "grad_norm": 217.6434706478821, + "learning_rate": 9.1e-07, + "loss": 5.2662, + "step": 91 + }, + { + "epoch": 0.034805636999905416, + "grad_norm": 374.4674448505233, + "learning_rate": 9.2e-07, + "loss": 5.2355, + "step": 92 + }, + { + "epoch": 0.03518395914120874, + "grad_norm": 218.23465312606612, + "learning_rate": 9.3e-07, + "loss": 5.1486, + "step": 93 + }, + { + "epoch": 0.03556228128251206, + "grad_norm": 98.81927420372956, + "learning_rate": 9.399999999999999e-07, + "loss": 5.0807, + "step": 94 + }, + { + "epoch": 0.03594060342381538, + "grad_norm": 211.12146153212487, + "learning_rate": 9.499999999999999e-07, + "loss": 5.0853, + "step": 95 + }, + { + "epoch": 0.0363189255651187, + "grad_norm": 190.3736868117524, + "learning_rate": 9.6e-07, + "loss": 5.0756, + "step": 96 + }, + { + "epoch": 0.03669724770642202, + "grad_norm": 122.03862248450174, + "learning_rate": 9.7e-07, + "loss": 4.9252, + "step": 97 + }, + { + "epoch": 0.03707556984772534, + "grad_norm": 410.81026410608786, + "learning_rate": 9.8e-07, + "loss": 5.0664, + "step": 98 + }, + { + "epoch": 0.03745389198902866, + "grad_norm": 269.97951212839484, + "learning_rate": 9.9e-07, + "loss": 4.9091, + "step": 99 + }, + { + "epoch": 0.03783221413033198, + "grad_norm": 260.7212338620472, + "learning_rate": 1e-06, + "loss": 4.8821, + "step": 100 + }, + { + "epoch": 0.0382105362716353, + "grad_norm": 165.92539323350238, + "learning_rate": 1.0099999999999999e-06, + "loss": 4.7469, + "step": 101 + }, + { + "epoch": 0.03858885841293862, + "grad_norm": 281.9862388742268, + "learning_rate": 1.02e-06, + "loss": 4.7974, + "step": 102 + }, + { + "epoch": 0.038967180554241934, + "grad_norm": 164.28597977866295, + "learning_rate": 1.0299999999999999e-06, + "loss": 4.6513, + "step": 103 + }, + { + "epoch": 0.03934550269554526, + "grad_norm": 315.7550450358392, + "learning_rate": 1.04e-06, + "loss": 4.7021, + "step": 104 + }, + { + "epoch": 0.039723824836848574, + "grad_norm": 202.93065604656107, + "learning_rate": 1.05e-06, + "loss": 4.5712, + "step": 105 + }, + { + "epoch": 0.0401021469781519, + "grad_norm": 210.26805622762828, + "learning_rate": 1.06e-06, + "loss": 4.6196, + "step": 106 + }, + { + "epoch": 0.040480469119455215, + "grad_norm": 187.14917857744504, + "learning_rate": 1.07e-06, + "loss": 4.5484, + "step": 107 + }, + { + "epoch": 0.04085879126075854, + "grad_norm": 155.43076076847103, + "learning_rate": 1.08e-06, + "loss": 4.4144, + "step": 108 + }, + { + "epoch": 0.041237113402061855, + "grad_norm": 154.98829996861681, + "learning_rate": 1.09e-06, + "loss": 4.3404, + "step": 109 + }, + { + "epoch": 0.04161543554336518, + "grad_norm": 141.595366217918, + "learning_rate": 1.1e-06, + "loss": 4.3111, + "step": 110 + }, + { + "epoch": 0.041993757684668495, + "grad_norm": 134.27240833451944, + "learning_rate": 1.11e-06, + "loss": 4.1952, + "step": 111 + }, + { + "epoch": 0.04237207982597181, + "grad_norm": 95.65375597330166, + "learning_rate": 1.12e-06, + "loss": 4.0809, + "step": 112 + }, + { + "epoch": 0.042750401967275135, + "grad_norm": 109.07352101322023, + "learning_rate": 1.1299999999999998e-06, + "loss": 4.0286, + "step": 113 + }, + { + "epoch": 0.04312872410857845, + "grad_norm": 114.47547920727833, + "learning_rate": 1.1399999999999999e-06, + "loss": 3.9147, + "step": 114 + }, + { + "epoch": 0.043507046249881776, + "grad_norm": 105.22542090856187, + "learning_rate": 1.1499999999999998e-06, + "loss": 3.888, + "step": 115 + }, + { + "epoch": 0.04388536839118509, + "grad_norm": 170.85609503557524, + "learning_rate": 1.16e-06, + "loss": 3.7806, + "step": 116 + }, + { + "epoch": 0.044263690532488416, + "grad_norm": 132.60484964177928, + "learning_rate": 1.1699999999999998e-06, + "loss": 3.7388, + "step": 117 + }, + { + "epoch": 0.04464201267379173, + "grad_norm": 817.4981900388101, + "learning_rate": 1.18e-06, + "loss": 3.8085, + "step": 118 + }, + { + "epoch": 0.045020334815095056, + "grad_norm": 277.2968095396992, + "learning_rate": 1.1899999999999998e-06, + "loss": 3.7519, + "step": 119 + }, + { + "epoch": 0.04539865695639837, + "grad_norm": 242.3036172020571, + "learning_rate": 1.2e-06, + "loss": 3.6811, + "step": 120 + }, + { + "epoch": 0.045776979097701696, + "grad_norm": 147.12958250512, + "learning_rate": 1.2099999999999998e-06, + "loss": 3.5537, + "step": 121 + }, + { + "epoch": 0.04615530123900501, + "grad_norm": 304.91416915276426, + "learning_rate": 1.22e-06, + "loss": 3.5308, + "step": 122 + }, + { + "epoch": 0.04653362338030833, + "grad_norm": 228.8092972324273, + "learning_rate": 1.2299999999999999e-06, + "loss": 3.4916, + "step": 123 + }, + { + "epoch": 0.04691194552161165, + "grad_norm": 197.353832945714, + "learning_rate": 1.24e-06, + "loss": 3.4215, + "step": 124 + }, + { + "epoch": 0.04729026766291497, + "grad_norm": 228.72368996651358, + "learning_rate": 1.2499999999999999e-06, + "loss": 3.371, + "step": 125 + }, + { + "epoch": 0.04766858980421829, + "grad_norm": 164.2731725612326, + "learning_rate": 1.26e-06, + "loss": 3.3909, + "step": 126 + }, + { + "epoch": 0.04804691194552161, + "grad_norm": 186.5826183173996, + "learning_rate": 1.27e-06, + "loss": 3.3104, + "step": 127 + }, + { + "epoch": 0.048425234086824934, + "grad_norm": 139.94786192019586, + "learning_rate": 1.28e-06, + "loss": 3.2437, + "step": 128 + }, + { + "epoch": 0.04880355622812825, + "grad_norm": 170.89837594203516, + "learning_rate": 1.29e-06, + "loss": 3.2145, + "step": 129 + }, + { + "epoch": 0.049181878369431574, + "grad_norm": 124.04755267516651, + "learning_rate": 1.3e-06, + "loss": 3.1275, + "step": 130 + }, + { + "epoch": 0.04956020051073489, + "grad_norm": 112.7475091581948, + "learning_rate": 1.31e-06, + "loss": 3.1021, + "step": 131 + }, + { + "epoch": 0.04993852265203821, + "grad_norm": 483.6676734928997, + "learning_rate": 1.32e-06, + "loss": 3.0251, + "step": 132 + }, + { + "epoch": 0.05031684479334153, + "grad_norm": 131.48794283663062, + "learning_rate": 1.33e-06, + "loss": 3.0474, + "step": 133 + }, + { + "epoch": 0.05031684479334153, + "eval_loss": 3.0402355194091797, + "eval_runtime": 26.8305, + "eval_samples_per_second": 32.985, + "eval_steps_per_second": 1.044, + "step": 133 + }, + { + "epoch": 0.05031684479334153, + "eval_bench_accuracy_arc_challenge": 0.2714285714285714, + "eval_bench_accuracy_hellaswag": 0.22, + "eval_bench_accuracy_mmlu": 0.23478260869565218, + "eval_bench_average_accuracy": 0.2420703933747412, + "eval_bench_loss": 6.577301560786733, + "eval_bench_total_accuracy": 0.23956043956043957, + "step": 133 + }, + { + "epoch": 0.05069516693464485, + "grad_norm": 664.2692049220283, + "learning_rate": 1.34e-06, + "loss": 3.0489, + "step": 134 + }, + { + "epoch": 0.05107348907594817, + "grad_norm": 164.70902413028506, + "learning_rate": 1.35e-06, + "loss": 3.0729, + "step": 135 + }, + { + "epoch": 0.05145181121725149, + "grad_norm": 778.4019675411471, + "learning_rate": 1.3600000000000001e-06, + "loss": 2.9025, + "step": 136 + }, + { + "epoch": 0.05183013335855481, + "grad_norm": 141.784859477734, + "learning_rate": 1.37e-06, + "loss": 2.9153, + "step": 137 + }, + { + "epoch": 0.05220845549985813, + "grad_norm": 815.6337164546584, + "learning_rate": 1.38e-06, + "loss": 2.9767, + "step": 138 + }, + { + "epoch": 0.05258677764116145, + "grad_norm": 387.14144869932585, + "learning_rate": 1.3899999999999998e-06, + "loss": 2.9545, + "step": 139 + }, + { + "epoch": 0.05296509978246477, + "grad_norm": 1286.7446765387322, + "learning_rate": 1.4e-06, + "loss": 2.9779, + "step": 140 + }, + { + "epoch": 0.05334342192376809, + "grad_norm": 170.85639571110613, + "learning_rate": 1.4099999999999998e-06, + "loss": 2.8642, + "step": 141 + }, + { + "epoch": 0.05372174406507141, + "grad_norm": 375.24244542748465, + "learning_rate": 1.42e-06, + "loss": 2.7942, + "step": 142 + }, + { + "epoch": 0.054100066206374725, + "grad_norm": 154.53620941237315, + "learning_rate": 1.4299999999999999e-06, + "loss": 2.7527, + "step": 143 + }, + { + "epoch": 0.05447838834767805, + "grad_norm": 188.97826644064364, + "learning_rate": 1.44e-06, + "loss": 2.7492, + "step": 144 + }, + { + "epoch": 0.054856710488981365, + "grad_norm": 103.19619548153565, + "learning_rate": 1.4499999999999999e-06, + "loss": 2.6708, + "step": 145 + }, + { + "epoch": 0.05523503263028469, + "grad_norm": 125.47407228350237, + "learning_rate": 1.46e-06, + "loss": 2.6737, + "step": 146 + }, + { + "epoch": 0.055613354771588006, + "grad_norm": 71.31808903587059, + "learning_rate": 1.47e-06, + "loss": 2.6175, + "step": 147 + }, + { + "epoch": 0.05599167691289133, + "grad_norm": 158.4470726659215, + "learning_rate": 1.48e-06, + "loss": 2.5772, + "step": 148 + }, + { + "epoch": 0.056369999054194646, + "grad_norm": 213.54517556280484, + "learning_rate": 1.49e-06, + "loss": 2.5397, + "step": 149 + }, + { + "epoch": 0.05674832119549797, + "grad_norm": 94.87447540886092, + "learning_rate": 1.5e-06, + "loss": 2.5007, + "step": 150 + }, + { + "epoch": 0.057126643336801286, + "grad_norm": 140.6331701396571, + "learning_rate": 1.51e-06, + "loss": 2.4911, + "step": 151 + }, + { + "epoch": 0.0575049654781046, + "grad_norm": 71.42229734282893, + "learning_rate": 1.5199999999999998e-06, + "loss": 2.3964, + "step": 152 + }, + { + "epoch": 0.057883287619407926, + "grad_norm": 100.92797990716835, + "learning_rate": 1.53e-06, + "loss": 2.3796, + "step": 153 + }, + { + "epoch": 0.05826160976071124, + "grad_norm": 69.12965458867137, + "learning_rate": 1.5399999999999999e-06, + "loss": 2.4147, + "step": 154 + }, + { + "epoch": 0.058639931902014567, + "grad_norm": 68.31144568523656, + "learning_rate": 1.55e-06, + "loss": 2.285, + "step": 155 + }, + { + "epoch": 0.05901825404331788, + "grad_norm": 63.86407191747168, + "learning_rate": 1.5599999999999999e-06, + "loss": 2.2905, + "step": 156 + }, + { + "epoch": 0.05939657618462121, + "grad_norm": 89.9702991999028, + "learning_rate": 1.57e-06, + "loss": 2.2642, + "step": 157 + }, + { + "epoch": 0.05977489832592452, + "grad_norm": 38.70583191014119, + "learning_rate": 1.58e-06, + "loss": 2.1927, + "step": 158 + }, + { + "epoch": 0.06015322046722785, + "grad_norm": 150.0176513817121, + "learning_rate": 1.59e-06, + "loss": 2.2046, + "step": 159 + }, + { + "epoch": 0.060531542608531164, + "grad_norm": 85.38752600608713, + "learning_rate": 1.6e-06, + "loss": 2.1777, + "step": 160 + }, + { + "epoch": 0.06090986474983449, + "grad_norm": 108.46382637315519, + "learning_rate": 1.61e-06, + "loss": 2.0947, + "step": 161 + }, + { + "epoch": 0.061288186891137804, + "grad_norm": 72.33751976980996, + "learning_rate": 1.62e-06, + "loss": 2.1455, + "step": 162 + }, + { + "epoch": 0.06166650903244112, + "grad_norm": 254.7588636023186, + "learning_rate": 1.6299999999999999e-06, + "loss": 2.0967, + "step": 163 + }, + { + "epoch": 0.062044831173744444, + "grad_norm": 143.3727693773649, + "learning_rate": 1.6399999999999998e-06, + "loss": 2.0443, + "step": 164 + }, + { + "epoch": 0.06242315331504776, + "grad_norm": 672.6219381081797, + "learning_rate": 1.6499999999999999e-06, + "loss": 2.2139, + "step": 165 + }, + { + "epoch": 0.06280147545635108, + "grad_norm": 89.69156829747156, + "learning_rate": 1.6599999999999998e-06, + "loss": 2.0433, + "step": 166 + }, + { + "epoch": 0.06317979759765441, + "grad_norm": 47.054580203479496, + "learning_rate": 1.6699999999999999e-06, + "loss": 1.9805, + "step": 167 + }, + { + "epoch": 0.06355811973895772, + "grad_norm": 53.90193516042071, + "learning_rate": 1.6799999999999998e-06, + "loss": 1.8572, + "step": 168 + }, + { + "epoch": 0.06393644188026104, + "grad_norm": 55.351958687059195, + "learning_rate": 1.69e-06, + "loss": 1.8879, + "step": 169 + }, + { + "epoch": 0.06431476402156436, + "grad_norm": 30.956994176305464, + "learning_rate": 1.6999999999999998e-06, + "loss": 1.8335, + "step": 170 + }, + { + "epoch": 0.06469308616286769, + "grad_norm": 81.23380900946358, + "learning_rate": 1.71e-06, + "loss": 1.8101, + "step": 171 + }, + { + "epoch": 0.065071408304171, + "grad_norm": 46.43733520396148, + "learning_rate": 1.7199999999999998e-06, + "loss": 1.8177, + "step": 172 + }, + { + "epoch": 0.06544973044547432, + "grad_norm": 46.90830376181402, + "learning_rate": 1.73e-06, + "loss": 1.7543, + "step": 173 + }, + { + "epoch": 0.06582805258677764, + "grad_norm": 69.19161149417722, + "learning_rate": 1.7399999999999999e-06, + "loss": 1.7712, + "step": 174 + }, + { + "epoch": 0.06620637472808096, + "grad_norm": 46.99692135130498, + "learning_rate": 1.75e-06, + "loss": 1.7728, + "step": 175 + }, + { + "epoch": 0.06658469686938429, + "grad_norm": 85.68605330443327, + "learning_rate": 1.7599999999999999e-06, + "loss": 1.7186, + "step": 176 + }, + { + "epoch": 0.0669630190106876, + "grad_norm": 48.57963404347663, + "learning_rate": 1.77e-06, + "loss": 1.6979, + "step": 177 + }, + { + "epoch": 0.06734134115199092, + "grad_norm": 111.44637207499896, + "learning_rate": 1.78e-06, + "loss": 1.734, + "step": 178 + }, + { + "epoch": 0.06771966329329424, + "grad_norm": 83.89157732570692, + "learning_rate": 1.79e-06, + "loss": 1.6947, + "step": 179 + }, + { + "epoch": 0.06809798543459757, + "grad_norm": 50.66006983599147, + "learning_rate": 1.8e-06, + "loss": 1.6385, + "step": 180 + }, + { + "epoch": 0.06847630757590088, + "grad_norm": 47.32959657636825, + "learning_rate": 1.81e-06, + "loss": 1.5717, + "step": 181 + }, + { + "epoch": 0.0688546297172042, + "grad_norm": 71.70671420810187, + "learning_rate": 1.82e-06, + "loss": 1.5167, + "step": 182 + }, + { + "epoch": 0.06923295185850752, + "grad_norm": 48.11379424928171, + "learning_rate": 1.83e-06, + "loss": 1.5992, + "step": 183 + }, + { + "epoch": 0.06961127399981083, + "grad_norm": 54.01731463177801, + "learning_rate": 1.84e-06, + "loss": 1.5217, + "step": 184 + }, + { + "epoch": 0.06998959614111416, + "grad_norm": 39.52299725178149, + "learning_rate": 1.85e-06, + "loss": 1.5009, + "step": 185 + }, + { + "epoch": 0.07036791828241748, + "grad_norm": 63.37058186080119, + "learning_rate": 1.86e-06, + "loss": 1.5853, + "step": 186 + }, + { + "epoch": 0.0707462404237208, + "grad_norm": 44.5116426583779, + "learning_rate": 1.87e-06, + "loss": 1.4865, + "step": 187 + }, + { + "epoch": 0.07112456256502411, + "grad_norm": 40.56409454228496, + "learning_rate": 1.8799999999999998e-06, + "loss": 1.4732, + "step": 188 + }, + { + "epoch": 0.07150288470632744, + "grad_norm": 31.923505092753718, + "learning_rate": 1.89e-06, + "loss": 1.4519, + "step": 189 + }, + { + "epoch": 0.07188120684763076, + "grad_norm": 34.50709112981039, + "learning_rate": 1.8999999999999998e-06, + "loss": 1.4205, + "step": 190 + }, + { + "epoch": 0.07225952898893408, + "grad_norm": 22.09682402936458, + "learning_rate": 1.91e-06, + "loss": 1.38, + "step": 191 + }, + { + "epoch": 0.0726378511302374, + "grad_norm": 25.3767669172789, + "learning_rate": 1.92e-06, + "loss": 1.3879, + "step": 192 + }, + { + "epoch": 0.07301617327154071, + "grad_norm": 29.51813748066488, + "learning_rate": 1.9299999999999997e-06, + "loss": 1.3506, + "step": 193 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 21.76501410574832, + "learning_rate": 1.94e-06, + "loss": 1.3237, + "step": 194 + }, + { + "epoch": 0.07377281755414736, + "grad_norm": 20.74781891582525, + "learning_rate": 1.95e-06, + "loss": 1.3639, + "step": 195 + }, + { + "epoch": 0.07415113969545067, + "grad_norm": 27.66733930317673, + "learning_rate": 1.96e-06, + "loss": 1.3061, + "step": 196 + }, + { + "epoch": 0.07452946183675399, + "grad_norm": 21.087698250942193, + "learning_rate": 1.9699999999999998e-06, + "loss": 1.375, + "step": 197 + }, + { + "epoch": 0.07490778397805732, + "grad_norm": 22.065927379036225, + "learning_rate": 1.98e-06, + "loss": 1.3219, + "step": 198 + }, + { + "epoch": 0.07528610611936064, + "grad_norm": 37.132637966902955, + "learning_rate": 1.99e-06, + "loss": 1.2424, + "step": 199 + }, + { + "epoch": 0.07566442826066395, + "grad_norm": 20.85100061426098, + "learning_rate": 2e-06, + "loss": 1.2973, + "step": 200 + }, + { + "epoch": 0.07604275040196727, + "grad_norm": 19.748272671220768, + "learning_rate": 2.01e-06, + "loss": 1.2371, + "step": 201 + }, + { + "epoch": 0.0764210725432706, + "grad_norm": 24.073543088140834, + "learning_rate": 2.0199999999999997e-06, + "loss": 1.252, + "step": 202 + }, + { + "epoch": 0.07679939468457392, + "grad_norm": 34.22154387867275, + "learning_rate": 2.0299999999999996e-06, + "loss": 1.2911, + "step": 203 + }, + { + "epoch": 0.07717771682587724, + "grad_norm": 16.511181722757403, + "learning_rate": 2.04e-06, + "loss": 1.2321, + "step": 204 + }, + { + "epoch": 0.07755603896718055, + "grad_norm": 12.872226386234452, + "learning_rate": 2.05e-06, + "loss": 1.1767, + "step": 205 + }, + { + "epoch": 0.07793436110848387, + "grad_norm": 15.436365816346868, + "learning_rate": 2.0599999999999998e-06, + "loss": 1.1955, + "step": 206 + }, + { + "epoch": 0.0783126832497872, + "grad_norm": 12.062107586682833, + "learning_rate": 2.0699999999999997e-06, + "loss": 1.1799, + "step": 207 + }, + { + "epoch": 0.07869100539109052, + "grad_norm": 49.38765930014822, + "learning_rate": 2.08e-06, + "loss": 1.1762, + "step": 208 + }, + { + "epoch": 0.07906932753239383, + "grad_norm": 23.38441549316206, + "learning_rate": 2.09e-06, + "loss": 1.1831, + "step": 209 + }, + { + "epoch": 0.07944764967369715, + "grad_norm": 22.28035230836217, + "learning_rate": 2.1e-06, + "loss": 1.1858, + "step": 210 + }, + { + "epoch": 0.07982597181500048, + "grad_norm": 43.05138932031075, + "learning_rate": 2.1099999999999997e-06, + "loss": 1.2106, + "step": 211 + }, + { + "epoch": 0.0802042939563038, + "grad_norm": 22.919581037837645, + "learning_rate": 2.12e-06, + "loss": 1.1872, + "step": 212 + }, + { + "epoch": 0.08058261609760711, + "grad_norm": 106.27528509092721, + "learning_rate": 2.13e-06, + "loss": 1.1807, + "step": 213 + }, + { + "epoch": 0.08096093823891043, + "grad_norm": 62.766496496977574, + "learning_rate": 2.14e-06, + "loss": 1.1932, + "step": 214 + }, + { + "epoch": 0.08133926038021375, + "grad_norm": 66.54674237816508, + "learning_rate": 2.1499999999999997e-06, + "loss": 1.1328, + "step": 215 + }, + { + "epoch": 0.08171758252151708, + "grad_norm": 66.81453157766589, + "learning_rate": 2.16e-06, + "loss": 1.1613, + "step": 216 + }, + { + "epoch": 0.0820959046628204, + "grad_norm": 35.57901795776919, + "learning_rate": 2.17e-06, + "loss": 1.1821, + "step": 217 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 10.30900211340774, + "learning_rate": 2.18e-06, + "loss": 1.1023, + "step": 218 + }, + { + "epoch": 0.08285254894542703, + "grad_norm": 29.533042017371177, + "learning_rate": 2.1899999999999998e-06, + "loss": 1.1669, + "step": 219 + }, + { + "epoch": 0.08323087108673036, + "grad_norm": 22.47096674174166, + "learning_rate": 2.2e-06, + "loss": 1.1612, + "step": 220 + }, + { + "epoch": 0.08360919322803367, + "grad_norm": 13.583126551810135, + "learning_rate": 2.21e-06, + "loss": 1.0867, + "step": 221 + }, + { + "epoch": 0.08398751536933699, + "grad_norm": 9.91479302526445, + "learning_rate": 2.22e-06, + "loss": 1.0916, + "step": 222 + }, + { + "epoch": 0.0843658375106403, + "grad_norm": 11.269431287067826, + "learning_rate": 2.23e-06, + "loss": 1.1264, + "step": 223 + }, + { + "epoch": 0.08474415965194362, + "grad_norm": 7.7465735801712805, + "learning_rate": 2.24e-06, + "loss": 1.136, + "step": 224 + }, + { + "epoch": 0.08512248179324695, + "grad_norm": 8.687635755465738, + "learning_rate": 2.25e-06, + "loss": 1.0803, + "step": 225 + }, + { + "epoch": 0.08550080393455027, + "grad_norm": 11.628437205512707, + "learning_rate": 2.2599999999999995e-06, + "loss": 1.1646, + "step": 226 + }, + { + "epoch": 0.08587912607585359, + "grad_norm": 9.268721256498573, + "learning_rate": 2.27e-06, + "loss": 1.1015, + "step": 227 + }, + { + "epoch": 0.0862574482171569, + "grad_norm": 6.187500026884083, + "learning_rate": 2.2799999999999998e-06, + "loss": 1.0662, + "step": 228 + }, + { + "epoch": 0.08663577035846023, + "grad_norm": 8.62028463677054, + "learning_rate": 2.29e-06, + "loss": 1.052, + "step": 229 + }, + { + "epoch": 0.08701409249976355, + "grad_norm": 9.674790887814405, + "learning_rate": 2.2999999999999996e-06, + "loss": 1.0978, + "step": 230 + }, + { + "epoch": 0.08739241464106687, + "grad_norm": 8.326705028491853, + "learning_rate": 2.31e-06, + "loss": 1.0184, + "step": 231 + }, + { + "epoch": 0.08777073678237018, + "grad_norm": 7.318027642173224, + "learning_rate": 2.32e-06, + "loss": 1.0509, + "step": 232 + }, + { + "epoch": 0.0881490589236735, + "grad_norm": 12.85041462496061, + "learning_rate": 2.33e-06, + "loss": 1.0556, + "step": 233 + }, + { + "epoch": 0.08852738106497683, + "grad_norm": 9.328207044954535, + "learning_rate": 2.3399999999999996e-06, + "loss": 1.0816, + "step": 234 + }, + { + "epoch": 0.08890570320628015, + "grad_norm": 7.022150416570471, + "learning_rate": 2.35e-06, + "loss": 1.0466, + "step": 235 + }, + { + "epoch": 0.08928402534758346, + "grad_norm": 8.86057501782776, + "learning_rate": 2.36e-06, + "loss": 1.04, + "step": 236 + }, + { + "epoch": 0.08966234748888678, + "grad_norm": 9.072613041437753, + "learning_rate": 2.37e-06, + "loss": 1.039, + "step": 237 + }, + { + "epoch": 0.09004066963019011, + "grad_norm": 11.561198612520238, + "learning_rate": 2.3799999999999997e-06, + "loss": 1.025, + "step": 238 + }, + { + "epoch": 0.09041899177149343, + "grad_norm": 5.796410505813014, + "learning_rate": 2.39e-06, + "loss": 1.0007, + "step": 239 + }, + { + "epoch": 0.09079731391279675, + "grad_norm": 13.451590053171754, + "learning_rate": 2.4e-06, + "loss": 1.0051, + "step": 240 + }, + { + "epoch": 0.09117563605410006, + "grad_norm": 8.917436837849364, + "learning_rate": 2.4100000000000002e-06, + "loss": 1.0866, + "step": 241 + }, + { + "epoch": 0.09155395819540339, + "grad_norm": 4.792174398814023, + "learning_rate": 2.4199999999999997e-06, + "loss": 1.0022, + "step": 242 + }, + { + "epoch": 0.09193228033670671, + "grad_norm": 6.487991210049911, + "learning_rate": 2.43e-06, + "loss": 0.976, + "step": 243 + }, + { + "epoch": 0.09231060247801003, + "grad_norm": 9.885175529767102, + "learning_rate": 2.44e-06, + "loss": 1.0038, + "step": 244 + }, + { + "epoch": 0.09268892461931334, + "grad_norm": 5.6067215406645134, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0559, + "step": 245 + }, + { + "epoch": 0.09306724676061666, + "grad_norm": 14.632584569195519, + "learning_rate": 2.4599999999999997e-06, + "loss": 1.0229, + "step": 246 + }, + { + "epoch": 0.09344556890191999, + "grad_norm": 6.406784955802286, + "learning_rate": 2.47e-06, + "loss": 1.0252, + "step": 247 + }, + { + "epoch": 0.0938238910432233, + "grad_norm": 7.547314965665046, + "learning_rate": 2.48e-06, + "loss": 0.9838, + "step": 248 + }, + { + "epoch": 0.09420221318452662, + "grad_norm": 6.44920071987235, + "learning_rate": 2.4900000000000003e-06, + "loss": 0.9664, + "step": 249 + }, + { + "epoch": 0.09458053532582994, + "grad_norm": 5.4686676744513765, + "learning_rate": 2.4999999999999998e-06, + "loss": 0.9781, + "step": 250 + }, + { + "epoch": 0.09495885746713327, + "grad_norm": 5.951563165398436, + "learning_rate": 2.5099999999999997e-06, + "loss": 0.9953, + "step": 251 + }, + { + "epoch": 0.09533717960843659, + "grad_norm": 5.7316411610727105, + "learning_rate": 2.52e-06, + "loss": 1.0431, + "step": 252 + }, + { + "epoch": 0.0957155017497399, + "grad_norm": 4.90373215304178, + "learning_rate": 2.5299999999999995e-06, + "loss": 0.9738, + "step": 253 + }, + { + "epoch": 0.09609382389104322, + "grad_norm": 4.018027173598048, + "learning_rate": 2.54e-06, + "loss": 1.0113, + "step": 254 + }, + { + "epoch": 0.09647214603234654, + "grad_norm": 6.869682846334475, + "learning_rate": 2.5499999999999997e-06, + "loss": 0.9812, + "step": 255 + }, + { + "epoch": 0.09685046817364987, + "grad_norm": 5.959477622367862, + "learning_rate": 2.56e-06, + "loss": 1.0031, + "step": 256 + }, + { + "epoch": 0.09722879031495318, + "grad_norm": 4.231167141984737, + "learning_rate": 2.5699999999999995e-06, + "loss": 1.0319, + "step": 257 + }, + { + "epoch": 0.0976071124562565, + "grad_norm": 6.714523011394094, + "learning_rate": 2.58e-06, + "loss": 0.9851, + "step": 258 + }, + { + "epoch": 0.09798543459755982, + "grad_norm": 6.020515136070658, + "learning_rate": 2.5899999999999998e-06, + "loss": 0.9782, + "step": 259 + }, + { + "epoch": 0.09836375673886315, + "grad_norm": 4.681331319695956, + "learning_rate": 2.6e-06, + "loss": 1.014, + "step": 260 + }, + { + "epoch": 0.09874207888016646, + "grad_norm": 7.4305112606450905, + "learning_rate": 2.6099999999999996e-06, + "loss": 0.9751, + "step": 261 + }, + { + "epoch": 0.09912040102146978, + "grad_norm": 3.819753600694035, + "learning_rate": 2.62e-06, + "loss": 0.968, + "step": 262 + }, + { + "epoch": 0.0994987231627731, + "grad_norm": 5.789415532330102, + "learning_rate": 2.63e-06, + "loss": 0.9529, + "step": 263 + }, + { + "epoch": 0.09987704530407641, + "grad_norm": 4.539898474801753, + "learning_rate": 2.64e-06, + "loss": 0.978, + "step": 264 + }, + { + "epoch": 0.10025536744537974, + "grad_norm": 3.2389391663703306, + "learning_rate": 2.6499999999999996e-06, + "loss": 0.9833, + "step": 265 + }, + { + "epoch": 0.10063368958668306, + "grad_norm": 5.4718084763112556, + "learning_rate": 2.66e-06, + "loss": 0.9714, + "step": 266 + }, + { + "epoch": 0.10063368958668306, + "eval_loss": 0.9851981997489929, + "eval_runtime": 27.2115, + "eval_samples_per_second": 32.523, + "eval_steps_per_second": 1.029, + "step": 266 + }, + { + "epoch": 0.10063368958668306, + "eval_bench_accuracy_arc_challenge": 0.29285714285714287, + "eval_bench_accuracy_hellaswag": 0.215, + "eval_bench_accuracy_mmlu": 0.3826086956521739, + "eval_bench_average_accuracy": 0.29682194616977225, + "eval_bench_loss": 6.3663490696957235, + "eval_bench_total_accuracy": 0.2813186813186813, + "step": 266 + }, + { + "epoch": 0.10101201172798638, + "grad_norm": 4.736473735176666, + "learning_rate": 2.67e-06, + "loss": 1.0245, + "step": 267 + }, + { + "epoch": 0.1013903338692897, + "grad_norm": 2.927740836124029, + "learning_rate": 2.68e-06, + "loss": 0.9906, + "step": 268 + }, + { + "epoch": 0.10176865601059303, + "grad_norm": 4.622383990826824, + "learning_rate": 2.6899999999999997e-06, + "loss": 0.9679, + "step": 269 + }, + { + "epoch": 0.10214697815189634, + "grad_norm": 3.8746535383849836, + "learning_rate": 2.7e-06, + "loss": 0.9211, + "step": 270 + }, + { + "epoch": 0.10252530029319966, + "grad_norm": 4.361727224982868, + "learning_rate": 2.71e-06, + "loss": 0.9779, + "step": 271 + }, + { + "epoch": 0.10290362243450298, + "grad_norm": 3.2847575684010795, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.969, + "step": 272 + }, + { + "epoch": 0.1032819445758063, + "grad_norm": 2.946259099361567, + "learning_rate": 2.7299999999999997e-06, + "loss": 0.9374, + "step": 273 + }, + { + "epoch": 0.10366026671710962, + "grad_norm": 3.5163454504687364, + "learning_rate": 2.74e-06, + "loss": 0.9809, + "step": 274 + }, + { + "epoch": 0.10403858885841294, + "grad_norm": 4.1448737340815045, + "learning_rate": 2.75e-06, + "loss": 0.9816, + "step": 275 + }, + { + "epoch": 0.10441691099971626, + "grad_norm": 3.345900089125294, + "learning_rate": 2.76e-06, + "loss": 0.94, + "step": 276 + }, + { + "epoch": 0.10479523314101957, + "grad_norm": 4.756231356260067, + "learning_rate": 2.7699999999999997e-06, + "loss": 0.9948, + "step": 277 + }, + { + "epoch": 0.1051735552823229, + "grad_norm": 3.395795830645774, + "learning_rate": 2.7799999999999996e-06, + "loss": 0.9852, + "step": 278 + }, + { + "epoch": 0.10555187742362622, + "grad_norm": 3.7361359597792085, + "learning_rate": 2.79e-06, + "loss": 0.9705, + "step": 279 + }, + { + "epoch": 0.10593019956492954, + "grad_norm": 2.9021780470974536, + "learning_rate": 2.8e-06, + "loss": 0.9517, + "step": 280 + }, + { + "epoch": 0.10630852170623285, + "grad_norm": 3.3140561096891408, + "learning_rate": 2.8099999999999998e-06, + "loss": 0.9518, + "step": 281 + }, + { + "epoch": 0.10668684384753618, + "grad_norm": 4.955772041684827, + "learning_rate": 2.8199999999999997e-06, + "loss": 0.949, + "step": 282 + }, + { + "epoch": 0.1070651659888395, + "grad_norm": 2.7495737336593447, + "learning_rate": 2.83e-06, + "loss": 0.9637, + "step": 283 + }, + { + "epoch": 0.10744348813014282, + "grad_norm": 5.5808851538998745, + "learning_rate": 2.84e-06, + "loss": 0.9149, + "step": 284 + }, + { + "epoch": 0.10782181027144613, + "grad_norm": 3.2461608503776582, + "learning_rate": 2.85e-06, + "loss": 0.9562, + "step": 285 + }, + { + "epoch": 0.10820013241274945, + "grad_norm": 3.016464443847612, + "learning_rate": 2.8599999999999997e-06, + "loss": 0.9635, + "step": 286 + }, + { + "epoch": 0.10857845455405278, + "grad_norm": 3.1653672708590936, + "learning_rate": 2.87e-06, + "loss": 1.0064, + "step": 287 + }, + { + "epoch": 0.1089567766953561, + "grad_norm": 2.1243065072255907, + "learning_rate": 2.88e-06, + "loss": 0.9279, + "step": 288 + }, + { + "epoch": 0.10933509883665941, + "grad_norm": 3.4080159282806712, + "learning_rate": 2.89e-06, + "loss": 0.9759, + "step": 289 + }, + { + "epoch": 0.10971342097796273, + "grad_norm": 2.610557409129719, + "learning_rate": 2.8999999999999998e-06, + "loss": 0.9787, + "step": 290 + }, + { + "epoch": 0.11009174311926606, + "grad_norm": 2.2107636510154176, + "learning_rate": 2.91e-06, + "loss": 0.9296, + "step": 291 + }, + { + "epoch": 0.11047006526056938, + "grad_norm": 4.245908140335627, + "learning_rate": 2.92e-06, + "loss": 0.9273, + "step": 292 + }, + { + "epoch": 0.1108483874018727, + "grad_norm": 2.895847446673922, + "learning_rate": 2.93e-06, + "loss": 0.9383, + "step": 293 + }, + { + "epoch": 0.11122670954317601, + "grad_norm": 2.704339168426421, + "learning_rate": 2.94e-06, + "loss": 0.9153, + "step": 294 + }, + { + "epoch": 0.11160503168447933, + "grad_norm": 2.701813364341608, + "learning_rate": 2.95e-06, + "loss": 0.9299, + "step": 295 + }, + { + "epoch": 0.11198335382578266, + "grad_norm": 2.948359459278812, + "learning_rate": 2.96e-06, + "loss": 0.9702, + "step": 296 + }, + { + "epoch": 0.11236167596708597, + "grad_norm": 3.377595158199111, + "learning_rate": 2.97e-06, + "loss": 0.9554, + "step": 297 + }, + { + "epoch": 0.11273999810838929, + "grad_norm": 2.5213378940105415, + "learning_rate": 2.98e-06, + "loss": 0.9312, + "step": 298 + }, + { + "epoch": 0.11311832024969261, + "grad_norm": 4.796315482527464, + "learning_rate": 2.99e-06, + "loss": 0.9294, + "step": 299 + }, + { + "epoch": 0.11349664239099594, + "grad_norm": 2.161917946044457, + "learning_rate": 3e-06, + "loss": 0.9603, + "step": 300 + }, + { + "epoch": 0.11387496453229926, + "grad_norm": 4.2290402280104145, + "learning_rate": 3.0099999999999996e-06, + "loss": 0.9079, + "step": 301 + }, + { + "epoch": 0.11425328667360257, + "grad_norm": 2.7667893528721867, + "learning_rate": 3.02e-06, + "loss": 0.953, + "step": 302 + }, + { + "epoch": 0.11463160881490589, + "grad_norm": 9.065359561610483, + "learning_rate": 3.03e-06, + "loss": 0.9891, + "step": 303 + }, + { + "epoch": 0.1150099309562092, + "grad_norm": 3.629194869203107, + "learning_rate": 3.0399999999999997e-06, + "loss": 0.9434, + "step": 304 + }, + { + "epoch": 0.11538825309751254, + "grad_norm": 3.2434020969746182, + "learning_rate": 3.0499999999999996e-06, + "loss": 0.9289, + "step": 305 + }, + { + "epoch": 0.11576657523881585, + "grad_norm": 3.266784032620147, + "learning_rate": 3.06e-06, + "loss": 0.941, + "step": 306 + }, + { + "epoch": 0.11614489738011917, + "grad_norm": 2.2252097372145627, + "learning_rate": 3.07e-06, + "loss": 0.9197, + "step": 307 + }, + { + "epoch": 0.11652321952142249, + "grad_norm": 2.2906797269719683, + "learning_rate": 3.0799999999999997e-06, + "loss": 0.9278, + "step": 308 + }, + { + "epoch": 0.11690154166272582, + "grad_norm": 2.899028879345415, + "learning_rate": 3.0899999999999996e-06, + "loss": 0.9177, + "step": 309 + }, + { + "epoch": 0.11727986380402913, + "grad_norm": 1.9374921205584867, + "learning_rate": 3.1e-06, + "loss": 0.9049, + "step": 310 + }, + { + "epoch": 0.11765818594533245, + "grad_norm": 1.90674843142603, + "learning_rate": 3.11e-06, + "loss": 0.9563, + "step": 311 + }, + { + "epoch": 0.11803650808663577, + "grad_norm": 1.878846884674951, + "learning_rate": 3.1199999999999998e-06, + "loss": 0.9139, + "step": 312 + }, + { + "epoch": 0.1184148302279391, + "grad_norm": 1.8411547245015762, + "learning_rate": 3.1299999999999997e-06, + "loss": 0.947, + "step": 313 + }, + { + "epoch": 0.11879315236924241, + "grad_norm": 1.6495211524540856, + "learning_rate": 3.14e-06, + "loss": 0.8994, + "step": 314 + }, + { + "epoch": 0.11917147451054573, + "grad_norm": 1.979339834494396, + "learning_rate": 3.15e-06, + "loss": 0.9425, + "step": 315 + }, + { + "epoch": 0.11954979665184905, + "grad_norm": 1.6881739152797177, + "learning_rate": 3.16e-06, + "loss": 0.9079, + "step": 316 + }, + { + "epoch": 0.11992811879315236, + "grad_norm": 1.7476621404963093, + "learning_rate": 3.1699999999999997e-06, + "loss": 0.9342, + "step": 317 + }, + { + "epoch": 0.1203064409344557, + "grad_norm": 1.7825714782443438, + "learning_rate": 3.18e-06, + "loss": 0.9736, + "step": 318 + }, + { + "epoch": 0.12068476307575901, + "grad_norm": 1.7904157984440023, + "learning_rate": 3.19e-06, + "loss": 0.8904, + "step": 319 + }, + { + "epoch": 0.12106308521706233, + "grad_norm": 1.8488826023075036, + "learning_rate": 3.2e-06, + "loss": 0.9374, + "step": 320 + }, + { + "epoch": 0.12144140735836564, + "grad_norm": 1.7466001202181465, + "learning_rate": 3.2099999999999998e-06, + "loss": 0.9506, + "step": 321 + }, + { + "epoch": 0.12181972949966897, + "grad_norm": 1.9022275763429817, + "learning_rate": 3.22e-06, + "loss": 0.9452, + "step": 322 + }, + { + "epoch": 0.12219805164097229, + "grad_norm": 1.62671365850624, + "learning_rate": 3.23e-06, + "loss": 0.9063, + "step": 323 + }, + { + "epoch": 0.12257637378227561, + "grad_norm": 1.537323535673334, + "learning_rate": 3.24e-06, + "loss": 0.892, + "step": 324 + }, + { + "epoch": 0.12295469592357892, + "grad_norm": 1.6088280546082747, + "learning_rate": 3.25e-06, + "loss": 0.9055, + "step": 325 + }, + { + "epoch": 0.12333301806488224, + "grad_norm": 1.754864511511676, + "learning_rate": 3.2599999999999997e-06, + "loss": 0.9982, + "step": 326 + }, + { + "epoch": 0.12371134020618557, + "grad_norm": 1.7110520395582398, + "learning_rate": 3.27e-06, + "loss": 0.8869, + "step": 327 + }, + { + "epoch": 0.12408966234748889, + "grad_norm": 2.2210658284362976, + "learning_rate": 3.2799999999999995e-06, + "loss": 0.9, + "step": 328 + }, + { + "epoch": 0.1244679844887922, + "grad_norm": 2.0718951481844337, + "learning_rate": 3.29e-06, + "loss": 0.9474, + "step": 329 + }, + { + "epoch": 0.12484630663009552, + "grad_norm": 1.6483777638825354, + "learning_rate": 3.2999999999999997e-06, + "loss": 0.9193, + "step": 330 + }, + { + "epoch": 0.12522462877139884, + "grad_norm": 1.8408500351694481, + "learning_rate": 3.31e-06, + "loss": 0.9331, + "step": 331 + }, + { + "epoch": 0.12560295091270215, + "grad_norm": 1.5886399601274244, + "learning_rate": 3.3199999999999996e-06, + "loss": 0.9181, + "step": 332 + }, + { + "epoch": 0.1259812730540055, + "grad_norm": 1.5415700759277726, + "learning_rate": 3.33e-06, + "loss": 0.9078, + "step": 333 + }, + { + "epoch": 0.12635959519530882, + "grad_norm": 1.5699378541238653, + "learning_rate": 3.3399999999999998e-06, + "loss": 0.9415, + "step": 334 + }, + { + "epoch": 0.12673791733661213, + "grad_norm": 1.4355378270145513, + "learning_rate": 3.35e-06, + "loss": 0.9328, + "step": 335 + }, + { + "epoch": 0.12711623947791545, + "grad_norm": 1.4472036059899498, + "learning_rate": 3.3599999999999996e-06, + "loss": 0.9235, + "step": 336 + }, + { + "epoch": 0.12749456161921877, + "grad_norm": 1.493466705425371, + "learning_rate": 3.37e-06, + "loss": 0.917, + "step": 337 + }, + { + "epoch": 0.12787288376052208, + "grad_norm": 1.725222957788955, + "learning_rate": 3.38e-06, + "loss": 0.9229, + "step": 338 + }, + { + "epoch": 0.1282512059018254, + "grad_norm": 1.829546156665469, + "learning_rate": 3.39e-06, + "loss": 0.9199, + "step": 339 + }, + { + "epoch": 0.12862952804312872, + "grad_norm": 1.562404556848645, + "learning_rate": 3.3999999999999996e-06, + "loss": 0.9258, + "step": 340 + }, + { + "epoch": 0.12900785018443203, + "grad_norm": 1.5503184849860385, + "learning_rate": 3.41e-06, + "loss": 0.9056, + "step": 341 + }, + { + "epoch": 0.12938617232573538, + "grad_norm": 2.093643266825353, + "learning_rate": 3.42e-06, + "loss": 0.9151, + "step": 342 + }, + { + "epoch": 0.1297644944670387, + "grad_norm": 1.5470351610527242, + "learning_rate": 3.43e-06, + "loss": 0.9295, + "step": 343 + }, + { + "epoch": 0.130142816608342, + "grad_norm": 1.6415927498606424, + "learning_rate": 3.4399999999999997e-06, + "loss": 0.9227, + "step": 344 + }, + { + "epoch": 0.13052113874964533, + "grad_norm": 1.501364967749395, + "learning_rate": 3.45e-06, + "loss": 0.9196, + "step": 345 + }, + { + "epoch": 0.13089946089094864, + "grad_norm": 1.4667926955996313, + "learning_rate": 3.46e-06, + "loss": 0.9875, + "step": 346 + }, + { + "epoch": 0.13127778303225196, + "grad_norm": 1.4015397895960147, + "learning_rate": 3.4700000000000002e-06, + "loss": 0.9174, + "step": 347 + }, + { + "epoch": 0.13165610517355528, + "grad_norm": 1.6317901839112616, + "learning_rate": 3.4799999999999997e-06, + "loss": 0.9022, + "step": 348 + }, + { + "epoch": 0.1320344273148586, + "grad_norm": 1.5495030641920218, + "learning_rate": 3.49e-06, + "loss": 0.9056, + "step": 349 + }, + { + "epoch": 0.1324127494561619, + "grad_norm": 1.4169162437828007, + "learning_rate": 3.5e-06, + "loss": 0.9125, + "step": 350 + }, + { + "epoch": 0.13279107159746525, + "grad_norm": 1.5269510878366184, + "learning_rate": 3.5099999999999994e-06, + "loss": 0.9325, + "step": 351 + }, + { + "epoch": 0.13316939373876857, + "grad_norm": 1.4845731562408333, + "learning_rate": 3.5199999999999998e-06, + "loss": 0.9119, + "step": 352 + }, + { + "epoch": 0.1335477158800719, + "grad_norm": 1.2998342684154016, + "learning_rate": 3.5299999999999997e-06, + "loss": 0.8989, + "step": 353 + }, + { + "epoch": 0.1339260380213752, + "grad_norm": 1.4867481861923495, + "learning_rate": 3.54e-06, + "loss": 0.9201, + "step": 354 + }, + { + "epoch": 0.13430436016267852, + "grad_norm": 1.4212824059163913, + "learning_rate": 3.5499999999999995e-06, + "loss": 0.9288, + "step": 355 + }, + { + "epoch": 0.13468268230398184, + "grad_norm": 1.3588961307618976, + "learning_rate": 3.56e-06, + "loss": 0.9117, + "step": 356 + }, + { + "epoch": 0.13506100444528515, + "grad_norm": 1.4097313807539793, + "learning_rate": 3.5699999999999997e-06, + "loss": 0.9139, + "step": 357 + }, + { + "epoch": 0.13543932658658847, + "grad_norm": 1.490782064831479, + "learning_rate": 3.58e-06, + "loss": 0.938, + "step": 358 + }, + { + "epoch": 0.1358176487278918, + "grad_norm": 1.2930048652835795, + "learning_rate": 3.5899999999999995e-06, + "loss": 0.9023, + "step": 359 + }, + { + "epoch": 0.13619597086919513, + "grad_norm": 1.824182436515982, + "learning_rate": 3.6e-06, + "loss": 0.9343, + "step": 360 + }, + { + "epoch": 0.13657429301049845, + "grad_norm": 1.4837219324976698, + "learning_rate": 3.6099999999999997e-06, + "loss": 0.9418, + "step": 361 + }, + { + "epoch": 0.13695261515180177, + "grad_norm": 1.3718729917310193, + "learning_rate": 3.62e-06, + "loss": 0.9231, + "step": 362 + }, + { + "epoch": 0.13733093729310508, + "grad_norm": 1.3644818822127356, + "learning_rate": 3.6299999999999995e-06, + "loss": 0.9093, + "step": 363 + }, + { + "epoch": 0.1377092594344084, + "grad_norm": 1.4274881326706697, + "learning_rate": 3.64e-06, + "loss": 0.9077, + "step": 364 + }, + { + "epoch": 0.13808758157571172, + "grad_norm": 1.3169195252885812, + "learning_rate": 3.6499999999999998e-06, + "loss": 0.8772, + "step": 365 + }, + { + "epoch": 0.13846590371701503, + "grad_norm": 1.3505673564506786, + "learning_rate": 3.66e-06, + "loss": 0.8729, + "step": 366 + }, + { + "epoch": 0.13884422585831835, + "grad_norm": 1.3728815922981648, + "learning_rate": 3.6699999999999996e-06, + "loss": 0.91, + "step": 367 + }, + { + "epoch": 0.13922254799962167, + "grad_norm": 1.4225979847364822, + "learning_rate": 3.68e-06, + "loss": 0.8862, + "step": 368 + }, + { + "epoch": 0.139600870140925, + "grad_norm": 1.3363118705656714, + "learning_rate": 3.69e-06, + "loss": 0.9322, + "step": 369 + }, + { + "epoch": 0.13997919228222833, + "grad_norm": 1.318614371056809, + "learning_rate": 3.7e-06, + "loss": 0.926, + "step": 370 + }, + { + "epoch": 0.14035751442353164, + "grad_norm": 1.330484253084181, + "learning_rate": 3.7099999999999996e-06, + "loss": 0.9456, + "step": 371 + }, + { + "epoch": 0.14073583656483496, + "grad_norm": 1.3318506320691512, + "learning_rate": 3.72e-06, + "loss": 0.9017, + "step": 372 + }, + { + "epoch": 0.14111415870613828, + "grad_norm": 1.3759434761704756, + "learning_rate": 3.73e-06, + "loss": 0.8881, + "step": 373 + }, + { + "epoch": 0.1414924808474416, + "grad_norm": 1.3957619030952084, + "learning_rate": 3.74e-06, + "loss": 0.9121, + "step": 374 + }, + { + "epoch": 0.1418708029887449, + "grad_norm": 1.3427799016571502, + "learning_rate": 3.7499999999999997e-06, + "loss": 0.9106, + "step": 375 + }, + { + "epoch": 0.14224912513004823, + "grad_norm": 44.30080368963616, + "learning_rate": 3.7599999999999996e-06, + "loss": 0.8911, + "step": 376 + }, + { + "epoch": 0.14262744727135154, + "grad_norm": 2.2669972347416127, + "learning_rate": 3.77e-06, + "loss": 0.933, + "step": 377 + }, + { + "epoch": 0.1430057694126549, + "grad_norm": 1.4829201626961606, + "learning_rate": 3.78e-06, + "loss": 0.901, + "step": 378 + }, + { + "epoch": 0.1433840915539582, + "grad_norm": 4.064663928049432, + "learning_rate": 3.7899999999999997e-06, + "loss": 0.8942, + "step": 379 + }, + { + "epoch": 0.14376241369526152, + "grad_norm": 1.8169275430345828, + "learning_rate": 3.7999999999999996e-06, + "loss": 0.88, + "step": 380 + }, + { + "epoch": 0.14414073583656484, + "grad_norm": 1.903257571166488, + "learning_rate": 3.81e-06, + "loss": 0.9286, + "step": 381 + }, + { + "epoch": 0.14451905797786815, + "grad_norm": 1.662557610937424, + "learning_rate": 3.82e-06, + "loss": 0.8947, + "step": 382 + }, + { + "epoch": 0.14489738011917147, + "grad_norm": 1.3504615763712993, + "learning_rate": 3.83e-06, + "loss": 0.9081, + "step": 383 + }, + { + "epoch": 0.1452757022604748, + "grad_norm": 2.083053759282353, + "learning_rate": 3.84e-06, + "loss": 0.9229, + "step": 384 + }, + { + "epoch": 0.1456540244017781, + "grad_norm": 1.5724819369725127, + "learning_rate": 3.8499999999999996e-06, + "loss": 0.9019, + "step": 385 + }, + { + "epoch": 0.14603234654308142, + "grad_norm": 1.2833291006046557, + "learning_rate": 3.8599999999999995e-06, + "loss": 0.8943, + "step": 386 + }, + { + "epoch": 0.14641066868438476, + "grad_norm": 1.6810072820257926, + "learning_rate": 3.87e-06, + "loss": 0.9469, + "step": 387 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 1.462137670239198, + "learning_rate": 3.88e-06, + "loss": 0.885, + "step": 388 + }, + { + "epoch": 0.1471673129669914, + "grad_norm": 1.3544773507596952, + "learning_rate": 3.89e-06, + "loss": 0.9223, + "step": 389 + }, + { + "epoch": 0.14754563510829471, + "grad_norm": 1.305788748108731, + "learning_rate": 3.9e-06, + "loss": 0.9085, + "step": 390 + }, + { + "epoch": 0.14792395724959803, + "grad_norm": 1.4728433076805145, + "learning_rate": 3.91e-06, + "loss": 0.9111, + "step": 391 + }, + { + "epoch": 0.14830227939090135, + "grad_norm": 1.3023289374881166, + "learning_rate": 3.92e-06, + "loss": 0.9082, + "step": 392 + }, + { + "epoch": 0.14868060153220466, + "grad_norm": 1.528856941817902, + "learning_rate": 3.93e-06, + "loss": 0.8583, + "step": 393 + }, + { + "epoch": 0.14905892367350798, + "grad_norm": 1.2279025499674738, + "learning_rate": 3.9399999999999995e-06, + "loss": 0.8943, + "step": 394 + }, + { + "epoch": 0.1494372458148113, + "grad_norm": 1.5480907504889059, + "learning_rate": 3.95e-06, + "loss": 0.858, + "step": 395 + }, + { + "epoch": 0.14981556795611464, + "grad_norm": 1.3146063824478018, + "learning_rate": 3.96e-06, + "loss": 0.8618, + "step": 396 + }, + { + "epoch": 0.15019389009741796, + "grad_norm": 1.334057857690303, + "learning_rate": 3.97e-06, + "loss": 0.9243, + "step": 397 + }, + { + "epoch": 0.15057221223872128, + "grad_norm": 1.3866128005645164, + "learning_rate": 3.98e-06, + "loss": 0.9274, + "step": 398 + }, + { + "epoch": 0.1509505343800246, + "grad_norm": 1.2955294219171367, + "learning_rate": 3.99e-06, + "loss": 0.9173, + "step": 399 + }, + { + "epoch": 0.1509505343800246, + "eval_loss": 0.9027320742607117, + "eval_runtime": 27.0581, + "eval_samples_per_second": 32.707, + "eval_steps_per_second": 1.035, + "step": 399 + }, + { + "epoch": 0.1509505343800246, + "eval_bench_accuracy_arc_challenge": 0.24285714285714285, + "eval_bench_accuracy_hellaswag": 0.24, + "eval_bench_accuracy_mmlu": 0.3739130434782609, + "eval_bench_average_accuracy": 0.2855900621118012, + "eval_bench_loss": 4.885084721080044, + "eval_bench_total_accuracy": 0.27472527472527475, + "step": 399 + }, + { + "epoch": 0.1513288565213279, + "grad_norm": 1.4867956471987611, + "learning_rate": 4e-06, + "loss": 0.8442, + "step": 400 + }, + { + "epoch": 0.15170717866263123, + "grad_norm": 1.4418482940385888, + "learning_rate": 4.01e-06, + "loss": 0.8851, + "step": 401 + }, + { + "epoch": 0.15208550080393454, + "grad_norm": 1.2367816437008439, + "learning_rate": 4.02e-06, + "loss": 0.9016, + "step": 402 + }, + { + "epoch": 0.15246382294523786, + "grad_norm": 1.3381669970164036, + "learning_rate": 4.03e-06, + "loss": 0.8967, + "step": 403 + }, + { + "epoch": 0.1528421450865412, + "grad_norm": 1.178040710244701, + "learning_rate": 4.0399999999999994e-06, + "loss": 0.9052, + "step": 404 + }, + { + "epoch": 0.15322046722784452, + "grad_norm": 1.354680203607332, + "learning_rate": 4.049999999999999e-06, + "loss": 0.916, + "step": 405 + }, + { + "epoch": 0.15359878936914784, + "grad_norm": 1.2478760852613116, + "learning_rate": 4.059999999999999e-06, + "loss": 0.8918, + "step": 406 + }, + { + "epoch": 0.15397711151045115, + "grad_norm": 1.3580886429686791, + "learning_rate": 4.07e-06, + "loss": 0.8769, + "step": 407 + }, + { + "epoch": 0.15435543365175447, + "grad_norm": 1.4849252692119392, + "learning_rate": 4.08e-06, + "loss": 0.8985, + "step": 408 + }, + { + "epoch": 0.1547337557930578, + "grad_norm": 1.234446053198778, + "learning_rate": 4.09e-06, + "loss": 0.8681, + "step": 409 + }, + { + "epoch": 0.1551120779343611, + "grad_norm": 1.4907001456714162, + "learning_rate": 4.1e-06, + "loss": 0.9035, + "step": 410 + }, + { + "epoch": 0.15549040007566442, + "grad_norm": 1.1935520171507346, + "learning_rate": 4.1100000000000005e-06, + "loss": 0.8939, + "step": 411 + }, + { + "epoch": 0.15586872221696774, + "grad_norm": 1.3431797561411594, + "learning_rate": 4.1199999999999995e-06, + "loss": 0.8892, + "step": 412 + }, + { + "epoch": 0.15624704435827108, + "grad_norm": 1.1858701499867044, + "learning_rate": 4.129999999999999e-06, + "loss": 0.8952, + "step": 413 + }, + { + "epoch": 0.1566253664995744, + "grad_norm": 1.3160462921208504, + "learning_rate": 4.139999999999999e-06, + "loss": 0.9104, + "step": 414 + }, + { + "epoch": 0.15700368864087771, + "grad_norm": 1.205303163621962, + "learning_rate": 4.15e-06, + "loss": 0.8989, + "step": 415 + }, + { + "epoch": 0.15738201078218103, + "grad_norm": 1.2116662309617274, + "learning_rate": 4.16e-06, + "loss": 0.9178, + "step": 416 + }, + { + "epoch": 0.15776033292348435, + "grad_norm": 1.1758637546414648, + "learning_rate": 4.17e-06, + "loss": 0.8792, + "step": 417 + }, + { + "epoch": 0.15813865506478766, + "grad_norm": 1.2552462548629688, + "learning_rate": 4.18e-06, + "loss": 0.8981, + "step": 418 + }, + { + "epoch": 0.15851697720609098, + "grad_norm": 1.206264514397755, + "learning_rate": 4.1900000000000005e-06, + "loss": 0.9058, + "step": 419 + }, + { + "epoch": 0.1588952993473943, + "grad_norm": 1.2231014501429258, + "learning_rate": 4.2e-06, + "loss": 0.899, + "step": 420 + }, + { + "epoch": 0.15927362148869761, + "grad_norm": 1.2120070273790158, + "learning_rate": 4.2099999999999995e-06, + "loss": 0.8449, + "step": 421 + }, + { + "epoch": 0.15965194363000096, + "grad_norm": 1.225434870357441, + "learning_rate": 4.219999999999999e-06, + "loss": 0.8925, + "step": 422 + }, + { + "epoch": 0.16003026577130428, + "grad_norm": 1.2700536143173544, + "learning_rate": 4.23e-06, + "loss": 0.8948, + "step": 423 + }, + { + "epoch": 0.1604085879126076, + "grad_norm": 1.327617668860312, + "learning_rate": 4.24e-06, + "loss": 0.8808, + "step": 424 + }, + { + "epoch": 0.1607869100539109, + "grad_norm": 1.2286005573930583, + "learning_rate": 4.25e-06, + "loss": 0.8885, + "step": 425 + }, + { + "epoch": 0.16116523219521423, + "grad_norm": 1.265158345195646, + "learning_rate": 4.26e-06, + "loss": 0.8973, + "step": 426 + }, + { + "epoch": 0.16154355433651754, + "grad_norm": 1.2113247771231779, + "learning_rate": 4.27e-06, + "loss": 0.88, + "step": 427 + }, + { + "epoch": 0.16192187647782086, + "grad_norm": 1.1981923822069018, + "learning_rate": 4.28e-06, + "loss": 0.8812, + "step": 428 + }, + { + "epoch": 0.16230019861912418, + "grad_norm": 1.269210905108754, + "learning_rate": 4.29e-06, + "loss": 0.951, + "step": 429 + }, + { + "epoch": 0.1626785207604275, + "grad_norm": 1.270040077896289, + "learning_rate": 4.2999999999999995e-06, + "loss": 0.8502, + "step": 430 + }, + { + "epoch": 0.16305684290173084, + "grad_norm": 1.2459835235482208, + "learning_rate": 4.309999999999999e-06, + "loss": 0.9249, + "step": 431 + }, + { + "epoch": 0.16343516504303415, + "grad_norm": 1.2065849160511677, + "learning_rate": 4.32e-06, + "loss": 0.8569, + "step": 432 + }, + { + "epoch": 0.16381348718433747, + "grad_norm": 1.3240957525319628, + "learning_rate": 4.33e-06, + "loss": 0.8378, + "step": 433 + }, + { + "epoch": 0.1641918093256408, + "grad_norm": 1.308494624204772, + "learning_rate": 4.34e-06, + "loss": 0.8853, + "step": 434 + }, + { + "epoch": 0.1645701314669441, + "grad_norm": 1.2876226830148083, + "learning_rate": 4.35e-06, + "loss": 0.8999, + "step": 435 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 1.3895344761060464, + "learning_rate": 4.36e-06, + "loss": 0.8995, + "step": 436 + }, + { + "epoch": 0.16532677574955074, + "grad_norm": 1.2397074052657744, + "learning_rate": 4.37e-06, + "loss": 0.8787, + "step": 437 + }, + { + "epoch": 0.16570509789085405, + "grad_norm": 1.2286411029399464, + "learning_rate": 4.3799999999999996e-06, + "loss": 0.8968, + "step": 438 + }, + { + "epoch": 0.16608342003215737, + "grad_norm": 1.231038186520652, + "learning_rate": 4.3899999999999995e-06, + "loss": 0.8781, + "step": 439 + }, + { + "epoch": 0.16646174217346071, + "grad_norm": 1.2138487844408843, + "learning_rate": 4.4e-06, + "loss": 0.8698, + "step": 440 + }, + { + "epoch": 0.16684006431476403, + "grad_norm": 1.3027744892443913, + "learning_rate": 4.41e-06, + "loss": 0.9253, + "step": 441 + }, + { + "epoch": 0.16721838645606735, + "grad_norm": 1.2467659827353952, + "learning_rate": 4.42e-06, + "loss": 0.9121, + "step": 442 + }, + { + "epoch": 0.16759670859737066, + "grad_norm": 1.1589200132022377, + "learning_rate": 4.43e-06, + "loss": 0.8803, + "step": 443 + }, + { + "epoch": 0.16797503073867398, + "grad_norm": 1.2200621136986902, + "learning_rate": 4.44e-06, + "loss": 0.9079, + "step": 444 + }, + { + "epoch": 0.1683533528799773, + "grad_norm": 1.1747935123553643, + "learning_rate": 4.45e-06, + "loss": 0.8766, + "step": 445 + }, + { + "epoch": 0.1687316750212806, + "grad_norm": 1.1865214460906777, + "learning_rate": 4.46e-06, + "loss": 0.9068, + "step": 446 + }, + { + "epoch": 0.16910999716258393, + "grad_norm": 1.2579950961305297, + "learning_rate": 4.4699999999999996e-06, + "loss": 0.8815, + "step": 447 + }, + { + "epoch": 0.16948831930388725, + "grad_norm": 1.226665097174107, + "learning_rate": 4.48e-06, + "loss": 0.9327, + "step": 448 + }, + { + "epoch": 0.1698666414451906, + "grad_norm": 1.1931395850546989, + "learning_rate": 4.49e-06, + "loss": 0.8796, + "step": 449 + }, + { + "epoch": 0.1702449635864939, + "grad_norm": 1.202501530652917, + "learning_rate": 4.5e-06, + "loss": 0.8931, + "step": 450 + }, + { + "epoch": 0.17062328572779722, + "grad_norm": 1.1807025967685065, + "learning_rate": 4.509999999999999e-06, + "loss": 0.8887, + "step": 451 + }, + { + "epoch": 0.17100160786910054, + "grad_norm": 1.219222521929812, + "learning_rate": 4.519999999999999e-06, + "loss": 0.8999, + "step": 452 + }, + { + "epoch": 0.17137993001040386, + "grad_norm": 1.234613051649134, + "learning_rate": 4.53e-06, + "loss": 0.8439, + "step": 453 + }, + { + "epoch": 0.17175825215170717, + "grad_norm": 1.2268814413232634, + "learning_rate": 4.54e-06, + "loss": 0.8679, + "step": 454 + }, + { + "epoch": 0.1721365742930105, + "grad_norm": 1.2687792576706662, + "learning_rate": 4.55e-06, + "loss": 0.9137, + "step": 455 + }, + { + "epoch": 0.1725148964343138, + "grad_norm": 1.259597511238193, + "learning_rate": 4.5599999999999995e-06, + "loss": 0.8929, + "step": 456 + }, + { + "epoch": 0.17289321857561712, + "grad_norm": 1.1601209722807053, + "learning_rate": 4.57e-06, + "loss": 0.8989, + "step": 457 + }, + { + "epoch": 0.17327154071692047, + "grad_norm": 1.1337571129482695, + "learning_rate": 4.58e-06, + "loss": 0.8867, + "step": 458 + }, + { + "epoch": 0.17364986285822379, + "grad_norm": 1.2315099804928107, + "learning_rate": 4.589999999999999e-06, + "loss": 0.8766, + "step": 459 + }, + { + "epoch": 0.1740281849995271, + "grad_norm": 1.1590598116825013, + "learning_rate": 4.599999999999999e-06, + "loss": 0.8996, + "step": 460 + }, + { + "epoch": 0.17440650714083042, + "grad_norm": 1.2223724961641853, + "learning_rate": 4.61e-06, + "loss": 0.8885, + "step": 461 + }, + { + "epoch": 0.17478482928213374, + "grad_norm": 1.2563659855924223, + "learning_rate": 4.62e-06, + "loss": 0.9316, + "step": 462 + }, + { + "epoch": 0.17516315142343705, + "grad_norm": 1.2219308373205684, + "learning_rate": 4.63e-06, + "loss": 0.9402, + "step": 463 + }, + { + "epoch": 0.17554147356474037, + "grad_norm": 1.2529933281060042, + "learning_rate": 4.64e-06, + "loss": 0.8425, + "step": 464 + }, + { + "epoch": 0.17591979570604369, + "grad_norm": 1.1519152308086784, + "learning_rate": 4.65e-06, + "loss": 0.8335, + "step": 465 + }, + { + "epoch": 0.176298117847347, + "grad_norm": 1.1993447663063845, + "learning_rate": 4.66e-06, + "loss": 0.8423, + "step": 466 + }, + { + "epoch": 0.17667643998865035, + "grad_norm": 1.2393551988442821, + "learning_rate": 4.669999999999999e-06, + "loss": 0.8766, + "step": 467 + }, + { + "epoch": 0.17705476212995366, + "grad_norm": 1.1568166146377072, + "learning_rate": 4.679999999999999e-06, + "loss": 0.913, + "step": 468 + }, + { + "epoch": 0.17743308427125698, + "grad_norm": 1.2535994832897241, + "learning_rate": 4.69e-06, + "loss": 0.8611, + "step": 469 + }, + { + "epoch": 0.1778114064125603, + "grad_norm": 1.2581510292576754, + "learning_rate": 4.7e-06, + "loss": 0.852, + "step": 470 + }, + { + "epoch": 0.1781897285538636, + "grad_norm": 1.185843568335289, + "learning_rate": 4.71e-06, + "loss": 0.8712, + "step": 471 + }, + { + "epoch": 0.17856805069516693, + "grad_norm": 1.1762961141384334, + "learning_rate": 4.72e-06, + "loss": 0.8848, + "step": 472 + }, + { + "epoch": 0.17894637283647025, + "grad_norm": 1.2378038953878985, + "learning_rate": 4.7300000000000005e-06, + "loss": 0.89, + "step": 473 + }, + { + "epoch": 0.17932469497777356, + "grad_norm": 1.2303598909876003, + "learning_rate": 4.74e-06, + "loss": 0.9019, + "step": 474 + }, + { + "epoch": 0.1797030171190769, + "grad_norm": 1.3055168080029775, + "learning_rate": 4.749999999999999e-06, + "loss": 0.8886, + "step": 475 + }, + { + "epoch": 0.18008133926038022, + "grad_norm": 1.263816208541402, + "learning_rate": 4.759999999999999e-06, + "loss": 0.8934, + "step": 476 + }, + { + "epoch": 0.18045966140168354, + "grad_norm": 1.2304160263194301, + "learning_rate": 4.769999999999999e-06, + "loss": 0.8334, + "step": 477 + }, + { + "epoch": 0.18083798354298686, + "grad_norm": 1.16427739617554, + "learning_rate": 4.78e-06, + "loss": 0.8933, + "step": 478 + }, + { + "epoch": 0.18121630568429017, + "grad_norm": 1.2928340654165948, + "learning_rate": 4.79e-06, + "loss": 0.9091, + "step": 479 + }, + { + "epoch": 0.1815946278255935, + "grad_norm": 1.2237270548636812, + "learning_rate": 4.8e-06, + "loss": 0.8894, + "step": 480 + }, + { + "epoch": 0.1819729499668968, + "grad_norm": 1.2973745239107866, + "learning_rate": 4.81e-06, + "loss": 0.8827, + "step": 481 + }, + { + "epoch": 0.18235127210820012, + "grad_norm": 1.2192171355443393, + "learning_rate": 4.8200000000000004e-06, + "loss": 0.842, + "step": 482 + }, + { + "epoch": 0.18272959424950344, + "grad_norm": 1.1825464816429376, + "learning_rate": 4.8299999999999995e-06, + "loss": 0.8974, + "step": 483 + }, + { + "epoch": 0.18310791639080679, + "grad_norm": 1.2357877717915002, + "learning_rate": 4.839999999999999e-06, + "loss": 0.8713, + "step": 484 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 1.2724832467234655, + "learning_rate": 4.849999999999999e-06, + "loss": 0.8916, + "step": 485 + }, + { + "epoch": 0.18386456067341342, + "grad_norm": 1.2402819428437333, + "learning_rate": 4.86e-06, + "loss": 0.9006, + "step": 486 + }, + { + "epoch": 0.18424288281471674, + "grad_norm": 1.253080289206958, + "learning_rate": 4.87e-06, + "loss": 0.8552, + "step": 487 + }, + { + "epoch": 0.18462120495602005, + "grad_norm": 1.20114987062819, + "learning_rate": 4.88e-06, + "loss": 0.8646, + "step": 488 + }, + { + "epoch": 0.18499952709732337, + "grad_norm": 1.2698388666443412, + "learning_rate": 4.89e-06, + "loss": 0.9058, + "step": 489 + }, + { + "epoch": 0.18537784923862669, + "grad_norm": 1.255138008138629, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.9045, + "step": 490 + }, + { + "epoch": 0.18575617137993, + "grad_norm": 1.173366935458501, + "learning_rate": 4.91e-06, + "loss": 0.8653, + "step": 491 + }, + { + "epoch": 0.18613449352123332, + "grad_norm": 1.2544859383454867, + "learning_rate": 4.9199999999999995e-06, + "loss": 0.8577, + "step": 492 + }, + { + "epoch": 0.18651281566253666, + "grad_norm": 1.1732808685881084, + "learning_rate": 4.929999999999999e-06, + "loss": 0.8551, + "step": 493 + }, + { + "epoch": 0.18689113780383998, + "grad_norm": 1.2265764031917046, + "learning_rate": 4.94e-06, + "loss": 0.8726, + "step": 494 + }, + { + "epoch": 0.1872694599451433, + "grad_norm": 1.2234524388802157, + "learning_rate": 4.95e-06, + "loss": 0.8833, + "step": 495 + }, + { + "epoch": 0.1876477820864466, + "grad_norm": 1.2488343163013593, + "learning_rate": 4.96e-06, + "loss": 0.8704, + "step": 496 + }, + { + "epoch": 0.18802610422774993, + "grad_norm": 1.1667370629188312, + "learning_rate": 4.97e-06, + "loss": 0.8637, + "step": 497 + }, + { + "epoch": 0.18840442636905325, + "grad_norm": 1.1300202443780525, + "learning_rate": 4.980000000000001e-06, + "loss": 0.8222, + "step": 498 + }, + { + "epoch": 0.18878274851035656, + "grad_norm": 1.2105094043051028, + "learning_rate": 4.99e-06, + "loss": 0.8172, + "step": 499 + }, + { + "epoch": 0.18916107065165988, + "grad_norm": 1.147109513607525, + "learning_rate": 4.9999999999999996e-06, + "loss": 0.8718, + "step": 500 + }, + { + "epoch": 0.1895393927929632, + "grad_norm": 1.186254501579871, + "learning_rate": 5.0099999999999995e-06, + "loss": 0.8672, + "step": 501 + }, + { + "epoch": 0.18991771493426654, + "grad_norm": 1.1921470006777564, + "learning_rate": 5.019999999999999e-06, + "loss": 0.8984, + "step": 502 + }, + { + "epoch": 0.19029603707556986, + "grad_norm": 1.204441588496536, + "learning_rate": 5.03e-06, + "loss": 0.8933, + "step": 503 + }, + { + "epoch": 0.19067435921687317, + "grad_norm": 1.176488402672726, + "learning_rate": 5.04e-06, + "loss": 0.8179, + "step": 504 + }, + { + "epoch": 0.1910526813581765, + "grad_norm": 1.1591890939118275, + "learning_rate": 5.05e-06, + "loss": 0.8994, + "step": 505 + }, + { + "epoch": 0.1914310034994798, + "grad_norm": 1.1844780849489716, + "learning_rate": 5.059999999999999e-06, + "loss": 0.9002, + "step": 506 + }, + { + "epoch": 0.19180932564078312, + "grad_norm": 1.1340897482563235, + "learning_rate": 5.07e-06, + "loss": 0.8629, + "step": 507 + }, + { + "epoch": 0.19218764778208644, + "grad_norm": 1.242695087632576, + "learning_rate": 5.08e-06, + "loss": 0.893, + "step": 508 + }, + { + "epoch": 0.19256596992338976, + "grad_norm": 1.21618537349293, + "learning_rate": 5.0899999999999995e-06, + "loss": 0.8874, + "step": 509 + }, + { + "epoch": 0.19294429206469307, + "grad_norm": 1.2081469798752933, + "learning_rate": 5.0999999999999995e-06, + "loss": 0.8672, + "step": 510 + }, + { + "epoch": 0.19332261420599642, + "grad_norm": 1.1486757711757551, + "learning_rate": 5.11e-06, + "loss": 0.8445, + "step": 511 + }, + { + "epoch": 0.19370093634729973, + "grad_norm": 1.160176382154706, + "learning_rate": 5.12e-06, + "loss": 0.8689, + "step": 512 + }, + { + "epoch": 0.19407925848860305, + "grad_norm": 1.1842115955863446, + "learning_rate": 5.13e-06, + "loss": 0.887, + "step": 513 + }, + { + "epoch": 0.19445758062990637, + "grad_norm": 1.1622953235550992, + "learning_rate": 5.139999999999999e-06, + "loss": 0.8891, + "step": 514 + }, + { + "epoch": 0.19483590277120968, + "grad_norm": 1.2278834007146076, + "learning_rate": 5.15e-06, + "loss": 0.9542, + "step": 515 + }, + { + "epoch": 0.195214224912513, + "grad_norm": 1.1688897803585725, + "learning_rate": 5.16e-06, + "loss": 0.842, + "step": 516 + }, + { + "epoch": 0.19559254705381632, + "grad_norm": 1.169443235508946, + "learning_rate": 5.17e-06, + "loss": 0.926, + "step": 517 + }, + { + "epoch": 0.19597086919511963, + "grad_norm": 1.190101722103473, + "learning_rate": 5.1799999999999995e-06, + "loss": 0.9012, + "step": 518 + }, + { + "epoch": 0.19634919133642295, + "grad_norm": 1.1139938105404836, + "learning_rate": 5.19e-06, + "loss": 0.8355, + "step": 519 + }, + { + "epoch": 0.1967275134777263, + "grad_norm": 1.1644272208548614, + "learning_rate": 5.2e-06, + "loss": 0.8508, + "step": 520 + }, + { + "epoch": 0.1971058356190296, + "grad_norm": 1.188005585447595, + "learning_rate": 5.21e-06, + "loss": 0.8884, + "step": 521 + }, + { + "epoch": 0.19748415776033293, + "grad_norm": 1.162381129570287, + "learning_rate": 5.219999999999999e-06, + "loss": 0.8494, + "step": 522 + }, + { + "epoch": 0.19786247990163625, + "grad_norm": 1.1379792376540319, + "learning_rate": 5.23e-06, + "loss": 0.8427, + "step": 523 + }, + { + "epoch": 0.19824080204293956, + "grad_norm": 1.163441860737916, + "learning_rate": 5.24e-06, + "loss": 0.8831, + "step": 524 + }, + { + "epoch": 0.19861912418424288, + "grad_norm": 1.1604063632172568, + "learning_rate": 5.25e-06, + "loss": 0.8898, + "step": 525 + }, + { + "epoch": 0.1989974463255462, + "grad_norm": 1.1325670759545932, + "learning_rate": 5.26e-06, + "loss": 0.8735, + "step": 526 + }, + { + "epoch": 0.1993757684668495, + "grad_norm": 1.1790821072251718, + "learning_rate": 5.2699999999999995e-06, + "loss": 0.8343, + "step": 527 + }, + { + "epoch": 0.19975409060815283, + "grad_norm": 1.1453742135606537, + "learning_rate": 5.28e-06, + "loss": 0.8566, + "step": 528 + }, + { + "epoch": 0.20013241274945617, + "grad_norm": 1.13296207138768, + "learning_rate": 5.29e-06, + "loss": 0.8659, + "step": 529 + }, + { + "epoch": 0.2005107348907595, + "grad_norm": 1.1666609028219261, + "learning_rate": 5.299999999999999e-06, + "loss": 0.8853, + "step": 530 + }, + { + "epoch": 0.2008890570320628, + "grad_norm": 1.1656374685369397, + "learning_rate": 5.309999999999999e-06, + "loss": 0.9086, + "step": 531 + }, + { + "epoch": 0.20126737917336612, + "grad_norm": 1.1343885551812507, + "learning_rate": 5.32e-06, + "loss": 0.8379, + "step": 532 + }, + { + "epoch": 0.20126737917336612, + "eval_loss": 0.8767463564872742, + "eval_runtime": 26.8872, + "eval_samples_per_second": 32.915, + "eval_steps_per_second": 1.041, + "step": 532 + }, + { + "epoch": 0.20126737917336612, + "eval_bench_accuracy_arc_challenge": 0.24285714285714285, + "eval_bench_accuracy_hellaswag": 0.275, + "eval_bench_accuracy_mmlu": 0.3391304347826087, + "eval_bench_average_accuracy": 0.2856625258799172, + "eval_bench_loss": 5.605643824527138, + "eval_bench_total_accuracy": 0.2813186813186813, + "step": 532 + }, + { + "epoch": 0.20164570131466944, + "grad_norm": 1.1898287763707267, + "learning_rate": 5.33e-06, + "loss": 0.8633, + "step": 533 + }, + { + "epoch": 0.20202402345597276, + "grad_norm": 1.2061752853772802, + "learning_rate": 5.34e-06, + "loss": 0.8537, + "step": 534 + }, + { + "epoch": 0.20240234559727607, + "grad_norm": 1.1524730070815266, + "learning_rate": 5.35e-06, + "loss": 0.8658, + "step": 535 + }, + { + "epoch": 0.2027806677385794, + "grad_norm": 1.2112053959243978, + "learning_rate": 5.36e-06, + "loss": 0.8658, + "step": 536 + }, + { + "epoch": 0.2031589898798827, + "grad_norm": 1.1062007713391508, + "learning_rate": 5.37e-06, + "loss": 0.8695, + "step": 537 + }, + { + "epoch": 0.20353731202118605, + "grad_norm": 1.1454209056836882, + "learning_rate": 5.379999999999999e-06, + "loss": 0.8411, + "step": 538 + }, + { + "epoch": 0.20391563416248937, + "grad_norm": 1.1969213700372077, + "learning_rate": 5.389999999999999e-06, + "loss": 0.8262, + "step": 539 + }, + { + "epoch": 0.20429395630379268, + "grad_norm": 1.1817755878296146, + "learning_rate": 5.4e-06, + "loss": 0.8928, + "step": 540 + }, + { + "epoch": 0.204672278445096, + "grad_norm": 1.2881214697120862, + "learning_rate": 5.41e-06, + "loss": 0.8755, + "step": 541 + }, + { + "epoch": 0.20505060058639932, + "grad_norm": 1.1803409039809667, + "learning_rate": 5.42e-06, + "loss": 0.8728, + "step": 542 + }, + { + "epoch": 0.20542892272770263, + "grad_norm": 1.2147547833072705, + "learning_rate": 5.43e-06, + "loss": 0.8673, + "step": 543 + }, + { + "epoch": 0.20580724486900595, + "grad_norm": 1.111022507543289, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.8572, + "step": 544 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 1.229625708529713, + "learning_rate": 5.45e-06, + "loss": 0.9064, + "step": 545 + }, + { + "epoch": 0.2065638891516126, + "grad_norm": 1.1293738392645483, + "learning_rate": 5.459999999999999e-06, + "loss": 0.8504, + "step": 546 + }, + { + "epoch": 0.20694221129291593, + "grad_norm": 1.1526707564326522, + "learning_rate": 5.469999999999999e-06, + "loss": 0.8722, + "step": 547 + }, + { + "epoch": 0.20732053343421925, + "grad_norm": 1.1056906302195102, + "learning_rate": 5.48e-06, + "loss": 0.8253, + "step": 548 + }, + { + "epoch": 0.20769885557552256, + "grad_norm": 1.1541954114677542, + "learning_rate": 5.49e-06, + "loss": 0.8475, + "step": 549 + }, + { + "epoch": 0.20807717771682588, + "grad_norm": 1.151670600398325, + "learning_rate": 5.5e-06, + "loss": 0.8372, + "step": 550 + }, + { + "epoch": 0.2084554998581292, + "grad_norm": 1.157820909806914, + "learning_rate": 5.51e-06, + "loss": 0.8595, + "step": 551 + }, + { + "epoch": 0.2088338219994325, + "grad_norm": 1.1605316476134264, + "learning_rate": 5.52e-06, + "loss": 0.8595, + "step": 552 + }, + { + "epoch": 0.20921214414073583, + "grad_norm": 1.1898854269979218, + "learning_rate": 5.53e-06, + "loss": 0.8499, + "step": 553 + }, + { + "epoch": 0.20959046628203915, + "grad_norm": 1.1432985309555297, + "learning_rate": 5.5399999999999995e-06, + "loss": 0.9105, + "step": 554 + }, + { + "epoch": 0.2099687884233425, + "grad_norm": 1.1991072095190312, + "learning_rate": 5.549999999999999e-06, + "loss": 0.9184, + "step": 555 + }, + { + "epoch": 0.2103471105646458, + "grad_norm": 1.140264913482887, + "learning_rate": 5.559999999999999e-06, + "loss": 0.8663, + "step": 556 + }, + { + "epoch": 0.21072543270594912, + "grad_norm": 1.1185725137493638, + "learning_rate": 5.57e-06, + "loss": 0.9098, + "step": 557 + }, + { + "epoch": 0.21110375484725244, + "grad_norm": 1.156695278835195, + "learning_rate": 5.58e-06, + "loss": 0.8781, + "step": 558 + }, + { + "epoch": 0.21148207698855576, + "grad_norm": 1.145333592771482, + "learning_rate": 5.59e-06, + "loss": 0.882, + "step": 559 + }, + { + "epoch": 0.21186039912985907, + "grad_norm": 1.1762140502072864, + "learning_rate": 5.6e-06, + "loss": 0.8269, + "step": 560 + }, + { + "epoch": 0.2122387212711624, + "grad_norm": 1.1607104680787836, + "learning_rate": 5.61e-06, + "loss": 0.8718, + "step": 561 + }, + { + "epoch": 0.2126170434124657, + "grad_norm": 1.1469573147450298, + "learning_rate": 5.6199999999999996e-06, + "loss": 0.9056, + "step": 562 + }, + { + "epoch": 0.21299536555376902, + "grad_norm": 1.1193447632576843, + "learning_rate": 5.6299999999999995e-06, + "loss": 0.8501, + "step": 563 + }, + { + "epoch": 0.21337368769507237, + "grad_norm": 1.136879874832253, + "learning_rate": 5.639999999999999e-06, + "loss": 0.8124, + "step": 564 + }, + { + "epoch": 0.21375200983637568, + "grad_norm": 1.1284818158744658, + "learning_rate": 5.65e-06, + "loss": 0.8676, + "step": 565 + }, + { + "epoch": 0.214130331977679, + "grad_norm": 1.2698716712465286, + "learning_rate": 5.66e-06, + "loss": 0.8661, + "step": 566 + }, + { + "epoch": 0.21450865411898232, + "grad_norm": 1.153073394080358, + "learning_rate": 5.67e-06, + "loss": 0.8164, + "step": 567 + }, + { + "epoch": 0.21488697626028563, + "grad_norm": 1.187929464303015, + "learning_rate": 5.68e-06, + "loss": 0.8803, + "step": 568 + }, + { + "epoch": 0.21526529840158895, + "grad_norm": 1.1011027732459755, + "learning_rate": 5.69e-06, + "loss": 0.8709, + "step": 569 + }, + { + "epoch": 0.21564362054289227, + "grad_norm": 1.104661943825339, + "learning_rate": 5.7e-06, + "loss": 0.8408, + "step": 570 + }, + { + "epoch": 0.21602194268419558, + "grad_norm": 1.1237999429331513, + "learning_rate": 5.7099999999999995e-06, + "loss": 0.8316, + "step": 571 + }, + { + "epoch": 0.2164002648254989, + "grad_norm": 1.188002832097036, + "learning_rate": 5.7199999999999994e-06, + "loss": 0.8431, + "step": 572 + }, + { + "epoch": 0.21677858696680224, + "grad_norm": 1.1510459825305048, + "learning_rate": 5.73e-06, + "loss": 0.8847, + "step": 573 + }, + { + "epoch": 0.21715690910810556, + "grad_norm": 1.0954180332540966, + "learning_rate": 5.74e-06, + "loss": 0.8544, + "step": 574 + }, + { + "epoch": 0.21753523124940888, + "grad_norm": 1.1472545717374318, + "learning_rate": 5.75e-06, + "loss": 0.8249, + "step": 575 + }, + { + "epoch": 0.2179135533907122, + "grad_norm": 1.175641095732617, + "learning_rate": 5.76e-06, + "loss": 0.8614, + "step": 576 + }, + { + "epoch": 0.2182918755320155, + "grad_norm": 1.116355053736543, + "learning_rate": 5.769999999999999e-06, + "loss": 0.8405, + "step": 577 + }, + { + "epoch": 0.21867019767331883, + "grad_norm": 1.1157321259442492, + "learning_rate": 5.78e-06, + "loss": 0.8786, + "step": 578 + }, + { + "epoch": 0.21904851981462214, + "grad_norm": 1.1931582815103652, + "learning_rate": 5.79e-06, + "loss": 0.8904, + "step": 579 + }, + { + "epoch": 0.21942684195592546, + "grad_norm": 1.184066717780273, + "learning_rate": 5.7999999999999995e-06, + "loss": 0.8508, + "step": 580 + }, + { + "epoch": 0.21980516409722878, + "grad_norm": 1.161154664599336, + "learning_rate": 5.8099999999999994e-06, + "loss": 0.9202, + "step": 581 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 1.2235874832602252, + "learning_rate": 5.82e-06, + "loss": 0.8361, + "step": 582 + }, + { + "epoch": 0.22056180837983544, + "grad_norm": 1.1262137082837416, + "learning_rate": 5.83e-06, + "loss": 0.8566, + "step": 583 + }, + { + "epoch": 0.22094013052113876, + "grad_norm": 1.2072112047436216, + "learning_rate": 5.84e-06, + "loss": 0.8632, + "step": 584 + }, + { + "epoch": 0.22131845266244207, + "grad_norm": 1.1490940800541938, + "learning_rate": 5.849999999999999e-06, + "loss": 0.8593, + "step": 585 + }, + { + "epoch": 0.2216967748037454, + "grad_norm": 1.207791799143847, + "learning_rate": 5.86e-06, + "loss": 0.8556, + "step": 586 + }, + { + "epoch": 0.2220750969450487, + "grad_norm": 1.1526196801211563, + "learning_rate": 5.87e-06, + "loss": 0.8606, + "step": 587 + }, + { + "epoch": 0.22245341908635202, + "grad_norm": 1.1397609148470536, + "learning_rate": 5.88e-06, + "loss": 0.8469, + "step": 588 + }, + { + "epoch": 0.22283174122765534, + "grad_norm": 1.1785117139043815, + "learning_rate": 5.8899999999999995e-06, + "loss": 0.9147, + "step": 589 + }, + { + "epoch": 0.22321006336895866, + "grad_norm": 1.1858125002539965, + "learning_rate": 5.9e-06, + "loss": 0.8849, + "step": 590 + }, + { + "epoch": 0.223588385510262, + "grad_norm": 1.1941323389502188, + "learning_rate": 5.91e-06, + "loss": 0.869, + "step": 591 + }, + { + "epoch": 0.22396670765156532, + "grad_norm": 1.1418623190210022, + "learning_rate": 5.92e-06, + "loss": 0.8308, + "step": 592 + }, + { + "epoch": 0.22434502979286863, + "grad_norm": 1.0743417979986591, + "learning_rate": 5.929999999999999e-06, + "loss": 0.843, + "step": 593 + }, + { + "epoch": 0.22472335193417195, + "grad_norm": 1.1529208818856194, + "learning_rate": 5.94e-06, + "loss": 0.8235, + "step": 594 + }, + { + "epoch": 0.22510167407547527, + "grad_norm": 1.0767273225154363, + "learning_rate": 5.95e-06, + "loss": 0.8247, + "step": 595 + }, + { + "epoch": 0.22547999621677858, + "grad_norm": 1.1070019054712885, + "learning_rate": 5.96e-06, + "loss": 0.8426, + "step": 596 + }, + { + "epoch": 0.2258583183580819, + "grad_norm": 1.166373551635366, + "learning_rate": 5.97e-06, + "loss": 0.8732, + "step": 597 + }, + { + "epoch": 0.22623664049938522, + "grad_norm": 1.123857925375413, + "learning_rate": 5.98e-06, + "loss": 0.8464, + "step": 598 + }, + { + "epoch": 0.22661496264068853, + "grad_norm": 1.08557960856811, + "learning_rate": 5.99e-06, + "loss": 0.821, + "step": 599 + }, + { + "epoch": 0.22699328478199188, + "grad_norm": 1.1164890662505647, + "learning_rate": 6e-06, + "loss": 0.8846, + "step": 600 + }, + { + "epoch": 0.2273716069232952, + "grad_norm": 1.1514037573784872, + "learning_rate": 6.009999999999999e-06, + "loss": 0.8552, + "step": 601 + }, + { + "epoch": 0.2277499290645985, + "grad_norm": 1.1511174146769416, + "learning_rate": 6.019999999999999e-06, + "loss": 0.9014, + "step": 602 + }, + { + "epoch": 0.22812825120590183, + "grad_norm": 1.1696423261594386, + "learning_rate": 6.03e-06, + "loss": 0.8605, + "step": 603 + }, + { + "epoch": 0.22850657334720514, + "grad_norm": 1.1207706559785515, + "learning_rate": 6.04e-06, + "loss": 0.8382, + "step": 604 + }, + { + "epoch": 0.22888489548850846, + "grad_norm": 1.1767521633404514, + "learning_rate": 6.05e-06, + "loss": 0.9206, + "step": 605 + }, + { + "epoch": 0.22926321762981178, + "grad_norm": 1.1758374604143937, + "learning_rate": 6.06e-06, + "loss": 0.8883, + "step": 606 + }, + { + "epoch": 0.2296415397711151, + "grad_norm": 1.148791521470335, + "learning_rate": 6.07e-06, + "loss": 0.9091, + "step": 607 + }, + { + "epoch": 0.2300198619124184, + "grad_norm": 1.1533752302256568, + "learning_rate": 6.079999999999999e-06, + "loss": 0.915, + "step": 608 + }, + { + "epoch": 0.23039818405372176, + "grad_norm": 1.1082862913426186, + "learning_rate": 6.089999999999999e-06, + "loss": 0.8259, + "step": 609 + }, + { + "epoch": 0.23077650619502507, + "grad_norm": 1.1400168808816862, + "learning_rate": 6.099999999999999e-06, + "loss": 0.8417, + "step": 610 + }, + { + "epoch": 0.2311548283363284, + "grad_norm": 1.149922499835282, + "learning_rate": 6.11e-06, + "loss": 0.8736, + "step": 611 + }, + { + "epoch": 0.2315331504776317, + "grad_norm": 1.1611344187938348, + "learning_rate": 6.12e-06, + "loss": 0.8376, + "step": 612 + }, + { + "epoch": 0.23191147261893502, + "grad_norm": 1.1787603376828737, + "learning_rate": 6.13e-06, + "loss": 0.8558, + "step": 613 + }, + { + "epoch": 0.23228979476023834, + "grad_norm": 1.155525289243939, + "learning_rate": 6.14e-06, + "loss": 0.8463, + "step": 614 + }, + { + "epoch": 0.23266811690154166, + "grad_norm": 1.1589832886045384, + "learning_rate": 6.15e-06, + "loss": 0.8182, + "step": 615 + }, + { + "epoch": 0.23304643904284497, + "grad_norm": 1.1033596458549921, + "learning_rate": 6.1599999999999995e-06, + "loss": 0.8324, + "step": 616 + }, + { + "epoch": 0.23342476118414832, + "grad_norm": 1.2358470403500466, + "learning_rate": 6.169999999999999e-06, + "loss": 0.8682, + "step": 617 + }, + { + "epoch": 0.23380308332545163, + "grad_norm": 1.0984535537652391, + "learning_rate": 6.179999999999999e-06, + "loss": 0.8332, + "step": 618 + }, + { + "epoch": 0.23418140546675495, + "grad_norm": 1.2128396124349823, + "learning_rate": 6.19e-06, + "loss": 0.8747, + "step": 619 + }, + { + "epoch": 0.23455972760805827, + "grad_norm": 1.2275794235621071, + "learning_rate": 6.2e-06, + "loss": 0.8953, + "step": 620 + }, + { + "epoch": 0.23493804974936158, + "grad_norm": 1.2542101409168016, + "learning_rate": 6.21e-06, + "loss": 0.8892, + "step": 621 + }, + { + "epoch": 0.2353163718906649, + "grad_norm": 1.204474995156125, + "learning_rate": 6.22e-06, + "loss": 0.8491, + "step": 622 + }, + { + "epoch": 0.23569469403196822, + "grad_norm": 1.1548886283677673, + "learning_rate": 6.2300000000000005e-06, + "loss": 0.8581, + "step": 623 + }, + { + "epoch": 0.23607301617327153, + "grad_norm": 1.251297532099902, + "learning_rate": 6.2399999999999995e-06, + "loss": 0.851, + "step": 624 + }, + { + "epoch": 0.23645133831457485, + "grad_norm": 1.218716341983368, + "learning_rate": 6.2499999999999995e-06, + "loss": 0.917, + "step": 625 + }, + { + "epoch": 0.2368296604558782, + "grad_norm": 1.1845662251647084, + "learning_rate": 6.259999999999999e-06, + "loss": 0.9132, + "step": 626 + }, + { + "epoch": 0.2372079825971815, + "grad_norm": 1.1620810200029381, + "learning_rate": 6.269999999999999e-06, + "loss": 0.8652, + "step": 627 + }, + { + "epoch": 0.23758630473848483, + "grad_norm": 1.1563059559969693, + "learning_rate": 6.28e-06, + "loss": 0.8474, + "step": 628 + }, + { + "epoch": 0.23796462687978814, + "grad_norm": 1.1388389502769878, + "learning_rate": 6.29e-06, + "loss": 0.8314, + "step": 629 + }, + { + "epoch": 0.23834294902109146, + "grad_norm": 1.1551456623854715, + "learning_rate": 6.3e-06, + "loss": 0.8902, + "step": 630 + }, + { + "epoch": 0.23872127116239478, + "grad_norm": 1.1459750574525491, + "learning_rate": 6.31e-06, + "loss": 0.8505, + "step": 631 + }, + { + "epoch": 0.2390995933036981, + "grad_norm": 1.0925608036319805, + "learning_rate": 6.32e-06, + "loss": 0.8651, + "step": 632 + }, + { + "epoch": 0.2394779154450014, + "grad_norm": 1.1607966985031983, + "learning_rate": 6.3299999999999995e-06, + "loss": 0.8156, + "step": 633 + }, + { + "epoch": 0.23985623758630473, + "grad_norm": 1.112649862871437, + "learning_rate": 6.3399999999999994e-06, + "loss": 0.823, + "step": 634 + }, + { + "epoch": 0.24023455972760807, + "grad_norm": 1.1213541389814015, + "learning_rate": 6.349999999999999e-06, + "loss": 0.8397, + "step": 635 + }, + { + "epoch": 0.2406128818689114, + "grad_norm": 1.134629038613528, + "learning_rate": 6.36e-06, + "loss": 0.8503, + "step": 636 + }, + { + "epoch": 0.2409912040102147, + "grad_norm": 1.1342734785655144, + "learning_rate": 6.37e-06, + "loss": 0.8497, + "step": 637 + }, + { + "epoch": 0.24136952615151802, + "grad_norm": 1.1277526276470056, + "learning_rate": 6.38e-06, + "loss": 0.8348, + "step": 638 + }, + { + "epoch": 0.24174784829282134, + "grad_norm": 1.1313262215365258, + "learning_rate": 6.39e-06, + "loss": 0.8746, + "step": 639 + }, + { + "epoch": 0.24212617043412465, + "grad_norm": 1.0984126709233168, + "learning_rate": 6.4e-06, + "loss": 0.8296, + "step": 640 + }, + { + "epoch": 0.24250449257542797, + "grad_norm": 1.0888784783993595, + "learning_rate": 6.41e-06, + "loss": 0.8129, + "step": 641 + }, + { + "epoch": 0.2428828147167313, + "grad_norm": 1.1461818324642985, + "learning_rate": 6.4199999999999995e-06, + "loss": 0.8834, + "step": 642 + }, + { + "epoch": 0.2432611368580346, + "grad_norm": 1.1427506153934843, + "learning_rate": 6.429999999999999e-06, + "loss": 0.8706, + "step": 643 + }, + { + "epoch": 0.24363945899933795, + "grad_norm": 1.144102199065487, + "learning_rate": 6.44e-06, + "loss": 0.8877, + "step": 644 + }, + { + "epoch": 0.24401778114064127, + "grad_norm": 1.1231424595451174, + "learning_rate": 6.45e-06, + "loss": 0.8939, + "step": 645 + }, + { + "epoch": 0.24439610328194458, + "grad_norm": 1.1218026132749124, + "learning_rate": 6.46e-06, + "loss": 0.8366, + "step": 646 + }, + { + "epoch": 0.2447744254232479, + "grad_norm": 1.2086540508049943, + "learning_rate": 6.469999999999999e-06, + "loss": 0.892, + "step": 647 + }, + { + "epoch": 0.24515274756455122, + "grad_norm": 1.0868363589750187, + "learning_rate": 6.48e-06, + "loss": 0.8581, + "step": 648 + }, + { + "epoch": 0.24553106970585453, + "grad_norm": 1.1504181380058272, + "learning_rate": 6.49e-06, + "loss": 0.8942, + "step": 649 + }, + { + "epoch": 0.24590939184715785, + "grad_norm": 1.1874832509790985, + "learning_rate": 6.5e-06, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.24628771398846117, + "grad_norm": 1.1066886977698138, + "learning_rate": 6.5099999999999995e-06, + "loss": 0.8645, + "step": 651 + }, + { + "epoch": 0.24666603612976448, + "grad_norm": 1.1091171121306154, + "learning_rate": 6.519999999999999e-06, + "loss": 0.8866, + "step": 652 + }, + { + "epoch": 0.24704435827106783, + "grad_norm": 1.1168392333785764, + "learning_rate": 6.53e-06, + "loss": 0.8377, + "step": 653 + }, + { + "epoch": 0.24742268041237114, + "grad_norm": 1.1333024723334617, + "learning_rate": 6.54e-06, + "loss": 0.8404, + "step": 654 + }, + { + "epoch": 0.24780100255367446, + "grad_norm": 1.1624311607412376, + "learning_rate": 6.549999999999999e-06, + "loss": 0.8578, + "step": 655 + }, + { + "epoch": 0.24817932469497778, + "grad_norm": 1.140510520926876, + "learning_rate": 6.559999999999999e-06, + "loss": 0.7948, + "step": 656 + }, + { + "epoch": 0.2485576468362811, + "grad_norm": 1.1241297695775005, + "learning_rate": 6.57e-06, + "loss": 0.8455, + "step": 657 + }, + { + "epoch": 0.2489359689775844, + "grad_norm": 1.1171688585786779, + "learning_rate": 6.58e-06, + "loss": 0.8347, + "step": 658 + }, + { + "epoch": 0.24931429111888773, + "grad_norm": 1.131716974118065, + "learning_rate": 6.59e-06, + "loss": 0.8624, + "step": 659 + }, + { + "epoch": 0.24969261326019104, + "grad_norm": 1.1586113355227856, + "learning_rate": 6.5999999999999995e-06, + "loss": 0.8937, + "step": 660 + }, + { + "epoch": 0.2500709354014944, + "grad_norm": 1.186938370866149, + "learning_rate": 6.61e-06, + "loss": 0.8523, + "step": 661 + }, + { + "epoch": 0.2504492575427977, + "grad_norm": 1.1500652838613878, + "learning_rate": 6.62e-06, + "loss": 0.8537, + "step": 662 + }, + { + "epoch": 0.250827579684101, + "grad_norm": 1.2121811392488833, + "learning_rate": 6.629999999999999e-06, + "loss": 0.8477, + "step": 663 + }, + { + "epoch": 0.2512059018254043, + "grad_norm": 1.1348675624901883, + "learning_rate": 6.639999999999999e-06, + "loss": 0.8502, + "step": 664 + }, + { + "epoch": 0.25158422396670765, + "grad_norm": 1.102535269461347, + "learning_rate": 6.65e-06, + "loss": 0.8745, + "step": 665 + }, + { + "epoch": 0.25158422396670765, + "eval_loss": 0.8625780940055847, + "eval_runtime": 27.0021, + "eval_samples_per_second": 32.775, + "eval_steps_per_second": 1.037, + "step": 665 + }, + { + "epoch": 0.25158422396670765, + "eval_bench_accuracy_arc_challenge": 0.24285714285714285, + "eval_bench_accuracy_hellaswag": 0.225, + "eval_bench_accuracy_mmlu": 0.2782608695652174, + "eval_bench_average_accuracy": 0.24870600414078672, + "eval_bench_loss": 5.327823571991503, + "eval_bench_total_accuracy": 0.24395604395604395, + "step": 665 + }, + { + "epoch": 0.251962546108011, + "grad_norm": 1.149499114356956, + "learning_rate": 6.66e-06, + "loss": 0.8693, + "step": 666 + }, + { + "epoch": 0.2523408682493143, + "grad_norm": 1.161075438749712, + "learning_rate": 6.67e-06, + "loss": 0.9075, + "step": 667 + }, + { + "epoch": 0.25271919039061763, + "grad_norm": 1.141541764628487, + "learning_rate": 6.6799999999999996e-06, + "loss": 0.8643, + "step": 668 + }, + { + "epoch": 0.2530975125319209, + "grad_norm": 1.1390764097501647, + "learning_rate": 6.69e-06, + "loss": 0.8752, + "step": 669 + }, + { + "epoch": 0.25347583467322427, + "grad_norm": 1.1198865085900025, + "learning_rate": 6.7e-06, + "loss": 0.8403, + "step": 670 + }, + { + "epoch": 0.25385415681452755, + "grad_norm": 1.143235453200182, + "learning_rate": 6.709999999999999e-06, + "loss": 0.8347, + "step": 671 + }, + { + "epoch": 0.2542324789558309, + "grad_norm": 1.105054342960603, + "learning_rate": 6.719999999999999e-06, + "loss": 0.877, + "step": 672 + }, + { + "epoch": 0.2546108010971342, + "grad_norm": 1.1899413861555337, + "learning_rate": 6.73e-06, + "loss": 0.8239, + "step": 673 + }, + { + "epoch": 0.25498912323843753, + "grad_norm": 1.1305008415556128, + "learning_rate": 6.74e-06, + "loss": 0.8598, + "step": 674 + }, + { + "epoch": 0.2553674453797409, + "grad_norm": 1.168034799536073, + "learning_rate": 6.75e-06, + "loss": 0.8294, + "step": 675 + }, + { + "epoch": 0.25574576752104416, + "grad_norm": 1.1472097884900647, + "learning_rate": 6.76e-06, + "loss": 0.9007, + "step": 676 + }, + { + "epoch": 0.2561240896623475, + "grad_norm": 1.0931411919432397, + "learning_rate": 6.7699999999999996e-06, + "loss": 0.8326, + "step": 677 + }, + { + "epoch": 0.2565024118036508, + "grad_norm": 1.1510688024969498, + "learning_rate": 6.78e-06, + "loss": 0.8828, + "step": 678 + }, + { + "epoch": 0.25688073394495414, + "grad_norm": 1.1191461068866526, + "learning_rate": 6.789999999999999e-06, + "loss": 0.8461, + "step": 679 + }, + { + "epoch": 0.25725905608625743, + "grad_norm": 1.1041404496614182, + "learning_rate": 6.799999999999999e-06, + "loss": 0.8285, + "step": 680 + }, + { + "epoch": 0.2576373782275608, + "grad_norm": 1.1012877673575499, + "learning_rate": 6.809999999999999e-06, + "loss": 0.8548, + "step": 681 + }, + { + "epoch": 0.25801570036886406, + "grad_norm": 1.1057501522176822, + "learning_rate": 6.82e-06, + "loss": 0.8591, + "step": 682 + }, + { + "epoch": 0.2583940225101674, + "grad_norm": 1.1498742481849225, + "learning_rate": 6.83e-06, + "loss": 0.8661, + "step": 683 + }, + { + "epoch": 0.25877234465147075, + "grad_norm": 1.1378178315852814, + "learning_rate": 6.84e-06, + "loss": 0.8759, + "step": 684 + }, + { + "epoch": 0.25915066679277404, + "grad_norm": 1.1011069671017035, + "learning_rate": 6.85e-06, + "loss": 0.823, + "step": 685 + }, + { + "epoch": 0.2595289889340774, + "grad_norm": 1.160807734407358, + "learning_rate": 6.86e-06, + "loss": 0.8732, + "step": 686 + }, + { + "epoch": 0.2599073110753807, + "grad_norm": 1.0867868118261128, + "learning_rate": 6.8699999999999994e-06, + "loss": 0.8367, + "step": 687 + }, + { + "epoch": 0.260285633216684, + "grad_norm": 1.0969221739263768, + "learning_rate": 6.879999999999999e-06, + "loss": 0.8647, + "step": 688 + }, + { + "epoch": 0.2606639553579873, + "grad_norm": 1.0995292401504533, + "learning_rate": 6.889999999999999e-06, + "loss": 0.8524, + "step": 689 + }, + { + "epoch": 0.26104227749929065, + "grad_norm": 1.1692507904848903, + "learning_rate": 6.9e-06, + "loss": 0.8519, + "step": 690 + }, + { + "epoch": 0.26142059964059394, + "grad_norm": 1.0998400071794445, + "learning_rate": 6.91e-06, + "loss": 0.8287, + "step": 691 + }, + { + "epoch": 0.2617989217818973, + "grad_norm": 1.1968950530047644, + "learning_rate": 6.92e-06, + "loss": 0.8138, + "step": 692 + }, + { + "epoch": 0.26217724392320063, + "grad_norm": 1.095854905073934, + "learning_rate": 6.93e-06, + "loss": 0.8568, + "step": 693 + }, + { + "epoch": 0.2625555660645039, + "grad_norm": 1.1079273378796317, + "learning_rate": 6.9400000000000005e-06, + "loss": 0.8353, + "step": 694 + }, + { + "epoch": 0.26293388820580726, + "grad_norm": 1.1606191819435765, + "learning_rate": 6.9499999999999995e-06, + "loss": 0.8561, + "step": 695 + }, + { + "epoch": 0.26331221034711055, + "grad_norm": 1.0902425837878627, + "learning_rate": 6.9599999999999994e-06, + "loss": 0.8391, + "step": 696 + }, + { + "epoch": 0.2636905324884139, + "grad_norm": 1.1206727493642596, + "learning_rate": 6.969999999999999e-06, + "loss": 0.8233, + "step": 697 + }, + { + "epoch": 0.2640688546297172, + "grad_norm": 1.0982647837307586, + "learning_rate": 6.98e-06, + "loss": 0.8602, + "step": 698 + }, + { + "epoch": 0.26444717677102053, + "grad_norm": 1.0871328583668558, + "learning_rate": 6.99e-06, + "loss": 0.8299, + "step": 699 + }, + { + "epoch": 0.2648254989123238, + "grad_norm": 1.1008815238203256, + "learning_rate": 7e-06, + "loss": 0.8341, + "step": 700 + }, + { + "epoch": 0.26520382105362716, + "grad_norm": 1.1750095526723472, + "learning_rate": 7.01e-06, + "loss": 0.8682, + "step": 701 + }, + { + "epoch": 0.2655821431949305, + "grad_norm": 1.1415931541767914, + "learning_rate": 7.019999999999999e-06, + "loss": 0.8932, + "step": 702 + }, + { + "epoch": 0.2659604653362338, + "grad_norm": 1.0981715817655127, + "learning_rate": 7.03e-06, + "loss": 0.838, + "step": 703 + }, + { + "epoch": 0.26633878747753714, + "grad_norm": 1.0986067356062597, + "learning_rate": 7.0399999999999995e-06, + "loss": 0.8503, + "step": 704 + }, + { + "epoch": 0.26671710961884043, + "grad_norm": 1.1084347528867848, + "learning_rate": 7.049999999999999e-06, + "loss": 0.8958, + "step": 705 + }, + { + "epoch": 0.2670954317601438, + "grad_norm": 1.1475294765378516, + "learning_rate": 7.059999999999999e-06, + "loss": 0.8496, + "step": 706 + }, + { + "epoch": 0.26747375390144706, + "grad_norm": 1.117143691203432, + "learning_rate": 7.07e-06, + "loss": 0.875, + "step": 707 + }, + { + "epoch": 0.2678520760427504, + "grad_norm": 1.1331250955748378, + "learning_rate": 7.08e-06, + "loss": 0.854, + "step": 708 + }, + { + "epoch": 0.2682303981840537, + "grad_norm": 1.0837995640069416, + "learning_rate": 7.09e-06, + "loss": 0.8461, + "step": 709 + }, + { + "epoch": 0.26860872032535704, + "grad_norm": 1.0933867992273585, + "learning_rate": 7.099999999999999e-06, + "loss": 0.8383, + "step": 710 + }, + { + "epoch": 0.2689870424666604, + "grad_norm": 1.0862191237112888, + "learning_rate": 7.11e-06, + "loss": 0.7976, + "step": 711 + }, + { + "epoch": 0.2693653646079637, + "grad_norm": 1.1151836826262986, + "learning_rate": 7.12e-06, + "loss": 0.8224, + "step": 712 + }, + { + "epoch": 0.269743686749267, + "grad_norm": 1.189062828656012, + "learning_rate": 7.1299999999999995e-06, + "loss": 0.8917, + "step": 713 + }, + { + "epoch": 0.2701220088905703, + "grad_norm": 1.1119181389921133, + "learning_rate": 7.139999999999999e-06, + "loss": 0.8291, + "step": 714 + }, + { + "epoch": 0.27050033103187365, + "grad_norm": 1.114538144475484, + "learning_rate": 7.15e-06, + "loss": 0.8996, + "step": 715 + }, + { + "epoch": 0.27087865317317694, + "grad_norm": 1.1005437857491667, + "learning_rate": 7.16e-06, + "loss": 0.7888, + "step": 716 + }, + { + "epoch": 0.2712569753144803, + "grad_norm": 1.1146994809955666, + "learning_rate": 7.17e-06, + "loss": 0.8878, + "step": 717 + }, + { + "epoch": 0.2716352974557836, + "grad_norm": 1.0936279250904897, + "learning_rate": 7.179999999999999e-06, + "loss": 0.8672, + "step": 718 + }, + { + "epoch": 0.2720136195970869, + "grad_norm": 1.1366251894998205, + "learning_rate": 7.19e-06, + "loss": 0.8858, + "step": 719 + }, + { + "epoch": 0.27239194173839026, + "grad_norm": 1.1195931324613553, + "learning_rate": 7.2e-06, + "loss": 0.8507, + "step": 720 + }, + { + "epoch": 0.27277026387969355, + "grad_norm": 1.0935327911384591, + "learning_rate": 7.21e-06, + "loss": 0.8424, + "step": 721 + }, + { + "epoch": 0.2731485860209969, + "grad_norm": 1.0953372322434138, + "learning_rate": 7.2199999999999995e-06, + "loss": 0.8831, + "step": 722 + }, + { + "epoch": 0.2735269081623002, + "grad_norm": 1.0904032768722667, + "learning_rate": 7.23e-06, + "loss": 0.8334, + "step": 723 + }, + { + "epoch": 0.27390523030360353, + "grad_norm": 1.1346874176897102, + "learning_rate": 7.24e-06, + "loss": 0.8506, + "step": 724 + }, + { + "epoch": 0.2742835524449068, + "grad_norm": 1.154262444900059, + "learning_rate": 7.25e-06, + "loss": 0.8393, + "step": 725 + }, + { + "epoch": 0.27466187458621016, + "grad_norm": 1.1336981217637951, + "learning_rate": 7.259999999999999e-06, + "loss": 0.8371, + "step": 726 + }, + { + "epoch": 0.27504019672751345, + "grad_norm": 1.1530922109530841, + "learning_rate": 7.269999999999999e-06, + "loss": 0.9141, + "step": 727 + }, + { + "epoch": 0.2754185188688168, + "grad_norm": 1.1414400257725132, + "learning_rate": 7.28e-06, + "loss": 0.8615, + "step": 728 + }, + { + "epoch": 0.27579684101012014, + "grad_norm": 1.0747602134856014, + "learning_rate": 7.29e-06, + "loss": 0.8507, + "step": 729 + }, + { + "epoch": 0.27617516315142343, + "grad_norm": 1.1341332656767107, + "learning_rate": 7.2999999999999996e-06, + "loss": 0.8771, + "step": 730 + }, + { + "epoch": 0.2765534852927268, + "grad_norm": 1.127774756748704, + "learning_rate": 7.3099999999999995e-06, + "loss": 0.8559, + "step": 731 + }, + { + "epoch": 0.27693180743403006, + "grad_norm": 1.106246473020497, + "learning_rate": 7.32e-06, + "loss": 0.8333, + "step": 732 + }, + { + "epoch": 0.2773101295753334, + "grad_norm": 1.072619886572064, + "learning_rate": 7.33e-06, + "loss": 0.8138, + "step": 733 + }, + { + "epoch": 0.2776884517166367, + "grad_norm": 1.1053237591292755, + "learning_rate": 7.339999999999999e-06, + "loss": 0.8929, + "step": 734 + }, + { + "epoch": 0.27806677385794004, + "grad_norm": 1.0590657569440343, + "learning_rate": 7.349999999999999e-06, + "loss": 0.8657, + "step": 735 + }, + { + "epoch": 0.27844509599924333, + "grad_norm": 1.0990511323540157, + "learning_rate": 7.36e-06, + "loss": 0.831, + "step": 736 + }, + { + "epoch": 0.2788234181405467, + "grad_norm": 1.0960494967933392, + "learning_rate": 7.37e-06, + "loss": 0.8672, + "step": 737 + }, + { + "epoch": 0.27920174028185, + "grad_norm": 1.0923972930315522, + "learning_rate": 7.38e-06, + "loss": 0.8359, + "step": 738 + }, + { + "epoch": 0.2795800624231533, + "grad_norm": 1.117398170352597, + "learning_rate": 7.3899999999999995e-06, + "loss": 0.8678, + "step": 739 + }, + { + "epoch": 0.27995838456445665, + "grad_norm": 1.0964334876514574, + "learning_rate": 7.4e-06, + "loss": 0.8175, + "step": 740 + }, + { + "epoch": 0.28033670670575994, + "grad_norm": 1.137429209179925, + "learning_rate": 7.41e-06, + "loss": 0.8469, + "step": 741 + }, + { + "epoch": 0.2807150288470633, + "grad_norm": 1.1550309848051612, + "learning_rate": 7.419999999999999e-06, + "loss": 0.8326, + "step": 742 + }, + { + "epoch": 0.2810933509883666, + "grad_norm": 1.1935237789558146, + "learning_rate": 7.429999999999999e-06, + "loss": 0.8568, + "step": 743 + }, + { + "epoch": 0.2814716731296699, + "grad_norm": 1.1694982973025607, + "learning_rate": 7.44e-06, + "loss": 0.8869, + "step": 744 + }, + { + "epoch": 0.2818499952709732, + "grad_norm": 1.1920139094347593, + "learning_rate": 7.45e-06, + "loss": 0.8487, + "step": 745 + }, + { + "epoch": 0.28222831741227655, + "grad_norm": 1.1367845567285337, + "learning_rate": 7.46e-06, + "loss": 0.8554, + "step": 746 + }, + { + "epoch": 0.2826066395535799, + "grad_norm": 1.1505063717374056, + "learning_rate": 7.47e-06, + "loss": 0.8371, + "step": 747 + }, + { + "epoch": 0.2829849616948832, + "grad_norm": 1.1339987287473563, + "learning_rate": 7.48e-06, + "loss": 0.8256, + "step": 748 + }, + { + "epoch": 0.28336328383618653, + "grad_norm": 1.158977003616627, + "learning_rate": 7.49e-06, + "loss": 0.8913, + "step": 749 + }, + { + "epoch": 0.2837416059774898, + "grad_norm": 1.1022707433616572, + "learning_rate": 7.499999999999999e-06, + "loss": 0.8117, + "step": 750 + }, + { + "epoch": 0.28411992811879316, + "grad_norm": 1.1550634309139105, + "learning_rate": 7.509999999999999e-06, + "loss": 0.8906, + "step": 751 + }, + { + "epoch": 0.28449825026009645, + "grad_norm": 1.090317910646282, + "learning_rate": 7.519999999999999e-06, + "loss": 0.8799, + "step": 752 + }, + { + "epoch": 0.2848765724013998, + "grad_norm": 1.0677643984555838, + "learning_rate": 7.53e-06, + "loss": 0.8653, + "step": 753 + }, + { + "epoch": 0.2852548945427031, + "grad_norm": 1.1663544994037678, + "learning_rate": 7.54e-06, + "loss": 0.8737, + "step": 754 + }, + { + "epoch": 0.28563321668400643, + "grad_norm": 1.0973153975053445, + "learning_rate": 7.55e-06, + "loss": 0.8485, + "step": 755 + }, + { + "epoch": 0.2860115388253098, + "grad_norm": 1.0761549351444184, + "learning_rate": 7.56e-06, + "loss": 0.8284, + "step": 756 + }, + { + "epoch": 0.28638986096661306, + "grad_norm": 1.1355050591654032, + "learning_rate": 7.5699999999999995e-06, + "loss": 0.8348, + "step": 757 + }, + { + "epoch": 0.2867681831079164, + "grad_norm": 1.116699730612722, + "learning_rate": 7.5799999999999994e-06, + "loss": 0.8405, + "step": 758 + }, + { + "epoch": 0.2871465052492197, + "grad_norm": 1.1037588379626753, + "learning_rate": 7.589999999999999e-06, + "loss": 0.8652, + "step": 759 + }, + { + "epoch": 0.28752482739052304, + "grad_norm": 1.092569661781677, + "learning_rate": 7.599999999999999e-06, + "loss": 0.8786, + "step": 760 + }, + { + "epoch": 0.28790314953182633, + "grad_norm": 1.1079207038423997, + "learning_rate": 7.61e-06, + "loss": 0.8731, + "step": 761 + }, + { + "epoch": 0.2882814716731297, + "grad_norm": 1.0840455559100046, + "learning_rate": 7.62e-06, + "loss": 0.8533, + "step": 762 + }, + { + "epoch": 0.28865979381443296, + "grad_norm": 1.1088308729059055, + "learning_rate": 7.63e-06, + "loss": 0.8407, + "step": 763 + }, + { + "epoch": 0.2890381159557363, + "grad_norm": 1.070788168887275, + "learning_rate": 7.64e-06, + "loss": 0.8919, + "step": 764 + }, + { + "epoch": 0.28941643809703965, + "grad_norm": 1.060969292922543, + "learning_rate": 7.65e-06, + "loss": 0.812, + "step": 765 + }, + { + "epoch": 0.28979476023834294, + "grad_norm": 1.1301219505514637, + "learning_rate": 7.66e-06, + "loss": 0.8336, + "step": 766 + }, + { + "epoch": 0.2901730823796463, + "grad_norm": 1.0534794694384884, + "learning_rate": 7.67e-06, + "loss": 0.8329, + "step": 767 + }, + { + "epoch": 0.2905514045209496, + "grad_norm": 1.1347313685498166, + "learning_rate": 7.68e-06, + "loss": 0.8793, + "step": 768 + }, + { + "epoch": 0.2909297266622529, + "grad_norm": 1.1475444842715925, + "learning_rate": 7.69e-06, + "loss": 0.8508, + "step": 769 + }, + { + "epoch": 0.2913080488035562, + "grad_norm": 1.131952349011137, + "learning_rate": 7.699999999999999e-06, + "loss": 0.845, + "step": 770 + }, + { + "epoch": 0.29168637094485955, + "grad_norm": 1.1447781586459667, + "learning_rate": 7.709999999999999e-06, + "loss": 0.8726, + "step": 771 + }, + { + "epoch": 0.29206469308616284, + "grad_norm": 1.1327583004535982, + "learning_rate": 7.719999999999999e-06, + "loss": 0.8104, + "step": 772 + }, + { + "epoch": 0.2924430152274662, + "grad_norm": 1.128617220703407, + "learning_rate": 7.73e-06, + "loss": 0.8176, + "step": 773 + }, + { + "epoch": 0.29282133736876953, + "grad_norm": 1.1023174787003673, + "learning_rate": 7.74e-06, + "loss": 0.8428, + "step": 774 + }, + { + "epoch": 0.2931996595100728, + "grad_norm": 1.1676360521088707, + "learning_rate": 7.75e-06, + "loss": 0.8811, + "step": 775 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 1.1926785192763554, + "learning_rate": 7.76e-06, + "loss": 0.8814, + "step": 776 + }, + { + "epoch": 0.29395630379267945, + "grad_norm": 1.0926242154672956, + "learning_rate": 7.769999999999998e-06, + "loss": 0.8697, + "step": 777 + }, + { + "epoch": 0.2943346259339828, + "grad_norm": 1.1477061183634145, + "learning_rate": 7.78e-06, + "loss": 0.883, + "step": 778 + }, + { + "epoch": 0.2947129480752861, + "grad_norm": 1.0524242129666213, + "learning_rate": 7.79e-06, + "loss": 0.8285, + "step": 779 + }, + { + "epoch": 0.29509127021658943, + "grad_norm": 1.1003220338231798, + "learning_rate": 7.8e-06, + "loss": 0.873, + "step": 780 + }, + { + "epoch": 0.2954695923578927, + "grad_norm": 1.0924766297335016, + "learning_rate": 7.81e-06, + "loss": 0.8388, + "step": 781 + }, + { + "epoch": 0.29584791449919606, + "grad_norm": 1.0905974324189436, + "learning_rate": 7.82e-06, + "loss": 0.8456, + "step": 782 + }, + { + "epoch": 0.2962262366404994, + "grad_norm": 1.0784036223330382, + "learning_rate": 7.83e-06, + "loss": 0.8732, + "step": 783 + }, + { + "epoch": 0.2966045587818027, + "grad_norm": 1.0471596415042548, + "learning_rate": 7.84e-06, + "loss": 0.8396, + "step": 784 + }, + { + "epoch": 0.29698288092310604, + "grad_norm": 1.080443491875735, + "learning_rate": 7.85e-06, + "loss": 0.8458, + "step": 785 + }, + { + "epoch": 0.29736120306440933, + "grad_norm": 1.0828576066417819, + "learning_rate": 7.86e-06, + "loss": 0.813, + "step": 786 + }, + { + "epoch": 0.2977395252057127, + "grad_norm": 1.0752539748255008, + "learning_rate": 7.87e-06, + "loss": 0.8564, + "step": 787 + }, + { + "epoch": 0.29811784734701596, + "grad_norm": 1.0994217833391198, + "learning_rate": 7.879999999999999e-06, + "loss": 0.8263, + "step": 788 + }, + { + "epoch": 0.2984961694883193, + "grad_norm": 1.086381772786406, + "learning_rate": 7.889999999999999e-06, + "loss": 0.8563, + "step": 789 + }, + { + "epoch": 0.2988744916296226, + "grad_norm": 1.1088374241291266, + "learning_rate": 7.9e-06, + "loss": 0.864, + "step": 790 + }, + { + "epoch": 0.29925281377092594, + "grad_norm": 1.1571412075379082, + "learning_rate": 7.91e-06, + "loss": 0.8171, + "step": 791 + }, + { + "epoch": 0.2996311359122293, + "grad_norm": 1.1203389931533279, + "learning_rate": 7.92e-06, + "loss": 0.8441, + "step": 792 + }, + { + "epoch": 0.3000094580535326, + "grad_norm": 1.0955306189611171, + "learning_rate": 7.929999999999999e-06, + "loss": 0.8367, + "step": 793 + }, + { + "epoch": 0.3003877801948359, + "grad_norm": 1.0518036198212661, + "learning_rate": 7.94e-06, + "loss": 0.8115, + "step": 794 + }, + { + "epoch": 0.3007661023361392, + "grad_norm": 1.1024545203471212, + "learning_rate": 7.95e-06, + "loss": 0.8981, + "step": 795 + }, + { + "epoch": 0.30114442447744255, + "grad_norm": 1.1408707488859684, + "learning_rate": 7.96e-06, + "loss": 0.8574, + "step": 796 + }, + { + "epoch": 0.30152274661874584, + "grad_norm": 1.0664606162956756, + "learning_rate": 7.97e-06, + "loss": 0.851, + "step": 797 + }, + { + "epoch": 0.3019010687600492, + "grad_norm": 1.1045392245613144, + "learning_rate": 7.98e-06, + "loss": 0.8472, + "step": 798 + }, + { + "epoch": 0.3019010687600492, + "eval_loss": 0.850925862789154, + "eval_runtime": 26.6744, + "eval_samples_per_second": 33.178, + "eval_steps_per_second": 1.05, + "step": 798 + }, + { + "epoch": 0.3019010687600492, + "eval_bench_accuracy_arc_challenge": 0.21428571428571427, + "eval_bench_accuracy_hellaswag": 0.235, + "eval_bench_accuracy_mmlu": 0.28695652173913044, + "eval_bench_average_accuracy": 0.24541407867494824, + "eval_bench_loss": 4.9830322265625, + "eval_bench_total_accuracy": 0.24175824175824176, + "step": 798 + }, + { + "epoch": 0.30227939090135253, + "grad_norm": 1.1188259399602403, + "learning_rate": 7.99e-06, + "loss": 0.8468, + "step": 799 + }, + { + "epoch": 0.3026577130426558, + "grad_norm": 1.1431484110606045, + "learning_rate": 8e-06, + "loss": 0.8401, + "step": 800 + }, + { + "epoch": 0.30303603518395916, + "grad_norm": 1.083646592987573, + "learning_rate": 7.999999611606006e-06, + "loss": 0.8062, + "step": 801 + }, + { + "epoch": 0.30341435732526245, + "grad_norm": 1.1319556143394125, + "learning_rate": 7.999998446424103e-06, + "loss": 0.8818, + "step": 802 + }, + { + "epoch": 0.3037926794665658, + "grad_norm": 1.0994025822887656, + "learning_rate": 7.999996504454512e-06, + "loss": 0.8509, + "step": 803 + }, + { + "epoch": 0.3041710016078691, + "grad_norm": 1.0755886346693961, + "learning_rate": 7.999993785697617e-06, + "loss": 0.8004, + "step": 804 + }, + { + "epoch": 0.30454932374917243, + "grad_norm": 1.1441919264010905, + "learning_rate": 7.99999029015394e-06, + "loss": 0.808, + "step": 805 + }, + { + "epoch": 0.3049276458904757, + "grad_norm": 1.1065610412104439, + "learning_rate": 7.999986017824165e-06, + "loss": 0.8549, + "step": 806 + }, + { + "epoch": 0.30530596803177906, + "grad_norm": 1.0882701082696518, + "learning_rate": 7.999980968709117e-06, + "loss": 0.8468, + "step": 807 + }, + { + "epoch": 0.3056842901730824, + "grad_norm": 1.1088124295992208, + "learning_rate": 7.999975142809778e-06, + "loss": 0.8736, + "step": 808 + }, + { + "epoch": 0.3060626123143857, + "grad_norm": 1.1033663016693673, + "learning_rate": 7.99996854012728e-06, + "loss": 0.8476, + "step": 809 + }, + { + "epoch": 0.30644093445568904, + "grad_norm": 1.13603689058083, + "learning_rate": 7.999961160662905e-06, + "loss": 0.8445, + "step": 810 + }, + { + "epoch": 0.30681925659699233, + "grad_norm": 1.160741078547518, + "learning_rate": 7.999953004418086e-06, + "loss": 0.8858, + "step": 811 + }, + { + "epoch": 0.3071975787382957, + "grad_norm": 1.1137885301105297, + "learning_rate": 7.999944071394408e-06, + "loss": 0.8468, + "step": 812 + }, + { + "epoch": 0.30757590087959896, + "grad_norm": 1.0950922126362728, + "learning_rate": 7.999934361593606e-06, + "loss": 0.8277, + "step": 813 + }, + { + "epoch": 0.3079542230209023, + "grad_norm": 1.0705498486629084, + "learning_rate": 7.999923875017561e-06, + "loss": 0.8542, + "step": 814 + }, + { + "epoch": 0.3083325451622056, + "grad_norm": 1.0320443969916053, + "learning_rate": 7.999912611668314e-06, + "loss": 0.8311, + "step": 815 + }, + { + "epoch": 0.30871086730350894, + "grad_norm": 1.1098560201406311, + "learning_rate": 7.999900571548054e-06, + "loss": 0.8285, + "step": 816 + }, + { + "epoch": 0.3090891894448123, + "grad_norm": 1.117956788545042, + "learning_rate": 7.999887754659112e-06, + "loss": 0.8062, + "step": 817 + }, + { + "epoch": 0.3094675115861156, + "grad_norm": 1.0815055115388574, + "learning_rate": 7.999874161003984e-06, + "loss": 0.825, + "step": 818 + }, + { + "epoch": 0.3098458337274189, + "grad_norm": 1.1258610055051623, + "learning_rate": 7.999859790585307e-06, + "loss": 0.8544, + "step": 819 + }, + { + "epoch": 0.3102241558687222, + "grad_norm": 1.0792203366803435, + "learning_rate": 7.99984464340587e-06, + "loss": 0.8371, + "step": 820 + }, + { + "epoch": 0.31060247801002555, + "grad_norm": 1.0857066217255478, + "learning_rate": 7.999828719468619e-06, + "loss": 0.8025, + "step": 821 + }, + { + "epoch": 0.31098080015132884, + "grad_norm": 1.0345681012946357, + "learning_rate": 7.999812018776642e-06, + "loss": 0.7961, + "step": 822 + }, + { + "epoch": 0.3113591222926322, + "grad_norm": 1.0880871394519303, + "learning_rate": 7.999794541333184e-06, + "loss": 0.867, + "step": 823 + }, + { + "epoch": 0.3117374444339355, + "grad_norm": 1.0734362647252, + "learning_rate": 7.99977628714164e-06, + "loss": 0.8504, + "step": 824 + }, + { + "epoch": 0.3121157665752388, + "grad_norm": 1.0651195855212972, + "learning_rate": 7.999757256205554e-06, + "loss": 0.836, + "step": 825 + }, + { + "epoch": 0.31249408871654216, + "grad_norm": 1.0952088927990486, + "learning_rate": 7.99973744852862e-06, + "loss": 0.8685, + "step": 826 + }, + { + "epoch": 0.31287241085784545, + "grad_norm": 1.1189908995835645, + "learning_rate": 7.999716864114687e-06, + "loss": 0.8612, + "step": 827 + }, + { + "epoch": 0.3132507329991488, + "grad_norm": 1.1107627441762915, + "learning_rate": 7.999695502967753e-06, + "loss": 0.887, + "step": 828 + }, + { + "epoch": 0.3136290551404521, + "grad_norm": 1.0910830318775155, + "learning_rate": 7.999673365091965e-06, + "loss": 0.8149, + "step": 829 + }, + { + "epoch": 0.31400737728175543, + "grad_norm": 1.0878738960197105, + "learning_rate": 7.99965045049162e-06, + "loss": 0.8543, + "step": 830 + }, + { + "epoch": 0.3143856994230587, + "grad_norm": 1.1304840925957875, + "learning_rate": 7.999626759171173e-06, + "loss": 0.8607, + "step": 831 + }, + { + "epoch": 0.31476402156436206, + "grad_norm": 1.0977832972523356, + "learning_rate": 7.99960229113522e-06, + "loss": 0.8238, + "step": 832 + }, + { + "epoch": 0.31514234370566535, + "grad_norm": 1.1056029713906521, + "learning_rate": 7.999577046388514e-06, + "loss": 0.8449, + "step": 833 + }, + { + "epoch": 0.3155206658469687, + "grad_norm": 1.1263279045653014, + "learning_rate": 7.999551024935959e-06, + "loss": 0.8996, + "step": 834 + }, + { + "epoch": 0.31589898798827204, + "grad_norm": 1.1023495304424114, + "learning_rate": 7.999524226782608e-06, + "loss": 0.8059, + "step": 835 + }, + { + "epoch": 0.31627731012957533, + "grad_norm": 1.0710753056086557, + "learning_rate": 7.999496651933662e-06, + "loss": 0.8364, + "step": 836 + }, + { + "epoch": 0.3166556322708787, + "grad_norm": 1.1628408036471776, + "learning_rate": 7.999468300394481e-06, + "loss": 0.8491, + "step": 837 + }, + { + "epoch": 0.31703395441218196, + "grad_norm": 1.1011205956685801, + "learning_rate": 7.999439172170566e-06, + "loss": 0.8371, + "step": 838 + }, + { + "epoch": 0.3174122765534853, + "grad_norm": 1.067716374321139, + "learning_rate": 7.999409267267577e-06, + "loss": 0.8257, + "step": 839 + }, + { + "epoch": 0.3177905986947886, + "grad_norm": 1.1358374860128349, + "learning_rate": 7.99937858569132e-06, + "loss": 0.8317, + "step": 840 + }, + { + "epoch": 0.31816892083609194, + "grad_norm": 1.0779959631518108, + "learning_rate": 7.999347127447752e-06, + "loss": 0.7981, + "step": 841 + }, + { + "epoch": 0.31854724297739523, + "grad_norm": 1.1254796876535107, + "learning_rate": 7.999314892542985e-06, + "loss": 0.8971, + "step": 842 + }, + { + "epoch": 0.3189255651186986, + "grad_norm": 1.0901729922813403, + "learning_rate": 7.999281880983277e-06, + "loss": 0.8506, + "step": 843 + }, + { + "epoch": 0.3193038872600019, + "grad_norm": 1.0709160400913234, + "learning_rate": 7.999248092775039e-06, + "loss": 0.8468, + "step": 844 + }, + { + "epoch": 0.3196822094013052, + "grad_norm": 1.1223182444160262, + "learning_rate": 7.999213527924831e-06, + "loss": 0.8217, + "step": 845 + }, + { + "epoch": 0.32006053154260855, + "grad_norm": 1.1033066311400137, + "learning_rate": 7.99917818643937e-06, + "loss": 0.8646, + "step": 846 + }, + { + "epoch": 0.32043885368391184, + "grad_norm": 1.1122943393613496, + "learning_rate": 7.999142068325514e-06, + "loss": 0.8343, + "step": 847 + }, + { + "epoch": 0.3208171758252152, + "grad_norm": 1.1197740571480894, + "learning_rate": 7.999105173590281e-06, + "loss": 0.8408, + "step": 848 + }, + { + "epoch": 0.3211954979665185, + "grad_norm": 1.0680302459683109, + "learning_rate": 7.999067502240835e-06, + "loss": 0.8527, + "step": 849 + }, + { + "epoch": 0.3215738201078218, + "grad_norm": 1.0872491602723373, + "learning_rate": 7.99902905428449e-06, + "loss": 0.8417, + "step": 850 + }, + { + "epoch": 0.3219521422491251, + "grad_norm": 1.106663351318103, + "learning_rate": 7.998989829728712e-06, + "loss": 0.8055, + "step": 851 + }, + { + "epoch": 0.32233046439042845, + "grad_norm": 1.0809694317490106, + "learning_rate": 7.998949828581122e-06, + "loss": 0.8614, + "step": 852 + }, + { + "epoch": 0.3227087865317318, + "grad_norm": 1.102190346138006, + "learning_rate": 7.998909050849484e-06, + "loss": 0.8716, + "step": 853 + }, + { + "epoch": 0.3230871086730351, + "grad_norm": 1.0436133036323463, + "learning_rate": 7.998867496541719e-06, + "loss": 0.8575, + "step": 854 + }, + { + "epoch": 0.32346543081433843, + "grad_norm": 1.0545933388006492, + "learning_rate": 7.998825165665894e-06, + "loss": 0.8208, + "step": 855 + }, + { + "epoch": 0.3238437529556417, + "grad_norm": 1.066597036199654, + "learning_rate": 7.998782058230237e-06, + "loss": 0.7723, + "step": 856 + }, + { + "epoch": 0.32422207509694506, + "grad_norm": 1.053365311188067, + "learning_rate": 7.998738174243111e-06, + "loss": 0.8102, + "step": 857 + }, + { + "epoch": 0.32460039723824835, + "grad_norm": 1.0581107038361595, + "learning_rate": 7.99869351371304e-06, + "loss": 0.7999, + "step": 858 + }, + { + "epoch": 0.3249787193795517, + "grad_norm": 1.1008953546338276, + "learning_rate": 7.998648076648702e-06, + "loss": 0.8568, + "step": 859 + }, + { + "epoch": 0.325357041520855, + "grad_norm": 1.1417115474045594, + "learning_rate": 7.998601863058915e-06, + "loss": 0.8183, + "step": 860 + }, + { + "epoch": 0.32573536366215833, + "grad_norm": 1.0221082409435902, + "learning_rate": 7.998554872952656e-06, + "loss": 0.8236, + "step": 861 + }, + { + "epoch": 0.3261136858034617, + "grad_norm": 1.0319653291858766, + "learning_rate": 7.99850710633905e-06, + "loss": 0.8268, + "step": 862 + }, + { + "epoch": 0.32649200794476496, + "grad_norm": 1.0741619232930077, + "learning_rate": 7.998458563227374e-06, + "loss": 0.8635, + "step": 863 + }, + { + "epoch": 0.3268703300860683, + "grad_norm": 1.084988318258729, + "learning_rate": 7.998409243627051e-06, + "loss": 0.807, + "step": 864 + }, + { + "epoch": 0.3272486522273716, + "grad_norm": 1.0687498037098355, + "learning_rate": 7.998359147547665e-06, + "loss": 0.852, + "step": 865 + }, + { + "epoch": 0.32762697436867494, + "grad_norm": 1.125647258256957, + "learning_rate": 7.99830827499894e-06, + "loss": 0.8153, + "step": 866 + }, + { + "epoch": 0.32800529650997823, + "grad_norm": 1.1182770611625017, + "learning_rate": 7.998256625990756e-06, + "loss": 0.8103, + "step": 867 + }, + { + "epoch": 0.3283836186512816, + "grad_norm": 1.0564435912408205, + "learning_rate": 7.998204200533144e-06, + "loss": 0.8119, + "step": 868 + }, + { + "epoch": 0.32876194079258486, + "grad_norm": 1.1460131223742922, + "learning_rate": 7.998150998636284e-06, + "loss": 0.8289, + "step": 869 + }, + { + "epoch": 0.3291402629338882, + "grad_norm": 1.0575674306051868, + "learning_rate": 7.998097020310509e-06, + "loss": 0.8428, + "step": 870 + }, + { + "epoch": 0.32951858507519155, + "grad_norm": 1.1137833102998567, + "learning_rate": 7.9980422655663e-06, + "loss": 0.8218, + "step": 871 + }, + { + "epoch": 0.32989690721649484, + "grad_norm": 1.1107427833797017, + "learning_rate": 7.997986734414291e-06, + "loss": 0.851, + "step": 872 + }, + { + "epoch": 0.3302752293577982, + "grad_norm": 1.1272405856822123, + "learning_rate": 7.997930426865266e-06, + "loss": 0.8604, + "step": 873 + }, + { + "epoch": 0.3306535514991015, + "grad_norm": 1.0539626107226423, + "learning_rate": 7.997873342930158e-06, + "loss": 0.8531, + "step": 874 + }, + { + "epoch": 0.3310318736404048, + "grad_norm": 1.0696538969484604, + "learning_rate": 7.997815482620057e-06, + "loss": 0.838, + "step": 875 + }, + { + "epoch": 0.3314101957817081, + "grad_norm": 1.1460143163401961, + "learning_rate": 7.997756845946193e-06, + "loss": 0.7944, + "step": 876 + }, + { + "epoch": 0.33178851792301145, + "grad_norm": 1.1082280014219137, + "learning_rate": 7.997697432919957e-06, + "loss": 0.9019, + "step": 877 + }, + { + "epoch": 0.33216684006431474, + "grad_norm": 1.0841358926479614, + "learning_rate": 7.997637243552888e-06, + "loss": 0.7975, + "step": 878 + }, + { + "epoch": 0.3325451622056181, + "grad_norm": 1.056009898365743, + "learning_rate": 7.997576277856674e-06, + "loss": 0.8574, + "step": 879 + }, + { + "epoch": 0.33292348434692143, + "grad_norm": 1.0802951235255627, + "learning_rate": 7.99751453584315e-06, + "loss": 0.8155, + "step": 880 + }, + { + "epoch": 0.3333018064882247, + "grad_norm": 1.077889148763545, + "learning_rate": 7.99745201752431e-06, + "loss": 0.7963, + "step": 881 + }, + { + "epoch": 0.33368012862952806, + "grad_norm": 1.1621065299950686, + "learning_rate": 7.997388722912295e-06, + "loss": 0.8548, + "step": 882 + }, + { + "epoch": 0.33405845077083135, + "grad_norm": 1.1322105218350456, + "learning_rate": 7.997324652019394e-06, + "loss": 0.8795, + "step": 883 + }, + { + "epoch": 0.3344367729121347, + "grad_norm": 1.136478913491314, + "learning_rate": 7.997259804858054e-06, + "loss": 0.8053, + "step": 884 + }, + { + "epoch": 0.334815095053438, + "grad_norm": 1.132941842896281, + "learning_rate": 7.997194181440863e-06, + "loss": 0.8753, + "step": 885 + }, + { + "epoch": 0.3351934171947413, + "grad_norm": 1.072088751980564, + "learning_rate": 7.997127781780567e-06, + "loss": 0.8471, + "step": 886 + }, + { + "epoch": 0.3355717393360446, + "grad_norm": 1.136959198020949, + "learning_rate": 7.997060605890062e-06, + "loss": 0.8805, + "step": 887 + }, + { + "epoch": 0.33595006147734796, + "grad_norm": 1.1411444801682626, + "learning_rate": 7.996992653782392e-06, + "loss": 0.8241, + "step": 888 + }, + { + "epoch": 0.3363283836186513, + "grad_norm": 1.0911333474121823, + "learning_rate": 7.996923925470752e-06, + "loss": 0.8134, + "step": 889 + }, + { + "epoch": 0.3367067057599546, + "grad_norm": 1.0929540349841498, + "learning_rate": 7.996854420968492e-06, + "loss": 0.8362, + "step": 890 + }, + { + "epoch": 0.33708502790125794, + "grad_norm": 1.1142134518728692, + "learning_rate": 7.996784140289106e-06, + "loss": 0.8583, + "step": 891 + }, + { + "epoch": 0.3374633500425612, + "grad_norm": 1.0776120467255657, + "learning_rate": 7.996713083446245e-06, + "loss": 0.8405, + "step": 892 + }, + { + "epoch": 0.33784167218386457, + "grad_norm": 1.0315550349351374, + "learning_rate": 7.996641250453707e-06, + "loss": 0.8233, + "step": 893 + }, + { + "epoch": 0.33821999432516786, + "grad_norm": 1.1320956870150307, + "learning_rate": 7.996568641325441e-06, + "loss": 0.8497, + "step": 894 + }, + { + "epoch": 0.3385983164664712, + "grad_norm": 1.0891148355471727, + "learning_rate": 7.996495256075548e-06, + "loss": 0.8338, + "step": 895 + }, + { + "epoch": 0.3389766386077745, + "grad_norm": 1.1104610577848222, + "learning_rate": 7.99642109471828e-06, + "loss": 0.8166, + "step": 896 + }, + { + "epoch": 0.33935496074907784, + "grad_norm": 1.0961276245110951, + "learning_rate": 7.996346157268037e-06, + "loss": 0.8213, + "step": 897 + }, + { + "epoch": 0.3397332828903812, + "grad_norm": 1.053397674073016, + "learning_rate": 7.996270443739375e-06, + "loss": 0.8269, + "step": 898 + }, + { + "epoch": 0.34011160503168447, + "grad_norm": 1.05985869383675, + "learning_rate": 7.996193954146995e-06, + "loss": 0.8632, + "step": 899 + }, + { + "epoch": 0.3404899271729878, + "grad_norm": 1.0747332831609127, + "learning_rate": 7.996116688505749e-06, + "loss": 0.8308, + "step": 900 + }, + { + "epoch": 0.3408682493142911, + "grad_norm": 1.0617958908539586, + "learning_rate": 7.996038646830645e-06, + "loss": 0.8003, + "step": 901 + }, + { + "epoch": 0.34124657145559445, + "grad_norm": 1.0595674189471762, + "learning_rate": 7.995959829136837e-06, + "loss": 0.7948, + "step": 902 + }, + { + "epoch": 0.34162489359689774, + "grad_norm": 1.0753382871745762, + "learning_rate": 7.995880235439632e-06, + "loss": 0.8399, + "step": 903 + }, + { + "epoch": 0.3420032157382011, + "grad_norm": 1.1183441140434693, + "learning_rate": 7.995799865754487e-06, + "loss": 0.8221, + "step": 904 + }, + { + "epoch": 0.34238153787950437, + "grad_norm": 1.0929766123596374, + "learning_rate": 7.995718720097011e-06, + "loss": 0.8309, + "step": 905 + }, + { + "epoch": 0.3427598600208077, + "grad_norm": 1.0179073548109145, + "learning_rate": 7.995636798482959e-06, + "loss": 0.8355, + "step": 906 + }, + { + "epoch": 0.34313818216211106, + "grad_norm": 1.1183732645745317, + "learning_rate": 7.99555410092824e-06, + "loss": 0.8376, + "step": 907 + }, + { + "epoch": 0.34351650430341435, + "grad_norm": 1.165733705514543, + "learning_rate": 7.995470627448915e-06, + "loss": 0.86, + "step": 908 + }, + { + "epoch": 0.3438948264447177, + "grad_norm": 1.0552618018743587, + "learning_rate": 7.995386378061196e-06, + "loss": 0.8468, + "step": 909 + }, + { + "epoch": 0.344273148586021, + "grad_norm": 1.131651010498469, + "learning_rate": 7.995301352781439e-06, + "loss": 0.8489, + "step": 910 + }, + { + "epoch": 0.3446514707273243, + "grad_norm": 1.1028826199732988, + "learning_rate": 7.995215551626162e-06, + "loss": 0.8721, + "step": 911 + }, + { + "epoch": 0.3450297928686276, + "grad_norm": 1.1380255943103783, + "learning_rate": 7.995128974612022e-06, + "loss": 0.8484, + "step": 912 + }, + { + "epoch": 0.34540811500993096, + "grad_norm": 1.0659393620350812, + "learning_rate": 7.995041621755835e-06, + "loss": 0.8198, + "step": 913 + }, + { + "epoch": 0.34578643715123425, + "grad_norm": 1.059819166817385, + "learning_rate": 7.994953493074562e-06, + "loss": 0.8601, + "step": 914 + }, + { + "epoch": 0.3461647592925376, + "grad_norm": 1.1168724106612267, + "learning_rate": 7.994864588585323e-06, + "loss": 0.8314, + "step": 915 + }, + { + "epoch": 0.34654308143384094, + "grad_norm": 1.0696755810222651, + "learning_rate": 7.994774908305377e-06, + "loss": 0.8488, + "step": 916 + }, + { + "epoch": 0.3469214035751442, + "grad_norm": 1.1571812110459856, + "learning_rate": 7.99468445225214e-06, + "loss": 0.8157, + "step": 917 + }, + { + "epoch": 0.34729972571644757, + "grad_norm": 1.114611745775756, + "learning_rate": 7.994593220443181e-06, + "loss": 0.8368, + "step": 918 + }, + { + "epoch": 0.34767804785775086, + "grad_norm": 1.152864146273239, + "learning_rate": 7.994501212896218e-06, + "loss": 0.861, + "step": 919 + }, + { + "epoch": 0.3480563699990542, + "grad_norm": 1.1345158690879138, + "learning_rate": 7.994408429629113e-06, + "loss": 0.8163, + "step": 920 + }, + { + "epoch": 0.3484346921403575, + "grad_norm": 1.0577940861565938, + "learning_rate": 7.994314870659892e-06, + "loss": 0.7803, + "step": 921 + }, + { + "epoch": 0.34881301428166084, + "grad_norm": 1.04106331488491, + "learning_rate": 7.994220536006717e-06, + "loss": 0.8291, + "step": 922 + }, + { + "epoch": 0.3491913364229641, + "grad_norm": 1.0394935151014175, + "learning_rate": 7.99412542568791e-06, + "loss": 0.7819, + "step": 923 + }, + { + "epoch": 0.34956965856426747, + "grad_norm": 1.1306507694533081, + "learning_rate": 7.994029539721941e-06, + "loss": 0.8594, + "step": 924 + }, + { + "epoch": 0.3499479807055708, + "grad_norm": 1.0984697906601044, + "learning_rate": 7.993932878127433e-06, + "loss": 0.872, + "step": 925 + }, + { + "epoch": 0.3503263028468741, + "grad_norm": 1.0848529154386723, + "learning_rate": 7.993835440923154e-06, + "loss": 0.8668, + "step": 926 + }, + { + "epoch": 0.35070462498817745, + "grad_norm": 1.074249076888769, + "learning_rate": 7.993737228128028e-06, + "loss": 0.88, + "step": 927 + }, + { + "epoch": 0.35108294712948074, + "grad_norm": 1.0595559434730502, + "learning_rate": 7.993638239761127e-06, + "loss": 0.8448, + "step": 928 + }, + { + "epoch": 0.3514612692707841, + "grad_norm": 1.0586225742216135, + "learning_rate": 7.993538475841674e-06, + "loss": 0.806, + "step": 929 + }, + { + "epoch": 0.35183959141208737, + "grad_norm": 1.0965639423993851, + "learning_rate": 7.993437936389045e-06, + "loss": 0.8532, + "step": 930 + }, + { + "epoch": 0.3522179135533907, + "grad_norm": 1.0635648509605742, + "learning_rate": 7.99333662142276e-06, + "loss": 0.8659, + "step": 931 + }, + { + "epoch": 0.3522179135533907, + "eval_loss": 0.8405433893203735, + "eval_runtime": 26.7827, + "eval_samples_per_second": 33.044, + "eval_steps_per_second": 1.045, + "step": 931 + }, + { + "epoch": 0.3522179135533907, + "eval_bench_accuracy_arc_challenge": 0.2, + "eval_bench_accuracy_hellaswag": 0.265, + "eval_bench_accuracy_mmlu": 0.20869565217391303, + "eval_bench_average_accuracy": 0.22456521739130433, + "eval_bench_loss": 4.116911503306606, + "eval_bench_total_accuracy": 0.23076923076923078, + "step": 931 + }, + { + "epoch": 0.352596235694694, + "grad_norm": 1.071445968085627, + "learning_rate": 7.993234530962498e-06, + "loss": 0.8349, + "step": 932 + }, + { + "epoch": 0.35297455783599735, + "grad_norm": 1.1138872222419933, + "learning_rate": 7.993131665028082e-06, + "loss": 0.8369, + "step": 933 + }, + { + "epoch": 0.3533528799773007, + "grad_norm": 1.034081458809988, + "learning_rate": 7.993028023639493e-06, + "loss": 0.8302, + "step": 934 + }, + { + "epoch": 0.353731202118604, + "grad_norm": 1.0615568247982479, + "learning_rate": 7.992923606816852e-06, + "loss": 0.7956, + "step": 935 + }, + { + "epoch": 0.3541095242599073, + "grad_norm": 1.0966324306911683, + "learning_rate": 7.992818414580439e-06, + "loss": 0.8157, + "step": 936 + }, + { + "epoch": 0.3544878464012106, + "grad_norm": 1.0499428116789347, + "learning_rate": 7.992712446950682e-06, + "loss": 0.8448, + "step": 937 + }, + { + "epoch": 0.35486616854251396, + "grad_norm": 1.0929166781794446, + "learning_rate": 7.99260570394816e-06, + "loss": 0.838, + "step": 938 + }, + { + "epoch": 0.35524449068381725, + "grad_norm": 1.0784478665113866, + "learning_rate": 7.9924981855936e-06, + "loss": 0.8477, + "step": 939 + }, + { + "epoch": 0.3556228128251206, + "grad_norm": 1.112873701673093, + "learning_rate": 7.992389891907885e-06, + "loss": 0.837, + "step": 940 + }, + { + "epoch": 0.3560011349664239, + "grad_norm": 1.0396578216523251, + "learning_rate": 7.992280822912044e-06, + "loss": 0.7867, + "step": 941 + }, + { + "epoch": 0.3563794571077272, + "grad_norm": 1.1025438788531285, + "learning_rate": 7.992170978627258e-06, + "loss": 0.8588, + "step": 942 + }, + { + "epoch": 0.35675777924903057, + "grad_norm": 1.0567533995232752, + "learning_rate": 7.992060359074857e-06, + "loss": 0.8415, + "step": 943 + }, + { + "epoch": 0.35713610139033386, + "grad_norm": 1.0876544163342308, + "learning_rate": 7.991948964276324e-06, + "loss": 0.8139, + "step": 944 + }, + { + "epoch": 0.3575144235316372, + "grad_norm": 1.1119965568409491, + "learning_rate": 7.991836794253291e-06, + "loss": 0.8236, + "step": 945 + }, + { + "epoch": 0.3578927456729405, + "grad_norm": 1.050449035576396, + "learning_rate": 7.991723849027543e-06, + "loss": 0.8683, + "step": 946 + }, + { + "epoch": 0.35827106781424384, + "grad_norm": 1.0727809938491701, + "learning_rate": 7.991610128621012e-06, + "loss": 0.8637, + "step": 947 + }, + { + "epoch": 0.3586493899555471, + "grad_norm": 1.1142250081446294, + "learning_rate": 7.991495633055782e-06, + "loss": 0.8173, + "step": 948 + }, + { + "epoch": 0.35902771209685047, + "grad_norm": 1.0422992081938323, + "learning_rate": 7.99138036235409e-06, + "loss": 0.8247, + "step": 949 + }, + { + "epoch": 0.3594060342381538, + "grad_norm": 1.0683985452632145, + "learning_rate": 7.991264316538315e-06, + "loss": 0.7835, + "step": 950 + }, + { + "epoch": 0.3597843563794571, + "grad_norm": 1.1389275468673155, + "learning_rate": 7.991147495631001e-06, + "loss": 0.8263, + "step": 951 + }, + { + "epoch": 0.36016267852076045, + "grad_norm": 1.0300732494637694, + "learning_rate": 7.99102989965483e-06, + "loss": 0.8382, + "step": 952 + }, + { + "epoch": 0.36054100066206374, + "grad_norm": 1.1134877059951171, + "learning_rate": 7.990911528632637e-06, + "loss": 0.8301, + "step": 953 + }, + { + "epoch": 0.3609193228033671, + "grad_norm": 1.1556214956120872, + "learning_rate": 7.990792382587413e-06, + "loss": 0.8339, + "step": 954 + }, + { + "epoch": 0.36129764494467037, + "grad_norm": 1.0496596260111375, + "learning_rate": 7.990672461542295e-06, + "loss": 0.855, + "step": 955 + }, + { + "epoch": 0.3616759670859737, + "grad_norm": 1.0631933354628074, + "learning_rate": 7.99055176552057e-06, + "loss": 0.8028, + "step": 956 + }, + { + "epoch": 0.362054289227277, + "grad_norm": 1.112630845203049, + "learning_rate": 7.990430294545676e-06, + "loss": 0.8324, + "step": 957 + }, + { + "epoch": 0.36243261136858035, + "grad_norm": 1.047199242259213, + "learning_rate": 7.990308048641205e-06, + "loss": 0.8113, + "step": 958 + }, + { + "epoch": 0.3628109335098837, + "grad_norm": 1.027441822648717, + "learning_rate": 7.990185027830895e-06, + "loss": 0.818, + "step": 959 + }, + { + "epoch": 0.363189255651187, + "grad_norm": 1.1215384265121908, + "learning_rate": 7.990061232138636e-06, + "loss": 0.8105, + "step": 960 + }, + { + "epoch": 0.3635675777924903, + "grad_norm": 1.068442952320319, + "learning_rate": 7.989936661588471e-06, + "loss": 0.7921, + "step": 961 + }, + { + "epoch": 0.3639458999337936, + "grad_norm": 1.1092839541563482, + "learning_rate": 7.989811316204588e-06, + "loss": 0.8604, + "step": 962 + }, + { + "epoch": 0.36432422207509696, + "grad_norm": 1.071801311807864, + "learning_rate": 7.989685196011332e-06, + "loss": 0.8309, + "step": 963 + }, + { + "epoch": 0.36470254421640025, + "grad_norm": 1.0755045364863953, + "learning_rate": 7.989558301033193e-06, + "loss": 0.8281, + "step": 964 + }, + { + "epoch": 0.3650808663577036, + "grad_norm": 1.0267320983799983, + "learning_rate": 7.989430631294813e-06, + "loss": 0.8354, + "step": 965 + }, + { + "epoch": 0.3654591884990069, + "grad_norm": 1.137253491825624, + "learning_rate": 7.98930218682099e-06, + "loss": 0.879, + "step": 966 + }, + { + "epoch": 0.3658375106403102, + "grad_norm": 1.078336142946193, + "learning_rate": 7.989172967636661e-06, + "loss": 0.7937, + "step": 967 + }, + { + "epoch": 0.36621583278161357, + "grad_norm": 1.249220122221408, + "learning_rate": 7.98904297376692e-06, + "loss": 0.8719, + "step": 968 + }, + { + "epoch": 0.36659415492291686, + "grad_norm": 1.0553052489470098, + "learning_rate": 7.988912205237018e-06, + "loss": 0.8343, + "step": 969 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 1.0825650361601242, + "learning_rate": 7.988780662072345e-06, + "loss": 0.8708, + "step": 970 + }, + { + "epoch": 0.3673507992055235, + "grad_norm": 1.0492113257783737, + "learning_rate": 7.988648344298449e-06, + "loss": 0.8158, + "step": 971 + }, + { + "epoch": 0.36772912134682684, + "grad_norm": 1.1098170719484017, + "learning_rate": 7.988515251941022e-06, + "loss": 0.8072, + "step": 972 + }, + { + "epoch": 0.3681074434881301, + "grad_norm": 1.0470408388006793, + "learning_rate": 7.988381385025913e-06, + "loss": 0.8254, + "step": 973 + }, + { + "epoch": 0.36848576562943347, + "grad_norm": 1.1223023650314936, + "learning_rate": 7.988246743579118e-06, + "loss": 0.8422, + "step": 974 + }, + { + "epoch": 0.36886408777073676, + "grad_norm": 1.0378189816707217, + "learning_rate": 7.988111327626781e-06, + "loss": 0.7986, + "step": 975 + }, + { + "epoch": 0.3692424099120401, + "grad_norm": 1.0879026599404655, + "learning_rate": 7.987975137195206e-06, + "loss": 0.8239, + "step": 976 + }, + { + "epoch": 0.36962073205334345, + "grad_norm": 1.0445944467404071, + "learning_rate": 7.987838172310836e-06, + "loss": 0.7856, + "step": 977 + }, + { + "epoch": 0.36999905419464674, + "grad_norm": 1.0952504464513027, + "learning_rate": 7.987700433000268e-06, + "loss": 0.8474, + "step": 978 + }, + { + "epoch": 0.3703773763359501, + "grad_norm": 1.0976482765823483, + "learning_rate": 7.987561919290254e-06, + "loss": 0.8067, + "step": 979 + }, + { + "epoch": 0.37075569847725337, + "grad_norm": 1.0673215016151512, + "learning_rate": 7.987422631207691e-06, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3711340206185567, + "grad_norm": 1.1205110055136513, + "learning_rate": 7.98728256877963e-06, + "loss": 0.7892, + "step": 981 + }, + { + "epoch": 0.37151234275986, + "grad_norm": 1.092436787430483, + "learning_rate": 7.987141732033268e-06, + "loss": 0.8332, + "step": 982 + }, + { + "epoch": 0.37189066490116335, + "grad_norm": 1.091564370951629, + "learning_rate": 7.987000120995958e-06, + "loss": 0.8318, + "step": 983 + }, + { + "epoch": 0.37226898704246664, + "grad_norm": 1.0840271784135682, + "learning_rate": 7.986857735695197e-06, + "loss": 0.8343, + "step": 984 + }, + { + "epoch": 0.37264730918377, + "grad_norm": 1.1224128911012572, + "learning_rate": 7.98671457615864e-06, + "loss": 0.8084, + "step": 985 + }, + { + "epoch": 0.3730256313250733, + "grad_norm": 1.0744788507306402, + "learning_rate": 7.986570642414086e-06, + "loss": 0.8468, + "step": 986 + }, + { + "epoch": 0.3734039534663766, + "grad_norm": 1.0627524449061605, + "learning_rate": 7.986425934489486e-06, + "loss": 0.794, + "step": 987 + }, + { + "epoch": 0.37378227560767996, + "grad_norm": 1.1606049685680029, + "learning_rate": 7.986280452412942e-06, + "loss": 0.8599, + "step": 988 + }, + { + "epoch": 0.37416059774898325, + "grad_norm": 1.1453346028219251, + "learning_rate": 7.986134196212707e-06, + "loss": 0.839, + "step": 989 + }, + { + "epoch": 0.3745389198902866, + "grad_norm": 1.047560845313498, + "learning_rate": 7.985987165917182e-06, + "loss": 0.838, + "step": 990 + }, + { + "epoch": 0.3749172420315899, + "grad_norm": 1.0691648190671164, + "learning_rate": 7.985839361554922e-06, + "loss": 0.8349, + "step": 991 + }, + { + "epoch": 0.3752955641728932, + "grad_norm": 1.0728147519090105, + "learning_rate": 7.985690783154628e-06, + "loss": 0.8082, + "step": 992 + }, + { + "epoch": 0.3756738863141965, + "grad_norm": 1.0710609346244502, + "learning_rate": 7.985541430745155e-06, + "loss": 0.8367, + "step": 993 + }, + { + "epoch": 0.37605220845549986, + "grad_norm": 1.0345097180466358, + "learning_rate": 7.985391304355508e-06, + "loss": 0.8235, + "step": 994 + }, + { + "epoch": 0.3764305305968032, + "grad_norm": 1.0627329252549442, + "learning_rate": 7.985240404014836e-06, + "loss": 0.8361, + "step": 995 + }, + { + "epoch": 0.3768088527381065, + "grad_norm": 1.055170154515539, + "learning_rate": 7.98508872975245e-06, + "loss": 0.7913, + "step": 996 + }, + { + "epoch": 0.37718717487940984, + "grad_norm": 1.0799095201174227, + "learning_rate": 7.9849362815978e-06, + "loss": 0.8143, + "step": 997 + }, + { + "epoch": 0.3775654970207131, + "grad_norm": 1.1004168575034028, + "learning_rate": 7.984783059580493e-06, + "loss": 0.8325, + "step": 998 + }, + { + "epoch": 0.37794381916201647, + "grad_norm": 1.064297565177233, + "learning_rate": 7.984629063730284e-06, + "loss": 0.7825, + "step": 999 + }, + { + "epoch": 0.37832214130331976, + "grad_norm": 1.0635329039354893, + "learning_rate": 7.984474294077078e-06, + "loss": 0.843, + "step": 1000 + }, + { + "epoch": 0.3787004634446231, + "grad_norm": 1.0134149947950788, + "learning_rate": 7.98431875065093e-06, + "loss": 0.8407, + "step": 1001 + }, + { + "epoch": 0.3790787855859264, + "grad_norm": 1.1003240739229772, + "learning_rate": 7.984162433482048e-06, + "loss": 0.8757, + "step": 1002 + }, + { + "epoch": 0.37945710772722974, + "grad_norm": 1.0704123729576063, + "learning_rate": 7.984005342600789e-06, + "loss": 0.8385, + "step": 1003 + }, + { + "epoch": 0.3798354298685331, + "grad_norm": 1.082489049237877, + "learning_rate": 7.983847478037655e-06, + "loss": 0.8494, + "step": 1004 + }, + { + "epoch": 0.38021375200983637, + "grad_norm": 1.080752264367249, + "learning_rate": 7.983688839823308e-06, + "loss": 0.8609, + "step": 1005 + }, + { + "epoch": 0.3805920741511397, + "grad_norm": 1.1968418204384677, + "learning_rate": 7.983529427988552e-06, + "loss": 0.8564, + "step": 1006 + }, + { + "epoch": 0.380970396292443, + "grad_norm": 1.061469890379153, + "learning_rate": 7.983369242564346e-06, + "loss": 0.7891, + "step": 1007 + }, + { + "epoch": 0.38134871843374635, + "grad_norm": 1.0621745023983624, + "learning_rate": 7.983208283581796e-06, + "loss": 0.864, + "step": 1008 + }, + { + "epoch": 0.38172704057504964, + "grad_norm": 1.1002758271639341, + "learning_rate": 7.98304655107216e-06, + "loss": 0.8511, + "step": 1009 + }, + { + "epoch": 0.382105362716353, + "grad_norm": 1.2982365803931801, + "learning_rate": 7.982884045066848e-06, + "loss": 0.8707, + "step": 1010 + }, + { + "epoch": 0.38248368485765627, + "grad_norm": 1.0481998500890215, + "learning_rate": 7.982720765597416e-06, + "loss": 0.808, + "step": 1011 + }, + { + "epoch": 0.3828620069989596, + "grad_norm": 1.0843657280284922, + "learning_rate": 7.982556712695573e-06, + "loss": 0.8033, + "step": 1012 + }, + { + "epoch": 0.38324032914026296, + "grad_norm": 1.056797859890995, + "learning_rate": 7.982391886393176e-06, + "loss": 0.8109, + "step": 1013 + }, + { + "epoch": 0.38361865128156625, + "grad_norm": 1.060307047043872, + "learning_rate": 7.982226286722239e-06, + "loss": 0.8485, + "step": 1014 + }, + { + "epoch": 0.3839969734228696, + "grad_norm": 1.0880414860647125, + "learning_rate": 7.982059913714915e-06, + "loss": 0.829, + "step": 1015 + }, + { + "epoch": 0.3843752955641729, + "grad_norm": 1.0647653565219015, + "learning_rate": 7.981892767403516e-06, + "loss": 0.831, + "step": 1016 + }, + { + "epoch": 0.3847536177054762, + "grad_norm": 1.1245340497823308, + "learning_rate": 7.9817248478205e-06, + "loss": 0.8633, + "step": 1017 + }, + { + "epoch": 0.3851319398467795, + "grad_norm": 1.083643967559738, + "learning_rate": 7.981556154998477e-06, + "loss": 0.8694, + "step": 1018 + }, + { + "epoch": 0.38551026198808286, + "grad_norm": 1.0892685401414424, + "learning_rate": 7.981386688970209e-06, + "loss": 0.8455, + "step": 1019 + }, + { + "epoch": 0.38588858412938615, + "grad_norm": 1.080573813534876, + "learning_rate": 7.981216449768603e-06, + "loss": 0.8028, + "step": 1020 + }, + { + "epoch": 0.3862669062706895, + "grad_norm": 1.0697257333484091, + "learning_rate": 7.981045437426718e-06, + "loss": 0.8254, + "step": 1021 + }, + { + "epoch": 0.38664522841199284, + "grad_norm": 1.1482898982014345, + "learning_rate": 7.980873651977768e-06, + "loss": 0.8434, + "step": 1022 + }, + { + "epoch": 0.3870235505532961, + "grad_norm": 1.066295131291774, + "learning_rate": 7.98070109345511e-06, + "loss": 0.7966, + "step": 1023 + }, + { + "epoch": 0.38740187269459947, + "grad_norm": 1.0329631074824188, + "learning_rate": 7.980527761892255e-06, + "loss": 0.7914, + "step": 1024 + }, + { + "epoch": 0.38778019483590276, + "grad_norm": 1.0857069666875103, + "learning_rate": 7.980353657322863e-06, + "loss": 0.8622, + "step": 1025 + }, + { + "epoch": 0.3881585169772061, + "grad_norm": 1.060211010001084, + "learning_rate": 7.980178779780747e-06, + "loss": 0.8381, + "step": 1026 + }, + { + "epoch": 0.3885368391185094, + "grad_norm": 1.0543634996329088, + "learning_rate": 7.980003129299865e-06, + "loss": 0.8378, + "step": 1027 + }, + { + "epoch": 0.38891516125981274, + "grad_norm": 1.1081388338013471, + "learning_rate": 7.979826705914328e-06, + "loss": 0.8338, + "step": 1028 + }, + { + "epoch": 0.389293483401116, + "grad_norm": 1.104557100267363, + "learning_rate": 7.9796495096584e-06, + "loss": 0.795, + "step": 1029 + }, + { + "epoch": 0.38967180554241937, + "grad_norm": 1.0655072241835162, + "learning_rate": 7.979471540566489e-06, + "loss": 0.8237, + "step": 1030 + }, + { + "epoch": 0.3900501276837227, + "grad_norm": 1.0796326933387017, + "learning_rate": 7.979292798673156e-06, + "loss": 0.8556, + "step": 1031 + }, + { + "epoch": 0.390428449825026, + "grad_norm": 1.0380712383913533, + "learning_rate": 7.979113284013114e-06, + "loss": 0.839, + "step": 1032 + }, + { + "epoch": 0.39080677196632935, + "grad_norm": 1.085425876568373, + "learning_rate": 7.97893299662122e-06, + "loss": 0.8516, + "step": 1033 + }, + { + "epoch": 0.39118509410763264, + "grad_norm": 1.2207322749435598, + "learning_rate": 7.978751936532491e-06, + "loss": 0.8549, + "step": 1034 + }, + { + "epoch": 0.391563416248936, + "grad_norm": 1.088319428223248, + "learning_rate": 7.978570103782086e-06, + "loss": 0.8573, + "step": 1035 + }, + { + "epoch": 0.39194173839023927, + "grad_norm": 1.0545678177926456, + "learning_rate": 7.978387498405317e-06, + "loss": 0.8325, + "step": 1036 + }, + { + "epoch": 0.3923200605315426, + "grad_norm": 1.0921146086499482, + "learning_rate": 7.978204120437641e-06, + "loss": 0.7912, + "step": 1037 + }, + { + "epoch": 0.3926983826728459, + "grad_norm": 1.1156394836322963, + "learning_rate": 7.978019969914676e-06, + "loss": 0.8344, + "step": 1038 + }, + { + "epoch": 0.39307670481414925, + "grad_norm": 1.1163141481746923, + "learning_rate": 7.97783504687218e-06, + "loss": 0.8039, + "step": 1039 + }, + { + "epoch": 0.3934550269554526, + "grad_norm": 1.1055832393565042, + "learning_rate": 7.977649351346065e-06, + "loss": 0.8098, + "step": 1040 + }, + { + "epoch": 0.3938333490967559, + "grad_norm": 1.0475102246909884, + "learning_rate": 7.97746288337239e-06, + "loss": 0.7868, + "step": 1041 + }, + { + "epoch": 0.3942116712380592, + "grad_norm": 1.0630199431469338, + "learning_rate": 7.977275642987371e-06, + "loss": 0.7965, + "step": 1042 + }, + { + "epoch": 0.3945899933793625, + "grad_norm": 1.1096476912788604, + "learning_rate": 7.977087630227368e-06, + "loss": 0.8052, + "step": 1043 + }, + { + "epoch": 0.39496831552066586, + "grad_norm": 1.0863091134871783, + "learning_rate": 7.976898845128891e-06, + "loss": 0.8435, + "step": 1044 + }, + { + "epoch": 0.39534663766196915, + "grad_norm": 1.0492836175021802, + "learning_rate": 7.976709287728602e-06, + "loss": 0.8083, + "step": 1045 + }, + { + "epoch": 0.3957249598032725, + "grad_norm": 1.0529300466346392, + "learning_rate": 7.976518958063315e-06, + "loss": 0.8274, + "step": 1046 + }, + { + "epoch": 0.3961032819445758, + "grad_norm": 1.070473727548606, + "learning_rate": 7.976327856169989e-06, + "loss": 0.7971, + "step": 1047 + }, + { + "epoch": 0.3964816040858791, + "grad_norm": 1.0617092300636013, + "learning_rate": 7.976135982085734e-06, + "loss": 0.8536, + "step": 1048 + }, + { + "epoch": 0.39685992622718247, + "grad_norm": 1.0606504595804507, + "learning_rate": 7.975943335847815e-06, + "loss": 0.777, + "step": 1049 + }, + { + "epoch": 0.39723824836848576, + "grad_norm": 1.1335961432026964, + "learning_rate": 7.97574991749364e-06, + "loss": 0.8707, + "step": 1050 + }, + { + "epoch": 0.3976165705097891, + "grad_norm": 1.0932495202458485, + "learning_rate": 7.975555727060773e-06, + "loss": 0.8476, + "step": 1051 + }, + { + "epoch": 0.3979948926510924, + "grad_norm": 1.0904729718461323, + "learning_rate": 7.975360764586923e-06, + "loss": 0.8325, + "step": 1052 + }, + { + "epoch": 0.39837321479239574, + "grad_norm": 1.060481887356713, + "learning_rate": 7.975165030109953e-06, + "loss": 0.8293, + "step": 1053 + }, + { + "epoch": 0.398751536933699, + "grad_norm": 1.0594136483291037, + "learning_rate": 7.974968523667874e-06, + "loss": 0.8333, + "step": 1054 + }, + { + "epoch": 0.39912985907500237, + "grad_norm": 1.072066755016977, + "learning_rate": 7.974771245298845e-06, + "loss": 0.8588, + "step": 1055 + }, + { + "epoch": 0.39950818121630566, + "grad_norm": 1.0407488984374065, + "learning_rate": 7.974573195041179e-06, + "loss": 0.8119, + "step": 1056 + }, + { + "epoch": 0.399886503357609, + "grad_norm": 1.0897696384583164, + "learning_rate": 7.974374372933333e-06, + "loss": 0.8729, + "step": 1057 + }, + { + "epoch": 0.40026482549891235, + "grad_norm": 1.0395716067441272, + "learning_rate": 7.974174779013923e-06, + "loss": 0.844, + "step": 1058 + }, + { + "epoch": 0.40064314764021564, + "grad_norm": 1.0440432063315428, + "learning_rate": 7.973974413321706e-06, + "loss": 0.8311, + "step": 1059 + }, + { + "epoch": 0.401021469781519, + "grad_norm": 1.085811930524537, + "learning_rate": 7.973773275895593e-06, + "loss": 0.8506, + "step": 1060 + }, + { + "epoch": 0.40139979192282227, + "grad_norm": 1.017123583458792, + "learning_rate": 7.973571366774646e-06, + "loss": 0.7491, + "step": 1061 + }, + { + "epoch": 0.4017781140641256, + "grad_norm": 1.041022717188848, + "learning_rate": 7.973368685998074e-06, + "loss": 0.8189, + "step": 1062 + }, + { + "epoch": 0.4021564362054289, + "grad_norm": 1.0150607929017172, + "learning_rate": 7.973165233605234e-06, + "loss": 0.814, + "step": 1063 + }, + { + "epoch": 0.40253475834673225, + "grad_norm": 1.0458554860554623, + "learning_rate": 7.972961009635642e-06, + "loss": 0.8123, + "step": 1064 + }, + { + "epoch": 0.40253475834673225, + "eval_loss": 0.8304316997528076, + "eval_runtime": 26.6669, + "eval_samples_per_second": 33.187, + "eval_steps_per_second": 1.05, + "step": 1064 + }, + { + "epoch": 0.40253475834673225, + "eval_bench_accuracy_arc_challenge": 0.25, + "eval_bench_accuracy_hellaswag": 0.285, + "eval_bench_accuracy_mmlu": 0.2782608695652174, + "eval_bench_average_accuracy": 0.2710869565217391, + "eval_bench_loss": 4.517480147512336, + "eval_bench_total_accuracy": 0.2725274725274725, + "step": 1064 + }, + { + "epoch": 0.40291308048803554, + "grad_norm": 1.037409138160307, + "learning_rate": 7.972756014128952e-06, + "loss": 0.8159, + "step": 1065 + }, + { + "epoch": 0.4032914026293389, + "grad_norm": 1.0836167448402902, + "learning_rate": 7.972550247124976e-06, + "loss": 0.8131, + "step": 1066 + }, + { + "epoch": 0.4036697247706422, + "grad_norm": 1.0933137283571555, + "learning_rate": 7.972343708663674e-06, + "loss": 0.8183, + "step": 1067 + }, + { + "epoch": 0.4040480469119455, + "grad_norm": 1.03216484709328, + "learning_rate": 7.972136398785154e-06, + "loss": 0.8569, + "step": 1068 + }, + { + "epoch": 0.40442636905324886, + "grad_norm": 1.0656155608965763, + "learning_rate": 7.971928317529676e-06, + "loss": 0.8453, + "step": 1069 + }, + { + "epoch": 0.40480469119455215, + "grad_norm": 1.0708238570639999, + "learning_rate": 7.971719464937647e-06, + "loss": 0.8367, + "step": 1070 + }, + { + "epoch": 0.4051830133358555, + "grad_norm": 1.0621498480602682, + "learning_rate": 7.971509841049628e-06, + "loss": 0.8589, + "step": 1071 + }, + { + "epoch": 0.4055613354771588, + "grad_norm": 1.0072315129856741, + "learning_rate": 7.971299445906324e-06, + "loss": 0.8379, + "step": 1072 + }, + { + "epoch": 0.4059396576184621, + "grad_norm": 1.033456153626471, + "learning_rate": 7.971088279548597e-06, + "loss": 0.8079, + "step": 1073 + }, + { + "epoch": 0.4063179797597654, + "grad_norm": 1.0079272901425842, + "learning_rate": 7.970876342017452e-06, + "loss": 0.7868, + "step": 1074 + }, + { + "epoch": 0.40669630190106876, + "grad_norm": 1.0073805003714849, + "learning_rate": 7.970663633354047e-06, + "loss": 0.7988, + "step": 1075 + }, + { + "epoch": 0.4070746240423721, + "grad_norm": 1.0708487426838318, + "learning_rate": 7.97045015359969e-06, + "loss": 0.8026, + "step": 1076 + }, + { + "epoch": 0.4074529461836754, + "grad_norm": 1.069671541329999, + "learning_rate": 7.970235902795838e-06, + "loss": 0.8462, + "step": 1077 + }, + { + "epoch": 0.40783126832497874, + "grad_norm": 1.0250427566221285, + "learning_rate": 7.9700208809841e-06, + "loss": 0.819, + "step": 1078 + }, + { + "epoch": 0.408209590466282, + "grad_norm": 1.035811754086645, + "learning_rate": 7.969805088206226e-06, + "loss": 0.8192, + "step": 1079 + }, + { + "epoch": 0.40858791260758537, + "grad_norm": 1.0919846226041652, + "learning_rate": 7.96958852450413e-06, + "loss": 0.8463, + "step": 1080 + }, + { + "epoch": 0.40896623474888866, + "grad_norm": 1.0922304905923719, + "learning_rate": 7.969371189919865e-06, + "loss": 0.8505, + "step": 1081 + }, + { + "epoch": 0.409344556890192, + "grad_norm": 1.0327335666733615, + "learning_rate": 7.969153084495636e-06, + "loss": 0.8054, + "step": 1082 + }, + { + "epoch": 0.4097228790314953, + "grad_norm": 1.069756821894608, + "learning_rate": 7.968934208273798e-06, + "loss": 0.8348, + "step": 1083 + }, + { + "epoch": 0.41010120117279864, + "grad_norm": 1.0472686446394408, + "learning_rate": 7.968714561296859e-06, + "loss": 0.8302, + "step": 1084 + }, + { + "epoch": 0.410479523314102, + "grad_norm": 1.0462638623089058, + "learning_rate": 7.96849414360747e-06, + "loss": 0.8249, + "step": 1085 + }, + { + "epoch": 0.41085784545540527, + "grad_norm": 1.0056327093077677, + "learning_rate": 7.96827295524844e-06, + "loss": 0.7795, + "step": 1086 + }, + { + "epoch": 0.4112361675967086, + "grad_norm": 1.0244037556207601, + "learning_rate": 7.968050996262716e-06, + "loss": 0.7905, + "step": 1087 + }, + { + "epoch": 0.4116144897380119, + "grad_norm": 1.0346973741005767, + "learning_rate": 7.967828266693409e-06, + "loss": 0.8371, + "step": 1088 + }, + { + "epoch": 0.41199281187931525, + "grad_norm": 1.0958021967982934, + "learning_rate": 7.96760476658377e-06, + "loss": 0.8479, + "step": 1089 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 1.0136255102022522, + "learning_rate": 7.967380495977201e-06, + "loss": 0.8055, + "step": 1090 + }, + { + "epoch": 0.4127494561619219, + "grad_norm": 1.0687414316917077, + "learning_rate": 7.967155454917255e-06, + "loss": 0.8481, + "step": 1091 + }, + { + "epoch": 0.4131277783032252, + "grad_norm": 1.0765456661292323, + "learning_rate": 7.966929643447634e-06, + "loss": 0.8115, + "step": 1092 + }, + { + "epoch": 0.4135061004445285, + "grad_norm": 1.078258124622418, + "learning_rate": 7.966703061612192e-06, + "loss": 0.8319, + "step": 1093 + }, + { + "epoch": 0.41388442258583186, + "grad_norm": 1.0491237525414794, + "learning_rate": 7.966475709454928e-06, + "loss": 0.8592, + "step": 1094 + }, + { + "epoch": 0.41426274472713515, + "grad_norm": 1.0719668981104609, + "learning_rate": 7.966247587019994e-06, + "loss": 0.821, + "step": 1095 + }, + { + "epoch": 0.4146410668684385, + "grad_norm": 1.026254989024167, + "learning_rate": 7.966018694351691e-06, + "loss": 0.8168, + "step": 1096 + }, + { + "epoch": 0.4150193890097418, + "grad_norm": 1.0321711854785867, + "learning_rate": 7.96578903149447e-06, + "loss": 0.8255, + "step": 1097 + }, + { + "epoch": 0.4153977111510451, + "grad_norm": 1.0513898483857722, + "learning_rate": 7.965558598492929e-06, + "loss": 0.7748, + "step": 1098 + }, + { + "epoch": 0.4157760332923484, + "grad_norm": 1.0364175851458883, + "learning_rate": 7.965327395391819e-06, + "loss": 0.7978, + "step": 1099 + }, + { + "epoch": 0.41615435543365176, + "grad_norm": 0.985307760157813, + "learning_rate": 7.965095422236038e-06, + "loss": 0.801, + "step": 1100 + }, + { + "epoch": 0.4165326775749551, + "grad_norm": 1.0813628193591218, + "learning_rate": 7.964862679070634e-06, + "loss": 0.845, + "step": 1101 + }, + { + "epoch": 0.4169109997162584, + "grad_norm": 1.0734207809402587, + "learning_rate": 7.964629165940808e-06, + "loss": 0.8817, + "step": 1102 + }, + { + "epoch": 0.41728932185756173, + "grad_norm": 1.0599230797124688, + "learning_rate": 7.964394882891904e-06, + "loss": 0.8085, + "step": 1103 + }, + { + "epoch": 0.417667643998865, + "grad_norm": 1.078793670107089, + "learning_rate": 7.96415982996942e-06, + "loss": 0.7938, + "step": 1104 + }, + { + "epoch": 0.41804596614016837, + "grad_norm": 1.0350357122236093, + "learning_rate": 7.963924007219002e-06, + "loss": 0.8207, + "step": 1105 + }, + { + "epoch": 0.41842428828147166, + "grad_norm": 1.041240999715739, + "learning_rate": 7.963687414686449e-06, + "loss": 0.7737, + "step": 1106 + }, + { + "epoch": 0.418802610422775, + "grad_norm": 1.1066667842190356, + "learning_rate": 7.963450052417703e-06, + "loss": 0.8191, + "step": 1107 + }, + { + "epoch": 0.4191809325640783, + "grad_norm": 1.0866062695241046, + "learning_rate": 7.963211920458863e-06, + "loss": 0.8098, + "step": 1108 + }, + { + "epoch": 0.41955925470538163, + "grad_norm": 1.0628974307927237, + "learning_rate": 7.962973018856169e-06, + "loss": 0.836, + "step": 1109 + }, + { + "epoch": 0.419937576846685, + "grad_norm": 1.0490148472801595, + "learning_rate": 7.962733347656018e-06, + "loss": 0.8074, + "step": 1110 + }, + { + "epoch": 0.42031589898798827, + "grad_norm": 1.056521276681419, + "learning_rate": 7.962492906904953e-06, + "loss": 0.7798, + "step": 1111 + }, + { + "epoch": 0.4206942211292916, + "grad_norm": 1.0568484786859005, + "learning_rate": 7.962251696649665e-06, + "loss": 0.832, + "step": 1112 + }, + { + "epoch": 0.4210725432705949, + "grad_norm": 1.022548771593414, + "learning_rate": 7.962009716937e-06, + "loss": 0.8576, + "step": 1113 + }, + { + "epoch": 0.42145086541189825, + "grad_norm": 1.0376517279626776, + "learning_rate": 7.961766967813946e-06, + "loss": 0.7709, + "step": 1114 + }, + { + "epoch": 0.42182918755320153, + "grad_norm": 1.057176802372392, + "learning_rate": 7.961523449327646e-06, + "loss": 0.8684, + "step": 1115 + }, + { + "epoch": 0.4222075096945049, + "grad_norm": 1.0278310719203412, + "learning_rate": 7.961279161525389e-06, + "loss": 0.7934, + "step": 1116 + }, + { + "epoch": 0.42258583183580817, + "grad_norm": 1.0116937469277474, + "learning_rate": 7.961034104454618e-06, + "loss": 0.8288, + "step": 1117 + }, + { + "epoch": 0.4229641539771115, + "grad_norm": 1.0791508367529585, + "learning_rate": 7.960788278162918e-06, + "loss": 0.8295, + "step": 1118 + }, + { + "epoch": 0.42334247611841486, + "grad_norm": 1.0482664569638203, + "learning_rate": 7.960541682698034e-06, + "loss": 0.8044, + "step": 1119 + }, + { + "epoch": 0.42372079825971815, + "grad_norm": 1.026033507367731, + "learning_rate": 7.960294318107847e-06, + "loss": 0.8086, + "step": 1120 + }, + { + "epoch": 0.4240991204010215, + "grad_norm": 1.0713832704640005, + "learning_rate": 7.960046184440399e-06, + "loss": 0.8421, + "step": 1121 + }, + { + "epoch": 0.4244774425423248, + "grad_norm": 1.0635267452769637, + "learning_rate": 7.959797281743876e-06, + "loss": 0.8452, + "step": 1122 + }, + { + "epoch": 0.4248557646836281, + "grad_norm": 1.046318335512741, + "learning_rate": 7.959547610066613e-06, + "loss": 0.7944, + "step": 1123 + }, + { + "epoch": 0.4252340868249314, + "grad_norm": 1.0788089412291229, + "learning_rate": 7.959297169457097e-06, + "loss": 0.8338, + "step": 1124 + }, + { + "epoch": 0.42561240896623476, + "grad_norm": 1.0582140885008549, + "learning_rate": 7.959045959963962e-06, + "loss": 0.7914, + "step": 1125 + }, + { + "epoch": 0.42599073110753805, + "grad_norm": 1.0773203264262958, + "learning_rate": 7.958793981635991e-06, + "loss": 0.8549, + "step": 1126 + }, + { + "epoch": 0.4263690532488414, + "grad_norm": 1.0738918058139102, + "learning_rate": 7.958541234522119e-06, + "loss": 0.7836, + "step": 1127 + }, + { + "epoch": 0.42674737539014473, + "grad_norm": 1.0307363548970123, + "learning_rate": 7.958287718671429e-06, + "loss": 0.829, + "step": 1128 + }, + { + "epoch": 0.427125697531448, + "grad_norm": 1.0223432647328048, + "learning_rate": 7.958033434133152e-06, + "loss": 0.8421, + "step": 1129 + }, + { + "epoch": 0.42750401967275137, + "grad_norm": 1.0402584891579054, + "learning_rate": 7.95777838095667e-06, + "loss": 0.7836, + "step": 1130 + }, + { + "epoch": 0.42788234181405466, + "grad_norm": 1.0761841482760737, + "learning_rate": 7.957522559191514e-06, + "loss": 0.7933, + "step": 1131 + }, + { + "epoch": 0.428260663955358, + "grad_norm": 1.0391476619745978, + "learning_rate": 7.957265968887361e-06, + "loss": 0.811, + "step": 1132 + }, + { + "epoch": 0.4286389860966613, + "grad_norm": 1.026814188051067, + "learning_rate": 7.957008610094043e-06, + "loss": 0.8078, + "step": 1133 + }, + { + "epoch": 0.42901730823796463, + "grad_norm": 1.0406330571564124, + "learning_rate": 7.956750482861538e-06, + "loss": 0.8359, + "step": 1134 + }, + { + "epoch": 0.4293956303792679, + "grad_norm": 1.0642979501183267, + "learning_rate": 7.956491587239971e-06, + "loss": 0.8045, + "step": 1135 + }, + { + "epoch": 0.42977395252057127, + "grad_norm": 1.0393212545559525, + "learning_rate": 7.956231923279624e-06, + "loss": 0.8348, + "step": 1136 + }, + { + "epoch": 0.4301522746618746, + "grad_norm": 1.0470124602821342, + "learning_rate": 7.955971491030917e-06, + "loss": 0.8148, + "step": 1137 + }, + { + "epoch": 0.4305305968031779, + "grad_norm": 1.0676455383028118, + "learning_rate": 7.955710290544428e-06, + "loss": 0.8336, + "step": 1138 + }, + { + "epoch": 0.43090891894448125, + "grad_norm": 1.0721667527067038, + "learning_rate": 7.955448321870882e-06, + "loss": 0.831, + "step": 1139 + }, + { + "epoch": 0.43128724108578453, + "grad_norm": 1.064318000094558, + "learning_rate": 7.955185585061151e-06, + "loss": 0.8335, + "step": 1140 + }, + { + "epoch": 0.4316655632270879, + "grad_norm": 1.0302584817777816, + "learning_rate": 7.95492208016626e-06, + "loss": 0.791, + "step": 1141 + }, + { + "epoch": 0.43204388536839117, + "grad_norm": 1.0256366632375336, + "learning_rate": 7.954657807237379e-06, + "loss": 0.8253, + "step": 1142 + }, + { + "epoch": 0.4324222075096945, + "grad_norm": 1.0251051777197329, + "learning_rate": 7.954392766325828e-06, + "loss": 0.8223, + "step": 1143 + }, + { + "epoch": 0.4328005296509978, + "grad_norm": 1.045445405795435, + "learning_rate": 7.954126957483077e-06, + "loss": 0.7606, + "step": 1144 + }, + { + "epoch": 0.43317885179230115, + "grad_norm": 1.0425200750958303, + "learning_rate": 7.95386038076075e-06, + "loss": 0.8537, + "step": 1145 + }, + { + "epoch": 0.4335571739336045, + "grad_norm": 1.0419269404142824, + "learning_rate": 7.953593036210611e-06, + "loss": 0.8277, + "step": 1146 + }, + { + "epoch": 0.4339354960749078, + "grad_norm": 1.084574429840746, + "learning_rate": 7.953324923884578e-06, + "loss": 0.803, + "step": 1147 + }, + { + "epoch": 0.4343138182162111, + "grad_norm": 1.0419638253671073, + "learning_rate": 7.953056043834717e-06, + "loss": 0.8334, + "step": 1148 + }, + { + "epoch": 0.4346921403575144, + "grad_norm": 1.0168098031537844, + "learning_rate": 7.952786396113248e-06, + "loss": 0.7849, + "step": 1149 + }, + { + "epoch": 0.43507046249881776, + "grad_norm": 1.0391261866313206, + "learning_rate": 7.95251598077253e-06, + "loss": 0.792, + "step": 1150 + }, + { + "epoch": 0.43544878464012104, + "grad_norm": 1.0145928185391837, + "learning_rate": 7.95224479786508e-06, + "loss": 0.8069, + "step": 1151 + }, + { + "epoch": 0.4358271067814244, + "grad_norm": 1.0145834983924735, + "learning_rate": 7.951972847443561e-06, + "loss": 0.8045, + "step": 1152 + }, + { + "epoch": 0.4362054289227277, + "grad_norm": 1.0385429868897398, + "learning_rate": 7.951700129560786e-06, + "loss": 0.8091, + "step": 1153 + }, + { + "epoch": 0.436583751064031, + "grad_norm": 1.0484204110539974, + "learning_rate": 7.951426644269712e-06, + "loss": 0.8118, + "step": 1154 + }, + { + "epoch": 0.43696207320533437, + "grad_norm": 1.059201104727976, + "learning_rate": 7.951152391623452e-06, + "loss": 0.8335, + "step": 1155 + }, + { + "epoch": 0.43734039534663766, + "grad_norm": 1.0061721443896443, + "learning_rate": 7.950877371675265e-06, + "loss": 0.7489, + "step": 1156 + }, + { + "epoch": 0.437718717487941, + "grad_norm": 1.0920232553881484, + "learning_rate": 7.950601584478557e-06, + "loss": 0.8012, + "step": 1157 + }, + { + "epoch": 0.4380970396292443, + "grad_norm": 1.0519115174631195, + "learning_rate": 7.950325030086889e-06, + "loss": 0.7923, + "step": 1158 + }, + { + "epoch": 0.43847536177054763, + "grad_norm": 1.0813679052789027, + "learning_rate": 7.950047708553962e-06, + "loss": 0.8313, + "step": 1159 + }, + { + "epoch": 0.4388536839118509, + "grad_norm": 1.0854599046397435, + "learning_rate": 7.949769619933634e-06, + "loss": 0.8616, + "step": 1160 + }, + { + "epoch": 0.43923200605315427, + "grad_norm": 1.1104488658598137, + "learning_rate": 7.94949076427991e-06, + "loss": 0.7878, + "step": 1161 + }, + { + "epoch": 0.43961032819445756, + "grad_norm": 1.1346641422155257, + "learning_rate": 7.949211141646941e-06, + "loss": 0.8287, + "step": 1162 + }, + { + "epoch": 0.4399886503357609, + "grad_norm": 1.0632008460543734, + "learning_rate": 7.948930752089029e-06, + "loss": 0.8278, + "step": 1163 + }, + { + "epoch": 0.44036697247706424, + "grad_norm": 1.0770714736885665, + "learning_rate": 7.948649595660626e-06, + "loss": 0.794, + "step": 1164 + }, + { + "epoch": 0.44074529461836753, + "grad_norm": 1.0320296674718166, + "learning_rate": 7.948367672416329e-06, + "loss": 0.7973, + "step": 1165 + }, + { + "epoch": 0.4411236167596709, + "grad_norm": 1.037195297637391, + "learning_rate": 7.94808498241089e-06, + "loss": 0.8124, + "step": 1166 + }, + { + "epoch": 0.44150193890097417, + "grad_norm": 1.07174382564237, + "learning_rate": 7.947801525699204e-06, + "loss": 0.8501, + "step": 1167 + }, + { + "epoch": 0.4418802610422775, + "grad_norm": 1.0423383360705205, + "learning_rate": 7.947517302336321e-06, + "loss": 0.8023, + "step": 1168 + }, + { + "epoch": 0.4422585831835808, + "grad_norm": 1.0225149206809994, + "learning_rate": 7.947232312377431e-06, + "loss": 0.8082, + "step": 1169 + }, + { + "epoch": 0.44263690532488414, + "grad_norm": 1.0490213514112987, + "learning_rate": 7.946946555877883e-06, + "loss": 0.8553, + "step": 1170 + }, + { + "epoch": 0.44301522746618743, + "grad_norm": 1.0565295484573578, + "learning_rate": 7.946660032893168e-06, + "loss": 0.8334, + "step": 1171 + }, + { + "epoch": 0.4433935496074908, + "grad_norm": 1.096379949923879, + "learning_rate": 7.946372743478928e-06, + "loss": 0.7885, + "step": 1172 + }, + { + "epoch": 0.4437718717487941, + "grad_norm": 1.0635010257740696, + "learning_rate": 7.946084687690952e-06, + "loss": 0.867, + "step": 1173 + }, + { + "epoch": 0.4441501938900974, + "grad_norm": 1.046045957242929, + "learning_rate": 7.945795865585184e-06, + "loss": 0.7794, + "step": 1174 + }, + { + "epoch": 0.44452851603140076, + "grad_norm": 1.1358219370976814, + "learning_rate": 7.945506277217707e-06, + "loss": 0.8048, + "step": 1175 + }, + { + "epoch": 0.44490683817270404, + "grad_norm": 1.0850391747638126, + "learning_rate": 7.945215922644764e-06, + "loss": 0.8056, + "step": 1176 + }, + { + "epoch": 0.4452851603140074, + "grad_norm": 1.1532691295951847, + "learning_rate": 7.944924801922734e-06, + "loss": 0.8176, + "step": 1177 + }, + { + "epoch": 0.4456634824553107, + "grad_norm": 1.0915907522482993, + "learning_rate": 7.944632915108158e-06, + "loss": 0.7994, + "step": 1178 + }, + { + "epoch": 0.446041804596614, + "grad_norm": 1.0282978902411528, + "learning_rate": 7.944340262257718e-06, + "loss": 0.8263, + "step": 1179 + }, + { + "epoch": 0.4464201267379173, + "grad_norm": 1.1021567277496518, + "learning_rate": 7.944046843428244e-06, + "loss": 0.829, + "step": 1180 + }, + { + "epoch": 0.44679844887922066, + "grad_norm": 1.0694612963890957, + "learning_rate": 7.94375265867672e-06, + "loss": 0.8565, + "step": 1181 + }, + { + "epoch": 0.447176771020524, + "grad_norm": 1.0750903881599976, + "learning_rate": 7.943457708060272e-06, + "loss": 0.8396, + "step": 1182 + }, + { + "epoch": 0.4475550931618273, + "grad_norm": 1.0453024844416716, + "learning_rate": 7.943161991636183e-06, + "loss": 0.8096, + "step": 1183 + }, + { + "epoch": 0.44793341530313063, + "grad_norm": 1.0657511458371332, + "learning_rate": 7.942865509461879e-06, + "loss": 0.7964, + "step": 1184 + }, + { + "epoch": 0.4483117374444339, + "grad_norm": 1.0565556737130861, + "learning_rate": 7.942568261594931e-06, + "loss": 0.8254, + "step": 1185 + }, + { + "epoch": 0.44869005958573727, + "grad_norm": 1.0811193147116154, + "learning_rate": 7.942270248093072e-06, + "loss": 0.8741, + "step": 1186 + }, + { + "epoch": 0.44906838172704056, + "grad_norm": 1.0468093016525521, + "learning_rate": 7.941971469014168e-06, + "loss": 0.8379, + "step": 1187 + }, + { + "epoch": 0.4494467038683439, + "grad_norm": 1.06315933336805, + "learning_rate": 7.941671924416245e-06, + "loss": 0.8294, + "step": 1188 + }, + { + "epoch": 0.4498250260096472, + "grad_norm": 1.044215685157516, + "learning_rate": 7.941371614357473e-06, + "loss": 0.8093, + "step": 1189 + }, + { + "epoch": 0.45020334815095053, + "grad_norm": 1.0172723595558777, + "learning_rate": 7.941070538896172e-06, + "loss": 0.777, + "step": 1190 + }, + { + "epoch": 0.4505816702922539, + "grad_norm": 1.0750120304696666, + "learning_rate": 7.940768698090809e-06, + "loss": 0.8105, + "step": 1191 + }, + { + "epoch": 0.45095999243355717, + "grad_norm": 1.0440692979176232, + "learning_rate": 7.940466091999999e-06, + "loss": 0.8537, + "step": 1192 + }, + { + "epoch": 0.4513383145748605, + "grad_norm": 1.031643540251273, + "learning_rate": 7.940162720682508e-06, + "loss": 0.8362, + "step": 1193 + }, + { + "epoch": 0.4517166367161638, + "grad_norm": 1.0019678147671374, + "learning_rate": 7.939858584197252e-06, + "loss": 0.8142, + "step": 1194 + }, + { + "epoch": 0.45209495885746714, + "grad_norm": 1.060840824446392, + "learning_rate": 7.939553682603292e-06, + "loss": 0.7826, + "step": 1195 + }, + { + "epoch": 0.45247328099877043, + "grad_norm": 1.0604407355830034, + "learning_rate": 7.939248015959839e-06, + "loss": 0.8276, + "step": 1196 + }, + { + "epoch": 0.4528516031400738, + "grad_norm": 1.0445689437408072, + "learning_rate": 7.938941584326251e-06, + "loss": 0.7994, + "step": 1197 + }, + { + "epoch": 0.4528516031400738, + "eval_loss": 0.8220446705818176, + "eval_runtime": 26.7666, + "eval_samples_per_second": 33.064, + "eval_steps_per_second": 1.046, + "step": 1197 + }, + { + "epoch": 0.4528516031400738, + "eval_bench_accuracy_arc_challenge": 0.2571428571428571, + "eval_bench_accuracy_hellaswag": 0.225, + "eval_bench_accuracy_mmlu": 0.23478260869565218, + "eval_bench_average_accuracy": 0.23897515527950308, + "eval_bench_loss": 5.286834716796875, + "eval_bench_total_accuracy": 0.23736263736263735, + "step": 1197 + }, + { + "epoch": 0.45322992528137707, + "grad_norm": 1.0158388274699295, + "learning_rate": 7.938634387762039e-06, + "loss": 0.8241, + "step": 1198 + }, + { + "epoch": 0.4536082474226804, + "grad_norm": 1.165515743538843, + "learning_rate": 7.938326426326857e-06, + "loss": 0.8526, + "step": 1199 + }, + { + "epoch": 0.45398656956398376, + "grad_norm": 1.0460295029244764, + "learning_rate": 7.938017700080514e-06, + "loss": 0.7998, + "step": 1200 + }, + { + "epoch": 0.45436489170528704, + "grad_norm": 1.0837173342344641, + "learning_rate": 7.93770820908296e-06, + "loss": 0.7997, + "step": 1201 + }, + { + "epoch": 0.4547432138465904, + "grad_norm": 1.0243169477083875, + "learning_rate": 7.937397953394296e-06, + "loss": 0.7991, + "step": 1202 + }, + { + "epoch": 0.4551215359878937, + "grad_norm": 1.0695328376321132, + "learning_rate": 7.937086933074777e-06, + "loss": 0.7884, + "step": 1203 + }, + { + "epoch": 0.455499858129197, + "grad_norm": 1.0594971537497897, + "learning_rate": 7.9367751481848e-06, + "loss": 0.793, + "step": 1204 + }, + { + "epoch": 0.4558781802705003, + "grad_norm": 1.0554812656920887, + "learning_rate": 7.936462598784913e-06, + "loss": 0.8283, + "step": 1205 + }, + { + "epoch": 0.45625650241180365, + "grad_norm": 1.0592140535117982, + "learning_rate": 7.936149284935811e-06, + "loss": 0.8323, + "step": 1206 + }, + { + "epoch": 0.45663482455310694, + "grad_norm": 1.026196033728254, + "learning_rate": 7.935835206698342e-06, + "loss": 0.8024, + "step": 1207 + }, + { + "epoch": 0.4570131466944103, + "grad_norm": 1.0292414805578125, + "learning_rate": 7.935520364133494e-06, + "loss": 0.7895, + "step": 1208 + }, + { + "epoch": 0.45739146883571363, + "grad_norm": 1.0251629830106175, + "learning_rate": 7.935204757302413e-06, + "loss": 0.8086, + "step": 1209 + }, + { + "epoch": 0.4577697909770169, + "grad_norm": 1.0757191280770386, + "learning_rate": 7.934888386266387e-06, + "loss": 0.8562, + "step": 1210 + }, + { + "epoch": 0.45814811311832027, + "grad_norm": 1.0698429731328996, + "learning_rate": 7.934571251086853e-06, + "loss": 0.8518, + "step": 1211 + }, + { + "epoch": 0.45852643525962355, + "grad_norm": 1.074189860162607, + "learning_rate": 7.934253351825402e-06, + "loss": 0.7941, + "step": 1212 + }, + { + "epoch": 0.4589047574009269, + "grad_norm": 1.0538357299975836, + "learning_rate": 7.933934688543764e-06, + "loss": 0.8394, + "step": 1213 + }, + { + "epoch": 0.4592830795422302, + "grad_norm": 1.0421117329655678, + "learning_rate": 7.933615261303826e-06, + "loss": 0.7609, + "step": 1214 + }, + { + "epoch": 0.45966140168353353, + "grad_norm": 1.0391554404129049, + "learning_rate": 7.933295070167617e-06, + "loss": 0.8257, + "step": 1215 + }, + { + "epoch": 0.4600397238248368, + "grad_norm": 1.0446148939643307, + "learning_rate": 7.93297411519732e-06, + "loss": 0.8104, + "step": 1216 + }, + { + "epoch": 0.46041804596614017, + "grad_norm": 1.0344384305012022, + "learning_rate": 7.932652396455262e-06, + "loss": 0.8044, + "step": 1217 + }, + { + "epoch": 0.4607963681074435, + "grad_norm": 1.0733053009164926, + "learning_rate": 7.932329914003919e-06, + "loss": 0.8174, + "step": 1218 + }, + { + "epoch": 0.4611746902487468, + "grad_norm": 1.0714389655461505, + "learning_rate": 7.932006667905917e-06, + "loss": 0.8255, + "step": 1219 + }, + { + "epoch": 0.46155301239005014, + "grad_norm": 1.028255926596019, + "learning_rate": 7.93168265822403e-06, + "loss": 0.8132, + "step": 1220 + }, + { + "epoch": 0.46193133453135343, + "grad_norm": 1.0523184669233379, + "learning_rate": 7.93135788502118e-06, + "loss": 0.8428, + "step": 1221 + }, + { + "epoch": 0.4623096566726568, + "grad_norm": 1.0557227987751663, + "learning_rate": 7.931032348360435e-06, + "loss": 0.8332, + "step": 1222 + }, + { + "epoch": 0.46268797881396007, + "grad_norm": 1.0609398608821474, + "learning_rate": 7.930706048305015e-06, + "loss": 0.8254, + "step": 1223 + }, + { + "epoch": 0.4630663009552634, + "grad_norm": 1.0113270947271225, + "learning_rate": 7.930378984918286e-06, + "loss": 0.8335, + "step": 1224 + }, + { + "epoch": 0.4634446230965667, + "grad_norm": 1.0131305243085915, + "learning_rate": 7.93005115826376e-06, + "loss": 0.7971, + "step": 1225 + }, + { + "epoch": 0.46382294523787004, + "grad_norm": 1.0569179946125011, + "learning_rate": 7.929722568405108e-06, + "loss": 0.8166, + "step": 1226 + }, + { + "epoch": 0.4642012673791734, + "grad_norm": 1.042578338856108, + "learning_rate": 7.929393215406131e-06, + "loss": 0.8204, + "step": 1227 + }, + { + "epoch": 0.4645795895204767, + "grad_norm": 1.0748606201799873, + "learning_rate": 7.929063099330795e-06, + "loss": 0.8152, + "step": 1228 + }, + { + "epoch": 0.46495791166178, + "grad_norm": 1.0587959397105573, + "learning_rate": 7.928732220243206e-06, + "loss": 0.8452, + "step": 1229 + }, + { + "epoch": 0.4653362338030833, + "grad_norm": 1.0914151462165957, + "learning_rate": 7.928400578207617e-06, + "loss": 0.8131, + "step": 1230 + }, + { + "epoch": 0.46571455594438665, + "grad_norm": 1.0396349529813116, + "learning_rate": 7.928068173288438e-06, + "loss": 0.8113, + "step": 1231 + }, + { + "epoch": 0.46609287808568994, + "grad_norm": 1.0607390438435043, + "learning_rate": 7.927735005550215e-06, + "loss": 0.8368, + "step": 1232 + }, + { + "epoch": 0.4664712002269933, + "grad_norm": 1.0290648955783543, + "learning_rate": 7.927401075057652e-06, + "loss": 0.808, + "step": 1233 + }, + { + "epoch": 0.46684952236829663, + "grad_norm": 1.0438273949617254, + "learning_rate": 7.927066381875595e-06, + "loss": 0.8109, + "step": 1234 + }, + { + "epoch": 0.4672278445095999, + "grad_norm": 1.0492773898494756, + "learning_rate": 7.926730926069041e-06, + "loss": 0.8263, + "step": 1235 + }, + { + "epoch": 0.46760616665090327, + "grad_norm": 1.0898615275461312, + "learning_rate": 7.926394707703133e-06, + "loss": 0.8417, + "step": 1236 + }, + { + "epoch": 0.46798448879220655, + "grad_norm": 1.0371312864392424, + "learning_rate": 7.926057726843167e-06, + "loss": 0.7853, + "step": 1237 + }, + { + "epoch": 0.4683628109335099, + "grad_norm": 1.0311331135840094, + "learning_rate": 7.925719983554582e-06, + "loss": 0.8433, + "step": 1238 + }, + { + "epoch": 0.4687411330748132, + "grad_norm": 1.0104501833340858, + "learning_rate": 7.925381477902967e-06, + "loss": 0.8246, + "step": 1239 + }, + { + "epoch": 0.46911945521611653, + "grad_norm": 1.033351900846643, + "learning_rate": 7.92504220995406e-06, + "loss": 0.801, + "step": 1240 + }, + { + "epoch": 0.4694977773574198, + "grad_norm": 1.0678576004897766, + "learning_rate": 7.92470217977374e-06, + "loss": 0.7953, + "step": 1241 + }, + { + "epoch": 0.46987609949872317, + "grad_norm": 1.049154054889686, + "learning_rate": 7.924361387428047e-06, + "loss": 0.8034, + "step": 1242 + }, + { + "epoch": 0.4702544216400265, + "grad_norm": 1.0501910151623293, + "learning_rate": 7.924019832983159e-06, + "loss": 0.8421, + "step": 1243 + }, + { + "epoch": 0.4706327437813298, + "grad_norm": 1.0265699705882914, + "learning_rate": 7.923677516505404e-06, + "loss": 0.7909, + "step": 1244 + }, + { + "epoch": 0.47101106592263314, + "grad_norm": 1.0395280931797561, + "learning_rate": 7.92333443806126e-06, + "loss": 0.8283, + "step": 1245 + }, + { + "epoch": 0.47138938806393643, + "grad_norm": 1.006365421675378, + "learning_rate": 7.922990597717352e-06, + "loss": 0.8065, + "step": 1246 + }, + { + "epoch": 0.4717677102052398, + "grad_norm": 1.0276097967827926, + "learning_rate": 7.922645995540453e-06, + "loss": 0.808, + "step": 1247 + }, + { + "epoch": 0.47214603234654307, + "grad_norm": 0.990132630477362, + "learning_rate": 7.922300631597482e-06, + "loss": 0.8006, + "step": 1248 + }, + { + "epoch": 0.4725243544878464, + "grad_norm": 1.047163368722463, + "learning_rate": 7.921954505955508e-06, + "loss": 0.7698, + "step": 1249 + }, + { + "epoch": 0.4729026766291497, + "grad_norm": 1.0735335320173403, + "learning_rate": 7.921607618681748e-06, + "loss": 0.807, + "step": 1250 + }, + { + "epoch": 0.47328099877045304, + "grad_norm": 1.0461927309518722, + "learning_rate": 7.921259969843568e-06, + "loss": 0.8158, + "step": 1251 + }, + { + "epoch": 0.4736593209117564, + "grad_norm": 1.0478396570827158, + "learning_rate": 7.920911559508476e-06, + "loss": 0.8386, + "step": 1252 + }, + { + "epoch": 0.4740376430530597, + "grad_norm": 1.0449949458790635, + "learning_rate": 7.920562387744139e-06, + "loss": 0.769, + "step": 1253 + }, + { + "epoch": 0.474415965194363, + "grad_norm": 1.0333564168358704, + "learning_rate": 7.92021245461836e-06, + "loss": 0.7821, + "step": 1254 + }, + { + "epoch": 0.4747942873356663, + "grad_norm": 1.0160573616445434, + "learning_rate": 7.919861760199095e-06, + "loss": 0.8134, + "step": 1255 + }, + { + "epoch": 0.47517260947696965, + "grad_norm": 1.113593494987971, + "learning_rate": 7.91951030455445e-06, + "loss": 0.8009, + "step": 1256 + }, + { + "epoch": 0.47555093161827294, + "grad_norm": 1.0583016464392816, + "learning_rate": 7.919158087752675e-06, + "loss": 0.8338, + "step": 1257 + }, + { + "epoch": 0.4759292537595763, + "grad_norm": 1.0274177510689335, + "learning_rate": 7.918805109862172e-06, + "loss": 0.7701, + "step": 1258 + }, + { + "epoch": 0.4763075759008796, + "grad_norm": 0.9716066799511451, + "learning_rate": 7.918451370951486e-06, + "loss": 0.7624, + "step": 1259 + }, + { + "epoch": 0.4766858980421829, + "grad_norm": 1.0417278811736634, + "learning_rate": 7.91809687108931e-06, + "loss": 0.8515, + "step": 1260 + }, + { + "epoch": 0.47706422018348627, + "grad_norm": 1.0815755118948713, + "learning_rate": 7.917741610344492e-06, + "loss": 0.826, + "step": 1261 + }, + { + "epoch": 0.47744254232478955, + "grad_norm": 0.994132013241377, + "learning_rate": 7.917385588786019e-06, + "loss": 0.8112, + "step": 1262 + }, + { + "epoch": 0.4778208644660929, + "grad_norm": 1.0835320028786077, + "learning_rate": 7.91702880648303e-06, + "loss": 0.8283, + "step": 1263 + }, + { + "epoch": 0.4781991866073962, + "grad_norm": 1.0656905256693705, + "learning_rate": 7.916671263504812e-06, + "loss": 0.8112, + "step": 1264 + }, + { + "epoch": 0.47857750874869953, + "grad_norm": 1.0642356494274112, + "learning_rate": 7.916312959920796e-06, + "loss": 0.8187, + "step": 1265 + }, + { + "epoch": 0.4789558308900028, + "grad_norm": 1.1132626507153238, + "learning_rate": 7.915953895800568e-06, + "loss": 0.8333, + "step": 1266 + }, + { + "epoch": 0.47933415303130616, + "grad_norm": 1.0964935829984281, + "learning_rate": 7.915594071213852e-06, + "loss": 0.8555, + "step": 1267 + }, + { + "epoch": 0.47971247517260945, + "grad_norm": 1.0333616049038883, + "learning_rate": 7.915233486230529e-06, + "loss": 0.8002, + "step": 1268 + }, + { + "epoch": 0.4800907973139128, + "grad_norm": 1.0938509373019147, + "learning_rate": 7.914872140920622e-06, + "loss": 0.8222, + "step": 1269 + }, + { + "epoch": 0.48046911945521614, + "grad_norm": 1.0500659271586612, + "learning_rate": 7.914510035354302e-06, + "loss": 0.7984, + "step": 1270 + }, + { + "epoch": 0.48084744159651943, + "grad_norm": 1.0412102283401292, + "learning_rate": 7.914147169601891e-06, + "loss": 0.8178, + "step": 1271 + }, + { + "epoch": 0.4812257637378228, + "grad_norm": 0.9740307673809164, + "learning_rate": 7.913783543733856e-06, + "loss": 0.7733, + "step": 1272 + }, + { + "epoch": 0.48160408587912606, + "grad_norm": 1.069013806380367, + "learning_rate": 7.91341915782081e-06, + "loss": 0.8355, + "step": 1273 + }, + { + "epoch": 0.4819824080204294, + "grad_norm": 1.020794082270209, + "learning_rate": 7.913054011933518e-06, + "loss": 0.8066, + "step": 1274 + }, + { + "epoch": 0.4823607301617327, + "grad_norm": 1.0710477291242142, + "learning_rate": 7.91268810614289e-06, + "loss": 0.822, + "step": 1275 + }, + { + "epoch": 0.48273905230303604, + "grad_norm": 1.021706668635038, + "learning_rate": 7.912321440519982e-06, + "loss": 0.8393, + "step": 1276 + }, + { + "epoch": 0.48311737444433933, + "grad_norm": 1.0381317605620335, + "learning_rate": 7.911954015136e-06, + "loss": 0.8001, + "step": 1277 + }, + { + "epoch": 0.4834956965856427, + "grad_norm": 1.0491889355455017, + "learning_rate": 7.9115858300623e-06, + "loss": 0.8424, + "step": 1278 + }, + { + "epoch": 0.483874018726946, + "grad_norm": 1.027527176211447, + "learning_rate": 7.911216885370377e-06, + "loss": 0.7934, + "step": 1279 + }, + { + "epoch": 0.4842523408682493, + "grad_norm": 1.0241159829134092, + "learning_rate": 7.910847181131883e-06, + "loss": 0.8632, + "step": 1280 + }, + { + "epoch": 0.48463066300955265, + "grad_norm": 1.050840821158761, + "learning_rate": 7.910476717418613e-06, + "loss": 0.8341, + "step": 1281 + }, + { + "epoch": 0.48500898515085594, + "grad_norm": 1.0312020050809032, + "learning_rate": 7.910105494302508e-06, + "loss": 0.8124, + "step": 1282 + }, + { + "epoch": 0.4853873072921593, + "grad_norm": 1.058895959078315, + "learning_rate": 7.90973351185566e-06, + "loss": 0.8179, + "step": 1283 + }, + { + "epoch": 0.4857656294334626, + "grad_norm": 1.0442278097312725, + "learning_rate": 7.909360770150308e-06, + "loss": 0.8251, + "step": 1284 + }, + { + "epoch": 0.4861439515747659, + "grad_norm": 1.0685857966408454, + "learning_rate": 7.908987269258834e-06, + "loss": 0.8506, + "step": 1285 + }, + { + "epoch": 0.4865222737160692, + "grad_norm": 1.1080322429830538, + "learning_rate": 7.908613009253774e-06, + "loss": 0.825, + "step": 1286 + }, + { + "epoch": 0.48690059585737255, + "grad_norm": 1.0340810208381146, + "learning_rate": 7.908237990207805e-06, + "loss": 0.7916, + "step": 1287 + }, + { + "epoch": 0.4872789179986759, + "grad_norm": 1.0420175323828418, + "learning_rate": 7.907862212193758e-06, + "loss": 0.822, + "step": 1288 + }, + { + "epoch": 0.4876572401399792, + "grad_norm": 1.0199603577395158, + "learning_rate": 7.907485675284604e-06, + "loss": 0.8082, + "step": 1289 + }, + { + "epoch": 0.48803556228128253, + "grad_norm": 1.0282638290755661, + "learning_rate": 7.907108379553467e-06, + "loss": 0.8308, + "step": 1290 + }, + { + "epoch": 0.4884138844225858, + "grad_norm": 1.0699234725043125, + "learning_rate": 7.90673032507362e-06, + "loss": 0.809, + "step": 1291 + }, + { + "epoch": 0.48879220656388916, + "grad_norm": 1.0537759557907738, + "learning_rate": 7.906351511918477e-06, + "loss": 0.8244, + "step": 1292 + }, + { + "epoch": 0.48917052870519245, + "grad_norm": 1.0220073412783424, + "learning_rate": 7.905971940161603e-06, + "loss": 0.8313, + "step": 1293 + }, + { + "epoch": 0.4895488508464958, + "grad_norm": 1.0751723455689177, + "learning_rate": 7.905591609876708e-06, + "loss": 0.8373, + "step": 1294 + }, + { + "epoch": 0.4899271729877991, + "grad_norm": 1.0162597179792359, + "learning_rate": 7.905210521137654e-06, + "loss": 0.8142, + "step": 1295 + }, + { + "epoch": 0.49030549512910243, + "grad_norm": 1.0733965520897772, + "learning_rate": 7.904828674018446e-06, + "loss": 0.8325, + "step": 1296 + }, + { + "epoch": 0.4906838172704058, + "grad_norm": 1.0275444217813758, + "learning_rate": 7.904446068593236e-06, + "loss": 0.812, + "step": 1297 + }, + { + "epoch": 0.49106213941170906, + "grad_norm": 1.0074767810899912, + "learning_rate": 7.904062704936325e-06, + "loss": 0.8072, + "step": 1298 + }, + { + "epoch": 0.4914404615530124, + "grad_norm": 1.0390065488319102, + "learning_rate": 7.903678583122165e-06, + "loss": 0.8008, + "step": 1299 + }, + { + "epoch": 0.4918187836943157, + "grad_norm": 0.9868065507715447, + "learning_rate": 7.903293703225345e-06, + "loss": 0.816, + "step": 1300 + }, + { + "epoch": 0.49219710583561904, + "grad_norm": 1.0553901493428994, + "learning_rate": 7.902908065320615e-06, + "loss": 0.835, + "step": 1301 + }, + { + "epoch": 0.49257542797692233, + "grad_norm": 1.0153758567731757, + "learning_rate": 7.902521669482858e-06, + "loss": 0.7622, + "step": 1302 + }, + { + "epoch": 0.4929537501182257, + "grad_norm": 1.039524643535567, + "learning_rate": 7.902134515787115e-06, + "loss": 0.8219, + "step": 1303 + }, + { + "epoch": 0.49333207225952896, + "grad_norm": 1.0193352620631986, + "learning_rate": 7.901746604308567e-06, + "loss": 0.7745, + "step": 1304 + }, + { + "epoch": 0.4937103944008323, + "grad_norm": 1.0237247993056149, + "learning_rate": 7.901357935122549e-06, + "loss": 0.7918, + "step": 1305 + }, + { + "epoch": 0.49408871654213565, + "grad_norm": 1.018379832975063, + "learning_rate": 7.900968508304535e-06, + "loss": 0.8111, + "step": 1306 + }, + { + "epoch": 0.49446703868343894, + "grad_norm": 1.116472085720671, + "learning_rate": 7.900578323930154e-06, + "loss": 0.7942, + "step": 1307 + }, + { + "epoch": 0.4948453608247423, + "grad_norm": 1.0587349903275387, + "learning_rate": 7.900187382075179e-06, + "loss": 0.7992, + "step": 1308 + }, + { + "epoch": 0.4952236829660456, + "grad_norm": 1.0058048161089288, + "learning_rate": 7.899795682815525e-06, + "loss": 0.7812, + "step": 1309 + }, + { + "epoch": 0.4956020051073489, + "grad_norm": 1.0466221891639538, + "learning_rate": 7.899403226227265e-06, + "loss": 0.8172, + "step": 1310 + }, + { + "epoch": 0.4959803272486522, + "grad_norm": 1.021072365800396, + "learning_rate": 7.899010012386609e-06, + "loss": 0.7917, + "step": 1311 + }, + { + "epoch": 0.49635864938995555, + "grad_norm": 1.0276680529834, + "learning_rate": 7.898616041369919e-06, + "loss": 0.806, + "step": 1312 + }, + { + "epoch": 0.49673697153125884, + "grad_norm": 1.0080935461504426, + "learning_rate": 7.898221313253703e-06, + "loss": 0.7839, + "step": 1313 + }, + { + "epoch": 0.4971152936725622, + "grad_norm": 1.045973831410194, + "learning_rate": 7.897825828114615e-06, + "loss": 0.8396, + "step": 1314 + }, + { + "epoch": 0.49749361581386553, + "grad_norm": 1.0314643332651545, + "learning_rate": 7.897429586029458e-06, + "loss": 0.845, + "step": 1315 + }, + { + "epoch": 0.4978719379551688, + "grad_norm": 1.0214806015923183, + "learning_rate": 7.897032587075181e-06, + "loss": 0.8178, + "step": 1316 + }, + { + "epoch": 0.49825026009647216, + "grad_norm": 1.0739578792818636, + "learning_rate": 7.896634831328881e-06, + "loss": 0.803, + "step": 1317 + }, + { + "epoch": 0.49862858223777545, + "grad_norm": 1.1075886688146952, + "learning_rate": 7.8962363188678e-06, + "loss": 0.7869, + "step": 1318 + }, + { + "epoch": 0.4990069043790788, + "grad_norm": 1.0212558702854573, + "learning_rate": 7.895837049769326e-06, + "loss": 0.8181, + "step": 1319 + }, + { + "epoch": 0.4993852265203821, + "grad_norm": 1.0781905029615857, + "learning_rate": 7.895437024111e-06, + "loss": 0.8469, + "step": 1320 + }, + { + "epoch": 0.49976354866168543, + "grad_norm": 1.0970231389243905, + "learning_rate": 7.895036241970501e-06, + "loss": 0.8268, + "step": 1321 + }, + { + "epoch": 0.5001418708029888, + "grad_norm": 0.9979190002347814, + "learning_rate": 7.894634703425664e-06, + "loss": 0.82, + "step": 1322 + }, + { + "epoch": 0.5005201929442921, + "grad_norm": 1.011211832148979, + "learning_rate": 7.894232408554466e-06, + "loss": 0.7793, + "step": 1323 + }, + { + "epoch": 0.5008985150855954, + "grad_norm": 1.058479892971991, + "learning_rate": 7.893829357435027e-06, + "loss": 0.8557, + "step": 1324 + }, + { + "epoch": 0.5012768372268988, + "grad_norm": 1.067675718676119, + "learning_rate": 7.893425550145624e-06, + "loss": 0.8075, + "step": 1325 + }, + { + "epoch": 0.501655159368202, + "grad_norm": 1.0748158502027498, + "learning_rate": 7.893020986764671e-06, + "loss": 0.8217, + "step": 1326 + }, + { + "epoch": 0.5020334815095053, + "grad_norm": 1.0371866926324267, + "learning_rate": 7.892615667370736e-06, + "loss": 0.786, + "step": 1327 + }, + { + "epoch": 0.5024118036508086, + "grad_norm": 1.0227845872267822, + "learning_rate": 7.892209592042528e-06, + "loss": 0.851, + "step": 1328 + }, + { + "epoch": 0.502790125792112, + "grad_norm": 1.053385595871815, + "learning_rate": 7.891802760858909e-06, + "loss": 0.8131, + "step": 1329 + }, + { + "epoch": 0.5031684479334153, + "grad_norm": 1.0858668827753901, + "learning_rate": 7.89139517389888e-06, + "loss": 0.8178, + "step": 1330 + }, + { + "epoch": 0.5031684479334153, + "eval_loss": 0.8155249357223511, + "eval_runtime": 26.9154, + "eval_samples_per_second": 32.881, + "eval_steps_per_second": 1.04, + "step": 1330 + }, + { + "epoch": 0.5031684479334153, + "eval_bench_accuracy_arc_challenge": 0.22857142857142856, + "eval_bench_accuracy_hellaswag": 0.255, + "eval_bench_accuracy_mmlu": 0.2782608695652174, + "eval_bench_average_accuracy": 0.253944099378882, + "eval_bench_loss": 5.252888461999726, + "eval_bench_total_accuracy": 0.25274725274725274, + "step": 1330 + }, + { + "epoch": 0.5035467700747186, + "grad_norm": 1.0418553186067219, + "learning_rate": 7.890986831241598e-06, + "loss": 0.7842, + "step": 1331 + }, + { + "epoch": 0.503925092216022, + "grad_norm": 1.027783298562076, + "learning_rate": 7.890577732966358e-06, + "loss": 0.7925, + "step": 1332 + }, + { + "epoch": 0.5043034143573253, + "grad_norm": 1.0399175596382164, + "learning_rate": 7.890167879152609e-06, + "loss": 0.8595, + "step": 1333 + }, + { + "epoch": 0.5046817364986286, + "grad_norm": 1.0324556300456535, + "learning_rate": 7.88975726987994e-06, + "loss": 0.8402, + "step": 1334 + }, + { + "epoch": 0.5050600586399319, + "grad_norm": 1.0669911175427689, + "learning_rate": 7.889345905228092e-06, + "loss": 0.8132, + "step": 1335 + }, + { + "epoch": 0.5054383807812353, + "grad_norm": 1.07761249948945, + "learning_rate": 7.888933785276951e-06, + "loss": 0.8122, + "step": 1336 + }, + { + "epoch": 0.5058167029225386, + "grad_norm": 1.0315582279231172, + "learning_rate": 7.888520910106548e-06, + "loss": 0.8063, + "step": 1337 + }, + { + "epoch": 0.5061950250638418, + "grad_norm": 1.028383480686869, + "learning_rate": 7.888107279797064e-06, + "loss": 0.8115, + "step": 1338 + }, + { + "epoch": 0.5065733472051451, + "grad_norm": 1.1084019164549017, + "learning_rate": 7.887692894428822e-06, + "loss": 0.8586, + "step": 1339 + }, + { + "epoch": 0.5069516693464485, + "grad_norm": 1.0246273881178, + "learning_rate": 7.887277754082298e-06, + "loss": 0.7968, + "step": 1340 + }, + { + "epoch": 0.5073299914877518, + "grad_norm": 1.0537510788483588, + "learning_rate": 7.886861858838109e-06, + "loss": 0.7794, + "step": 1341 + }, + { + "epoch": 0.5077083136290551, + "grad_norm": 1.025698434441957, + "learning_rate": 7.88644520877702e-06, + "loss": 0.7983, + "step": 1342 + }, + { + "epoch": 0.5080866357703585, + "grad_norm": 1.0480085776508747, + "learning_rate": 7.886027803979946e-06, + "loss": 0.8016, + "step": 1343 + }, + { + "epoch": 0.5084649579116618, + "grad_norm": 1.0461816558010573, + "learning_rate": 7.885609644527943e-06, + "loss": 0.8189, + "step": 1344 + }, + { + "epoch": 0.5088432800529651, + "grad_norm": 0.993326821555258, + "learning_rate": 7.885190730502215e-06, + "loss": 0.7957, + "step": 1345 + }, + { + "epoch": 0.5092216021942684, + "grad_norm": 1.0745480385635238, + "learning_rate": 7.884771061984118e-06, + "loss": 0.8019, + "step": 1346 + }, + { + "epoch": 0.5095999243355718, + "grad_norm": 1.0384805298302937, + "learning_rate": 7.884350639055147e-06, + "loss": 0.8395, + "step": 1347 + }, + { + "epoch": 0.5099782464768751, + "grad_norm": 1.020760024227472, + "learning_rate": 7.883929461796949e-06, + "loss": 0.7919, + "step": 1348 + }, + { + "epoch": 0.5103565686181784, + "grad_norm": 1.0426222802625165, + "learning_rate": 7.883507530291315e-06, + "loss": 0.8133, + "step": 1349 + }, + { + "epoch": 0.5107348907594818, + "grad_norm": 1.0236106718012763, + "learning_rate": 7.883084844620181e-06, + "loss": 0.7525, + "step": 1350 + }, + { + "epoch": 0.511113212900785, + "grad_norm": 1.0752909757757687, + "learning_rate": 7.882661404865635e-06, + "loss": 0.8363, + "step": 1351 + }, + { + "epoch": 0.5114915350420883, + "grad_norm": 1.0496011841679878, + "learning_rate": 7.882237211109903e-06, + "loss": 0.825, + "step": 1352 + }, + { + "epoch": 0.5118698571833916, + "grad_norm": 1.052905405929199, + "learning_rate": 7.881812263435365e-06, + "loss": 0.7808, + "step": 1353 + }, + { + "epoch": 0.512248179324695, + "grad_norm": 1.0383149467870931, + "learning_rate": 7.881386561924544e-06, + "loss": 0.8258, + "step": 1354 + }, + { + "epoch": 0.5126265014659983, + "grad_norm": 1.0142846574710827, + "learning_rate": 7.880960106660112e-06, + "loss": 0.832, + "step": 1355 + }, + { + "epoch": 0.5130048236073016, + "grad_norm": 1.0162105056610324, + "learning_rate": 7.880532897724882e-06, + "loss": 0.8271, + "step": 1356 + }, + { + "epoch": 0.5133831457486049, + "grad_norm": 1.0111397828819904, + "learning_rate": 7.880104935201817e-06, + "loss": 0.7716, + "step": 1357 + }, + { + "epoch": 0.5137614678899083, + "grad_norm": 1.0387312593547113, + "learning_rate": 7.879676219174028e-06, + "loss": 0.7856, + "step": 1358 + }, + { + "epoch": 0.5141397900312116, + "grad_norm": 1.0976300200992746, + "learning_rate": 7.879246749724769e-06, + "loss": 0.8214, + "step": 1359 + }, + { + "epoch": 0.5145181121725149, + "grad_norm": 1.0225148649560976, + "learning_rate": 7.878816526937443e-06, + "loss": 0.8154, + "step": 1360 + }, + { + "epoch": 0.5148964343138183, + "grad_norm": 1.0564511900500775, + "learning_rate": 7.878385550895597e-06, + "loss": 0.7706, + "step": 1361 + }, + { + "epoch": 0.5152747564551216, + "grad_norm": 1.065194818654382, + "learning_rate": 7.877953821682924e-06, + "loss": 0.7806, + "step": 1362 + }, + { + "epoch": 0.5156530785964248, + "grad_norm": 1.0318627975030588, + "learning_rate": 7.877521339383267e-06, + "loss": 0.8317, + "step": 1363 + }, + { + "epoch": 0.5160314007377281, + "grad_norm": 1.0660496042471788, + "learning_rate": 7.877088104080612e-06, + "loss": 0.8116, + "step": 1364 + }, + { + "epoch": 0.5164097228790315, + "grad_norm": 1.0084811396262128, + "learning_rate": 7.87665411585909e-06, + "loss": 0.8233, + "step": 1365 + }, + { + "epoch": 0.5167880450203348, + "grad_norm": 1.0061856631615549, + "learning_rate": 7.876219374802983e-06, + "loss": 0.8226, + "step": 1366 + }, + { + "epoch": 0.5171663671616381, + "grad_norm": 0.9962092519447693, + "learning_rate": 7.875783880996717e-06, + "loss": 0.7949, + "step": 1367 + }, + { + "epoch": 0.5175446893029415, + "grad_norm": 1.0320181154699064, + "learning_rate": 7.87534763452486e-06, + "loss": 0.8078, + "step": 1368 + }, + { + "epoch": 0.5179230114442448, + "grad_norm": 1.0366220904643662, + "learning_rate": 7.87491063547213e-06, + "loss": 0.7915, + "step": 1369 + }, + { + "epoch": 0.5183013335855481, + "grad_norm": 0.9990483570523689, + "learning_rate": 7.874472883923396e-06, + "loss": 0.7962, + "step": 1370 + }, + { + "epoch": 0.5186796557268514, + "grad_norm": 1.072712099895109, + "learning_rate": 7.874034379963663e-06, + "loss": 0.8201, + "step": 1371 + }, + { + "epoch": 0.5190579778681548, + "grad_norm": 1.0469398611990606, + "learning_rate": 7.873595123678088e-06, + "loss": 0.8295, + "step": 1372 + }, + { + "epoch": 0.5194363000094581, + "grad_norm": 1.0258466230718022, + "learning_rate": 7.873155115151976e-06, + "loss": 0.7962, + "step": 1373 + }, + { + "epoch": 0.5198146221507614, + "grad_norm": 1.0150744464405486, + "learning_rate": 7.872714354470771e-06, + "loss": 0.8091, + "step": 1374 + }, + { + "epoch": 0.5201929442920646, + "grad_norm": 1.0877815460579687, + "learning_rate": 7.87227284172007e-06, + "loss": 0.8449, + "step": 1375 + }, + { + "epoch": 0.520571266433368, + "grad_norm": 0.9989012315656198, + "learning_rate": 7.871830576985613e-06, + "loss": 0.7904, + "step": 1376 + }, + { + "epoch": 0.5209495885746713, + "grad_norm": 1.0281663493359343, + "learning_rate": 7.871387560353288e-06, + "loss": 0.8235, + "step": 1377 + }, + { + "epoch": 0.5213279107159746, + "grad_norm": 1.013255314723829, + "learning_rate": 7.870943791909124e-06, + "loss": 0.8137, + "step": 1378 + }, + { + "epoch": 0.521706232857278, + "grad_norm": 1.0404202767535178, + "learning_rate": 7.870499271739304e-06, + "loss": 0.8331, + "step": 1379 + }, + { + "epoch": 0.5220845549985813, + "grad_norm": 1.0008843854289766, + "learning_rate": 7.870053999930149e-06, + "loss": 0.7985, + "step": 1380 + }, + { + "epoch": 0.5224628771398846, + "grad_norm": 1.115907702208107, + "learning_rate": 7.869607976568131e-06, + "loss": 0.8444, + "step": 1381 + }, + { + "epoch": 0.5228411992811879, + "grad_norm": 1.0499698053880258, + "learning_rate": 7.869161201739866e-06, + "loss": 0.7875, + "step": 1382 + }, + { + "epoch": 0.5232195214224913, + "grad_norm": 1.0086891227734494, + "learning_rate": 7.868713675532115e-06, + "loss": 0.7981, + "step": 1383 + }, + { + "epoch": 0.5235978435637946, + "grad_norm": 1.0416968121742411, + "learning_rate": 7.868265398031788e-06, + "loss": 0.8082, + "step": 1384 + }, + { + "epoch": 0.5239761657050979, + "grad_norm": 0.9956171233693443, + "learning_rate": 7.86781636932594e-06, + "loss": 0.8497, + "step": 1385 + }, + { + "epoch": 0.5243544878464013, + "grad_norm": 1.0366372693126888, + "learning_rate": 7.867366589501767e-06, + "loss": 0.7878, + "step": 1386 + }, + { + "epoch": 0.5247328099877046, + "grad_norm": 1.0252929211171813, + "learning_rate": 7.86691605864662e-06, + "loss": 0.8254, + "step": 1387 + }, + { + "epoch": 0.5251111321290078, + "grad_norm": 1.0349722097719734, + "learning_rate": 7.866464776847987e-06, + "loss": 0.8092, + "step": 1388 + }, + { + "epoch": 0.5254894542703111, + "grad_norm": 1.0775801625166288, + "learning_rate": 7.866012744193508e-06, + "loss": 0.8032, + "step": 1389 + }, + { + "epoch": 0.5258677764116145, + "grad_norm": 1.025158242287074, + "learning_rate": 7.865559960770964e-06, + "loss": 0.7777, + "step": 1390 + }, + { + "epoch": 0.5262460985529178, + "grad_norm": 1.0261907345479138, + "learning_rate": 7.865106426668287e-06, + "loss": 0.7656, + "step": 1391 + }, + { + "epoch": 0.5266244206942211, + "grad_norm": 1.0119949142526334, + "learning_rate": 7.864652141973549e-06, + "loss": 0.817, + "step": 1392 + }, + { + "epoch": 0.5270027428355244, + "grad_norm": 0.9887922738590984, + "learning_rate": 7.864197106774973e-06, + "loss": 0.7871, + "step": 1393 + }, + { + "epoch": 0.5273810649768278, + "grad_norm": 1.0473369889166892, + "learning_rate": 7.863741321160924e-06, + "loss": 0.7885, + "step": 1394 + }, + { + "epoch": 0.5277593871181311, + "grad_norm": 1.021975230127612, + "learning_rate": 7.863284785219916e-06, + "loss": 0.7862, + "step": 1395 + }, + { + "epoch": 0.5281377092594344, + "grad_norm": 1.0624890686836679, + "learning_rate": 7.862827499040604e-06, + "loss": 0.8445, + "step": 1396 + }, + { + "epoch": 0.5285160314007378, + "grad_norm": 1.0159701351719927, + "learning_rate": 7.862369462711795e-06, + "loss": 0.8084, + "step": 1397 + }, + { + "epoch": 0.5288943535420411, + "grad_norm": 1.0307854419947649, + "learning_rate": 7.861910676322434e-06, + "loss": 0.7957, + "step": 1398 + }, + { + "epoch": 0.5292726756833444, + "grad_norm": 1.088274510577477, + "learning_rate": 7.861451139961622e-06, + "loss": 0.8134, + "step": 1399 + }, + { + "epoch": 0.5296509978246476, + "grad_norm": 1.1610468987478788, + "learning_rate": 7.860990853718593e-06, + "loss": 0.7706, + "step": 1400 + }, + { + "epoch": 0.530029319965951, + "grad_norm": 1.0709949089292212, + "learning_rate": 7.860529817682737e-06, + "loss": 0.839, + "step": 1401 + }, + { + "epoch": 0.5304076421072543, + "grad_norm": 1.0641189768424455, + "learning_rate": 7.860068031943586e-06, + "loss": 0.7794, + "step": 1402 + }, + { + "epoch": 0.5307859642485576, + "grad_norm": 1.0425801957230985, + "learning_rate": 7.859605496590816e-06, + "loss": 0.7982, + "step": 1403 + }, + { + "epoch": 0.531164286389861, + "grad_norm": 1.0561738214600724, + "learning_rate": 7.859142211714251e-06, + "loss": 0.8298, + "step": 1404 + }, + { + "epoch": 0.5315426085311643, + "grad_norm": 1.0034598628819673, + "learning_rate": 7.858678177403859e-06, + "loss": 0.842, + "step": 1405 + }, + { + "epoch": 0.5319209306724676, + "grad_norm": 1.0174154185360578, + "learning_rate": 7.858213393749755e-06, + "loss": 0.8024, + "step": 1406 + }, + { + "epoch": 0.5322992528137709, + "grad_norm": 1.002603647328177, + "learning_rate": 7.857747860842196e-06, + "loss": 0.8186, + "step": 1407 + }, + { + "epoch": 0.5326775749550743, + "grad_norm": 1.0285530234043798, + "learning_rate": 7.857281578771589e-06, + "loss": 0.8156, + "step": 1408 + }, + { + "epoch": 0.5330558970963776, + "grad_norm": 1.02768116084931, + "learning_rate": 7.856814547628485e-06, + "loss": 0.8165, + "step": 1409 + }, + { + "epoch": 0.5334342192376809, + "grad_norm": 1.1031829681313992, + "learning_rate": 7.85634676750358e-06, + "loss": 0.8579, + "step": 1410 + }, + { + "epoch": 0.5338125413789842, + "grad_norm": 1.027426941839886, + "learning_rate": 7.855878238487714e-06, + "loss": 0.7945, + "step": 1411 + }, + { + "epoch": 0.5341908635202876, + "grad_norm": 1.0561714395136612, + "learning_rate": 7.855408960671875e-06, + "loss": 0.7641, + "step": 1412 + }, + { + "epoch": 0.5345691856615908, + "grad_norm": 1.090238437190781, + "learning_rate": 7.854938934147195e-06, + "loss": 0.8063, + "step": 1413 + }, + { + "epoch": 0.5349475078028941, + "grad_norm": 1.2074317498906901, + "learning_rate": 7.854468159004952e-06, + "loss": 0.7921, + "step": 1414 + }, + { + "epoch": 0.5353258299441975, + "grad_norm": 1.0749934432108652, + "learning_rate": 7.85399663533657e-06, + "loss": 0.8165, + "step": 1415 + }, + { + "epoch": 0.5357041520855008, + "grad_norm": 1.0472554586470812, + "learning_rate": 7.853524363233614e-06, + "loss": 0.8232, + "step": 1416 + }, + { + "epoch": 0.5360824742268041, + "grad_norm": 1.0321608082815132, + "learning_rate": 7.853051342787802e-06, + "loss": 0.8207, + "step": 1417 + }, + { + "epoch": 0.5364607963681074, + "grad_norm": 1.010186032847584, + "learning_rate": 7.852577574090992e-06, + "loss": 0.7875, + "step": 1418 + }, + { + "epoch": 0.5368391185094108, + "grad_norm": 1.0585550633979846, + "learning_rate": 7.852103057235187e-06, + "loss": 0.7872, + "step": 1419 + }, + { + "epoch": 0.5372174406507141, + "grad_norm": 1.0424950696245099, + "learning_rate": 7.851627792312539e-06, + "loss": 0.7871, + "step": 1420 + }, + { + "epoch": 0.5375957627920174, + "grad_norm": 1.0123853847303819, + "learning_rate": 7.85115177941534e-06, + "loss": 0.7915, + "step": 1421 + }, + { + "epoch": 0.5379740849333208, + "grad_norm": 1.0357173714573609, + "learning_rate": 7.850675018636034e-06, + "loss": 0.7829, + "step": 1422 + }, + { + "epoch": 0.5383524070746241, + "grad_norm": 1.4395615442604752, + "learning_rate": 7.850197510067203e-06, + "loss": 0.8255, + "step": 1423 + }, + { + "epoch": 0.5387307292159274, + "grad_norm": 1.0121918462650672, + "learning_rate": 7.849719253801578e-06, + "loss": 0.8553, + "step": 1424 + }, + { + "epoch": 0.5391090513572306, + "grad_norm": 0.9837030660961567, + "learning_rate": 7.849240249932039e-06, + "loss": 0.7586, + "step": 1425 + }, + { + "epoch": 0.539487373498534, + "grad_norm": 1.018520798880126, + "learning_rate": 7.848760498551603e-06, + "loss": 0.8266, + "step": 1426 + }, + { + "epoch": 0.5398656956398373, + "grad_norm": 1.0215594842474691, + "learning_rate": 7.848279999753438e-06, + "loss": 0.8115, + "step": 1427 + }, + { + "epoch": 0.5402440177811406, + "grad_norm": 1.0166660418304827, + "learning_rate": 7.847798753630854e-06, + "loss": 0.7822, + "step": 1428 + }, + { + "epoch": 0.5406223399224439, + "grad_norm": 1.0027140748494623, + "learning_rate": 7.84731676027731e-06, + "loss": 0.8033, + "step": 1429 + }, + { + "epoch": 0.5410006620637473, + "grad_norm": 1.0627188785846766, + "learning_rate": 7.846834019786404e-06, + "loss": 0.8265, + "step": 1430 + }, + { + "epoch": 0.5413789842050506, + "grad_norm": 1.0264202021796238, + "learning_rate": 7.846350532251887e-06, + "loss": 0.8109, + "step": 1431 + }, + { + "epoch": 0.5417573063463539, + "grad_norm": 1.0850130197305035, + "learning_rate": 7.845866297767647e-06, + "loss": 0.8166, + "step": 1432 + }, + { + "epoch": 0.5421356284876573, + "grad_norm": 1.0443803197744415, + "learning_rate": 7.845381316427724e-06, + "loss": 0.8134, + "step": 1433 + }, + { + "epoch": 0.5425139506289606, + "grad_norm": 1.0216121613789444, + "learning_rate": 7.844895588326298e-06, + "loss": 0.8248, + "step": 1434 + }, + { + "epoch": 0.5428922727702639, + "grad_norm": 1.0528680390786613, + "learning_rate": 7.844409113557698e-06, + "loss": 0.8306, + "step": 1435 + }, + { + "epoch": 0.5432705949115672, + "grad_norm": 1.056376944389717, + "learning_rate": 7.843921892216392e-06, + "loss": 0.7733, + "step": 1436 + }, + { + "epoch": 0.5436489170528706, + "grad_norm": 1.0054617166141346, + "learning_rate": 7.843433924397002e-06, + "loss": 0.7937, + "step": 1437 + }, + { + "epoch": 0.5440272391941738, + "grad_norm": 1.0047703505362153, + "learning_rate": 7.842945210194286e-06, + "loss": 0.7923, + "step": 1438 + }, + { + "epoch": 0.5444055613354771, + "grad_norm": 1.0096110719940172, + "learning_rate": 7.842455749703151e-06, + "loss": 0.7994, + "step": 1439 + }, + { + "epoch": 0.5447838834767805, + "grad_norm": 1.0605981769829262, + "learning_rate": 7.841965543018651e-06, + "loss": 0.8085, + "step": 1440 + }, + { + "epoch": 0.5451622056180838, + "grad_norm": 1.0471718815415907, + "learning_rate": 7.841474590235981e-06, + "loss": 0.8463, + "step": 1441 + }, + { + "epoch": 0.5455405277593871, + "grad_norm": 1.0505867574083267, + "learning_rate": 7.840982891450483e-06, + "loss": 0.8242, + "step": 1442 + }, + { + "epoch": 0.5459188499006904, + "grad_norm": 1.0445952963424892, + "learning_rate": 7.840490446757645e-06, + "loss": 0.7749, + "step": 1443 + }, + { + "epoch": 0.5462971720419938, + "grad_norm": 1.0068778649332644, + "learning_rate": 7.839997256253096e-06, + "loss": 0.8116, + "step": 1444 + }, + { + "epoch": 0.5466754941832971, + "grad_norm": 1.00961692913919, + "learning_rate": 7.839503320032612e-06, + "loss": 0.7901, + "step": 1445 + }, + { + "epoch": 0.5470538163246004, + "grad_norm": 0.9780075250092127, + "learning_rate": 7.839008638192115e-06, + "loss": 0.7885, + "step": 1446 + }, + { + "epoch": 0.5474321384659037, + "grad_norm": 1.100812581357096, + "learning_rate": 7.838513210827671e-06, + "loss": 0.8001, + "step": 1447 + }, + { + "epoch": 0.5478104606072071, + "grad_norm": 1.0494389505966184, + "learning_rate": 7.83801703803549e-06, + "loss": 0.7977, + "step": 1448 + }, + { + "epoch": 0.5481887827485104, + "grad_norm": 1.034386181938751, + "learning_rate": 7.837520119911927e-06, + "loss": 0.8244, + "step": 1449 + }, + { + "epoch": 0.5485671048898136, + "grad_norm": 1.0112131883045796, + "learning_rate": 7.837022456553482e-06, + "loss": 0.7537, + "step": 1450 + }, + { + "epoch": 0.548945427031117, + "grad_norm": 1.0542214842469684, + "learning_rate": 7.836524048056801e-06, + "loss": 0.8436, + "step": 1451 + }, + { + "epoch": 0.5493237491724203, + "grad_norm": 1.0139124551358574, + "learning_rate": 7.836024894518673e-06, + "loss": 0.7765, + "step": 1452 + }, + { + "epoch": 0.5497020713137236, + "grad_norm": 1.0370438053735662, + "learning_rate": 7.835524996036031e-06, + "loss": 0.7957, + "step": 1453 + }, + { + "epoch": 0.5500803934550269, + "grad_norm": 1.0403261101993466, + "learning_rate": 7.835024352705953e-06, + "loss": 0.8082, + "step": 1454 + }, + { + "epoch": 0.5504587155963303, + "grad_norm": 1.0223772000926137, + "learning_rate": 7.834522964625665e-06, + "loss": 0.8091, + "step": 1455 + }, + { + "epoch": 0.5508370377376336, + "grad_norm": 0.9867288417868126, + "learning_rate": 7.834020831892534e-06, + "loss": 0.7971, + "step": 1456 + }, + { + "epoch": 0.5512153598789369, + "grad_norm": 1.038419907192562, + "learning_rate": 7.833517954604074e-06, + "loss": 0.7774, + "step": 1457 + }, + { + "epoch": 0.5515936820202403, + "grad_norm": 1.0143771814537008, + "learning_rate": 7.833014332857939e-06, + "loss": 0.7763, + "step": 1458 + }, + { + "epoch": 0.5519720041615436, + "grad_norm": 1.0001756819325087, + "learning_rate": 7.832509966751933e-06, + "loss": 0.7889, + "step": 1459 + }, + { + "epoch": 0.5523503263028469, + "grad_norm": 1.036257856076326, + "learning_rate": 7.832004856384001e-06, + "loss": 0.7901, + "step": 1460 + }, + { + "epoch": 0.5527286484441502, + "grad_norm": 1.0355156315068814, + "learning_rate": 7.831499001852236e-06, + "loss": 0.7742, + "step": 1461 + }, + { + "epoch": 0.5531069705854536, + "grad_norm": 1.1407334044483102, + "learning_rate": 7.830992403254873e-06, + "loss": 0.8265, + "step": 1462 + }, + { + "epoch": 0.5534852927267568, + "grad_norm": 1.0063557289156941, + "learning_rate": 7.83048506069029e-06, + "loss": 0.7994, + "step": 1463 + }, + { + "epoch": 0.5534852927267568, + "eval_loss": 0.8094308972358704, + "eval_runtime": 26.9598, + "eval_samples_per_second": 32.827, + "eval_steps_per_second": 1.039, + "step": 1463 + }, + { + "epoch": 0.5534852927267568, + "eval_bench_accuracy_arc_challenge": 0.25, + "eval_bench_accuracy_hellaswag": 0.215, + "eval_bench_accuracy_mmlu": 0.2608695652173913, + "eval_bench_average_accuracy": 0.24195652173913043, + "eval_bench_loss": 6.063661274157073, + "eval_bench_total_accuracy": 0.23736263736263735, + "step": 1463 + }, + { + "epoch": 0.5538636148680601, + "grad_norm": 1.0744841523132298, + "learning_rate": 7.829976974257012e-06, + "loss": 0.8504, + "step": 1464 + }, + { + "epoch": 0.5542419370093635, + "grad_norm": 1.0186917057516884, + "learning_rate": 7.829468144053712e-06, + "loss": 0.8052, + "step": 1465 + }, + { + "epoch": 0.5546202591506668, + "grad_norm": 1.0107687681368964, + "learning_rate": 7.828958570179196e-06, + "loss": 0.8094, + "step": 1466 + }, + { + "epoch": 0.5549985812919701, + "grad_norm": 1.0349853318053726, + "learning_rate": 7.828448252732428e-06, + "loss": 0.8303, + "step": 1467 + }, + { + "epoch": 0.5553769034332734, + "grad_norm": 1.0450694598466956, + "learning_rate": 7.827937191812508e-06, + "loss": 0.7924, + "step": 1468 + }, + { + "epoch": 0.5557552255745768, + "grad_norm": 1.0278598268440422, + "learning_rate": 7.82742538751868e-06, + "loss": 0.7701, + "step": 1469 + }, + { + "epoch": 0.5561335477158801, + "grad_norm": 1.0315097348678433, + "learning_rate": 7.826912839950338e-06, + "loss": 0.7643, + "step": 1470 + }, + { + "epoch": 0.5565118698571834, + "grad_norm": 1.0630245419936848, + "learning_rate": 7.826399549207016e-06, + "loss": 0.8334, + "step": 1471 + }, + { + "epoch": 0.5568901919984867, + "grad_norm": 1.057495631028003, + "learning_rate": 7.825885515388394e-06, + "loss": 0.8098, + "step": 1472 + }, + { + "epoch": 0.5572685141397901, + "grad_norm": 1.0485936898987425, + "learning_rate": 7.825370738594296e-06, + "loss": 0.8524, + "step": 1473 + }, + { + "epoch": 0.5576468362810933, + "grad_norm": 1.089800751911175, + "learning_rate": 7.82485521892469e-06, + "loss": 0.7807, + "step": 1474 + }, + { + "epoch": 0.5580251584223966, + "grad_norm": 1.008238694676228, + "learning_rate": 7.824338956479687e-06, + "loss": 0.7641, + "step": 1475 + }, + { + "epoch": 0.5584034805637, + "grad_norm": 0.9866356509513795, + "learning_rate": 7.823821951359546e-06, + "loss": 0.8072, + "step": 1476 + }, + { + "epoch": 0.5587818027050033, + "grad_norm": 1.0159932518028019, + "learning_rate": 7.823304203664665e-06, + "loss": 0.7563, + "step": 1477 + }, + { + "epoch": 0.5591601248463066, + "grad_norm": 1.0691391299613169, + "learning_rate": 7.82278571349559e-06, + "loss": 0.7666, + "step": 1478 + }, + { + "epoch": 0.5595384469876099, + "grad_norm": 1.069708560088697, + "learning_rate": 7.822266480953014e-06, + "loss": 0.8094, + "step": 1479 + }, + { + "epoch": 0.5599167691289133, + "grad_norm": 1.0399404229309808, + "learning_rate": 7.821746506137766e-06, + "loss": 0.8041, + "step": 1480 + }, + { + "epoch": 0.5602950912702166, + "grad_norm": 1.0528966086217326, + "learning_rate": 7.821225789150823e-06, + "loss": 0.8186, + "step": 1481 + }, + { + "epoch": 0.5606734134115199, + "grad_norm": 1.078154168587184, + "learning_rate": 7.820704330093309e-06, + "loss": 0.7697, + "step": 1482 + }, + { + "epoch": 0.5610517355528233, + "grad_norm": 0.9974199242655317, + "learning_rate": 7.82018212906649e-06, + "loss": 0.7627, + "step": 1483 + }, + { + "epoch": 0.5614300576941266, + "grad_norm": 1.0441157570327169, + "learning_rate": 7.819659186171774e-06, + "loss": 0.7637, + "step": 1484 + }, + { + "epoch": 0.5618083798354299, + "grad_norm": 1.0350192453023053, + "learning_rate": 7.819135501510717e-06, + "loss": 0.7863, + "step": 1485 + }, + { + "epoch": 0.5621867019767331, + "grad_norm": 1.0314197771080482, + "learning_rate": 7.818611075185016e-06, + "loss": 0.7761, + "step": 1486 + }, + { + "epoch": 0.5625650241180365, + "grad_norm": 1.1142918188982494, + "learning_rate": 7.818085907296514e-06, + "loss": 0.8451, + "step": 1487 + }, + { + "epoch": 0.5629433462593398, + "grad_norm": 1.0635918190610065, + "learning_rate": 7.817559997947194e-06, + "loss": 0.7987, + "step": 1488 + }, + { + "epoch": 0.5633216684006431, + "grad_norm": 1.0137296000615337, + "learning_rate": 7.817033347239188e-06, + "loss": 0.7849, + "step": 1489 + }, + { + "epoch": 0.5636999905419464, + "grad_norm": 1.0465836630867722, + "learning_rate": 7.816505955274772e-06, + "loss": 0.7609, + "step": 1490 + }, + { + "epoch": 0.5640783126832498, + "grad_norm": 1.0227869394316658, + "learning_rate": 7.81597782215636e-06, + "loss": 0.7658, + "step": 1491 + }, + { + "epoch": 0.5644566348245531, + "grad_norm": 1.025273340871076, + "learning_rate": 7.815448947986518e-06, + "loss": 0.7943, + "step": 1492 + }, + { + "epoch": 0.5648349569658564, + "grad_norm": 1.0788965118297305, + "learning_rate": 7.814919332867948e-06, + "loss": 0.7825, + "step": 1493 + }, + { + "epoch": 0.5652132791071598, + "grad_norm": 1.0290788502294095, + "learning_rate": 7.814388976903501e-06, + "loss": 0.7686, + "step": 1494 + }, + { + "epoch": 0.5655916012484631, + "grad_norm": 1.0043872677988737, + "learning_rate": 7.813857880196172e-06, + "loss": 0.765, + "step": 1495 + }, + { + "epoch": 0.5659699233897664, + "grad_norm": 1.0416556353562665, + "learning_rate": 7.813326042849096e-06, + "loss": 0.7905, + "step": 1496 + }, + { + "epoch": 0.5663482455310697, + "grad_norm": 1.0403767458597168, + "learning_rate": 7.812793464965557e-06, + "loss": 0.8392, + "step": 1497 + }, + { + "epoch": 0.5667265676723731, + "grad_norm": 1.0804135578705913, + "learning_rate": 7.812260146648978e-06, + "loss": 0.8042, + "step": 1498 + }, + { + "epoch": 0.5671048898136763, + "grad_norm": 1.0525290992619953, + "learning_rate": 7.811726088002928e-06, + "loss": 0.8125, + "step": 1499 + }, + { + "epoch": 0.5674832119549796, + "grad_norm": 1.0443809449733452, + "learning_rate": 7.81119128913112e-06, + "loss": 0.8449, + "step": 1500 + }, + { + "epoch": 0.567861534096283, + "grad_norm": 1.0484442830821317, + "learning_rate": 7.810655750137408e-06, + "loss": 0.791, + "step": 1501 + }, + { + "epoch": 0.5682398562375863, + "grad_norm": 1.0322889324418691, + "learning_rate": 7.810119471125797e-06, + "loss": 0.7638, + "step": 1502 + }, + { + "epoch": 0.5686181783788896, + "grad_norm": 1.0251619422017846, + "learning_rate": 7.809582452200428e-06, + "loss": 0.7971, + "step": 1503 + }, + { + "epoch": 0.5689965005201929, + "grad_norm": 1.0150926516902954, + "learning_rate": 7.809044693465587e-06, + "loss": 0.7734, + "step": 1504 + }, + { + "epoch": 0.5693748226614963, + "grad_norm": 1.0663474541629985, + "learning_rate": 7.808506195025707e-06, + "loss": 0.8411, + "step": 1505 + }, + { + "epoch": 0.5697531448027996, + "grad_norm": 1.0708265848333849, + "learning_rate": 7.807966956985363e-06, + "loss": 0.8428, + "step": 1506 + }, + { + "epoch": 0.5701314669441029, + "grad_norm": 1.0294311898641297, + "learning_rate": 7.807426979449273e-06, + "loss": 0.8016, + "step": 1507 + }, + { + "epoch": 0.5705097890854062, + "grad_norm": 1.072155935601359, + "learning_rate": 7.806886262522298e-06, + "loss": 0.7896, + "step": 1508 + }, + { + "epoch": 0.5708881112267096, + "grad_norm": 1.0602457428763656, + "learning_rate": 7.806344806309445e-06, + "loss": 0.8306, + "step": 1509 + }, + { + "epoch": 0.5712664333680129, + "grad_norm": 1.0410264668234372, + "learning_rate": 7.805802610915862e-06, + "loss": 0.7708, + "step": 1510 + }, + { + "epoch": 0.5716447555093161, + "grad_norm": 1.0323609766839155, + "learning_rate": 7.805259676446843e-06, + "loss": 0.7731, + "step": 1511 + }, + { + "epoch": 0.5720230776506195, + "grad_norm": 1.0629777585594808, + "learning_rate": 7.804716003007825e-06, + "loss": 0.8667, + "step": 1512 + }, + { + "epoch": 0.5724013997919228, + "grad_norm": 0.9991092397744588, + "learning_rate": 7.804171590704384e-06, + "loss": 0.8158, + "step": 1513 + }, + { + "epoch": 0.5727797219332261, + "grad_norm": 1.0691406196971251, + "learning_rate": 7.803626439642245e-06, + "loss": 0.8439, + "step": 1514 + }, + { + "epoch": 0.5731580440745294, + "grad_norm": 1.003105717691004, + "learning_rate": 7.803080549927276e-06, + "loss": 0.8294, + "step": 1515 + }, + { + "epoch": 0.5735363662158328, + "grad_norm": 1.03908547211568, + "learning_rate": 7.802533921665487e-06, + "loss": 0.7924, + "step": 1516 + }, + { + "epoch": 0.5739146883571361, + "grad_norm": 1.0879350896154778, + "learning_rate": 7.801986554963032e-06, + "loss": 0.8214, + "step": 1517 + }, + { + "epoch": 0.5742930104984394, + "grad_norm": 1.0215923317383557, + "learning_rate": 7.801438449926204e-06, + "loss": 0.7672, + "step": 1518 + }, + { + "epoch": 0.5746713326397428, + "grad_norm": 1.0667625852082359, + "learning_rate": 7.800889606661448e-06, + "loss": 0.779, + "step": 1519 + }, + { + "epoch": 0.5750496547810461, + "grad_norm": 1.0265205651578218, + "learning_rate": 7.800340025275346e-06, + "loss": 0.8048, + "step": 1520 + }, + { + "epoch": 0.5754279769223494, + "grad_norm": 1.07228233508983, + "learning_rate": 7.799789705874626e-06, + "loss": 0.7798, + "step": 1521 + }, + { + "epoch": 0.5758062990636527, + "grad_norm": 1.0864037890509946, + "learning_rate": 7.799238648566155e-06, + "loss": 0.8061, + "step": 1522 + }, + { + "epoch": 0.5761846212049561, + "grad_norm": 1.024552729289987, + "learning_rate": 7.79868685345695e-06, + "loss": 0.7923, + "step": 1523 + }, + { + "epoch": 0.5765629433462593, + "grad_norm": 1.050893206442173, + "learning_rate": 7.798134320654169e-06, + "loss": 0.7922, + "step": 1524 + }, + { + "epoch": 0.5769412654875626, + "grad_norm": 1.0361508996059923, + "learning_rate": 7.797581050265108e-06, + "loss": 0.7934, + "step": 1525 + }, + { + "epoch": 0.5773195876288659, + "grad_norm": 1.0710969406799804, + "learning_rate": 7.797027042397215e-06, + "loss": 0.8126, + "step": 1526 + }, + { + "epoch": 0.5776979097701693, + "grad_norm": 1.0658020905692465, + "learning_rate": 7.796472297158071e-06, + "loss": 0.825, + "step": 1527 + }, + { + "epoch": 0.5780762319114726, + "grad_norm": 1.0530236797299208, + "learning_rate": 7.79591681465541e-06, + "loss": 0.8297, + "step": 1528 + }, + { + "epoch": 0.5784545540527759, + "grad_norm": 1.0375398854054054, + "learning_rate": 7.795360594997107e-06, + "loss": 0.8184, + "step": 1529 + }, + { + "epoch": 0.5788328761940793, + "grad_norm": 1.0223176641231346, + "learning_rate": 7.794803638291175e-06, + "loss": 0.8081, + "step": 1530 + }, + { + "epoch": 0.5792111983353826, + "grad_norm": 1.0392507145784662, + "learning_rate": 7.794245944645772e-06, + "loss": 0.8473, + "step": 1531 + }, + { + "epoch": 0.5795895204766859, + "grad_norm": 1.022490501012432, + "learning_rate": 7.793687514169201e-06, + "loss": 0.7883, + "step": 1532 + }, + { + "epoch": 0.5799678426179892, + "grad_norm": 1.0564202458689138, + "learning_rate": 7.793128346969911e-06, + "loss": 0.7797, + "step": 1533 + }, + { + "epoch": 0.5803461647592926, + "grad_norm": 1.0741330485557585, + "learning_rate": 7.792568443156489e-06, + "loss": 0.808, + "step": 1534 + }, + { + "epoch": 0.5807244869005959, + "grad_norm": 0.9936986392860392, + "learning_rate": 7.792007802837665e-06, + "loss": 0.7748, + "step": 1535 + }, + { + "epoch": 0.5811028090418991, + "grad_norm": 1.04388957808874, + "learning_rate": 7.791446426122313e-06, + "loss": 0.8282, + "step": 1536 + }, + { + "epoch": 0.5814811311832025, + "grad_norm": 1.0718346958784504, + "learning_rate": 7.790884313119454e-06, + "loss": 0.7922, + "step": 1537 + }, + { + "epoch": 0.5818594533245058, + "grad_norm": 1.0477864953037763, + "learning_rate": 7.790321463938246e-06, + "loss": 0.8141, + "step": 1538 + }, + { + "epoch": 0.5822377754658091, + "grad_norm": 1.026774949013717, + "learning_rate": 7.789757878687995e-06, + "loss": 0.7598, + "step": 1539 + }, + { + "epoch": 0.5826160976071124, + "grad_norm": 1.015538072369435, + "learning_rate": 7.789193557478143e-06, + "loss": 0.7877, + "step": 1540 + }, + { + "epoch": 0.5829944197484158, + "grad_norm": 1.0348274415641654, + "learning_rate": 7.788628500418287e-06, + "loss": 0.8258, + "step": 1541 + }, + { + "epoch": 0.5833727418897191, + "grad_norm": 1.02268572106111, + "learning_rate": 7.788062707618151e-06, + "loss": 0.8323, + "step": 1542 + }, + { + "epoch": 0.5837510640310224, + "grad_norm": 1.0046192564851208, + "learning_rate": 7.787496179187618e-06, + "loss": 0.7522, + "step": 1543 + }, + { + "epoch": 0.5841293861723257, + "grad_norm": 1.0526322563558683, + "learning_rate": 7.7869289152367e-06, + "loss": 0.8168, + "step": 1544 + }, + { + "epoch": 0.5845077083136291, + "grad_norm": 0.9819648563646498, + "learning_rate": 7.78636091587556e-06, + "loss": 0.7441, + "step": 1545 + }, + { + "epoch": 0.5848860304549324, + "grad_norm": 1.0131957579824842, + "learning_rate": 7.785792181214504e-06, + "loss": 0.7716, + "step": 1546 + }, + { + "epoch": 0.5852643525962357, + "grad_norm": 1.0442706083972597, + "learning_rate": 7.785222711363975e-06, + "loss": 0.783, + "step": 1547 + }, + { + "epoch": 0.5856426747375391, + "grad_norm": 1.024417321524946, + "learning_rate": 7.784652506434564e-06, + "loss": 0.808, + "step": 1548 + }, + { + "epoch": 0.5860209968788423, + "grad_norm": 1.0597851794054838, + "learning_rate": 7.784081566537004e-06, + "loss": 0.8209, + "step": 1549 + }, + { + "epoch": 0.5863993190201456, + "grad_norm": 1.0122874466478462, + "learning_rate": 7.783509891782168e-06, + "loss": 0.7717, + "step": 1550 + }, + { + "epoch": 0.5867776411614489, + "grad_norm": 1.0075483569470989, + "learning_rate": 7.782937482281076e-06, + "loss": 0.7653, + "step": 1551 + }, + { + "epoch": 0.5871559633027523, + "grad_norm": 1.021446573700645, + "learning_rate": 7.782364338144885e-06, + "loss": 0.7696, + "step": 1552 + }, + { + "epoch": 0.5875342854440556, + "grad_norm": 1.0432444836660548, + "learning_rate": 7.781790459484901e-06, + "loss": 0.7933, + "step": 1553 + }, + { + "epoch": 0.5879126075853589, + "grad_norm": 1.0051174216679133, + "learning_rate": 7.781215846412565e-06, + "loss": 0.7867, + "step": 1554 + }, + { + "epoch": 0.5882909297266623, + "grad_norm": 1.0867512164890576, + "learning_rate": 7.78064049903947e-06, + "loss": 0.7725, + "step": 1555 + }, + { + "epoch": 0.5886692518679656, + "grad_norm": 1.04980942321374, + "learning_rate": 7.780064417477346e-06, + "loss": 0.8114, + "step": 1556 + }, + { + "epoch": 0.5890475740092689, + "grad_norm": 1.0617568995349125, + "learning_rate": 7.779487601838065e-06, + "loss": 0.7859, + "step": 1557 + }, + { + "epoch": 0.5894258961505722, + "grad_norm": 1.0628832051157708, + "learning_rate": 7.778910052233642e-06, + "loss": 0.8021, + "step": 1558 + }, + { + "epoch": 0.5898042182918756, + "grad_norm": 1.0898131031337233, + "learning_rate": 7.778331768776237e-06, + "loss": 0.802, + "step": 1559 + }, + { + "epoch": 0.5901825404331789, + "grad_norm": 1.0649413521341573, + "learning_rate": 7.77775275157815e-06, + "loss": 0.8217, + "step": 1560 + }, + { + "epoch": 0.5905608625744821, + "grad_norm": 1.0368511400497493, + "learning_rate": 7.777173000751825e-06, + "loss": 0.7819, + "step": 1561 + }, + { + "epoch": 0.5909391847157854, + "grad_norm": 1.020241580639323, + "learning_rate": 7.776592516409848e-06, + "loss": 0.8435, + "step": 1562 + }, + { + "epoch": 0.5913175068570888, + "grad_norm": 1.039218236167864, + "learning_rate": 7.776011298664945e-06, + "loss": 0.822, + "step": 1563 + }, + { + "epoch": 0.5916958289983921, + "grad_norm": 1.0277738056724017, + "learning_rate": 7.775429347629992e-06, + "loss": 0.7755, + "step": 1564 + }, + { + "epoch": 0.5920741511396954, + "grad_norm": 0.9767055405759969, + "learning_rate": 7.774846663417996e-06, + "loss": 0.8259, + "step": 1565 + }, + { + "epoch": 0.5924524732809988, + "grad_norm": 1.0409555633420142, + "learning_rate": 7.774263246142116e-06, + "loss": 0.7829, + "step": 1566 + }, + { + "epoch": 0.5928307954223021, + "grad_norm": 1.0275312312209073, + "learning_rate": 7.77367909591565e-06, + "loss": 0.7724, + "step": 1567 + }, + { + "epoch": 0.5932091175636054, + "grad_norm": 1.0128232786560865, + "learning_rate": 7.773094212852036e-06, + "loss": 0.778, + "step": 1568 + }, + { + "epoch": 0.5935874397049087, + "grad_norm": 1.010220293379828, + "learning_rate": 7.77250859706486e-06, + "loss": 0.8122, + "step": 1569 + }, + { + "epoch": 0.5939657618462121, + "grad_norm": 1.0377569519031766, + "learning_rate": 7.771922248667843e-06, + "loss": 0.7944, + "step": 1570 + }, + { + "epoch": 0.5943440839875154, + "grad_norm": 1.0056143743542545, + "learning_rate": 7.771335167774855e-06, + "loss": 0.8184, + "step": 1571 + }, + { + "epoch": 0.5947224061288187, + "grad_norm": 1.0823167997700618, + "learning_rate": 7.770747354499902e-06, + "loss": 0.793, + "step": 1572 + }, + { + "epoch": 0.5951007282701221, + "grad_norm": 1.005554310069684, + "learning_rate": 7.770158808957142e-06, + "loss": 0.8294, + "step": 1573 + }, + { + "epoch": 0.5954790504114253, + "grad_norm": 1.016774447299906, + "learning_rate": 7.769569531260861e-06, + "loss": 0.7916, + "step": 1574 + }, + { + "epoch": 0.5958573725527286, + "grad_norm": 0.9815704963237092, + "learning_rate": 7.7689795215255e-06, + "loss": 0.7873, + "step": 1575 + }, + { + "epoch": 0.5962356946940319, + "grad_norm": 1.054358096080715, + "learning_rate": 7.768388779865636e-06, + "loss": 0.8164, + "step": 1576 + }, + { + "epoch": 0.5966140168353353, + "grad_norm": 0.9774109882411877, + "learning_rate": 7.767797306395988e-06, + "loss": 0.791, + "step": 1577 + }, + { + "epoch": 0.5969923389766386, + "grad_norm": 1.0358457305091455, + "learning_rate": 7.76720510123142e-06, + "loss": 0.7707, + "step": 1578 + }, + { + "epoch": 0.5973706611179419, + "grad_norm": 1.0624591531096403, + "learning_rate": 7.766612164486936e-06, + "loss": 0.8472, + "step": 1579 + }, + { + "epoch": 0.5977489832592452, + "grad_norm": 0.9928836589328845, + "learning_rate": 7.766018496277682e-06, + "loss": 0.7902, + "step": 1580 + }, + { + "epoch": 0.5981273054005486, + "grad_norm": 1.0280490587815976, + "learning_rate": 7.765424096718946e-06, + "loss": 0.7841, + "step": 1581 + }, + { + "epoch": 0.5985056275418519, + "grad_norm": 0.9873621543820231, + "learning_rate": 7.76482896592616e-06, + "loss": 0.8006, + "step": 1582 + }, + { + "epoch": 0.5988839496831552, + "grad_norm": 1.0709729821860812, + "learning_rate": 7.764233104014897e-06, + "loss": 0.8682, + "step": 1583 + }, + { + "epoch": 0.5992622718244586, + "grad_norm": 0.9867939695157474, + "learning_rate": 7.76363651110087e-06, + "loss": 0.7879, + "step": 1584 + }, + { + "epoch": 0.5996405939657619, + "grad_norm": 1.0795152732921542, + "learning_rate": 7.763039187299937e-06, + "loss": 0.815, + "step": 1585 + }, + { + "epoch": 0.6000189161070651, + "grad_norm": 0.9899000945502743, + "learning_rate": 7.762441132728095e-06, + "loss": 0.7855, + "step": 1586 + }, + { + "epoch": 0.6003972382483684, + "grad_norm": 1.0252908086535142, + "learning_rate": 7.761842347501485e-06, + "loss": 0.8165, + "step": 1587 + }, + { + "epoch": 0.6007755603896718, + "grad_norm": 1.0423466115896767, + "learning_rate": 7.76124283173639e-06, + "loss": 0.8567, + "step": 1588 + }, + { + "epoch": 0.6011538825309751, + "grad_norm": 0.9948472361654808, + "learning_rate": 7.760642585549233e-06, + "loss": 0.7931, + "step": 1589 + }, + { + "epoch": 0.6015322046722784, + "grad_norm": 0.9998595808495474, + "learning_rate": 7.760041609056582e-06, + "loss": 0.7922, + "step": 1590 + }, + { + "epoch": 0.6019105268135818, + "grad_norm": 1.0113044627393564, + "learning_rate": 7.759439902375141e-06, + "loss": 0.7983, + "step": 1591 + }, + { + "epoch": 0.6022888489548851, + "grad_norm": 1.052771258939431, + "learning_rate": 7.758837465621764e-06, + "loss": 0.8088, + "step": 1592 + }, + { + "epoch": 0.6026671710961884, + "grad_norm": 1.0123858085251436, + "learning_rate": 7.758234298913439e-06, + "loss": 0.784, + "step": 1593 + }, + { + "epoch": 0.6030454932374917, + "grad_norm": 1.0337794095975905, + "learning_rate": 7.757630402367303e-06, + "loss": 0.7997, + "step": 1594 + }, + { + "epoch": 0.6034238153787951, + "grad_norm": 0.9846999031423823, + "learning_rate": 7.757025776100625e-06, + "loss": 0.7447, + "step": 1595 + }, + { + "epoch": 0.6038021375200984, + "grad_norm": 1.0462409901802558, + "learning_rate": 7.756420420230828e-06, + "loss": 0.7686, + "step": 1596 + }, + { + "epoch": 0.6038021375200984, + "eval_loss": 0.8007391691207886, + "eval_runtime": 27.0514, + "eval_samples_per_second": 32.715, + "eval_steps_per_second": 1.035, + "step": 1596 + }, + { + "epoch": 0.6038021375200984, + "eval_bench_accuracy_arc_challenge": 0.25, + "eval_bench_accuracy_hellaswag": 0.21, + "eval_bench_accuracy_mmlu": 0.25217391304347825, + "eval_bench_average_accuracy": 0.23739130434782607, + "eval_bench_loss": 6.375945509525767, + "eval_bench_total_accuracy": 0.23296703296703297, + "step": 1596 + }, + { + "epoch": 0.6041804596614017, + "grad_norm": 1.0790625835061922, + "learning_rate": 7.755814334875466e-06, + "loss": 0.8091, + "step": 1597 + }, + { + "epoch": 0.6045587818027051, + "grad_norm": 0.9802043723000299, + "learning_rate": 7.75520752015224e-06, + "loss": 0.7256, + "step": 1598 + }, + { + "epoch": 0.6049371039440083, + "grad_norm": 0.9923431981852016, + "learning_rate": 7.754599976178994e-06, + "loss": 0.8054, + "step": 1599 + }, + { + "epoch": 0.6053154260853116, + "grad_norm": 1.0242822958979938, + "learning_rate": 7.753991703073709e-06, + "loss": 0.7947, + "step": 1600 + }, + { + "epoch": 0.6056937482266149, + "grad_norm": 1.0693420250669043, + "learning_rate": 7.75338270095451e-06, + "loss": 0.7714, + "step": 1601 + }, + { + "epoch": 0.6060720703679183, + "grad_norm": 1.0393417772805222, + "learning_rate": 7.752772969939662e-06, + "loss": 0.7984, + "step": 1602 + }, + { + "epoch": 0.6064503925092216, + "grad_norm": 1.0193556335184584, + "learning_rate": 7.752162510147576e-06, + "loss": 0.7845, + "step": 1603 + }, + { + "epoch": 0.6068287146505249, + "grad_norm": 1.0439223450090194, + "learning_rate": 7.751551321696798e-06, + "loss": 0.7902, + "step": 1604 + }, + { + "epoch": 0.6072070367918282, + "grad_norm": 1.0458764132750307, + "learning_rate": 7.75093940470602e-06, + "loss": 0.8277, + "step": 1605 + }, + { + "epoch": 0.6075853589331316, + "grad_norm": 1.0304823323522874, + "learning_rate": 7.750326759294077e-06, + "loss": 0.7936, + "step": 1606 + }, + { + "epoch": 0.6079636810744349, + "grad_norm": 1.037572458907066, + "learning_rate": 7.749713385579942e-06, + "loss": 0.779, + "step": 1607 + }, + { + "epoch": 0.6083420032157382, + "grad_norm": 1.0233220079303753, + "learning_rate": 7.749099283682727e-06, + "loss": 0.7924, + "step": 1608 + }, + { + "epoch": 0.6087203253570416, + "grad_norm": 1.0490780083116327, + "learning_rate": 7.748484453721694e-06, + "loss": 0.8337, + "step": 1609 + }, + { + "epoch": 0.6090986474983449, + "grad_norm": 1.0173257743419322, + "learning_rate": 7.747868895816236e-06, + "loss": 0.7673, + "step": 1610 + }, + { + "epoch": 0.6094769696396481, + "grad_norm": 1.0573789547993953, + "learning_rate": 7.747252610085895e-06, + "loss": 0.8377, + "step": 1611 + }, + { + "epoch": 0.6098552917809514, + "grad_norm": 1.0257255841383113, + "learning_rate": 7.746635596650352e-06, + "loss": 0.7728, + "step": 1612 + }, + { + "epoch": 0.6102336139222548, + "grad_norm": 1.0160660389387, + "learning_rate": 7.746017855629429e-06, + "loss": 0.8025, + "step": 1613 + }, + { + "epoch": 0.6106119360635581, + "grad_norm": 1.0602513504043805, + "learning_rate": 7.74539938714309e-06, + "loss": 0.7925, + "step": 1614 + }, + { + "epoch": 0.6109902582048614, + "grad_norm": 1.0377020898351703, + "learning_rate": 7.744780191311437e-06, + "loss": 0.804, + "step": 1615 + }, + { + "epoch": 0.6113685803461648, + "grad_norm": 0.9962327806446186, + "learning_rate": 7.744160268254718e-06, + "loss": 0.7463, + "step": 1616 + }, + { + "epoch": 0.6117469024874681, + "grad_norm": 1.03576395621217, + "learning_rate": 7.743539618093323e-06, + "loss": 0.8125, + "step": 1617 + }, + { + "epoch": 0.6121252246287714, + "grad_norm": 1.0791330433766595, + "learning_rate": 7.742918240947774e-06, + "loss": 0.7497, + "step": 1618 + }, + { + "epoch": 0.6125035467700747, + "grad_norm": 1.0186732713870292, + "learning_rate": 7.742296136938745e-06, + "loss": 0.7715, + "step": 1619 + }, + { + "epoch": 0.6128818689113781, + "grad_norm": 1.0549459798818361, + "learning_rate": 7.741673306187047e-06, + "loss": 0.7663, + "step": 1620 + }, + { + "epoch": 0.6132601910526814, + "grad_norm": 0.9830530108058492, + "learning_rate": 7.74104974881363e-06, + "loss": 0.8146, + "step": 1621 + }, + { + "epoch": 0.6136385131939847, + "grad_norm": 1.0384186325465743, + "learning_rate": 7.74042546493959e-06, + "loss": 0.7864, + "step": 1622 + }, + { + "epoch": 0.614016835335288, + "grad_norm": 1.050915873907994, + "learning_rate": 7.739800454686156e-06, + "loss": 0.7966, + "step": 1623 + }, + { + "epoch": 0.6143951574765913, + "grad_norm": 1.0241953725880033, + "learning_rate": 7.739174718174705e-06, + "loss": 0.7659, + "step": 1624 + }, + { + "epoch": 0.6147734796178946, + "grad_norm": 1.0278047735993348, + "learning_rate": 7.738548255526757e-06, + "loss": 0.7753, + "step": 1625 + }, + { + "epoch": 0.6151518017591979, + "grad_norm": 1.0028879958633992, + "learning_rate": 7.737921066863963e-06, + "loss": 0.798, + "step": 1626 + }, + { + "epoch": 0.6155301239005013, + "grad_norm": 1.046709030024919, + "learning_rate": 7.737293152308125e-06, + "loss": 0.8318, + "step": 1627 + }, + { + "epoch": 0.6159084460418046, + "grad_norm": 1.053664353449831, + "learning_rate": 7.736664511981184e-06, + "loss": 0.8518, + "step": 1628 + }, + { + "epoch": 0.6162867681831079, + "grad_norm": 0.9978105688058767, + "learning_rate": 7.736035146005216e-06, + "loss": 0.7807, + "step": 1629 + }, + { + "epoch": 0.6166650903244112, + "grad_norm": 1.0998599207938173, + "learning_rate": 7.735405054502443e-06, + "loss": 0.8517, + "step": 1630 + }, + { + "epoch": 0.6170434124657146, + "grad_norm": 1.0347549984516864, + "learning_rate": 7.734774237595227e-06, + "loss": 0.7861, + "step": 1631 + }, + { + "epoch": 0.6174217346070179, + "grad_norm": 1.0604030894353325, + "learning_rate": 7.734142695406072e-06, + "loss": 0.8444, + "step": 1632 + }, + { + "epoch": 0.6178000567483212, + "grad_norm": 0.9995358654268639, + "learning_rate": 7.73351042805762e-06, + "loss": 0.7982, + "step": 1633 + }, + { + "epoch": 0.6181783788896246, + "grad_norm": 1.012063791302332, + "learning_rate": 7.732877435672656e-06, + "loss": 0.7891, + "step": 1634 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 1.062079535667684, + "learning_rate": 7.732243718374105e-06, + "loss": 0.7953, + "step": 1635 + }, + { + "epoch": 0.6189350231722311, + "grad_norm": 1.0049506132948145, + "learning_rate": 7.731609276285034e-06, + "loss": 0.8185, + "step": 1636 + }, + { + "epoch": 0.6193133453135344, + "grad_norm": 0.9787699976228371, + "learning_rate": 7.730974109528651e-06, + "loss": 0.8099, + "step": 1637 + }, + { + "epoch": 0.6196916674548378, + "grad_norm": 0.9716390457115083, + "learning_rate": 7.730338218228298e-06, + "loss": 0.7695, + "step": 1638 + }, + { + "epoch": 0.6200699895961411, + "grad_norm": 0.9806455110749785, + "learning_rate": 7.729701602507469e-06, + "loss": 0.7199, + "step": 1639 + }, + { + "epoch": 0.6204483117374444, + "grad_norm": 1.0303904399928674, + "learning_rate": 7.729064262489791e-06, + "loss": 0.8018, + "step": 1640 + }, + { + "epoch": 0.6208266338787477, + "grad_norm": 1.0184745198287024, + "learning_rate": 7.72842619829903e-06, + "loss": 0.8168, + "step": 1641 + }, + { + "epoch": 0.6212049560200511, + "grad_norm": 1.0350761019221557, + "learning_rate": 7.727787410059102e-06, + "loss": 0.8063, + "step": 1642 + }, + { + "epoch": 0.6215832781613544, + "grad_norm": 0.9997598615132083, + "learning_rate": 7.727147897894055e-06, + "loss": 0.7692, + "step": 1643 + }, + { + "epoch": 0.6219616003026577, + "grad_norm": 1.0317018080080016, + "learning_rate": 7.72650766192808e-06, + "loss": 0.7963, + "step": 1644 + }, + { + "epoch": 0.6223399224439611, + "grad_norm": 1.058330305743686, + "learning_rate": 7.725866702285508e-06, + "loss": 0.7778, + "step": 1645 + }, + { + "epoch": 0.6227182445852644, + "grad_norm": 1.050475543436919, + "learning_rate": 7.725225019090813e-06, + "loss": 0.8052, + "step": 1646 + }, + { + "epoch": 0.6230965667265677, + "grad_norm": 1.0381951307937078, + "learning_rate": 7.724582612468609e-06, + "loss": 0.7643, + "step": 1647 + }, + { + "epoch": 0.623474888867871, + "grad_norm": 0.9960696467209328, + "learning_rate": 7.723939482543647e-06, + "loss": 0.781, + "step": 1648 + }, + { + "epoch": 0.6238532110091743, + "grad_norm": 1.0235710160288658, + "learning_rate": 7.723295629440823e-06, + "loss": 0.7818, + "step": 1649 + }, + { + "epoch": 0.6242315331504776, + "grad_norm": 0.9987662526618373, + "learning_rate": 7.722651053285168e-06, + "loss": 0.7532, + "step": 1650 + }, + { + "epoch": 0.6246098552917809, + "grad_norm": 1.038603322649077, + "learning_rate": 7.722005754201863e-06, + "loss": 0.7995, + "step": 1651 + }, + { + "epoch": 0.6249881774330843, + "grad_norm": 1.0372844825153233, + "learning_rate": 7.721359732316216e-06, + "loss": 0.7982, + "step": 1652 + }, + { + "epoch": 0.6253664995743876, + "grad_norm": 1.0075983510701718, + "learning_rate": 7.720712987753687e-06, + "loss": 0.771, + "step": 1653 + }, + { + "epoch": 0.6257448217156909, + "grad_norm": 1.060885095951037, + "learning_rate": 7.72006552063987e-06, + "loss": 0.8095, + "step": 1654 + }, + { + "epoch": 0.6261231438569942, + "grad_norm": 1.024942261074342, + "learning_rate": 7.719417331100501e-06, + "loss": 0.8175, + "step": 1655 + }, + { + "epoch": 0.6265014659982976, + "grad_norm": 1.0259969128854978, + "learning_rate": 7.718768419261458e-06, + "loss": 0.7614, + "step": 1656 + }, + { + "epoch": 0.6268797881396009, + "grad_norm": 1.0032297451874017, + "learning_rate": 7.718118785248759e-06, + "loss": 0.7612, + "step": 1657 + }, + { + "epoch": 0.6272581102809042, + "grad_norm": 1.0210932763381098, + "learning_rate": 7.717468429188556e-06, + "loss": 0.7755, + "step": 1658 + }, + { + "epoch": 0.6276364324222075, + "grad_norm": 1.046603168853803, + "learning_rate": 7.71681735120715e-06, + "loss": 0.7888, + "step": 1659 + }, + { + "epoch": 0.6280147545635109, + "grad_norm": 1.0302944601931032, + "learning_rate": 7.716165551430978e-06, + "loss": 0.8215, + "step": 1660 + }, + { + "epoch": 0.6283930767048141, + "grad_norm": 1.0538426037667707, + "learning_rate": 7.715513029986616e-06, + "loss": 0.8277, + "step": 1661 + }, + { + "epoch": 0.6287713988461174, + "grad_norm": 1.0079131456868133, + "learning_rate": 7.714859787000784e-06, + "loss": 0.7898, + "step": 1662 + }, + { + "epoch": 0.6291497209874208, + "grad_norm": 1.0091132558305784, + "learning_rate": 7.714205822600338e-06, + "loss": 0.7628, + "step": 1663 + }, + { + "epoch": 0.6295280431287241, + "grad_norm": 1.0370707510362853, + "learning_rate": 7.713551136912277e-06, + "loss": 0.7847, + "step": 1664 + }, + { + "epoch": 0.6299063652700274, + "grad_norm": 1.0254976981220805, + "learning_rate": 7.712895730063737e-06, + "loss": 0.8251, + "step": 1665 + }, + { + "epoch": 0.6302846874113307, + "grad_norm": 1.0129086665617333, + "learning_rate": 7.712239602181998e-06, + "loss": 0.813, + "step": 1666 + }, + { + "epoch": 0.6306630095526341, + "grad_norm": 1.0211770501504658, + "learning_rate": 7.711582753394478e-06, + "loss": 0.7909, + "step": 1667 + }, + { + "epoch": 0.6310413316939374, + "grad_norm": 1.2302756712980163, + "learning_rate": 7.710925183828736e-06, + "loss": 0.782, + "step": 1668 + }, + { + "epoch": 0.6314196538352407, + "grad_norm": 1.0606820966683679, + "learning_rate": 7.710266893612468e-06, + "loss": 0.8001, + "step": 1669 + }, + { + "epoch": 0.6317979759765441, + "grad_norm": 1.0257958327969605, + "learning_rate": 7.70960788287351e-06, + "loss": 0.7715, + "step": 1670 + }, + { + "epoch": 0.6321762981178474, + "grad_norm": 1.033181617178253, + "learning_rate": 7.708948151739847e-06, + "loss": 0.7884, + "step": 1671 + }, + { + "epoch": 0.6325546202591507, + "grad_norm": 1.0142271201151716, + "learning_rate": 7.708287700339588e-06, + "loss": 0.7846, + "step": 1672 + }, + { + "epoch": 0.632932942400454, + "grad_norm": 1.0581952369577206, + "learning_rate": 7.707626528800999e-06, + "loss": 0.835, + "step": 1673 + }, + { + "epoch": 0.6333112645417573, + "grad_norm": 1.031831226064096, + "learning_rate": 7.706964637252472e-06, + "loss": 0.7808, + "step": 1674 + }, + { + "epoch": 0.6336895866830606, + "grad_norm": 1.034926042820135, + "learning_rate": 7.706302025822546e-06, + "loss": 0.8133, + "step": 1675 + }, + { + "epoch": 0.6340679088243639, + "grad_norm": 0.9974796232689039, + "learning_rate": 7.705638694639897e-06, + "loss": 0.8022, + "step": 1676 + }, + { + "epoch": 0.6344462309656672, + "grad_norm": 0.9991746871631939, + "learning_rate": 7.704974643833345e-06, + "loss": 0.7768, + "step": 1677 + }, + { + "epoch": 0.6348245531069706, + "grad_norm": 1.0647934668234986, + "learning_rate": 7.704309873531842e-06, + "loss": 0.7784, + "step": 1678 + }, + { + "epoch": 0.6352028752482739, + "grad_norm": 1.0706641503151557, + "learning_rate": 7.70364438386449e-06, + "loss": 0.7549, + "step": 1679 + }, + { + "epoch": 0.6355811973895772, + "grad_norm": 1.5575289700539314, + "learning_rate": 7.70297817496052e-06, + "loss": 0.7869, + "step": 1680 + }, + { + "epoch": 0.6359595195308806, + "grad_norm": 1.0441884975223152, + "learning_rate": 7.702311246949312e-06, + "loss": 0.8212, + "step": 1681 + }, + { + "epoch": 0.6363378416721839, + "grad_norm": 1.0184875000693254, + "learning_rate": 7.701643599960377e-06, + "loss": 0.7783, + "step": 1682 + }, + { + "epoch": 0.6367161638134872, + "grad_norm": 1.056484375092538, + "learning_rate": 7.700975234123374e-06, + "loss": 0.7997, + "step": 1683 + }, + { + "epoch": 0.6370944859547905, + "grad_norm": 1.0158431220473627, + "learning_rate": 7.700306149568096e-06, + "loss": 0.7887, + "step": 1684 + }, + { + "epoch": 0.6374728080960939, + "grad_norm": 1.005886147632736, + "learning_rate": 7.699636346424476e-06, + "loss": 0.8146, + "step": 1685 + }, + { + "epoch": 0.6378511302373971, + "grad_norm": 0.9516674282028371, + "learning_rate": 7.698965824822591e-06, + "loss": 0.7617, + "step": 1686 + }, + { + "epoch": 0.6382294523787004, + "grad_norm": 1.0354398239486777, + "learning_rate": 7.698294584892653e-06, + "loss": 0.7698, + "step": 1687 + }, + { + "epoch": 0.6386077745200038, + "grad_norm": 1.0412153778199809, + "learning_rate": 7.69762262676501e-06, + "loss": 0.7741, + "step": 1688 + }, + { + "epoch": 0.6389860966613071, + "grad_norm": 1.0038063833719368, + "learning_rate": 7.696949950570162e-06, + "loss": 0.7726, + "step": 1689 + }, + { + "epoch": 0.6393644188026104, + "grad_norm": 1.0041297661402129, + "learning_rate": 7.696276556438736e-06, + "loss": 0.8076, + "step": 1690 + }, + { + "epoch": 0.6397427409439137, + "grad_norm": 1.052469874333398, + "learning_rate": 7.695602444501503e-06, + "loss": 0.7906, + "step": 1691 + }, + { + "epoch": 0.6401210630852171, + "grad_norm": 0.9490194460452617, + "learning_rate": 7.694927614889376e-06, + "loss": 0.7188, + "step": 1692 + }, + { + "epoch": 0.6404993852265204, + "grad_norm": 0.974323163548883, + "learning_rate": 7.694252067733404e-06, + "loss": 0.753, + "step": 1693 + }, + { + "epoch": 0.6408777073678237, + "grad_norm": 1.0319007840691403, + "learning_rate": 7.693575803164774e-06, + "loss": 0.7962, + "step": 1694 + }, + { + "epoch": 0.641256029509127, + "grad_norm": 1.0299952133041577, + "learning_rate": 7.692898821314816e-06, + "loss": 0.7723, + "step": 1695 + }, + { + "epoch": 0.6416343516504304, + "grad_norm": 1.0632785008902024, + "learning_rate": 7.692221122315e-06, + "loss": 0.7536, + "step": 1696 + }, + { + "epoch": 0.6420126737917337, + "grad_norm": 1.0478356927175443, + "learning_rate": 7.69154270629693e-06, + "loss": 0.7759, + "step": 1697 + }, + { + "epoch": 0.642390995933037, + "grad_norm": 1.0207221782050084, + "learning_rate": 7.690863573392355e-06, + "loss": 0.8025, + "step": 1698 + }, + { + "epoch": 0.6427693180743403, + "grad_norm": 1.0307450911725362, + "learning_rate": 7.690183723733158e-06, + "loss": 0.8126, + "step": 1699 + }, + { + "epoch": 0.6431476402156436, + "grad_norm": 0.9558201805744811, + "learning_rate": 7.689503157451366e-06, + "loss": 0.7926, + "step": 1700 + }, + { + "epoch": 0.6435259623569469, + "grad_norm": 0.9839314509833194, + "learning_rate": 7.68882187467914e-06, + "loss": 0.7982, + "step": 1701 + }, + { + "epoch": 0.6439042844982502, + "grad_norm": 1.0446036605229558, + "learning_rate": 7.688139875548786e-06, + "loss": 0.7424, + "step": 1702 + }, + { + "epoch": 0.6442826066395536, + "grad_norm": 0.9747599328413645, + "learning_rate": 7.687457160192746e-06, + "loss": 0.7769, + "step": 1703 + }, + { + "epoch": 0.6446609287808569, + "grad_norm": 1.0017104708165576, + "learning_rate": 7.6867737287436e-06, + "loss": 0.7779, + "step": 1704 + }, + { + "epoch": 0.6450392509221602, + "grad_norm": 1.0396981093860427, + "learning_rate": 7.686089581334069e-06, + "loss": 0.7966, + "step": 1705 + }, + { + "epoch": 0.6454175730634636, + "grad_norm": 1.0077578946931687, + "learning_rate": 7.685404718097011e-06, + "loss": 0.7658, + "step": 1706 + }, + { + "epoch": 0.6457958952047669, + "grad_norm": 1.0045936301109948, + "learning_rate": 7.684719139165426e-06, + "loss": 0.8215, + "step": 1707 + }, + { + "epoch": 0.6461742173460702, + "grad_norm": 1.0059220607870412, + "learning_rate": 7.684032844672452e-06, + "loss": 0.784, + "step": 1708 + }, + { + "epoch": 0.6465525394873735, + "grad_norm": 1.002030780249217, + "learning_rate": 7.683345834751362e-06, + "loss": 0.754, + "step": 1709 + }, + { + "epoch": 0.6469308616286769, + "grad_norm": 1.0524082695853973, + "learning_rate": 7.682658109535575e-06, + "loss": 0.8141, + "step": 1710 + }, + { + "epoch": 0.6473091837699801, + "grad_norm": 1.023391717099541, + "learning_rate": 7.681969669158643e-06, + "loss": 0.8029, + "step": 1711 + }, + { + "epoch": 0.6476875059112834, + "grad_norm": 1.0537878870256816, + "learning_rate": 7.68128051375426e-06, + "loss": 0.8026, + "step": 1712 + }, + { + "epoch": 0.6480658280525867, + "grad_norm": 0.9946301646936768, + "learning_rate": 7.680590643456258e-06, + "loss": 0.8154, + "step": 1713 + }, + { + "epoch": 0.6484441501938901, + "grad_norm": 1.0129808485922718, + "learning_rate": 7.679900058398606e-06, + "loss": 0.7482, + "step": 1714 + }, + { + "epoch": 0.6488224723351934, + "grad_norm": 1.1366026781982712, + "learning_rate": 7.679208758715417e-06, + "loss": 0.7844, + "step": 1715 + }, + { + "epoch": 0.6492007944764967, + "grad_norm": 1.0252138838659255, + "learning_rate": 7.678516744540936e-06, + "loss": 0.7827, + "step": 1716 + }, + { + "epoch": 0.6495791166178001, + "grad_norm": 1.0483329033578623, + "learning_rate": 7.67782401600955e-06, + "loss": 0.7995, + "step": 1717 + }, + { + "epoch": 0.6499574387591034, + "grad_norm": 0.9954302178962173, + "learning_rate": 7.677130573255787e-06, + "loss": 0.7528, + "step": 1718 + }, + { + "epoch": 0.6503357609004067, + "grad_norm": 1.0342284002896778, + "learning_rate": 7.67643641641431e-06, + "loss": 0.7967, + "step": 1719 + }, + { + "epoch": 0.65071408304171, + "grad_norm": 1.0744541931554912, + "learning_rate": 7.675741545619926e-06, + "loss": 0.7959, + "step": 1720 + }, + { + "epoch": 0.6510924051830134, + "grad_norm": 0.9960576642926111, + "learning_rate": 7.675045961007571e-06, + "loss": 0.7644, + "step": 1721 + }, + { + "epoch": 0.6514707273243167, + "grad_norm": 1.0388432797415568, + "learning_rate": 7.674349662712328e-06, + "loss": 0.8452, + "step": 1722 + }, + { + "epoch": 0.65184904946562, + "grad_norm": 1.0809172859395315, + "learning_rate": 7.673652650869415e-06, + "loss": 0.8068, + "step": 1723 + }, + { + "epoch": 0.6522273716069233, + "grad_norm": 1.0066539502318497, + "learning_rate": 7.672954925614193e-06, + "loss": 0.7709, + "step": 1724 + }, + { + "epoch": 0.6526056937482266, + "grad_norm": 1.0418268199259764, + "learning_rate": 7.672256487082155e-06, + "loss": 0.7932, + "step": 1725 + }, + { + "epoch": 0.6529840158895299, + "grad_norm": 1.0245053090908052, + "learning_rate": 7.671557335408935e-06, + "loss": 0.798, + "step": 1726 + }, + { + "epoch": 0.6533623380308332, + "grad_norm": 1.0356795152001224, + "learning_rate": 7.670857470730309e-06, + "loss": 0.7573, + "step": 1727 + }, + { + "epoch": 0.6537406601721366, + "grad_norm": 1.0311220411463944, + "learning_rate": 7.670156893182188e-06, + "loss": 0.8159, + "step": 1728 + }, + { + "epoch": 0.6541189823134399, + "grad_norm": 0.9968740214468425, + "learning_rate": 7.66945560290062e-06, + "loss": 0.8174, + "step": 1729 + }, + { + "epoch": 0.6541189823134399, + "eval_loss": 0.7927515506744385, + "eval_runtime": 26.7774, + "eval_samples_per_second": 33.05, + "eval_steps_per_second": 1.046, + "step": 1729 + }, + { + "epoch": 0.6541189823134399, + "eval_bench_accuracy_arc_challenge": 0.0, + "eval_bench_accuracy_hellaswag": 0.21, + "eval_bench_accuracy_mmlu": 0.23478260869565218, + "eval_bench_average_accuracy": 0.1482608695652174, + "eval_bench_loss": 7.814903928522478, + "eval_bench_total_accuracy": 0.15164835164835164, + "step": 1729 + }, + { + "epoch": 0.6544973044547432, + "grad_norm": 1.0536869570872927, + "learning_rate": 7.668753600021795e-06, + "loss": 0.7894, + "step": 1730 + }, + { + "epoch": 0.6548756265960465, + "grad_norm": 1.0802849973303468, + "learning_rate": 7.66805088468204e-06, + "loss": 0.8128, + "step": 1731 + }, + { + "epoch": 0.6552539487373499, + "grad_norm": 1.0195535501035122, + "learning_rate": 7.66734745701782e-06, + "loss": 0.7698, + "step": 1732 + }, + { + "epoch": 0.6556322708786532, + "grad_norm": 0.9866819845303567, + "learning_rate": 7.666643317165737e-06, + "loss": 0.7632, + "step": 1733 + }, + { + "epoch": 0.6560105930199565, + "grad_norm": 1.0362620307566515, + "learning_rate": 7.665938465262536e-06, + "loss": 0.8242, + "step": 1734 + }, + { + "epoch": 0.6563889151612599, + "grad_norm": 1.005122320879091, + "learning_rate": 7.665232901445093e-06, + "loss": 0.8128, + "step": 1735 + }, + { + "epoch": 0.6567672373025631, + "grad_norm": 0.9968147052835493, + "learning_rate": 7.66452662585043e-06, + "loss": 0.7765, + "step": 1736 + }, + { + "epoch": 0.6571455594438664, + "grad_norm": 1.0160098359583503, + "learning_rate": 7.663819638615705e-06, + "loss": 0.769, + "step": 1737 + }, + { + "epoch": 0.6575238815851697, + "grad_norm": 0.9957799905329473, + "learning_rate": 7.663111939878207e-06, + "loss": 0.75, + "step": 1738 + }, + { + "epoch": 0.6579022037264731, + "grad_norm": 0.9817964252654222, + "learning_rate": 7.662403529775372e-06, + "loss": 0.7814, + "step": 1739 + }, + { + "epoch": 0.6582805258677764, + "grad_norm": 0.9928916742992132, + "learning_rate": 7.661694408444773e-06, + "loss": 0.7904, + "step": 1740 + }, + { + "epoch": 0.6586588480090797, + "grad_norm": 1.0410892155118083, + "learning_rate": 7.660984576024117e-06, + "loss": 0.8191, + "step": 1741 + }, + { + "epoch": 0.6590371701503831, + "grad_norm": 1.0021028586166405, + "learning_rate": 7.660274032651249e-06, + "loss": 0.7712, + "step": 1742 + }, + { + "epoch": 0.6594154922916864, + "grad_norm": 0.9990600675172764, + "learning_rate": 7.65956277846416e-06, + "loss": 0.7857, + "step": 1743 + }, + { + "epoch": 0.6597938144329897, + "grad_norm": 1.0992751750590166, + "learning_rate": 7.658850813600969e-06, + "loss": 0.7878, + "step": 1744 + }, + { + "epoch": 0.660172136574293, + "grad_norm": 1.0189976892843522, + "learning_rate": 7.65813813819994e-06, + "loss": 0.77, + "step": 1745 + }, + { + "epoch": 0.6605504587155964, + "grad_norm": 1.0468429508760897, + "learning_rate": 7.657424752399471e-06, + "loss": 0.7768, + "step": 1746 + }, + { + "epoch": 0.6609287808568997, + "grad_norm": 1.0374665153019, + "learning_rate": 7.6567106563381e-06, + "loss": 0.8103, + "step": 1747 + }, + { + "epoch": 0.661307102998203, + "grad_norm": 1.0713460469365848, + "learning_rate": 7.655995850154501e-06, + "loss": 0.7646, + "step": 1748 + }, + { + "epoch": 0.6616854251395063, + "grad_norm": 1.048711304359486, + "learning_rate": 7.655280333987491e-06, + "loss": 0.7852, + "step": 1749 + }, + { + "epoch": 0.6620637472808096, + "grad_norm": 1.0319143016049546, + "learning_rate": 7.654564107976017e-06, + "loss": 0.7979, + "step": 1750 + }, + { + "epoch": 0.6624420694221129, + "grad_norm": 1.0575930996275595, + "learning_rate": 7.653847172259169e-06, + "loss": 0.7768, + "step": 1751 + }, + { + "epoch": 0.6628203915634162, + "grad_norm": 0.9638702778680636, + "learning_rate": 7.653129526976173e-06, + "loss": 0.7979, + "step": 1752 + }, + { + "epoch": 0.6631987137047196, + "grad_norm": 0.9690337454201767, + "learning_rate": 7.652411172266398e-06, + "loss": 0.7894, + "step": 1753 + }, + { + "epoch": 0.6635770358460229, + "grad_norm": 1.0072303768845905, + "learning_rate": 7.65169210826934e-06, + "loss": 0.7302, + "step": 1754 + }, + { + "epoch": 0.6639553579873262, + "grad_norm": 1.0168462219112109, + "learning_rate": 7.650972335124644e-06, + "loss": 0.7918, + "step": 1755 + }, + { + "epoch": 0.6643336801286295, + "grad_norm": 0.9845272479814176, + "learning_rate": 7.650251852972084e-06, + "loss": 0.7798, + "step": 1756 + }, + { + "epoch": 0.6647120022699329, + "grad_norm": 1.0559359255774574, + "learning_rate": 7.649530661951578e-06, + "loss": 0.7835, + "step": 1757 + }, + { + "epoch": 0.6650903244112362, + "grad_norm": 1.0127474528668845, + "learning_rate": 7.64880876220318e-06, + "loss": 0.7566, + "step": 1758 + }, + { + "epoch": 0.6654686465525395, + "grad_norm": 1.067173774382862, + "learning_rate": 7.648086153867078e-06, + "loss": 0.7738, + "step": 1759 + }, + { + "epoch": 0.6658469686938429, + "grad_norm": 1.0262747793123224, + "learning_rate": 7.6473628370836e-06, + "loss": 0.7833, + "step": 1760 + }, + { + "epoch": 0.6662252908351461, + "grad_norm": 1.0515582564211456, + "learning_rate": 7.646638811993216e-06, + "loss": 0.7538, + "step": 1761 + }, + { + "epoch": 0.6666036129764494, + "grad_norm": 1.0329994771612065, + "learning_rate": 7.645914078736526e-06, + "loss": 0.8164, + "step": 1762 + }, + { + "epoch": 0.6669819351177527, + "grad_norm": 1.0311907540077614, + "learning_rate": 7.645188637454272e-06, + "loss": 0.7706, + "step": 1763 + }, + { + "epoch": 0.6673602572590561, + "grad_norm": 1.0409947640223565, + "learning_rate": 7.644462488287334e-06, + "loss": 0.7885, + "step": 1764 + }, + { + "epoch": 0.6677385794003594, + "grad_norm": 0.988219756000234, + "learning_rate": 7.643735631376724e-06, + "loss": 0.7408, + "step": 1765 + }, + { + "epoch": 0.6681169015416627, + "grad_norm": 1.027004288225805, + "learning_rate": 7.643008066863598e-06, + "loss": 0.8121, + "step": 1766 + }, + { + "epoch": 0.6684952236829661, + "grad_norm": 1.0184065601333092, + "learning_rate": 7.642279794889249e-06, + "loss": 0.7576, + "step": 1767 + }, + { + "epoch": 0.6688735458242694, + "grad_norm": 1.043603934502605, + "learning_rate": 7.641550815595102e-06, + "loss": 0.771, + "step": 1768 + }, + { + "epoch": 0.6692518679655727, + "grad_norm": 1.060392114018632, + "learning_rate": 7.640821129122723e-06, + "loss": 0.8247, + "step": 1769 + }, + { + "epoch": 0.669630190106876, + "grad_norm": 1.0126323816870029, + "learning_rate": 7.640090735613818e-06, + "loss": 0.8022, + "step": 1770 + }, + { + "epoch": 0.6700085122481794, + "grad_norm": 1.1648366101787067, + "learning_rate": 7.639359635210222e-06, + "loss": 0.7826, + "step": 1771 + }, + { + "epoch": 0.6703868343894827, + "grad_norm": 1.0724674686904885, + "learning_rate": 7.638627828053918e-06, + "loss": 0.7897, + "step": 1772 + }, + { + "epoch": 0.6707651565307859, + "grad_norm": 1.0540972019117152, + "learning_rate": 7.637895314287016e-06, + "loss": 0.7645, + "step": 1773 + }, + { + "epoch": 0.6711434786720892, + "grad_norm": 1.0057331810331451, + "learning_rate": 7.63716209405177e-06, + "loss": 0.816, + "step": 1774 + }, + { + "epoch": 0.6715218008133926, + "grad_norm": 0.9970921236923102, + "learning_rate": 7.63642816749057e-06, + "loss": 0.7671, + "step": 1775 + }, + { + "epoch": 0.6719001229546959, + "grad_norm": 1.002453880727358, + "learning_rate": 7.635693534745941e-06, + "loss": 0.7885, + "step": 1776 + }, + { + "epoch": 0.6722784450959992, + "grad_norm": 1.0312771975163908, + "learning_rate": 7.634958195960548e-06, + "loss": 0.7951, + "step": 1777 + }, + { + "epoch": 0.6726567672373026, + "grad_norm": 1.0177245342291783, + "learning_rate": 7.634222151277188e-06, + "loss": 0.773, + "step": 1778 + }, + { + "epoch": 0.6730350893786059, + "grad_norm": 1.060998481737934, + "learning_rate": 7.633485400838804e-06, + "loss": 0.7924, + "step": 1779 + }, + { + "epoch": 0.6734134115199092, + "grad_norm": 1.0340561242421995, + "learning_rate": 7.632747944788468e-06, + "loss": 0.8451, + "step": 1780 + }, + { + "epoch": 0.6737917336612125, + "grad_norm": 1.0461873170538059, + "learning_rate": 7.63200978326939e-06, + "loss": 0.7896, + "step": 1781 + }, + { + "epoch": 0.6741700558025159, + "grad_norm": 1.0320131696114871, + "learning_rate": 7.631270916424923e-06, + "loss": 0.7914, + "step": 1782 + }, + { + "epoch": 0.6745483779438192, + "grad_norm": 1.0291951526102714, + "learning_rate": 7.630531344398549e-06, + "loss": 0.7273, + "step": 1783 + }, + { + "epoch": 0.6749267000851225, + "grad_norm": 1.0352838518441736, + "learning_rate": 7.62979106733389e-06, + "loss": 0.8042, + "step": 1784 + }, + { + "epoch": 0.6753050222264259, + "grad_norm": 0.999179215624018, + "learning_rate": 7.629050085374709e-06, + "loss": 0.8106, + "step": 1785 + }, + { + "epoch": 0.6756833443677291, + "grad_norm": 1.002781374078623, + "learning_rate": 7.6283083986649e-06, + "loss": 0.7478, + "step": 1786 + }, + { + "epoch": 0.6760616665090324, + "grad_norm": 1.0578987973117508, + "learning_rate": 7.627566007348498e-06, + "loss": 0.767, + "step": 1787 + }, + { + "epoch": 0.6764399886503357, + "grad_norm": 1.018623825083434, + "learning_rate": 7.626822911569673e-06, + "loss": 0.7603, + "step": 1788 + }, + { + "epoch": 0.6768183107916391, + "grad_norm": 1.0691359310227244, + "learning_rate": 7.62607911147273e-06, + "loss": 0.8033, + "step": 1789 + }, + { + "epoch": 0.6771966329329424, + "grad_norm": 1.0473330500599638, + "learning_rate": 7.625334607202115e-06, + "loss": 0.799, + "step": 1790 + }, + { + "epoch": 0.6775749550742457, + "grad_norm": 1.0276960283606948, + "learning_rate": 7.624589398902408e-06, + "loss": 0.7882, + "step": 1791 + }, + { + "epoch": 0.677953277215549, + "grad_norm": 1.0216841452284737, + "learning_rate": 7.623843486718325e-06, + "loss": 0.7753, + "step": 1792 + }, + { + "epoch": 0.6783315993568524, + "grad_norm": 1.017840190852707, + "learning_rate": 7.623096870794722e-06, + "loss": 0.7944, + "step": 1793 + }, + { + "epoch": 0.6787099214981557, + "grad_norm": 1.0234534365543315, + "learning_rate": 7.6223495512765865e-06, + "loss": 0.7607, + "step": 1794 + }, + { + "epoch": 0.679088243639459, + "grad_norm": 1.0142595858519063, + "learning_rate": 7.621601528309049e-06, + "loss": 0.7665, + "step": 1795 + }, + { + "epoch": 0.6794665657807624, + "grad_norm": 1.0071219703193526, + "learning_rate": 7.620852802037371e-06, + "loss": 0.791, + "step": 1796 + }, + { + "epoch": 0.6798448879220657, + "grad_norm": 1.0031377757032336, + "learning_rate": 7.620103372606954e-06, + "loss": 0.7502, + "step": 1797 + }, + { + "epoch": 0.6802232100633689, + "grad_norm": 1.014284865797237, + "learning_rate": 7.619353240163334e-06, + "loss": 0.8012, + "step": 1798 + }, + { + "epoch": 0.6806015322046722, + "grad_norm": 1.0281456730858456, + "learning_rate": 7.618602404852186e-06, + "loss": 0.8308, + "step": 1799 + }, + { + "epoch": 0.6809798543459756, + "grad_norm": 1.0358974761664392, + "learning_rate": 7.617850866819319e-06, + "loss": 0.8116, + "step": 1800 + }, + { + "epoch": 0.6813581764872789, + "grad_norm": 1.0233639481564207, + "learning_rate": 7.61709862621068e-06, + "loss": 0.8062, + "step": 1801 + }, + { + "epoch": 0.6817364986285822, + "grad_norm": 0.9776086740367372, + "learning_rate": 7.61634568317235e-06, + "loss": 0.7926, + "step": 1802 + } + ], + "logging_steps": 1, + "max_steps": 7929, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 53, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.967375682524414e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}