diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12011 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999893708612791, + "eval_steps": 400, + "global_step": 8820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006802648781369246, + "grad_norm": 3.334683656692505, + "learning_rate": 6.802721088435375e-07, + "loss": 2.9439, + "num_input_tokens_seen": 2359296, + "step": 6 + }, + { + "epoch": 0.0013605297562738492, + "grad_norm": 2.447828769683838, + "learning_rate": 1.360544217687075e-06, + "loss": 2.918, + "num_input_tokens_seen": 4718592, + "step": 12 + }, + { + "epoch": 0.0020407946344107738, + "grad_norm": 1.870556116104126, + "learning_rate": 2.040816326530612e-06, + "loss": 2.8392, + "num_input_tokens_seen": 7077888, + "step": 18 + }, + { + "epoch": 0.0027210595125476984, + "grad_norm": 1.367610216140747, + "learning_rate": 2.72108843537415e-06, + "loss": 2.7568, + "num_input_tokens_seen": 9437184, + "step": 24 + }, + { + "epoch": 0.003401324390684623, + "grad_norm": 1.1030510663986206, + "learning_rate": 3.4013605442176877e-06, + "loss": 2.71, + "num_input_tokens_seen": 11796480, + "step": 30 + }, + { + "epoch": 0.0040815892688215475, + "grad_norm": 0.904552161693573, + "learning_rate": 4.081632653061224e-06, + "loss": 2.6018, + "num_input_tokens_seen": 14155776, + "step": 36 + }, + { + "epoch": 0.004761854146958472, + "grad_norm": 0.7523818612098694, + "learning_rate": 4.7619047619047615e-06, + "loss": 2.5708, + "num_input_tokens_seen": 16515072, + "step": 42 + }, + { + "epoch": 0.005442119025095397, + "grad_norm": 0.6976463198661804, + "learning_rate": 5.4421768707483e-06, + "loss": 2.5212, + "num_input_tokens_seen": 18874368, + "step": 48 + }, + { + "epoch": 0.006122383903232321, + "grad_norm": 0.6655228734016418, + "learning_rate": 6.122448979591837e-06, + "loss": 2.502, + "num_input_tokens_seen": 21233664, + "step": 54 + }, + { + "epoch": 0.006802648781369246, + "grad_norm": 0.6486473679542542, + "learning_rate": 6.802721088435375e-06, + "loss": 2.529, + "num_input_tokens_seen": 23592960, + "step": 60 + }, + { + "epoch": 0.00748291365950617, + "grad_norm": 0.5861107110977173, + "learning_rate": 7.482993197278912e-06, + "loss": 2.501, + "num_input_tokens_seen": 25952256, + "step": 66 + }, + { + "epoch": 0.008163178537643095, + "grad_norm": 0.6155074834823608, + "learning_rate": 8.163265306122448e-06, + "loss": 2.4401, + "num_input_tokens_seen": 28311552, + "step": 72 + }, + { + "epoch": 0.00884344341578002, + "grad_norm": 0.6312918663024902, + "learning_rate": 8.843537414965987e-06, + "loss": 2.4007, + "num_input_tokens_seen": 30670848, + "step": 78 + }, + { + "epoch": 0.009523708293916943, + "grad_norm": 0.5730547904968262, + "learning_rate": 9.523809523809523e-06, + "loss": 2.4158, + "num_input_tokens_seen": 33030144, + "step": 84 + }, + { + "epoch": 0.01020397317205387, + "grad_norm": 0.6233570575714111, + "learning_rate": 1.0204081632653061e-05, + "loss": 2.3776, + "num_input_tokens_seen": 35389440, + "step": 90 + }, + { + "epoch": 0.010884238050190793, + "grad_norm": 0.6111224293708801, + "learning_rate": 1.08843537414966e-05, + "loss": 2.3986, + "num_input_tokens_seen": 37748736, + "step": 96 + }, + { + "epoch": 0.011564502928327718, + "grad_norm": 0.6153233647346497, + "learning_rate": 1.1564625850340138e-05, + "loss": 2.3898, + "num_input_tokens_seen": 40108032, + "step": 102 + }, + { + "epoch": 0.012244767806464642, + "grad_norm": 0.5808484554290771, + "learning_rate": 1.2244897959183674e-05, + "loss": 2.3324, + "num_input_tokens_seen": 42467328, + "step": 108 + }, + { + "epoch": 0.012925032684601566, + "grad_norm": 0.6722745299339294, + "learning_rate": 1.2925170068027212e-05, + "loss": 2.3763, + "num_input_tokens_seen": 44826624, + "step": 114 + }, + { + "epoch": 0.013605297562738492, + "grad_norm": 0.5742015242576599, + "learning_rate": 1.360544217687075e-05, + "loss": 2.341, + "num_input_tokens_seen": 47185920, + "step": 120 + }, + { + "epoch": 0.014285562440875416, + "grad_norm": 0.572271466255188, + "learning_rate": 1.4285714285714285e-05, + "loss": 2.3944, + "num_input_tokens_seen": 49545216, + "step": 126 + }, + { + "epoch": 0.01496582731901234, + "grad_norm": 0.6920814514160156, + "learning_rate": 1.4965986394557824e-05, + "loss": 2.2826, + "num_input_tokens_seen": 51904512, + "step": 132 + }, + { + "epoch": 0.015646092197149266, + "grad_norm": 0.6880550384521484, + "learning_rate": 1.5646258503401362e-05, + "loss": 2.3113, + "num_input_tokens_seen": 54263808, + "step": 138 + }, + { + "epoch": 0.01632635707528619, + "grad_norm": 0.7120464444160461, + "learning_rate": 1.6326530612244897e-05, + "loss": 2.299, + "num_input_tokens_seen": 56623104, + "step": 144 + }, + { + "epoch": 0.017006621953423114, + "grad_norm": 0.7833120226860046, + "learning_rate": 1.7006802721088435e-05, + "loss": 2.3243, + "num_input_tokens_seen": 58982400, + "step": 150 + }, + { + "epoch": 0.01768688683156004, + "grad_norm": 0.6928340792655945, + "learning_rate": 1.7687074829931973e-05, + "loss": 2.3074, + "num_input_tokens_seen": 61341696, + "step": 156 + }, + { + "epoch": 0.018367151709696963, + "grad_norm": 0.6552649736404419, + "learning_rate": 1.836734693877551e-05, + "loss": 2.2916, + "num_input_tokens_seen": 63700992, + "step": 162 + }, + { + "epoch": 0.019047416587833887, + "grad_norm": 0.776566743850708, + "learning_rate": 1.9047619047619046e-05, + "loss": 2.3148, + "num_input_tokens_seen": 66060288, + "step": 168 + }, + { + "epoch": 0.01972768146597081, + "grad_norm": 0.8164829611778259, + "learning_rate": 1.9727891156462584e-05, + "loss": 2.3097, + "num_input_tokens_seen": 68419584, + "step": 174 + }, + { + "epoch": 0.02040794634410774, + "grad_norm": 0.6100189685821533, + "learning_rate": 2.0408163265306123e-05, + "loss": 2.279, + "num_input_tokens_seen": 70778880, + "step": 180 + }, + { + "epoch": 0.021088211222244663, + "grad_norm": 0.6935499310493469, + "learning_rate": 2.108843537414966e-05, + "loss": 2.3033, + "num_input_tokens_seen": 73138176, + "step": 186 + }, + { + "epoch": 0.021768476100381587, + "grad_norm": 0.6464715600013733, + "learning_rate": 2.17687074829932e-05, + "loss": 2.29, + "num_input_tokens_seen": 75497472, + "step": 192 + }, + { + "epoch": 0.02244874097851851, + "grad_norm": 0.6630780696868896, + "learning_rate": 2.2448979591836737e-05, + "loss": 2.2882, + "num_input_tokens_seen": 77856768, + "step": 198 + }, + { + "epoch": 0.023129005856655435, + "grad_norm": 0.6190893054008484, + "learning_rate": 2.3129251700680275e-05, + "loss": 2.2754, + "num_input_tokens_seen": 80216064, + "step": 204 + }, + { + "epoch": 0.02380927073479236, + "grad_norm": 0.7775458097457886, + "learning_rate": 2.380952380952381e-05, + "loss": 2.2589, + "num_input_tokens_seen": 82575360, + "step": 210 + }, + { + "epoch": 0.024489535612929283, + "grad_norm": 0.7430821657180786, + "learning_rate": 2.448979591836735e-05, + "loss": 2.2157, + "num_input_tokens_seen": 84934656, + "step": 216 + }, + { + "epoch": 0.025169800491066208, + "grad_norm": 0.8468402624130249, + "learning_rate": 2.5170068027210887e-05, + "loss": 2.2681, + "num_input_tokens_seen": 87293952, + "step": 222 + }, + { + "epoch": 0.025850065369203132, + "grad_norm": 0.6779688000679016, + "learning_rate": 2.5850340136054425e-05, + "loss": 2.2527, + "num_input_tokens_seen": 89653248, + "step": 228 + }, + { + "epoch": 0.02653033024734006, + "grad_norm": 0.7911133170127869, + "learning_rate": 2.6530612244897963e-05, + "loss": 2.2089, + "num_input_tokens_seen": 92012544, + "step": 234 + }, + { + "epoch": 0.027210595125476984, + "grad_norm": 0.8615039587020874, + "learning_rate": 2.72108843537415e-05, + "loss": 2.2765, + "num_input_tokens_seen": 94371840, + "step": 240 + }, + { + "epoch": 0.027890860003613908, + "grad_norm": 0.8226616978645325, + "learning_rate": 2.7891156462585033e-05, + "loss": 2.2692, + "num_input_tokens_seen": 96731136, + "step": 246 + }, + { + "epoch": 0.028571124881750832, + "grad_norm": 0.7901866436004639, + "learning_rate": 2.857142857142857e-05, + "loss": 2.2716, + "num_input_tokens_seen": 99090432, + "step": 252 + }, + { + "epoch": 0.029251389759887756, + "grad_norm": 0.7447572350502014, + "learning_rate": 2.925170068027211e-05, + "loss": 2.2277, + "num_input_tokens_seen": 101449728, + "step": 258 + }, + { + "epoch": 0.02993165463802468, + "grad_norm": 0.7737066149711609, + "learning_rate": 2.9931972789115647e-05, + "loss": 2.2078, + "num_input_tokens_seen": 103809024, + "step": 264 + }, + { + "epoch": 0.030611919516161604, + "grad_norm": 0.9132639169692993, + "learning_rate": 3.061224489795919e-05, + "loss": 2.2601, + "num_input_tokens_seen": 106168320, + "step": 270 + }, + { + "epoch": 0.03129218439429853, + "grad_norm": 0.7523462772369385, + "learning_rate": 3.1292517006802724e-05, + "loss": 2.2504, + "num_input_tokens_seen": 108527616, + "step": 276 + }, + { + "epoch": 0.031972449272435456, + "grad_norm": 0.688888430595398, + "learning_rate": 3.1972789115646265e-05, + "loss": 2.2534, + "num_input_tokens_seen": 110886912, + "step": 282 + }, + { + "epoch": 0.03265271415057238, + "grad_norm": 1.0086206197738647, + "learning_rate": 3.265306122448979e-05, + "loss": 2.2697, + "num_input_tokens_seen": 113246208, + "step": 288 + }, + { + "epoch": 0.033332979028709304, + "grad_norm": 0.93113112449646, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.2168, + "num_input_tokens_seen": 115605504, + "step": 294 + }, + { + "epoch": 0.03401324390684623, + "grad_norm": 1.0298339128494263, + "learning_rate": 3.401360544217687e-05, + "loss": 2.249, + "num_input_tokens_seen": 117964800, + "step": 300 + }, + { + "epoch": 0.03469350878498315, + "grad_norm": 0.807465136051178, + "learning_rate": 3.469387755102041e-05, + "loss": 2.2063, + "num_input_tokens_seen": 120324096, + "step": 306 + }, + { + "epoch": 0.03537377366312008, + "grad_norm": 0.8339959383010864, + "learning_rate": 3.5374149659863946e-05, + "loss": 2.2061, + "num_input_tokens_seen": 122683392, + "step": 312 + }, + { + "epoch": 0.036054038541257, + "grad_norm": 0.8316759467124939, + "learning_rate": 3.605442176870749e-05, + "loss": 2.2576, + "num_input_tokens_seen": 125042688, + "step": 318 + }, + { + "epoch": 0.036734303419393925, + "grad_norm": 0.7208542823791504, + "learning_rate": 3.673469387755102e-05, + "loss": 2.1918, + "num_input_tokens_seen": 127401984, + "step": 324 + }, + { + "epoch": 0.03741456829753085, + "grad_norm": 0.658276379108429, + "learning_rate": 3.7414965986394564e-05, + "loss": 2.231, + "num_input_tokens_seen": 129761280, + "step": 330 + }, + { + "epoch": 0.038094833175667774, + "grad_norm": 0.7552313208580017, + "learning_rate": 3.809523809523809e-05, + "loss": 2.1973, + "num_input_tokens_seen": 132120576, + "step": 336 + }, + { + "epoch": 0.0387750980538047, + "grad_norm": 0.6971672773361206, + "learning_rate": 3.8775510204081634e-05, + "loss": 2.2358, + "num_input_tokens_seen": 134479872, + "step": 342 + }, + { + "epoch": 0.03945536293194162, + "grad_norm": 0.9760845303535461, + "learning_rate": 3.945578231292517e-05, + "loss": 2.2392, + "num_input_tokens_seen": 136839168, + "step": 348 + }, + { + "epoch": 0.040135627810078546, + "grad_norm": 0.8042694330215454, + "learning_rate": 4.013605442176871e-05, + "loss": 2.2047, + "num_input_tokens_seen": 139198464, + "step": 354 + }, + { + "epoch": 0.04081589268821548, + "grad_norm": 0.6926929354667664, + "learning_rate": 4.0816326530612245e-05, + "loss": 2.2546, + "num_input_tokens_seen": 141557760, + "step": 360 + }, + { + "epoch": 0.0414961575663524, + "grad_norm": 0.8320844173431396, + "learning_rate": 4.149659863945579e-05, + "loss": 2.1762, + "num_input_tokens_seen": 143917056, + "step": 366 + }, + { + "epoch": 0.042176422444489325, + "grad_norm": 0.9185928106307983, + "learning_rate": 4.217687074829932e-05, + "loss": 2.2258, + "num_input_tokens_seen": 146276352, + "step": 372 + }, + { + "epoch": 0.04285668732262625, + "grad_norm": 0.9259613156318665, + "learning_rate": 4.2857142857142856e-05, + "loss": 2.1874, + "num_input_tokens_seen": 148635648, + "step": 378 + }, + { + "epoch": 0.043536952200763174, + "grad_norm": 0.7989060878753662, + "learning_rate": 4.35374149659864e-05, + "loss": 2.2364, + "num_input_tokens_seen": 150994944, + "step": 384 + }, + { + "epoch": 0.0442172170789001, + "grad_norm": 0.975271999835968, + "learning_rate": 4.421768707482993e-05, + "loss": 2.2092, + "num_input_tokens_seen": 153354240, + "step": 390 + }, + { + "epoch": 0.04489748195703702, + "grad_norm": 0.6971870064735413, + "learning_rate": 4.4897959183673474e-05, + "loss": 2.2374, + "num_input_tokens_seen": 155713536, + "step": 396 + }, + { + "epoch": 0.04535099187579497, + "eval_accuracy": 0.5588241758241759, + "eval_loss": 2.1870992183685303, + "eval_runtime": 128.6022, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 1.042, + "num_input_tokens_seen": 157286400, + "step": 400 + }, + { + "epoch": 0.045577746835173946, + "grad_norm": 0.8090885877609253, + "learning_rate": 4.557823129251701e-05, + "loss": 2.2098, + "num_input_tokens_seen": 158072832, + "step": 402 + }, + { + "epoch": 0.04625801171331087, + "grad_norm": 0.8753838539123535, + "learning_rate": 4.625850340136055e-05, + "loss": 2.1732, + "num_input_tokens_seen": 160432128, + "step": 408 + }, + { + "epoch": 0.046938276591447795, + "grad_norm": 1.0101414918899536, + "learning_rate": 4.6938775510204086e-05, + "loss": 2.1742, + "num_input_tokens_seen": 162791424, + "step": 414 + }, + { + "epoch": 0.04761854146958472, + "grad_norm": 0.841810405254364, + "learning_rate": 4.761904761904762e-05, + "loss": 2.1769, + "num_input_tokens_seen": 165150720, + "step": 420 + }, + { + "epoch": 0.04829880634772164, + "grad_norm": 0.9404070973396301, + "learning_rate": 4.8299319727891155e-05, + "loss": 2.2274, + "num_input_tokens_seen": 167510016, + "step": 426 + }, + { + "epoch": 0.04897907122585857, + "grad_norm": 0.7818936109542847, + "learning_rate": 4.89795918367347e-05, + "loss": 2.2116, + "num_input_tokens_seen": 169869312, + "step": 432 + }, + { + "epoch": 0.04965933610399549, + "grad_norm": 0.8758242130279541, + "learning_rate": 4.965986394557823e-05, + "loss": 2.2065, + "num_input_tokens_seen": 172228608, + "step": 438 + }, + { + "epoch": 0.050339600982132415, + "grad_norm": 0.8778213262557983, + "learning_rate": 4.9982098102398855e-05, + "loss": 2.1828, + "num_input_tokens_seen": 174587904, + "step": 444 + }, + { + "epoch": 0.05101986586026934, + "grad_norm": 0.8915372490882874, + "learning_rate": 4.9946294307196566e-05, + "loss": 2.1787, + "num_input_tokens_seen": 176947200, + "step": 450 + }, + { + "epoch": 0.051700130738406264, + "grad_norm": 0.8329364657402039, + "learning_rate": 4.991049051199427e-05, + "loss": 2.1746, + "num_input_tokens_seen": 179306496, + "step": 456 + }, + { + "epoch": 0.052380395616543195, + "grad_norm": 0.7898052334785461, + "learning_rate": 4.987468671679198e-05, + "loss": 2.1859, + "num_input_tokens_seen": 181665792, + "step": 462 + }, + { + "epoch": 0.05306066049468012, + "grad_norm": 0.9453828930854797, + "learning_rate": 4.9838882921589694e-05, + "loss": 2.2036, + "num_input_tokens_seen": 184025088, + "step": 468 + }, + { + "epoch": 0.05374092537281704, + "grad_norm": 0.8436469435691833, + "learning_rate": 4.98030791263874e-05, + "loss": 2.169, + "num_input_tokens_seen": 186384384, + "step": 474 + }, + { + "epoch": 0.05442119025095397, + "grad_norm": 0.9970703721046448, + "learning_rate": 4.976727533118511e-05, + "loss": 2.161, + "num_input_tokens_seen": 188743680, + "step": 480 + }, + { + "epoch": 0.05510145512909089, + "grad_norm": 0.7459275722503662, + "learning_rate": 4.9731471535982815e-05, + "loss": 2.2542, + "num_input_tokens_seen": 191102976, + "step": 486 + }, + { + "epoch": 0.055781720007227815, + "grad_norm": 0.9407626986503601, + "learning_rate": 4.969566774078053e-05, + "loss": 2.1648, + "num_input_tokens_seen": 193462272, + "step": 492 + }, + { + "epoch": 0.05646198488536474, + "grad_norm": 0.9194992780685425, + "learning_rate": 4.965986394557823e-05, + "loss": 2.142, + "num_input_tokens_seen": 195821568, + "step": 498 + }, + { + "epoch": 0.057142249763501664, + "grad_norm": 0.819237232208252, + "learning_rate": 4.9624060150375936e-05, + "loss": 2.2195, + "num_input_tokens_seen": 198180864, + "step": 504 + }, + { + "epoch": 0.05782251464163859, + "grad_norm": 0.8461591005325317, + "learning_rate": 4.958825635517365e-05, + "loss": 2.1649, + "num_input_tokens_seen": 200540160, + "step": 510 + }, + { + "epoch": 0.05850277951977551, + "grad_norm": 0.8540611267089844, + "learning_rate": 4.955245255997136e-05, + "loss": 2.1611, + "num_input_tokens_seen": 202899456, + "step": 516 + }, + { + "epoch": 0.059183044397912436, + "grad_norm": 0.767410933971405, + "learning_rate": 4.951664876476907e-05, + "loss": 2.1524, + "num_input_tokens_seen": 205258752, + "step": 522 + }, + { + "epoch": 0.05986330927604936, + "grad_norm": 1.049315333366394, + "learning_rate": 4.9480844969566776e-05, + "loss": 2.1582, + "num_input_tokens_seen": 207618048, + "step": 528 + }, + { + "epoch": 0.060543574154186285, + "grad_norm": 0.7773332595825195, + "learning_rate": 4.944504117436449e-05, + "loss": 2.193, + "num_input_tokens_seen": 209977344, + "step": 534 + }, + { + "epoch": 0.06122383903232321, + "grad_norm": 0.9237553477287292, + "learning_rate": 4.940923737916219e-05, + "loss": 2.158, + "num_input_tokens_seen": 212336640, + "step": 540 + }, + { + "epoch": 0.06190410391046013, + "grad_norm": 0.836017370223999, + "learning_rate": 4.93734335839599e-05, + "loss": 2.2233, + "num_input_tokens_seen": 214695936, + "step": 546 + }, + { + "epoch": 0.06258436878859706, + "grad_norm": 0.8589292168617249, + "learning_rate": 4.933762978875761e-05, + "loss": 2.2209, + "num_input_tokens_seen": 217055232, + "step": 552 + }, + { + "epoch": 0.06326463366673399, + "grad_norm": 0.7112890481948853, + "learning_rate": 4.930182599355532e-05, + "loss": 2.2024, + "num_input_tokens_seen": 219414528, + "step": 558 + }, + { + "epoch": 0.06394489854487091, + "grad_norm": 0.718296229839325, + "learning_rate": 4.926602219835303e-05, + "loss": 2.1465, + "num_input_tokens_seen": 221773824, + "step": 564 + }, + { + "epoch": 0.06462516342300784, + "grad_norm": 0.8518996238708496, + "learning_rate": 4.9230218403150736e-05, + "loss": 2.1673, + "num_input_tokens_seen": 224133120, + "step": 570 + }, + { + "epoch": 0.06530542830114476, + "grad_norm": 0.8261798024177551, + "learning_rate": 4.919441460794845e-05, + "loss": 2.1991, + "num_input_tokens_seen": 226492416, + "step": 576 + }, + { + "epoch": 0.06598569317928168, + "grad_norm": 0.9106934070587158, + "learning_rate": 4.915861081274615e-05, + "loss": 2.1448, + "num_input_tokens_seen": 228851712, + "step": 582 + }, + { + "epoch": 0.06666595805741861, + "grad_norm": 0.8303735256195068, + "learning_rate": 4.912280701754386e-05, + "loss": 2.1883, + "num_input_tokens_seen": 231211008, + "step": 588 + }, + { + "epoch": 0.06734622293555553, + "grad_norm": 0.7179098129272461, + "learning_rate": 4.908700322234157e-05, + "loss": 2.1744, + "num_input_tokens_seen": 233570304, + "step": 594 + }, + { + "epoch": 0.06802648781369246, + "grad_norm": 0.9411275386810303, + "learning_rate": 4.905119942713928e-05, + "loss": 2.1269, + "num_input_tokens_seen": 235929600, + "step": 600 + }, + { + "epoch": 0.06870675269182938, + "grad_norm": 0.9154955744743347, + "learning_rate": 4.901539563193699e-05, + "loss": 2.1807, + "num_input_tokens_seen": 238288896, + "step": 606 + }, + { + "epoch": 0.0693870175699663, + "grad_norm": 0.6920604705810547, + "learning_rate": 4.89795918367347e-05, + "loss": 2.1427, + "num_input_tokens_seen": 240648192, + "step": 612 + }, + { + "epoch": 0.07006728244810323, + "grad_norm": 0.6742058396339417, + "learning_rate": 4.894378804153241e-05, + "loss": 2.1443, + "num_input_tokens_seen": 243007488, + "step": 618 + }, + { + "epoch": 0.07074754732624015, + "grad_norm": 0.7783246040344238, + "learning_rate": 4.890798424633011e-05, + "loss": 2.1613, + "num_input_tokens_seen": 245366784, + "step": 624 + }, + { + "epoch": 0.07142781220437708, + "grad_norm": 0.9674144983291626, + "learning_rate": 4.887218045112782e-05, + "loss": 2.1811, + "num_input_tokens_seen": 247726080, + "step": 630 + }, + { + "epoch": 0.072108077082514, + "grad_norm": 0.7616820335388184, + "learning_rate": 4.883637665592553e-05, + "loss": 2.1264, + "num_input_tokens_seen": 250085376, + "step": 636 + }, + { + "epoch": 0.07278834196065093, + "grad_norm": 0.7255170941352844, + "learning_rate": 4.8800572860723234e-05, + "loss": 2.1, + "num_input_tokens_seen": 252444672, + "step": 642 + }, + { + "epoch": 0.07346860683878785, + "grad_norm": 0.7598791718482971, + "learning_rate": 4.8764769065520946e-05, + "loss": 2.1683, + "num_input_tokens_seen": 254803968, + "step": 648 + }, + { + "epoch": 0.07414887171692477, + "grad_norm": 0.6518032550811768, + "learning_rate": 4.872896527031866e-05, + "loss": 2.1476, + "num_input_tokens_seen": 257163264, + "step": 654 + }, + { + "epoch": 0.0748291365950617, + "grad_norm": 0.6598438024520874, + "learning_rate": 4.869316147511637e-05, + "loss": 2.1574, + "num_input_tokens_seen": 259522560, + "step": 660 + }, + { + "epoch": 0.07550940147319862, + "grad_norm": 0.7716994881629944, + "learning_rate": 4.8657357679914074e-05, + "loss": 2.1797, + "num_input_tokens_seen": 261881856, + "step": 666 + }, + { + "epoch": 0.07618966635133555, + "grad_norm": 0.6889157295227051, + "learning_rate": 4.862155388471178e-05, + "loss": 2.126, + "num_input_tokens_seen": 264241152, + "step": 672 + }, + { + "epoch": 0.07686993122947247, + "grad_norm": 0.7756158113479614, + "learning_rate": 4.858575008950949e-05, + "loss": 2.1199, + "num_input_tokens_seen": 266600448, + "step": 678 + }, + { + "epoch": 0.0775501961076094, + "grad_norm": 0.7093831896781921, + "learning_rate": 4.8549946294307195e-05, + "loss": 2.1428, + "num_input_tokens_seen": 268959744, + "step": 684 + }, + { + "epoch": 0.07823046098574632, + "grad_norm": 0.786888599395752, + "learning_rate": 4.8514142499104906e-05, + "loss": 2.2016, + "num_input_tokens_seen": 271319040, + "step": 690 + }, + { + "epoch": 0.07891072586388324, + "grad_norm": 1.1578270196914673, + "learning_rate": 4.847833870390262e-05, + "loss": 2.1278, + "num_input_tokens_seen": 273678336, + "step": 696 + }, + { + "epoch": 0.07959099074202017, + "grad_norm": 0.8472400307655334, + "learning_rate": 4.844253490870033e-05, + "loss": 2.1288, + "num_input_tokens_seen": 276037632, + "step": 702 + }, + { + "epoch": 0.08027125562015709, + "grad_norm": 0.708258867263794, + "learning_rate": 4.8406731113498034e-05, + "loss": 2.1609, + "num_input_tokens_seen": 278396928, + "step": 708 + }, + { + "epoch": 0.08095152049829403, + "grad_norm": 0.9472253918647766, + "learning_rate": 4.837092731829574e-05, + "loss": 2.1452, + "num_input_tokens_seen": 280756224, + "step": 714 + }, + { + "epoch": 0.08163178537643095, + "grad_norm": 0.7842580080032349, + "learning_rate": 4.833512352309345e-05, + "loss": 2.1191, + "num_input_tokens_seen": 283115520, + "step": 720 + }, + { + "epoch": 0.08231205025456788, + "grad_norm": 0.6376339793205261, + "learning_rate": 4.8299319727891155e-05, + "loss": 2.1575, + "num_input_tokens_seen": 285474816, + "step": 726 + }, + { + "epoch": 0.0829923151327048, + "grad_norm": 0.6511639952659607, + "learning_rate": 4.826351593268887e-05, + "loss": 2.1064, + "num_input_tokens_seen": 287834112, + "step": 732 + }, + { + "epoch": 0.08367258001084173, + "grad_norm": 0.599183976650238, + "learning_rate": 4.822771213748657e-05, + "loss": 2.1406, + "num_input_tokens_seen": 290193408, + "step": 738 + }, + { + "epoch": 0.08435284488897865, + "grad_norm": 0.6168906092643738, + "learning_rate": 4.819190834228429e-05, + "loss": 2.1267, + "num_input_tokens_seen": 292552704, + "step": 744 + }, + { + "epoch": 0.08503310976711558, + "grad_norm": 0.7475244998931885, + "learning_rate": 4.8156104547081995e-05, + "loss": 2.1511, + "num_input_tokens_seen": 294912000, + "step": 750 + }, + { + "epoch": 0.0857133746452525, + "grad_norm": 0.7636436223983765, + "learning_rate": 4.81203007518797e-05, + "loss": 2.1664, + "num_input_tokens_seen": 297271296, + "step": 756 + }, + { + "epoch": 0.08639363952338942, + "grad_norm": 0.8825888633728027, + "learning_rate": 4.808449695667741e-05, + "loss": 2.1587, + "num_input_tokens_seen": 299630592, + "step": 762 + }, + { + "epoch": 0.08707390440152635, + "grad_norm": 0.8732916712760925, + "learning_rate": 4.8048693161475116e-05, + "loss": 2.1548, + "num_input_tokens_seen": 301989888, + "step": 768 + }, + { + "epoch": 0.08775416927966327, + "grad_norm": 0.9067391157150269, + "learning_rate": 4.801288936627283e-05, + "loss": 2.1383, + "num_input_tokens_seen": 304349184, + "step": 774 + }, + { + "epoch": 0.0884344341578002, + "grad_norm": 0.795757532119751, + "learning_rate": 4.797708557107053e-05, + "loss": 2.1138, + "num_input_tokens_seen": 306708480, + "step": 780 + }, + { + "epoch": 0.08911469903593712, + "grad_norm": 0.8759602904319763, + "learning_rate": 4.7941281775868244e-05, + "loss": 2.1481, + "num_input_tokens_seen": 309067776, + "step": 786 + }, + { + "epoch": 0.08979496391407404, + "grad_norm": 0.6154273748397827, + "learning_rate": 4.7905477980665955e-05, + "loss": 2.1156, + "num_input_tokens_seen": 311427072, + "step": 792 + }, + { + "epoch": 0.09047522879221097, + "grad_norm": 0.6875632405281067, + "learning_rate": 4.786967418546366e-05, + "loss": 2.143, + "num_input_tokens_seen": 313786368, + "step": 798 + }, + { + "epoch": 0.09070198375158994, + "eval_accuracy": 0.5665372405372405, + "eval_loss": 2.1336145401000977, + "eval_runtime": 130.3356, + "eval_samples_per_second": 3.069, + "eval_steps_per_second": 1.028, + "num_input_tokens_seen": 314572800, + "step": 800 + }, + { + "epoch": 0.09115549367034789, + "grad_norm": 0.6513155698776245, + "learning_rate": 4.783387039026137e-05, + "loss": 2.1974, + "num_input_tokens_seen": 316145664, + "step": 804 + }, + { + "epoch": 0.09183575854848482, + "grad_norm": 0.6210876703262329, + "learning_rate": 4.7798066595059076e-05, + "loss": 2.1321, + "num_input_tokens_seen": 318504960, + "step": 810 + }, + { + "epoch": 0.09251602342662174, + "grad_norm": 0.6908721327781677, + "learning_rate": 4.776226279985679e-05, + "loss": 2.1574, + "num_input_tokens_seen": 320864256, + "step": 816 + }, + { + "epoch": 0.09319628830475866, + "grad_norm": 0.8170259594917297, + "learning_rate": 4.772645900465449e-05, + "loss": 2.0947, + "num_input_tokens_seen": 323223552, + "step": 822 + }, + { + "epoch": 0.09387655318289559, + "grad_norm": 0.7803713083267212, + "learning_rate": 4.7690655209452204e-05, + "loss": 2.1412, + "num_input_tokens_seen": 325582848, + "step": 828 + }, + { + "epoch": 0.09455681806103251, + "grad_norm": 0.9013774394989014, + "learning_rate": 4.7654851414249916e-05, + "loss": 2.1511, + "num_input_tokens_seen": 327942144, + "step": 834 + }, + { + "epoch": 0.09523708293916944, + "grad_norm": 0.691776692867279, + "learning_rate": 4.761904761904762e-05, + "loss": 2.149, + "num_input_tokens_seen": 330301440, + "step": 840 + }, + { + "epoch": 0.09591734781730636, + "grad_norm": 0.7903074622154236, + "learning_rate": 4.758324382384533e-05, + "loss": 2.1061, + "num_input_tokens_seen": 332660736, + "step": 846 + }, + { + "epoch": 0.09659761269544329, + "grad_norm": 0.7019173502922058, + "learning_rate": 4.754744002864304e-05, + "loss": 2.1225, + "num_input_tokens_seen": 335020032, + "step": 852 + }, + { + "epoch": 0.09727787757358021, + "grad_norm": 0.7324870824813843, + "learning_rate": 4.751163623344075e-05, + "loss": 2.0793, + "num_input_tokens_seen": 337379328, + "step": 858 + }, + { + "epoch": 0.09795814245171713, + "grad_norm": 0.6702744960784912, + "learning_rate": 4.747583243823845e-05, + "loss": 2.1023, + "num_input_tokens_seen": 339738624, + "step": 864 + }, + { + "epoch": 0.09863840732985406, + "grad_norm": 0.7916101217269897, + "learning_rate": 4.7440028643036165e-05, + "loss": 2.1885, + "num_input_tokens_seen": 342097920, + "step": 870 + }, + { + "epoch": 0.09931867220799098, + "grad_norm": 0.7214677929878235, + "learning_rate": 4.740422484783387e-05, + "loss": 2.2039, + "num_input_tokens_seen": 344457216, + "step": 876 + }, + { + "epoch": 0.0999989370861279, + "grad_norm": 0.9506244659423828, + "learning_rate": 4.736842105263158e-05, + "loss": 2.1401, + "num_input_tokens_seen": 346816512, + "step": 882 + }, + { + "epoch": 0.10067920196426483, + "grad_norm": 0.8334141969680786, + "learning_rate": 4.733261725742929e-05, + "loss": 2.1155, + "num_input_tokens_seen": 349175808, + "step": 888 + }, + { + "epoch": 0.10135946684240175, + "grad_norm": 0.8239167928695679, + "learning_rate": 4.7296813462227e-05, + "loss": 2.1143, + "num_input_tokens_seen": 351535104, + "step": 894 + }, + { + "epoch": 0.10203973172053868, + "grad_norm": 0.6935220956802368, + "learning_rate": 4.726100966702471e-05, + "loss": 2.1481, + "num_input_tokens_seen": 353894400, + "step": 900 + }, + { + "epoch": 0.1027199965986756, + "grad_norm": 0.6344029307365417, + "learning_rate": 4.7225205871822413e-05, + "loss": 2.1337, + "num_input_tokens_seen": 356253696, + "step": 906 + }, + { + "epoch": 0.10340026147681253, + "grad_norm": 0.9699512720108032, + "learning_rate": 4.7189402076620125e-05, + "loss": 2.1522, + "num_input_tokens_seen": 358612992, + "step": 912 + }, + { + "epoch": 0.10408052635494947, + "grad_norm": 0.6322435736656189, + "learning_rate": 4.715359828141783e-05, + "loss": 2.1133, + "num_input_tokens_seen": 360972288, + "step": 918 + }, + { + "epoch": 0.10476079123308639, + "grad_norm": 0.755022406578064, + "learning_rate": 4.711779448621554e-05, + "loss": 2.1191, + "num_input_tokens_seen": 363331584, + "step": 924 + }, + { + "epoch": 0.10544105611122331, + "grad_norm": 0.6669276356697083, + "learning_rate": 4.708199069101325e-05, + "loss": 2.1255, + "num_input_tokens_seen": 365690880, + "step": 930 + }, + { + "epoch": 0.10612132098936024, + "grad_norm": 0.7509146928787231, + "learning_rate": 4.704618689581096e-05, + "loss": 2.1711, + "num_input_tokens_seen": 368050176, + "step": 936 + }, + { + "epoch": 0.10680158586749716, + "grad_norm": 0.8903239369392395, + "learning_rate": 4.701038310060867e-05, + "loss": 2.1675, + "num_input_tokens_seen": 370409472, + "step": 942 + }, + { + "epoch": 0.10748185074563409, + "grad_norm": 0.6709368824958801, + "learning_rate": 4.6974579305406374e-05, + "loss": 2.1266, + "num_input_tokens_seen": 372768768, + "step": 948 + }, + { + "epoch": 0.10816211562377101, + "grad_norm": 0.6461692452430725, + "learning_rate": 4.6938775510204086e-05, + "loss": 2.1098, + "num_input_tokens_seen": 375128064, + "step": 954 + }, + { + "epoch": 0.10884238050190793, + "grad_norm": 0.8384061455726624, + "learning_rate": 4.690297171500179e-05, + "loss": 2.1397, + "num_input_tokens_seen": 377487360, + "step": 960 + }, + { + "epoch": 0.10952264538004486, + "grad_norm": 0.6946293115615845, + "learning_rate": 4.6867167919799495e-05, + "loss": 2.1413, + "num_input_tokens_seen": 379846656, + "step": 966 + }, + { + "epoch": 0.11020291025818178, + "grad_norm": 0.7992385625839233, + "learning_rate": 4.6831364124597213e-05, + "loss": 2.1188, + "num_input_tokens_seen": 382205952, + "step": 972 + }, + { + "epoch": 0.1108831751363187, + "grad_norm": 0.6177113056182861, + "learning_rate": 4.679556032939492e-05, + "loss": 2.1798, + "num_input_tokens_seen": 384565248, + "step": 978 + }, + { + "epoch": 0.11156344001445563, + "grad_norm": 0.6821500062942505, + "learning_rate": 4.675975653419263e-05, + "loss": 2.1062, + "num_input_tokens_seen": 386924544, + "step": 984 + }, + { + "epoch": 0.11224370489259256, + "grad_norm": 0.6839202642440796, + "learning_rate": 4.6723952738990334e-05, + "loss": 2.1472, + "num_input_tokens_seen": 389283840, + "step": 990 + }, + { + "epoch": 0.11292396977072948, + "grad_norm": 0.5608601570129395, + "learning_rate": 4.6688148943788046e-05, + "loss": 2.0845, + "num_input_tokens_seen": 391643136, + "step": 996 + }, + { + "epoch": 0.1136042346488664, + "grad_norm": 0.7359477877616882, + "learning_rate": 4.665234514858575e-05, + "loss": 2.1426, + "num_input_tokens_seen": 394002432, + "step": 1002 + }, + { + "epoch": 0.11428449952700333, + "grad_norm": 0.6624149084091187, + "learning_rate": 4.6616541353383456e-05, + "loss": 2.1188, + "num_input_tokens_seen": 396361728, + "step": 1008 + }, + { + "epoch": 0.11496476440514025, + "grad_norm": 0.6561130285263062, + "learning_rate": 4.658073755818117e-05, + "loss": 2.1141, + "num_input_tokens_seen": 398721024, + "step": 1014 + }, + { + "epoch": 0.11564502928327718, + "grad_norm": 0.76801598072052, + "learning_rate": 4.654493376297888e-05, + "loss": 2.1074, + "num_input_tokens_seen": 401080320, + "step": 1020 + }, + { + "epoch": 0.1163252941614141, + "grad_norm": 0.7016099095344543, + "learning_rate": 4.650912996777659e-05, + "loss": 2.116, + "num_input_tokens_seen": 403439616, + "step": 1026 + }, + { + "epoch": 0.11700555903955102, + "grad_norm": 0.7845112681388855, + "learning_rate": 4.6473326172574295e-05, + "loss": 2.0639, + "num_input_tokens_seen": 405798912, + "step": 1032 + }, + { + "epoch": 0.11768582391768795, + "grad_norm": 0.7502654194831848, + "learning_rate": 4.6437522377372007e-05, + "loss": 2.0549, + "num_input_tokens_seen": 408158208, + "step": 1038 + }, + { + "epoch": 0.11836608879582487, + "grad_norm": 0.8195413947105408, + "learning_rate": 4.640171858216971e-05, + "loss": 2.1463, + "num_input_tokens_seen": 410517504, + "step": 1044 + }, + { + "epoch": 0.1190463536739618, + "grad_norm": 0.6572406888008118, + "learning_rate": 4.6365914786967416e-05, + "loss": 2.1043, + "num_input_tokens_seen": 412876800, + "step": 1050 + }, + { + "epoch": 0.11972661855209872, + "grad_norm": 0.6787090301513672, + "learning_rate": 4.633011099176513e-05, + "loss": 2.1287, + "num_input_tokens_seen": 415236096, + "step": 1056 + }, + { + "epoch": 0.12040688343023564, + "grad_norm": 0.6975082755088806, + "learning_rate": 4.629430719656284e-05, + "loss": 2.1439, + "num_input_tokens_seen": 417595392, + "step": 1062 + }, + { + "epoch": 0.12108714830837257, + "grad_norm": 1.1566354036331177, + "learning_rate": 4.625850340136055e-05, + "loss": 2.1331, + "num_input_tokens_seen": 419954688, + "step": 1068 + }, + { + "epoch": 0.1217674131865095, + "grad_norm": 0.804976224899292, + "learning_rate": 4.6222699606158255e-05, + "loss": 2.1842, + "num_input_tokens_seen": 422313984, + "step": 1074 + }, + { + "epoch": 0.12244767806464642, + "grad_norm": 0.7782629728317261, + "learning_rate": 4.618689581095597e-05, + "loss": 2.1258, + "num_input_tokens_seen": 424673280, + "step": 1080 + }, + { + "epoch": 0.12312794294278334, + "grad_norm": 0.7431383728981018, + "learning_rate": 4.615109201575367e-05, + "loss": 2.0759, + "num_input_tokens_seen": 427032576, + "step": 1086 + }, + { + "epoch": 0.12380820782092027, + "grad_norm": 0.6219275593757629, + "learning_rate": 4.6115288220551377e-05, + "loss": 2.1158, + "num_input_tokens_seen": 429391872, + "step": 1092 + }, + { + "epoch": 0.12448847269905719, + "grad_norm": 0.7471150755882263, + "learning_rate": 4.607948442534909e-05, + "loss": 2.1602, + "num_input_tokens_seen": 431751168, + "step": 1098 + }, + { + "epoch": 0.12516873757719413, + "grad_norm": 0.788198709487915, + "learning_rate": 4.604368063014679e-05, + "loss": 2.1178, + "num_input_tokens_seen": 434110464, + "step": 1104 + }, + { + "epoch": 0.12584900245533104, + "grad_norm": 0.7239183187484741, + "learning_rate": 4.6007876834944504e-05, + "loss": 2.1157, + "num_input_tokens_seen": 436469760, + "step": 1110 + }, + { + "epoch": 0.12652926733346798, + "grad_norm": 0.6211867332458496, + "learning_rate": 4.5972073039742216e-05, + "loss": 2.1705, + "num_input_tokens_seen": 438829056, + "step": 1116 + }, + { + "epoch": 0.1272095322116049, + "grad_norm": 0.7338197231292725, + "learning_rate": 4.593626924453993e-05, + "loss": 2.1271, + "num_input_tokens_seen": 441188352, + "step": 1122 + }, + { + "epoch": 0.12788979708974182, + "grad_norm": 0.7123642563819885, + "learning_rate": 4.590046544933763e-05, + "loss": 2.0573, + "num_input_tokens_seen": 443547648, + "step": 1128 + }, + { + "epoch": 0.12857006196787873, + "grad_norm": 0.648991048336029, + "learning_rate": 4.586466165413534e-05, + "loss": 2.1645, + "num_input_tokens_seen": 445906944, + "step": 1134 + }, + { + "epoch": 0.12925032684601567, + "grad_norm": 0.727215051651001, + "learning_rate": 4.582885785893305e-05, + "loss": 2.1009, + "num_input_tokens_seen": 448266240, + "step": 1140 + }, + { + "epoch": 0.12993059172415258, + "grad_norm": 0.7532079219818115, + "learning_rate": 4.5793054063730753e-05, + "loss": 2.1067, + "num_input_tokens_seen": 450625536, + "step": 1146 + }, + { + "epoch": 0.13061085660228952, + "grad_norm": 0.7537828683853149, + "learning_rate": 4.5757250268528465e-05, + "loss": 2.1201, + "num_input_tokens_seen": 452984832, + "step": 1152 + }, + { + "epoch": 0.13129112148042643, + "grad_norm": 0.7720354795455933, + "learning_rate": 4.5721446473326176e-05, + "loss": 2.1422, + "num_input_tokens_seen": 455344128, + "step": 1158 + }, + { + "epoch": 0.13197138635856337, + "grad_norm": 0.9617815613746643, + "learning_rate": 4.568564267812389e-05, + "loss": 2.1267, + "num_input_tokens_seen": 457703424, + "step": 1164 + }, + { + "epoch": 0.13265165123670028, + "grad_norm": 0.6180392503738403, + "learning_rate": 4.564983888292159e-05, + "loss": 2.1077, + "num_input_tokens_seen": 460062720, + "step": 1170 + }, + { + "epoch": 0.13333191611483722, + "grad_norm": 0.6402847170829773, + "learning_rate": 4.56140350877193e-05, + "loss": 2.084, + "num_input_tokens_seen": 462422016, + "step": 1176 + }, + { + "epoch": 0.13401218099297413, + "grad_norm": 0.727862536907196, + "learning_rate": 4.557823129251701e-05, + "loss": 2.1177, + "num_input_tokens_seen": 464781312, + "step": 1182 + }, + { + "epoch": 0.13469244587111107, + "grad_norm": 0.6989423036575317, + "learning_rate": 4.5542427497314714e-05, + "loss": 2.1041, + "num_input_tokens_seen": 467140608, + "step": 1188 + }, + { + "epoch": 0.13537271074924798, + "grad_norm": 0.8146799206733704, + "learning_rate": 4.5506623702112425e-05, + "loss": 2.0976, + "num_input_tokens_seen": 469499904, + "step": 1194 + }, + { + "epoch": 0.13605297562738491, + "grad_norm": 0.7785530686378479, + "learning_rate": 4.547081990691014e-05, + "loss": 2.1272, + "num_input_tokens_seen": 471859200, + "step": 1200 + }, + { + "epoch": 0.13605297562738491, + "eval_accuracy": 0.5698174603174603, + "eval_loss": 2.109198808670044, + "eval_runtime": 129.4137, + "eval_samples_per_second": 3.091, + "eval_steps_per_second": 1.035, + "num_input_tokens_seen": 471859200, + "step": 1200 + }, + { + "epoch": 0.13673324050552182, + "grad_norm": 0.6469578742980957, + "learning_rate": 4.543501611170785e-05, + "loss": 2.1522, + "num_input_tokens_seen": 474218496, + "step": 1206 + }, + { + "epoch": 0.13741350538365876, + "grad_norm": 1.0499253273010254, + "learning_rate": 4.539921231650555e-05, + "loss": 2.1576, + "num_input_tokens_seen": 476577792, + "step": 1212 + }, + { + "epoch": 0.13809377026179567, + "grad_norm": 0.6888744235038757, + "learning_rate": 4.536340852130326e-05, + "loss": 2.1386, + "num_input_tokens_seen": 478937088, + "step": 1218 + }, + { + "epoch": 0.1387740351399326, + "grad_norm": 0.6668254733085632, + "learning_rate": 4.532760472610097e-05, + "loss": 2.1211, + "num_input_tokens_seen": 481296384, + "step": 1224 + }, + { + "epoch": 0.13945430001806955, + "grad_norm": 0.5561350584030151, + "learning_rate": 4.5291800930898674e-05, + "loss": 2.056, + "num_input_tokens_seen": 483655680, + "step": 1230 + }, + { + "epoch": 0.14013456489620646, + "grad_norm": 0.6395593285560608, + "learning_rate": 4.5255997135696386e-05, + "loss": 2.1032, + "num_input_tokens_seen": 486014976, + "step": 1236 + }, + { + "epoch": 0.1408148297743434, + "grad_norm": 0.5906882882118225, + "learning_rate": 4.522019334049409e-05, + "loss": 2.1373, + "num_input_tokens_seen": 488374272, + "step": 1242 + }, + { + "epoch": 0.1414950946524803, + "grad_norm": 0.776069700717926, + "learning_rate": 4.51843895452918e-05, + "loss": 2.1563, + "num_input_tokens_seen": 490733568, + "step": 1248 + }, + { + "epoch": 0.14217535953061725, + "grad_norm": 0.6770499348640442, + "learning_rate": 4.5148585750089514e-05, + "loss": 2.1556, + "num_input_tokens_seen": 493092864, + "step": 1254 + }, + { + "epoch": 0.14285562440875416, + "grad_norm": 0.6341859698295593, + "learning_rate": 4.511278195488722e-05, + "loss": 2.16, + "num_input_tokens_seen": 495452160, + "step": 1260 + }, + { + "epoch": 0.1435358892868911, + "grad_norm": 0.6789543032646179, + "learning_rate": 4.507697815968493e-05, + "loss": 2.1314, + "num_input_tokens_seen": 497811456, + "step": 1266 + }, + { + "epoch": 0.144216154165028, + "grad_norm": 0.5745943784713745, + "learning_rate": 4.5041174364482635e-05, + "loss": 2.0845, + "num_input_tokens_seen": 500170752, + "step": 1272 + }, + { + "epoch": 0.14489641904316494, + "grad_norm": 0.6102567315101624, + "learning_rate": 4.5005370569280346e-05, + "loss": 2.0663, + "num_input_tokens_seen": 502530048, + "step": 1278 + }, + { + "epoch": 0.14557668392130185, + "grad_norm": 0.5677859783172607, + "learning_rate": 4.496956677407805e-05, + "loss": 2.1446, + "num_input_tokens_seen": 504889344, + "step": 1284 + }, + { + "epoch": 0.1462569487994388, + "grad_norm": 0.7098356485366821, + "learning_rate": 4.493376297887576e-05, + "loss": 2.076, + "num_input_tokens_seen": 507248640, + "step": 1290 + }, + { + "epoch": 0.1469372136775757, + "grad_norm": 0.7611458897590637, + "learning_rate": 4.4897959183673474e-05, + "loss": 2.0869, + "num_input_tokens_seen": 509607936, + "step": 1296 + }, + { + "epoch": 0.14761747855571264, + "grad_norm": 0.7817174196243286, + "learning_rate": 4.486215538847118e-05, + "loss": 2.1251, + "num_input_tokens_seen": 511967232, + "step": 1302 + }, + { + "epoch": 0.14829774343384955, + "grad_norm": 0.7138420343399048, + "learning_rate": 4.482635159326889e-05, + "loss": 2.1547, + "num_input_tokens_seen": 514326528, + "step": 1308 + }, + { + "epoch": 0.1489780083119865, + "grad_norm": 0.6586819887161255, + "learning_rate": 4.4790547798066595e-05, + "loss": 2.0944, + "num_input_tokens_seen": 516685824, + "step": 1314 + }, + { + "epoch": 0.1496582731901234, + "grad_norm": 0.7534651160240173, + "learning_rate": 4.475474400286431e-05, + "loss": 2.1, + "num_input_tokens_seen": 519045120, + "step": 1320 + }, + { + "epoch": 0.15033853806826034, + "grad_norm": 0.677528440952301, + "learning_rate": 4.471894020766201e-05, + "loss": 2.096, + "num_input_tokens_seen": 521404416, + "step": 1326 + }, + { + "epoch": 0.15101880294639725, + "grad_norm": 0.5919771790504456, + "learning_rate": 4.468313641245972e-05, + "loss": 2.1184, + "num_input_tokens_seen": 523763712, + "step": 1332 + }, + { + "epoch": 0.15169906782453418, + "grad_norm": 0.6883030533790588, + "learning_rate": 4.464733261725743e-05, + "loss": 2.118, + "num_input_tokens_seen": 526123008, + "step": 1338 + }, + { + "epoch": 0.1523793327026711, + "grad_norm": 0.7062236666679382, + "learning_rate": 4.461152882205514e-05, + "loss": 2.1439, + "num_input_tokens_seen": 528482304, + "step": 1344 + }, + { + "epoch": 0.15305959758080803, + "grad_norm": 0.6822494268417358, + "learning_rate": 4.457572502685285e-05, + "loss": 2.1023, + "num_input_tokens_seen": 530841600, + "step": 1350 + }, + { + "epoch": 0.15373986245894494, + "grad_norm": 0.6365748047828674, + "learning_rate": 4.4539921231650556e-05, + "loss": 2.0743, + "num_input_tokens_seen": 533200896, + "step": 1356 + }, + { + "epoch": 0.15442012733708188, + "grad_norm": 0.6446681022644043, + "learning_rate": 4.450411743644827e-05, + "loss": 2.0933, + "num_input_tokens_seen": 535560192, + "step": 1362 + }, + { + "epoch": 0.1551003922152188, + "grad_norm": 0.6867052912712097, + "learning_rate": 4.446831364124597e-05, + "loss": 2.1392, + "num_input_tokens_seen": 537919488, + "step": 1368 + }, + { + "epoch": 0.15578065709335573, + "grad_norm": 0.6548677086830139, + "learning_rate": 4.4432509846043684e-05, + "loss": 2.1549, + "num_input_tokens_seen": 540278784, + "step": 1374 + }, + { + "epoch": 0.15646092197149264, + "grad_norm": 0.660763144493103, + "learning_rate": 4.439670605084139e-05, + "loss": 2.111, + "num_input_tokens_seen": 542638080, + "step": 1380 + }, + { + "epoch": 0.15714118684962958, + "grad_norm": 0.7522821426391602, + "learning_rate": 4.43609022556391e-05, + "loss": 2.0979, + "num_input_tokens_seen": 544997376, + "step": 1386 + }, + { + "epoch": 0.1578214517277665, + "grad_norm": 0.7142075896263123, + "learning_rate": 4.432509846043681e-05, + "loss": 2.0947, + "num_input_tokens_seen": 547356672, + "step": 1392 + }, + { + "epoch": 0.15850171660590343, + "grad_norm": 0.684587836265564, + "learning_rate": 4.4289294665234516e-05, + "loss": 2.1263, + "num_input_tokens_seen": 549715968, + "step": 1398 + }, + { + "epoch": 0.15918198148404034, + "grad_norm": 0.62205970287323, + "learning_rate": 4.425349087003223e-05, + "loss": 2.0615, + "num_input_tokens_seen": 552075264, + "step": 1404 + }, + { + "epoch": 0.15986224636217727, + "grad_norm": 0.6591596603393555, + "learning_rate": 4.421768707482993e-05, + "loss": 2.1085, + "num_input_tokens_seen": 554434560, + "step": 1410 + }, + { + "epoch": 0.16054251124031418, + "grad_norm": 0.6393697261810303, + "learning_rate": 4.4181883279627644e-05, + "loss": 2.1273, + "num_input_tokens_seen": 556793856, + "step": 1416 + }, + { + "epoch": 0.16122277611845112, + "grad_norm": 0.7339461445808411, + "learning_rate": 4.414607948442535e-05, + "loss": 2.078, + "num_input_tokens_seen": 559153152, + "step": 1422 + }, + { + "epoch": 0.16190304099658806, + "grad_norm": 0.5903000235557556, + "learning_rate": 4.411027568922306e-05, + "loss": 2.0781, + "num_input_tokens_seen": 561512448, + "step": 1428 + }, + { + "epoch": 0.16258330587472497, + "grad_norm": 0.5981512069702148, + "learning_rate": 4.407447189402077e-05, + "loss": 2.0697, + "num_input_tokens_seen": 563871744, + "step": 1434 + }, + { + "epoch": 0.1632635707528619, + "grad_norm": 0.7107566595077515, + "learning_rate": 4.403866809881848e-05, + "loss": 2.1106, + "num_input_tokens_seen": 566231040, + "step": 1440 + }, + { + "epoch": 0.16394383563099882, + "grad_norm": 0.66408371925354, + "learning_rate": 4.400286430361619e-05, + "loss": 2.178, + "num_input_tokens_seen": 568590336, + "step": 1446 + }, + { + "epoch": 0.16462410050913576, + "grad_norm": 0.7157317399978638, + "learning_rate": 4.396706050841389e-05, + "loss": 2.0805, + "num_input_tokens_seen": 570949632, + "step": 1452 + }, + { + "epoch": 0.16530436538727267, + "grad_norm": 0.5517193078994751, + "learning_rate": 4.3931256713211605e-05, + "loss": 2.0886, + "num_input_tokens_seen": 573308928, + "step": 1458 + }, + { + "epoch": 0.1659846302654096, + "grad_norm": 0.6534057259559631, + "learning_rate": 4.389545291800931e-05, + "loss": 2.0757, + "num_input_tokens_seen": 575668224, + "step": 1464 + }, + { + "epoch": 0.16666489514354652, + "grad_norm": 0.6548903584480286, + "learning_rate": 4.3859649122807014e-05, + "loss": 2.061, + "num_input_tokens_seen": 578027520, + "step": 1470 + }, + { + "epoch": 0.16734516002168345, + "grad_norm": 0.7592008113861084, + "learning_rate": 4.3823845327604726e-05, + "loss": 2.1042, + "num_input_tokens_seen": 580386816, + "step": 1476 + }, + { + "epoch": 0.16802542489982036, + "grad_norm": 0.6569022536277771, + "learning_rate": 4.378804153240244e-05, + "loss": 2.1117, + "num_input_tokens_seen": 582746112, + "step": 1482 + }, + { + "epoch": 0.1687056897779573, + "grad_norm": 0.666001558303833, + "learning_rate": 4.375223773720015e-05, + "loss": 2.1153, + "num_input_tokens_seen": 585105408, + "step": 1488 + }, + { + "epoch": 0.1693859546560942, + "grad_norm": 0.726992666721344, + "learning_rate": 4.3716433941997854e-05, + "loss": 2.1045, + "num_input_tokens_seen": 587464704, + "step": 1494 + }, + { + "epoch": 0.17006621953423115, + "grad_norm": 0.7073400020599365, + "learning_rate": 4.3680630146795565e-05, + "loss": 2.107, + "num_input_tokens_seen": 589824000, + "step": 1500 + }, + { + "epoch": 0.17074648441236806, + "grad_norm": 0.7716240286827087, + "learning_rate": 4.364482635159327e-05, + "loss": 2.0693, + "num_input_tokens_seen": 592183296, + "step": 1506 + }, + { + "epoch": 0.171426749290505, + "grad_norm": 0.6214017271995544, + "learning_rate": 4.3609022556390975e-05, + "loss": 2.0959, + "num_input_tokens_seen": 594542592, + "step": 1512 + }, + { + "epoch": 0.1721070141686419, + "grad_norm": 0.6559828519821167, + "learning_rate": 4.3573218761188686e-05, + "loss": 2.0871, + "num_input_tokens_seen": 596901888, + "step": 1518 + }, + { + "epoch": 0.17278727904677885, + "grad_norm": 0.5939403176307678, + "learning_rate": 4.35374149659864e-05, + "loss": 2.09, + "num_input_tokens_seen": 599261184, + "step": 1524 + }, + { + "epoch": 0.17346754392491576, + "grad_norm": 0.680909276008606, + "learning_rate": 4.350161117078411e-05, + "loss": 2.0679, + "num_input_tokens_seen": 601620480, + "step": 1530 + }, + { + "epoch": 0.1741478088030527, + "grad_norm": 0.6251941919326782, + "learning_rate": 4.3465807375581814e-05, + "loss": 2.0517, + "num_input_tokens_seen": 603979776, + "step": 1536 + }, + { + "epoch": 0.1748280736811896, + "grad_norm": 0.7877122759819031, + "learning_rate": 4.3430003580379526e-05, + "loss": 2.1092, + "num_input_tokens_seen": 606339072, + "step": 1542 + }, + { + "epoch": 0.17550833855932654, + "grad_norm": 0.6591320037841797, + "learning_rate": 4.339419978517723e-05, + "loss": 2.1308, + "num_input_tokens_seen": 608698368, + "step": 1548 + }, + { + "epoch": 0.17618860343746345, + "grad_norm": 0.6894817352294922, + "learning_rate": 4.3358395989974935e-05, + "loss": 2.0316, + "num_input_tokens_seen": 611057664, + "step": 1554 + }, + { + "epoch": 0.1768688683156004, + "grad_norm": 0.6120206713676453, + "learning_rate": 4.332259219477265e-05, + "loss": 2.1528, + "num_input_tokens_seen": 613416960, + "step": 1560 + }, + { + "epoch": 0.1775491331937373, + "grad_norm": 0.6873424649238586, + "learning_rate": 4.328678839957035e-05, + "loss": 2.12, + "num_input_tokens_seen": 615776256, + "step": 1566 + }, + { + "epoch": 0.17822939807187424, + "grad_norm": 0.6133244037628174, + "learning_rate": 4.325098460436807e-05, + "loss": 2.104, + "num_input_tokens_seen": 618135552, + "step": 1572 + }, + { + "epoch": 0.17890966295001115, + "grad_norm": 0.5738610029220581, + "learning_rate": 4.3215180809165775e-05, + "loss": 2.0827, + "num_input_tokens_seen": 620494848, + "step": 1578 + }, + { + "epoch": 0.1795899278281481, + "grad_norm": 0.6590917706489563, + "learning_rate": 4.3179377013963486e-05, + "loss": 2.1426, + "num_input_tokens_seen": 622854144, + "step": 1584 + }, + { + "epoch": 0.180270192706285, + "grad_norm": 0.7815598249435425, + "learning_rate": 4.314357321876119e-05, + "loss": 2.0688, + "num_input_tokens_seen": 625213440, + "step": 1590 + }, + { + "epoch": 0.18095045758442194, + "grad_norm": 0.6743267774581909, + "learning_rate": 4.3107769423558896e-05, + "loss": 2.1243, + "num_input_tokens_seen": 627572736, + "step": 1596 + }, + { + "epoch": 0.18140396750317989, + "eval_accuracy": 0.5724584859584859, + "eval_loss": 2.0929105281829834, + "eval_runtime": 128.5597, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.042, + "num_input_tokens_seen": 629145600, + "step": 1600 + }, + { + "epoch": 0.18163072246255885, + "grad_norm": 0.7218141555786133, + "learning_rate": 4.307196562835661e-05, + "loss": 2.1026, + "num_input_tokens_seen": 629932032, + "step": 1602 + }, + { + "epoch": 0.18231098734069578, + "grad_norm": 0.5616850852966309, + "learning_rate": 4.303616183315431e-05, + "loss": 2.0563, + "num_input_tokens_seen": 632291328, + "step": 1608 + }, + { + "epoch": 0.1829912522188327, + "grad_norm": 0.8092398643493652, + "learning_rate": 4.3000358037952024e-05, + "loss": 2.1115, + "num_input_tokens_seen": 634650624, + "step": 1614 + }, + { + "epoch": 0.18367151709696963, + "grad_norm": 0.8262616395950317, + "learning_rate": 4.2964554242749735e-05, + "loss": 2.1083, + "num_input_tokens_seen": 637009920, + "step": 1620 + }, + { + "epoch": 0.18435178197510654, + "grad_norm": 0.6983737349510193, + "learning_rate": 4.292875044754745e-05, + "loss": 2.0547, + "num_input_tokens_seen": 639369216, + "step": 1626 + }, + { + "epoch": 0.18503204685324348, + "grad_norm": 0.7725507616996765, + "learning_rate": 4.289294665234515e-05, + "loss": 2.0215, + "num_input_tokens_seen": 641728512, + "step": 1632 + }, + { + "epoch": 0.18571231173138042, + "grad_norm": 0.6409133672714233, + "learning_rate": 4.2857142857142856e-05, + "loss": 2.131, + "num_input_tokens_seen": 644087808, + "step": 1638 + }, + { + "epoch": 0.18639257660951733, + "grad_norm": 0.7413092851638794, + "learning_rate": 4.282133906194057e-05, + "loss": 2.1055, + "num_input_tokens_seen": 646447104, + "step": 1644 + }, + { + "epoch": 0.18707284148765427, + "grad_norm": 0.6062273383140564, + "learning_rate": 4.278553526673827e-05, + "loss": 2.1227, + "num_input_tokens_seen": 648806400, + "step": 1650 + }, + { + "epoch": 0.18775310636579118, + "grad_norm": 0.5928088426589966, + "learning_rate": 4.2749731471535984e-05, + "loss": 2.0063, + "num_input_tokens_seen": 651165696, + "step": 1656 + }, + { + "epoch": 0.18843337124392812, + "grad_norm": 0.7456128001213074, + "learning_rate": 4.2713927676333696e-05, + "loss": 2.1229, + "num_input_tokens_seen": 653524992, + "step": 1662 + }, + { + "epoch": 0.18911363612206503, + "grad_norm": 0.634148359298706, + "learning_rate": 4.267812388113141e-05, + "loss": 2.0924, + "num_input_tokens_seen": 655884288, + "step": 1668 + }, + { + "epoch": 0.18979390100020196, + "grad_norm": 0.5960593223571777, + "learning_rate": 4.264232008592911e-05, + "loss": 2.0759, + "num_input_tokens_seen": 658243584, + "step": 1674 + }, + { + "epoch": 0.19047416587833887, + "grad_norm": 0.5249684453010559, + "learning_rate": 4.260651629072682e-05, + "loss": 2.1217, + "num_input_tokens_seen": 660602880, + "step": 1680 + }, + { + "epoch": 0.1911544307564758, + "grad_norm": 0.6345716118812561, + "learning_rate": 4.257071249552453e-05, + "loss": 2.0945, + "num_input_tokens_seen": 662962176, + "step": 1686 + }, + { + "epoch": 0.19183469563461272, + "grad_norm": 0.6189055442810059, + "learning_rate": 4.253490870032223e-05, + "loss": 2.1206, + "num_input_tokens_seen": 665321472, + "step": 1692 + }, + { + "epoch": 0.19251496051274966, + "grad_norm": 0.6294938325881958, + "learning_rate": 4.2499104905119945e-05, + "loss": 2.0698, + "num_input_tokens_seen": 667680768, + "step": 1698 + }, + { + "epoch": 0.19319522539088657, + "grad_norm": 0.6409788131713867, + "learning_rate": 4.246330110991765e-05, + "loss": 2.0606, + "num_input_tokens_seen": 670040064, + "step": 1704 + }, + { + "epoch": 0.1938754902690235, + "grad_norm": 0.7358625531196594, + "learning_rate": 4.242749731471536e-05, + "loss": 2.1144, + "num_input_tokens_seen": 672399360, + "step": 1710 + }, + { + "epoch": 0.19455575514716042, + "grad_norm": 0.6153339743614197, + "learning_rate": 4.239169351951307e-05, + "loss": 2.119, + "num_input_tokens_seen": 674758656, + "step": 1716 + }, + { + "epoch": 0.19523602002529736, + "grad_norm": 0.6902744174003601, + "learning_rate": 4.235588972431078e-05, + "loss": 2.0636, + "num_input_tokens_seen": 677117952, + "step": 1722 + }, + { + "epoch": 0.19591628490343427, + "grad_norm": 0.7145854830741882, + "learning_rate": 4.232008592910849e-05, + "loss": 2.0937, + "num_input_tokens_seen": 679477248, + "step": 1728 + }, + { + "epoch": 0.1965965497815712, + "grad_norm": 0.7076539397239685, + "learning_rate": 4.2284282133906194e-05, + "loss": 2.1325, + "num_input_tokens_seen": 681836544, + "step": 1734 + }, + { + "epoch": 0.19727681465970812, + "grad_norm": 0.6653849482536316, + "learning_rate": 4.2248478338703905e-05, + "loss": 2.1266, + "num_input_tokens_seen": 684195840, + "step": 1740 + }, + { + "epoch": 0.19795707953784505, + "grad_norm": 0.7376857399940491, + "learning_rate": 4.221267454350161e-05, + "loss": 2.1273, + "num_input_tokens_seen": 686555136, + "step": 1746 + }, + { + "epoch": 0.19863734441598196, + "grad_norm": 0.6721606254577637, + "learning_rate": 4.217687074829932e-05, + "loss": 2.0921, + "num_input_tokens_seen": 688914432, + "step": 1752 + }, + { + "epoch": 0.1993176092941189, + "grad_norm": 0.8767059445381165, + "learning_rate": 4.214106695309703e-05, + "loss": 2.1027, + "num_input_tokens_seen": 691273728, + "step": 1758 + }, + { + "epoch": 0.1999978741722558, + "grad_norm": 0.6245223879814148, + "learning_rate": 4.210526315789474e-05, + "loss": 2.083, + "num_input_tokens_seen": 693633024, + "step": 1764 + }, + { + "epoch": 0.20067813905039275, + "grad_norm": 0.6684584021568298, + "learning_rate": 4.206945936269245e-05, + "loss": 2.0892, + "num_input_tokens_seen": 695992320, + "step": 1770 + }, + { + "epoch": 0.20135840392852966, + "grad_norm": 0.676654577255249, + "learning_rate": 4.2033655567490154e-05, + "loss": 2.1068, + "num_input_tokens_seen": 698351616, + "step": 1776 + }, + { + "epoch": 0.2020386688066666, + "grad_norm": 0.5355855226516724, + "learning_rate": 4.1997851772287866e-05, + "loss": 2.0531, + "num_input_tokens_seen": 700710912, + "step": 1782 + }, + { + "epoch": 0.2027189336848035, + "grad_norm": 0.6555802226066589, + "learning_rate": 4.196204797708557e-05, + "loss": 2.0901, + "num_input_tokens_seen": 703070208, + "step": 1788 + }, + { + "epoch": 0.20339919856294045, + "grad_norm": 0.6298051476478577, + "learning_rate": 4.192624418188328e-05, + "loss": 2.0877, + "num_input_tokens_seen": 705429504, + "step": 1794 + }, + { + "epoch": 0.20407946344107736, + "grad_norm": 0.720937192440033, + "learning_rate": 4.1890440386680994e-05, + "loss": 2.108, + "num_input_tokens_seen": 707788800, + "step": 1800 + }, + { + "epoch": 0.2047597283192143, + "grad_norm": 0.7328784465789795, + "learning_rate": 4.18546365914787e-05, + "loss": 2.0899, + "num_input_tokens_seen": 710148096, + "step": 1806 + }, + { + "epoch": 0.2054399931973512, + "grad_norm": 0.6752752065658569, + "learning_rate": 4.181883279627641e-05, + "loss": 2.1424, + "num_input_tokens_seen": 712507392, + "step": 1812 + }, + { + "epoch": 0.20612025807548814, + "grad_norm": 0.7588052153587341, + "learning_rate": 4.1783029001074115e-05, + "loss": 2.134, + "num_input_tokens_seen": 714866688, + "step": 1818 + }, + { + "epoch": 0.20680052295362505, + "grad_norm": 0.6495354771614075, + "learning_rate": 4.1747225205871826e-05, + "loss": 2.1091, + "num_input_tokens_seen": 717225984, + "step": 1824 + }, + { + "epoch": 0.207480787831762, + "grad_norm": 0.6155287623405457, + "learning_rate": 4.171142141066953e-05, + "loss": 2.1057, + "num_input_tokens_seen": 719585280, + "step": 1830 + }, + { + "epoch": 0.20816105270989893, + "grad_norm": 0.5426910519599915, + "learning_rate": 4.167561761546724e-05, + "loss": 2.0465, + "num_input_tokens_seen": 721944576, + "step": 1836 + }, + { + "epoch": 0.20884131758803584, + "grad_norm": 0.6535930633544922, + "learning_rate": 4.163981382026495e-05, + "loss": 2.046, + "num_input_tokens_seen": 724303872, + "step": 1842 + }, + { + "epoch": 0.20952158246617278, + "grad_norm": 0.639935314655304, + "learning_rate": 4.160401002506266e-05, + "loss": 2.1035, + "num_input_tokens_seen": 726663168, + "step": 1848 + }, + { + "epoch": 0.2102018473443097, + "grad_norm": 0.5828704833984375, + "learning_rate": 4.156820622986037e-05, + "loss": 2.0643, + "num_input_tokens_seen": 729022464, + "step": 1854 + }, + { + "epoch": 0.21088211222244663, + "grad_norm": 0.579765796661377, + "learning_rate": 4.1532402434658075e-05, + "loss": 2.1099, + "num_input_tokens_seen": 731381760, + "step": 1860 + }, + { + "epoch": 0.21156237710058354, + "grad_norm": 0.6833761930465698, + "learning_rate": 4.149659863945579e-05, + "loss": 2.0967, + "num_input_tokens_seen": 733741056, + "step": 1866 + }, + { + "epoch": 0.21224264197872048, + "grad_norm": 0.6318493485450745, + "learning_rate": 4.146079484425349e-05, + "loss": 2.1049, + "num_input_tokens_seen": 736100352, + "step": 1872 + }, + { + "epoch": 0.21292290685685739, + "grad_norm": 0.6708328127861023, + "learning_rate": 4.14249910490512e-05, + "loss": 2.0954, + "num_input_tokens_seen": 738459648, + "step": 1878 + }, + { + "epoch": 0.21360317173499432, + "grad_norm": 0.6389116644859314, + "learning_rate": 4.138918725384891e-05, + "loss": 2.113, + "num_input_tokens_seen": 740818944, + "step": 1884 + }, + { + "epoch": 0.21428343661313123, + "grad_norm": 0.6693724393844604, + "learning_rate": 4.135338345864662e-05, + "loss": 2.0786, + "num_input_tokens_seen": 743178240, + "step": 1890 + }, + { + "epoch": 0.21496370149126817, + "grad_norm": 0.6880051493644714, + "learning_rate": 4.131757966344433e-05, + "loss": 2.101, + "num_input_tokens_seen": 745537536, + "step": 1896 + }, + { + "epoch": 0.21564396636940508, + "grad_norm": 0.7772538065910339, + "learning_rate": 4.1281775868242036e-05, + "loss": 2.0912, + "num_input_tokens_seen": 747896832, + "step": 1902 + }, + { + "epoch": 0.21632423124754202, + "grad_norm": 0.5820342898368835, + "learning_rate": 4.124597207303975e-05, + "loss": 2.0999, + "num_input_tokens_seen": 750256128, + "step": 1908 + }, + { + "epoch": 0.21700449612567893, + "grad_norm": 0.6889671087265015, + "learning_rate": 4.121016827783745e-05, + "loss": 2.1452, + "num_input_tokens_seen": 752615424, + "step": 1914 + }, + { + "epoch": 0.21768476100381587, + "grad_norm": 0.7460409998893738, + "learning_rate": 4.1174364482635163e-05, + "loss": 2.0545, + "num_input_tokens_seen": 754974720, + "step": 1920 + }, + { + "epoch": 0.21836502588195278, + "grad_norm": 0.6621735692024231, + "learning_rate": 4.113856068743287e-05, + "loss": 2.1114, + "num_input_tokens_seen": 757334016, + "step": 1926 + }, + { + "epoch": 0.21904529076008972, + "grad_norm": 0.6911535859107971, + "learning_rate": 4.110275689223057e-05, + "loss": 2.0746, + "num_input_tokens_seen": 759693312, + "step": 1932 + }, + { + "epoch": 0.21972555563822663, + "grad_norm": 0.7786504626274109, + "learning_rate": 4.1066953097028285e-05, + "loss": 2.1343, + "num_input_tokens_seen": 762052608, + "step": 1938 + }, + { + "epoch": 0.22040582051636357, + "grad_norm": 0.6110914349555969, + "learning_rate": 4.1031149301825996e-05, + "loss": 2.1723, + "num_input_tokens_seen": 764411904, + "step": 1944 + }, + { + "epoch": 0.22108608539450048, + "grad_norm": 0.7057865858078003, + "learning_rate": 4.099534550662371e-05, + "loss": 2.1187, + "num_input_tokens_seen": 766771200, + "step": 1950 + }, + { + "epoch": 0.2217663502726374, + "grad_norm": 0.6199769973754883, + "learning_rate": 4.095954171142141e-05, + "loss": 2.0733, + "num_input_tokens_seen": 769130496, + "step": 1956 + }, + { + "epoch": 0.22244661515077432, + "grad_norm": 0.7107540965080261, + "learning_rate": 4.0923737916219124e-05, + "loss": 2.0964, + "num_input_tokens_seen": 771489792, + "step": 1962 + }, + { + "epoch": 0.22312688002891126, + "grad_norm": 0.6034384369850159, + "learning_rate": 4.088793412101683e-05, + "loss": 2.1232, + "num_input_tokens_seen": 773849088, + "step": 1968 + }, + { + "epoch": 0.22380714490704817, + "grad_norm": 0.6471470594406128, + "learning_rate": 4.0852130325814534e-05, + "loss": 2.0799, + "num_input_tokens_seen": 776208384, + "step": 1974 + }, + { + "epoch": 0.2244874097851851, + "grad_norm": 0.6443119049072266, + "learning_rate": 4.0816326530612245e-05, + "loss": 2.0843, + "num_input_tokens_seen": 778567680, + "step": 1980 + }, + { + "epoch": 0.22516767466332202, + "grad_norm": 0.6607959866523743, + "learning_rate": 4.0780522735409957e-05, + "loss": 2.1408, + "num_input_tokens_seen": 780926976, + "step": 1986 + }, + { + "epoch": 0.22584793954145896, + "grad_norm": 0.6692774891853333, + "learning_rate": 4.074471894020767e-05, + "loss": 2.0871, + "num_input_tokens_seen": 783286272, + "step": 1992 + }, + { + "epoch": 0.22652820441959587, + "grad_norm": 0.7502838969230652, + "learning_rate": 4.070891514500537e-05, + "loss": 2.1021, + "num_input_tokens_seen": 785645568, + "step": 1998 + }, + { + "epoch": 0.22675495937897486, + "eval_accuracy": 0.5746526251526252, + "eval_loss": 2.079448699951172, + "eval_runtime": 129.4769, + "eval_samples_per_second": 3.089, + "eval_steps_per_second": 1.035, + "num_input_tokens_seen": 786432000, + "step": 2000 + }, + { + "epoch": 0.2272084692977328, + "grad_norm": 0.6747561693191528, + "learning_rate": 4.0673111349803084e-05, + "loss": 2.0141, + "num_input_tokens_seen": 788004864, + "step": 2004 + }, + { + "epoch": 0.22788873417586972, + "grad_norm": 0.6549056172370911, + "learning_rate": 4.063730755460079e-05, + "loss": 2.1014, + "num_input_tokens_seen": 790364160, + "step": 2010 + }, + { + "epoch": 0.22856899905400666, + "grad_norm": 0.7539930939674377, + "learning_rate": 4.0601503759398494e-05, + "loss": 2.1268, + "num_input_tokens_seen": 792723456, + "step": 2016 + }, + { + "epoch": 0.22924926393214357, + "grad_norm": 0.5937004089355469, + "learning_rate": 4.0565699964196206e-05, + "loss": 2.0426, + "num_input_tokens_seen": 795082752, + "step": 2022 + }, + { + "epoch": 0.2299295288102805, + "grad_norm": 0.5992699861526489, + "learning_rate": 4.052989616899392e-05, + "loss": 2.1089, + "num_input_tokens_seen": 797442048, + "step": 2028 + }, + { + "epoch": 0.2306097936884174, + "grad_norm": 0.5451076030731201, + "learning_rate": 4.049409237379163e-05, + "loss": 2.0499, + "num_input_tokens_seen": 799801344, + "step": 2034 + }, + { + "epoch": 0.23129005856655435, + "grad_norm": 0.5855215787887573, + "learning_rate": 4.0458288578589333e-05, + "loss": 2.0501, + "num_input_tokens_seen": 802160640, + "step": 2040 + }, + { + "epoch": 0.2319703234446913, + "grad_norm": 0.6797962784767151, + "learning_rate": 4.0422484783387045e-05, + "loss": 2.1079, + "num_input_tokens_seen": 804519936, + "step": 2046 + }, + { + "epoch": 0.2326505883228282, + "grad_norm": 0.5858785510063171, + "learning_rate": 4.038668098818475e-05, + "loss": 2.1117, + "num_input_tokens_seen": 806879232, + "step": 2052 + }, + { + "epoch": 0.23333085320096514, + "grad_norm": 0.6085060834884644, + "learning_rate": 4.0350877192982455e-05, + "loss": 2.0706, + "num_input_tokens_seen": 809238528, + "step": 2058 + }, + { + "epoch": 0.23401111807910205, + "grad_norm": 0.5851722359657288, + "learning_rate": 4.0315073397780166e-05, + "loss": 2.1303, + "num_input_tokens_seen": 811597824, + "step": 2064 + }, + { + "epoch": 0.234691382957239, + "grad_norm": 0.6054412722587585, + "learning_rate": 4.027926960257787e-05, + "loss": 2.1186, + "num_input_tokens_seen": 813957120, + "step": 2070 + }, + { + "epoch": 0.2353716478353759, + "grad_norm": 0.6723355054855347, + "learning_rate": 4.024346580737558e-05, + "loss": 2.0892, + "num_input_tokens_seen": 816316416, + "step": 2076 + }, + { + "epoch": 0.23605191271351283, + "grad_norm": 0.6768056154251099, + "learning_rate": 4.0207662012173294e-05, + "loss": 2.0591, + "num_input_tokens_seen": 818675712, + "step": 2082 + }, + { + "epoch": 0.23673217759164975, + "grad_norm": 0.5856552124023438, + "learning_rate": 4.0171858216971005e-05, + "loss": 2.0735, + "num_input_tokens_seen": 821035008, + "step": 2088 + }, + { + "epoch": 0.23741244246978668, + "grad_norm": 0.7292026281356812, + "learning_rate": 4.013605442176871e-05, + "loss": 2.0721, + "num_input_tokens_seen": 823394304, + "step": 2094 + }, + { + "epoch": 0.2380927073479236, + "grad_norm": 0.6172975301742554, + "learning_rate": 4.0100250626566415e-05, + "loss": 2.0728, + "num_input_tokens_seen": 825753600, + "step": 2100 + }, + { + "epoch": 0.23877297222606053, + "grad_norm": 0.7551843523979187, + "learning_rate": 4.0064446831364127e-05, + "loss": 2.1213, + "num_input_tokens_seen": 828112896, + "step": 2106 + }, + { + "epoch": 0.23945323710419744, + "grad_norm": 0.625471830368042, + "learning_rate": 4.002864303616183e-05, + "loss": 2.066, + "num_input_tokens_seen": 830472192, + "step": 2112 + }, + { + "epoch": 0.24013350198233438, + "grad_norm": 0.6531856656074524, + "learning_rate": 3.999283924095954e-05, + "loss": 2.0666, + "num_input_tokens_seen": 832831488, + "step": 2118 + }, + { + "epoch": 0.2408137668604713, + "grad_norm": 0.652446448802948, + "learning_rate": 3.9957035445757254e-05, + "loss": 2.1193, + "num_input_tokens_seen": 835190784, + "step": 2124 + }, + { + "epoch": 0.24149403173860823, + "grad_norm": 0.6203518509864807, + "learning_rate": 3.9921231650554966e-05, + "loss": 2.1025, + "num_input_tokens_seen": 837550080, + "step": 2130 + }, + { + "epoch": 0.24217429661674514, + "grad_norm": 0.6497722268104553, + "learning_rate": 3.988542785535267e-05, + "loss": 2.0418, + "num_input_tokens_seen": 839909376, + "step": 2136 + }, + { + "epoch": 0.24285456149488208, + "grad_norm": 0.661279559135437, + "learning_rate": 3.9849624060150376e-05, + "loss": 2.0558, + "num_input_tokens_seen": 842268672, + "step": 2142 + }, + { + "epoch": 0.243534826373019, + "grad_norm": 0.5917189717292786, + "learning_rate": 3.981382026494809e-05, + "loss": 2.1041, + "num_input_tokens_seen": 844627968, + "step": 2148 + }, + { + "epoch": 0.24421509125115592, + "grad_norm": 0.8539558053016663, + "learning_rate": 3.977801646974579e-05, + "loss": 2.1188, + "num_input_tokens_seen": 846987264, + "step": 2154 + }, + { + "epoch": 0.24489535612929283, + "grad_norm": 0.5865846276283264, + "learning_rate": 3.97422126745435e-05, + "loss": 2.1069, + "num_input_tokens_seen": 849346560, + "step": 2160 + }, + { + "epoch": 0.24557562100742977, + "grad_norm": 0.6616944670677185, + "learning_rate": 3.970640887934121e-05, + "loss": 2.0633, + "num_input_tokens_seen": 851705856, + "step": 2166 + }, + { + "epoch": 0.24625588588556668, + "grad_norm": 0.5569839477539062, + "learning_rate": 3.9670605084138926e-05, + "loss": 2.0857, + "num_input_tokens_seen": 854065152, + "step": 2172 + }, + { + "epoch": 0.24693615076370362, + "grad_norm": 0.5691688060760498, + "learning_rate": 3.963480128893663e-05, + "loss": 2.061, + "num_input_tokens_seen": 856424448, + "step": 2178 + }, + { + "epoch": 0.24761641564184053, + "grad_norm": 0.607754647731781, + "learning_rate": 3.9598997493734336e-05, + "loss": 2.1715, + "num_input_tokens_seen": 858783744, + "step": 2184 + }, + { + "epoch": 0.24829668051997747, + "grad_norm": 0.7856176495552063, + "learning_rate": 3.956319369853205e-05, + "loss": 2.0697, + "num_input_tokens_seen": 861143040, + "step": 2190 + }, + { + "epoch": 0.24897694539811438, + "grad_norm": 0.7349157333374023, + "learning_rate": 3.952738990332975e-05, + "loss": 2.1124, + "num_input_tokens_seen": 863502336, + "step": 2196 + }, + { + "epoch": 0.24965721027625132, + "grad_norm": 0.6067531108856201, + "learning_rate": 3.9491586108127464e-05, + "loss": 2.0844, + "num_input_tokens_seen": 865861632, + "step": 2202 + }, + { + "epoch": 0.25033747515438826, + "grad_norm": 0.6353740692138672, + "learning_rate": 3.945578231292517e-05, + "loss": 2.0714, + "num_input_tokens_seen": 868220928, + "step": 2208 + }, + { + "epoch": 0.25101774003252514, + "grad_norm": 0.6207152605056763, + "learning_rate": 3.941997851772288e-05, + "loss": 2.1135, + "num_input_tokens_seen": 870580224, + "step": 2214 + }, + { + "epoch": 0.2516980049106621, + "grad_norm": 0.6664757132530212, + "learning_rate": 3.938417472252059e-05, + "loss": 2.0755, + "num_input_tokens_seen": 872939520, + "step": 2220 + }, + { + "epoch": 0.252378269788799, + "grad_norm": 0.6741634011268616, + "learning_rate": 3.9348370927318297e-05, + "loss": 2.1249, + "num_input_tokens_seen": 875298816, + "step": 2226 + }, + { + "epoch": 0.25305853466693595, + "grad_norm": 0.7361227869987488, + "learning_rate": 3.931256713211601e-05, + "loss": 2.0656, + "num_input_tokens_seen": 877658112, + "step": 2232 + }, + { + "epoch": 0.25373879954507284, + "grad_norm": 0.864486038684845, + "learning_rate": 3.927676333691371e-05, + "loss": 2.0808, + "num_input_tokens_seen": 880017408, + "step": 2238 + }, + { + "epoch": 0.2544190644232098, + "grad_norm": 0.817509114742279, + "learning_rate": 3.9240959541711424e-05, + "loss": 2.0223, + "num_input_tokens_seen": 882376704, + "step": 2244 + }, + { + "epoch": 0.2550993293013467, + "grad_norm": 0.6295050382614136, + "learning_rate": 3.920515574650913e-05, + "loss": 2.1069, + "num_input_tokens_seen": 884736000, + "step": 2250 + }, + { + "epoch": 0.25577959417948365, + "grad_norm": 0.557656466960907, + "learning_rate": 3.916935195130684e-05, + "loss": 2.0621, + "num_input_tokens_seen": 887095296, + "step": 2256 + }, + { + "epoch": 0.2564598590576206, + "grad_norm": 0.5819247364997864, + "learning_rate": 3.913354815610455e-05, + "loss": 2.0804, + "num_input_tokens_seen": 889454592, + "step": 2262 + }, + { + "epoch": 0.25714012393575747, + "grad_norm": 0.6297056674957275, + "learning_rate": 3.909774436090226e-05, + "loss": 2.0566, + "num_input_tokens_seen": 891813888, + "step": 2268 + }, + { + "epoch": 0.2578203888138944, + "grad_norm": 0.6011530756950378, + "learning_rate": 3.906194056569997e-05, + "loss": 2.0731, + "num_input_tokens_seen": 894173184, + "step": 2274 + }, + { + "epoch": 0.25850065369203135, + "grad_norm": 0.5878785252571106, + "learning_rate": 3.902613677049767e-05, + "loss": 2.115, + "num_input_tokens_seen": 896532480, + "step": 2280 + }, + { + "epoch": 0.2591809185701683, + "grad_norm": 0.6470881104469299, + "learning_rate": 3.8990332975295385e-05, + "loss": 2.0653, + "num_input_tokens_seen": 898891776, + "step": 2286 + }, + { + "epoch": 0.25986118344830517, + "grad_norm": 0.6201193332672119, + "learning_rate": 3.895452918009309e-05, + "loss": 2.0936, + "num_input_tokens_seen": 901251072, + "step": 2292 + }, + { + "epoch": 0.2605414483264421, + "grad_norm": 0.5656684637069702, + "learning_rate": 3.89187253848908e-05, + "loss": 2.1008, + "num_input_tokens_seen": 903610368, + "step": 2298 + }, + { + "epoch": 0.26122171320457904, + "grad_norm": 0.5908628106117249, + "learning_rate": 3.8882921589688506e-05, + "loss": 2.0602, + "num_input_tokens_seen": 905969664, + "step": 2304 + }, + { + "epoch": 0.261901978082716, + "grad_norm": 0.660382866859436, + "learning_rate": 3.884711779448622e-05, + "loss": 2.0933, + "num_input_tokens_seen": 908328960, + "step": 2310 + }, + { + "epoch": 0.26258224296085286, + "grad_norm": 0.5603790283203125, + "learning_rate": 3.881131399928393e-05, + "loss": 2.0898, + "num_input_tokens_seen": 910688256, + "step": 2316 + }, + { + "epoch": 0.2632625078389898, + "grad_norm": 0.6598983407020569, + "learning_rate": 3.8775510204081634e-05, + "loss": 2.0715, + "num_input_tokens_seen": 913047552, + "step": 2322 + }, + { + "epoch": 0.26394277271712674, + "grad_norm": 0.5827348828315735, + "learning_rate": 3.8739706408879345e-05, + "loss": 2.1179, + "num_input_tokens_seen": 915406848, + "step": 2328 + }, + { + "epoch": 0.2646230375952637, + "grad_norm": 0.7159097194671631, + "learning_rate": 3.870390261367705e-05, + "loss": 2.0344, + "num_input_tokens_seen": 917766144, + "step": 2334 + }, + { + "epoch": 0.26530330247340056, + "grad_norm": 0.6752398014068604, + "learning_rate": 3.866809881847476e-05, + "loss": 2.0513, + "num_input_tokens_seen": 920125440, + "step": 2340 + }, + { + "epoch": 0.2659835673515375, + "grad_norm": 0.598101794719696, + "learning_rate": 3.8632295023272466e-05, + "loss": 2.0701, + "num_input_tokens_seen": 922484736, + "step": 2346 + }, + { + "epoch": 0.26666383222967444, + "grad_norm": 0.6286051273345947, + "learning_rate": 3.859649122807018e-05, + "loss": 2.0854, + "num_input_tokens_seen": 924844032, + "step": 2352 + }, + { + "epoch": 0.2673440971078114, + "grad_norm": 0.6396269202232361, + "learning_rate": 3.856068743286789e-05, + "loss": 2.1089, + "num_input_tokens_seen": 927203328, + "step": 2358 + }, + { + "epoch": 0.26802436198594826, + "grad_norm": 0.6398798823356628, + "learning_rate": 3.8524883637665594e-05, + "loss": 2.0501, + "num_input_tokens_seen": 929562624, + "step": 2364 + }, + { + "epoch": 0.2687046268640852, + "grad_norm": 0.6426295042037964, + "learning_rate": 3.8489079842463306e-05, + "loss": 2.0617, + "num_input_tokens_seen": 931921920, + "step": 2370 + }, + { + "epoch": 0.26938489174222213, + "grad_norm": 0.6402562856674194, + "learning_rate": 3.845327604726101e-05, + "loss": 2.0715, + "num_input_tokens_seen": 934281216, + "step": 2376 + }, + { + "epoch": 0.27006515662035907, + "grad_norm": 0.699862539768219, + "learning_rate": 3.841747225205872e-05, + "loss": 2.13, + "num_input_tokens_seen": 936640512, + "step": 2382 + }, + { + "epoch": 0.27074542149849595, + "grad_norm": 0.8998868465423584, + "learning_rate": 3.838166845685643e-05, + "loss": 2.1084, + "num_input_tokens_seen": 938999808, + "step": 2388 + }, + { + "epoch": 0.2714256863766329, + "grad_norm": 0.665034294128418, + "learning_rate": 3.834586466165413e-05, + "loss": 2.0959, + "num_input_tokens_seen": 941359104, + "step": 2394 + }, + { + "epoch": 0.27210595125476983, + "grad_norm": 0.745847225189209, + "learning_rate": 3.831006086645185e-05, + "loss": 2.0794, + "num_input_tokens_seen": 943718400, + "step": 2400 + }, + { + "epoch": 0.27210595125476983, + "eval_accuracy": 0.5762228327228327, + "eval_loss": 2.068709135055542, + "eval_runtime": 128.4911, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.043, + "num_input_tokens_seen": 943718400, + "step": 2400 + }, + { + "epoch": 0.27278621613290677, + "grad_norm": 0.6324106454849243, + "learning_rate": 3.8274257071249555e-05, + "loss": 2.0734, + "num_input_tokens_seen": 946077696, + "step": 2406 + }, + { + "epoch": 0.27346648101104365, + "grad_norm": 0.7810145020484924, + "learning_rate": 3.8238453276047266e-05, + "loss": 2.0681, + "num_input_tokens_seen": 948436992, + "step": 2412 + }, + { + "epoch": 0.2741467458891806, + "grad_norm": 0.6391826272010803, + "learning_rate": 3.820264948084497e-05, + "loss": 2.0964, + "num_input_tokens_seen": 950796288, + "step": 2418 + }, + { + "epoch": 0.2748270107673175, + "grad_norm": 0.6988577842712402, + "learning_rate": 3.816684568564268e-05, + "loss": 2.1252, + "num_input_tokens_seen": 953155584, + "step": 2424 + }, + { + "epoch": 0.27550727564545446, + "grad_norm": 0.5647233128547668, + "learning_rate": 3.813104189044039e-05, + "loss": 2.0881, + "num_input_tokens_seen": 955514880, + "step": 2430 + }, + { + "epoch": 0.27618754052359135, + "grad_norm": 0.5780855417251587, + "learning_rate": 3.809523809523809e-05, + "loss": 2.0835, + "num_input_tokens_seen": 957874176, + "step": 2436 + }, + { + "epoch": 0.2768678054017283, + "grad_norm": 0.6789732575416565, + "learning_rate": 3.8059434300035804e-05, + "loss": 2.0732, + "num_input_tokens_seen": 960233472, + "step": 2442 + }, + { + "epoch": 0.2775480702798652, + "grad_norm": 0.6763067245483398, + "learning_rate": 3.8023630504833515e-05, + "loss": 2.0457, + "num_input_tokens_seen": 962592768, + "step": 2448 + }, + { + "epoch": 0.27822833515800216, + "grad_norm": 0.5905190110206604, + "learning_rate": 3.798782670963123e-05, + "loss": 2.0186, + "num_input_tokens_seen": 964952064, + "step": 2454 + }, + { + "epoch": 0.2789086000361391, + "grad_norm": 0.6527414321899414, + "learning_rate": 3.795202291442893e-05, + "loss": 2.0841, + "num_input_tokens_seen": 967311360, + "step": 2460 + }, + { + "epoch": 0.279588864914276, + "grad_norm": 0.5765488743782043, + "learning_rate": 3.791621911922664e-05, + "loss": 2.0811, + "num_input_tokens_seen": 969670656, + "step": 2466 + }, + { + "epoch": 0.2802691297924129, + "grad_norm": 0.6708554625511169, + "learning_rate": 3.788041532402435e-05, + "loss": 2.0625, + "num_input_tokens_seen": 972029952, + "step": 2472 + }, + { + "epoch": 0.28094939467054986, + "grad_norm": 0.6201637983322144, + "learning_rate": 3.784461152882205e-05, + "loss": 2.1125, + "num_input_tokens_seen": 974389248, + "step": 2478 + }, + { + "epoch": 0.2816296595486868, + "grad_norm": 0.6302900314331055, + "learning_rate": 3.7808807733619764e-05, + "loss": 2.1199, + "num_input_tokens_seen": 976748544, + "step": 2484 + }, + { + "epoch": 0.2823099244268237, + "grad_norm": 0.7140418887138367, + "learning_rate": 3.7773003938417476e-05, + "loss": 2.1241, + "num_input_tokens_seen": 979107840, + "step": 2490 + }, + { + "epoch": 0.2829901893049606, + "grad_norm": 0.5913351774215698, + "learning_rate": 3.773720014321519e-05, + "loss": 2.0307, + "num_input_tokens_seen": 981467136, + "step": 2496 + }, + { + "epoch": 0.28367045418309755, + "grad_norm": 0.5941835045814514, + "learning_rate": 3.770139634801289e-05, + "loss": 2.069, + "num_input_tokens_seen": 983826432, + "step": 2502 + }, + { + "epoch": 0.2843507190612345, + "grad_norm": 0.6847456097602844, + "learning_rate": 3.7665592552810604e-05, + "loss": 2.0953, + "num_input_tokens_seen": 986185728, + "step": 2508 + }, + { + "epoch": 0.2850309839393714, + "grad_norm": 0.6352680325508118, + "learning_rate": 3.762978875760831e-05, + "loss": 2.0734, + "num_input_tokens_seen": 988545024, + "step": 2514 + }, + { + "epoch": 0.2857112488175083, + "grad_norm": 0.6623321771621704, + "learning_rate": 3.759398496240601e-05, + "loss": 2.0512, + "num_input_tokens_seen": 990904320, + "step": 2520 + }, + { + "epoch": 0.28639151369564525, + "grad_norm": 0.718250572681427, + "learning_rate": 3.7558181167203725e-05, + "loss": 2.0888, + "num_input_tokens_seen": 993263616, + "step": 2526 + }, + { + "epoch": 0.2870717785737822, + "grad_norm": 0.5607486367225647, + "learning_rate": 3.752237737200143e-05, + "loss": 2.1202, + "num_input_tokens_seen": 995622912, + "step": 2532 + }, + { + "epoch": 0.28775204345191907, + "grad_norm": 0.653218150138855, + "learning_rate": 3.748657357679914e-05, + "loss": 2.1099, + "num_input_tokens_seen": 997982208, + "step": 2538 + }, + { + "epoch": 0.288432308330056, + "grad_norm": 0.6100384593009949, + "learning_rate": 3.745076978159685e-05, + "loss": 2.0464, + "num_input_tokens_seen": 1000341504, + "step": 2544 + }, + { + "epoch": 0.28911257320819295, + "grad_norm": 0.6485652327537537, + "learning_rate": 3.7414965986394564e-05, + "loss": 2.0631, + "num_input_tokens_seen": 1002700800, + "step": 2550 + }, + { + "epoch": 0.2897928380863299, + "grad_norm": 0.6714969873428345, + "learning_rate": 3.737916219119227e-05, + "loss": 2.0325, + "num_input_tokens_seen": 1005060096, + "step": 2556 + }, + { + "epoch": 0.29047310296446677, + "grad_norm": 0.629289448261261, + "learning_rate": 3.7343358395989974e-05, + "loss": 2.0345, + "num_input_tokens_seen": 1007419392, + "step": 2562 + }, + { + "epoch": 0.2911533678426037, + "grad_norm": 0.6530044078826904, + "learning_rate": 3.7307554600787685e-05, + "loss": 2.1037, + "num_input_tokens_seen": 1009778688, + "step": 2568 + }, + { + "epoch": 0.29183363272074064, + "grad_norm": 0.6162053942680359, + "learning_rate": 3.727175080558539e-05, + "loss": 2.0594, + "num_input_tokens_seen": 1012137984, + "step": 2574 + }, + { + "epoch": 0.2925138975988776, + "grad_norm": 0.6271448731422424, + "learning_rate": 3.72359470103831e-05, + "loss": 2.0737, + "num_input_tokens_seen": 1014497280, + "step": 2580 + }, + { + "epoch": 0.29319416247701446, + "grad_norm": 0.5966920256614685, + "learning_rate": 3.720014321518081e-05, + "loss": 2.1086, + "num_input_tokens_seen": 1016856576, + "step": 2586 + }, + { + "epoch": 0.2938744273551514, + "grad_norm": 0.6952504515647888, + "learning_rate": 3.7164339419978525e-05, + "loss": 2.0835, + "num_input_tokens_seen": 1019215872, + "step": 2592 + }, + { + "epoch": 0.29455469223328834, + "grad_norm": 0.6622751951217651, + "learning_rate": 3.712853562477623e-05, + "loss": 2.1187, + "num_input_tokens_seen": 1021575168, + "step": 2598 + }, + { + "epoch": 0.2952349571114253, + "grad_norm": 0.7054808139801025, + "learning_rate": 3.7092731829573934e-05, + "loss": 2.101, + "num_input_tokens_seen": 1023934464, + "step": 2604 + }, + { + "epoch": 0.29591522198956216, + "grad_norm": 0.5338059663772583, + "learning_rate": 3.7056928034371646e-05, + "loss": 2.0577, + "num_input_tokens_seen": 1026293760, + "step": 2610 + }, + { + "epoch": 0.2965954868676991, + "grad_norm": 0.6121593117713928, + "learning_rate": 3.702112423916935e-05, + "loss": 2.0464, + "num_input_tokens_seen": 1028653056, + "step": 2616 + }, + { + "epoch": 0.29727575174583604, + "grad_norm": 0.6173185706138611, + "learning_rate": 3.698532044396706e-05, + "loss": 2.0741, + "num_input_tokens_seen": 1031012352, + "step": 2622 + }, + { + "epoch": 0.297956016623973, + "grad_norm": 0.5515555739402771, + "learning_rate": 3.6949516648764774e-05, + "loss": 2.0617, + "num_input_tokens_seen": 1033371648, + "step": 2628 + }, + { + "epoch": 0.29863628150210986, + "grad_norm": 0.6501288414001465, + "learning_rate": 3.6913712853562485e-05, + "loss": 2.1319, + "num_input_tokens_seen": 1035730944, + "step": 2634 + }, + { + "epoch": 0.2993165463802468, + "grad_norm": 0.6460755467414856, + "learning_rate": 3.687790905836019e-05, + "loss": 2.0581, + "num_input_tokens_seen": 1038090240, + "step": 2640 + }, + { + "epoch": 0.29999681125838373, + "grad_norm": 0.5400772094726562, + "learning_rate": 3.6842105263157895e-05, + "loss": 2.0937, + "num_input_tokens_seen": 1040449536, + "step": 2646 + }, + { + "epoch": 0.30067707613652067, + "grad_norm": 0.7050911784172058, + "learning_rate": 3.6806301467955606e-05, + "loss": 2.0414, + "num_input_tokens_seen": 1042808832, + "step": 2652 + }, + { + "epoch": 0.3013573410146576, + "grad_norm": 0.502206563949585, + "learning_rate": 3.677049767275331e-05, + "loss": 2.0569, + "num_input_tokens_seen": 1045168128, + "step": 2658 + }, + { + "epoch": 0.3020376058927945, + "grad_norm": 0.6481841206550598, + "learning_rate": 3.673469387755102e-05, + "loss": 2.0846, + "num_input_tokens_seen": 1047527424, + "step": 2664 + }, + { + "epoch": 0.30271787077093143, + "grad_norm": 0.6112203598022461, + "learning_rate": 3.669889008234873e-05, + "loss": 2.0746, + "num_input_tokens_seen": 1049886720, + "step": 2670 + }, + { + "epoch": 0.30339813564906837, + "grad_norm": 0.6601382493972778, + "learning_rate": 3.666308628714644e-05, + "loss": 2.052, + "num_input_tokens_seen": 1052246016, + "step": 2676 + }, + { + "epoch": 0.3040784005272053, + "grad_norm": 0.7059093713760376, + "learning_rate": 3.662728249194415e-05, + "loss": 2.046, + "num_input_tokens_seen": 1054605312, + "step": 2682 + }, + { + "epoch": 0.3047586654053422, + "grad_norm": 0.7588717341423035, + "learning_rate": 3.6591478696741855e-05, + "loss": 2.0597, + "num_input_tokens_seen": 1056964608, + "step": 2688 + }, + { + "epoch": 0.3054389302834791, + "grad_norm": 0.7248372435569763, + "learning_rate": 3.655567490153957e-05, + "loss": 2.0673, + "num_input_tokens_seen": 1059323904, + "step": 2694 + }, + { + "epoch": 0.30611919516161606, + "grad_norm": 0.581738293170929, + "learning_rate": 3.651987110633727e-05, + "loss": 2.084, + "num_input_tokens_seen": 1061683200, + "step": 2700 + }, + { + "epoch": 0.306799460039753, + "grad_norm": 0.5930314064025879, + "learning_rate": 3.648406731113498e-05, + "loss": 2.1012, + "num_input_tokens_seen": 1064042496, + "step": 2706 + }, + { + "epoch": 0.3074797249178899, + "grad_norm": 0.584109365940094, + "learning_rate": 3.644826351593269e-05, + "loss": 2.0744, + "num_input_tokens_seen": 1066401792, + "step": 2712 + }, + { + "epoch": 0.3081599897960268, + "grad_norm": 0.5961458086967468, + "learning_rate": 3.64124597207304e-05, + "loss": 2.0837, + "num_input_tokens_seen": 1068761088, + "step": 2718 + }, + { + "epoch": 0.30884025467416376, + "grad_norm": 0.6335872411727905, + "learning_rate": 3.637665592552811e-05, + "loss": 2.0681, + "num_input_tokens_seen": 1071120384, + "step": 2724 + }, + { + "epoch": 0.3095205195523007, + "grad_norm": 0.6117258071899414, + "learning_rate": 3.6340852130325816e-05, + "loss": 2.1143, + "num_input_tokens_seen": 1073479680, + "step": 2730 + }, + { + "epoch": 0.3102007844304376, + "grad_norm": 0.5619468688964844, + "learning_rate": 3.630504833512353e-05, + "loss": 2.0558, + "num_input_tokens_seen": 1075838976, + "step": 2736 + }, + { + "epoch": 0.3108810493085745, + "grad_norm": 0.555188000202179, + "learning_rate": 3.626924453992123e-05, + "loss": 2.0814, + "num_input_tokens_seen": 1078198272, + "step": 2742 + }, + { + "epoch": 0.31156131418671146, + "grad_norm": 0.5773251056671143, + "learning_rate": 3.6233440744718944e-05, + "loss": 2.0728, + "num_input_tokens_seen": 1080557568, + "step": 2748 + }, + { + "epoch": 0.3122415790648484, + "grad_norm": 0.6792175769805908, + "learning_rate": 3.619763694951665e-05, + "loss": 2.0708, + "num_input_tokens_seen": 1082916864, + "step": 2754 + }, + { + "epoch": 0.3129218439429853, + "grad_norm": 0.6672898530960083, + "learning_rate": 3.616183315431436e-05, + "loss": 2.0112, + "num_input_tokens_seen": 1085276160, + "step": 2760 + }, + { + "epoch": 0.3136021088211222, + "grad_norm": 0.6736769676208496, + "learning_rate": 3.6126029359112065e-05, + "loss": 2.0495, + "num_input_tokens_seen": 1087635456, + "step": 2766 + }, + { + "epoch": 0.31428237369925915, + "grad_norm": 0.6413402557373047, + "learning_rate": 3.6090225563909776e-05, + "loss": 1.9964, + "num_input_tokens_seen": 1089994752, + "step": 2772 + }, + { + "epoch": 0.3149626385773961, + "grad_norm": 0.5596314668655396, + "learning_rate": 3.605442176870749e-05, + "loss": 2.0394, + "num_input_tokens_seen": 1092354048, + "step": 2778 + }, + { + "epoch": 0.315642903455533, + "grad_norm": 0.5517847537994385, + "learning_rate": 3.601861797350519e-05, + "loss": 2.0687, + "num_input_tokens_seen": 1094713344, + "step": 2784 + }, + { + "epoch": 0.3163231683336699, + "grad_norm": 0.6080681085586548, + "learning_rate": 3.5982814178302904e-05, + "loss": 2.1019, + "num_input_tokens_seen": 1097072640, + "step": 2790 + }, + { + "epoch": 0.31700343321180685, + "grad_norm": 0.6057153344154358, + "learning_rate": 3.594701038310061e-05, + "loss": 2.0843, + "num_input_tokens_seen": 1099431936, + "step": 2796 + }, + { + "epoch": 0.3174569431305648, + "eval_accuracy": 0.5775622710622711, + "eval_loss": 2.0592379570007324, + "eval_runtime": 128.8803, + "eval_samples_per_second": 3.104, + "eval_steps_per_second": 1.04, + "num_input_tokens_seen": 1101004800, + "step": 2800 + }, + { + "epoch": 0.3176836980899438, + "grad_norm": 0.6708900332450867, + "learning_rate": 3.591120658789832e-05, + "loss": 2.1171, + "num_input_tokens_seen": 1101791232, + "step": 2802 + }, + { + "epoch": 0.31836396296808067, + "grad_norm": 0.5367056727409363, + "learning_rate": 3.5875402792696025e-05, + "loss": 2.0848, + "num_input_tokens_seen": 1104150528, + "step": 2808 + }, + { + "epoch": 0.3190442278462176, + "grad_norm": 0.6883641481399536, + "learning_rate": 3.583959899749374e-05, + "loss": 2.1015, + "num_input_tokens_seen": 1106509824, + "step": 2814 + }, + { + "epoch": 0.31972449272435455, + "grad_norm": 0.6446415781974792, + "learning_rate": 3.580379520229145e-05, + "loss": 2.079, + "num_input_tokens_seen": 1108869120, + "step": 2820 + }, + { + "epoch": 0.3204047576024915, + "grad_norm": 0.642508864402771, + "learning_rate": 3.576799140708915e-05, + "loss": 2.1132, + "num_input_tokens_seen": 1111228416, + "step": 2826 + }, + { + "epoch": 0.32108502248062837, + "grad_norm": 0.5669949054718018, + "learning_rate": 3.5732187611886865e-05, + "loss": 2.0901, + "num_input_tokens_seen": 1113587712, + "step": 2832 + }, + { + "epoch": 0.3217652873587653, + "grad_norm": 0.7657294869422913, + "learning_rate": 3.569638381668457e-05, + "loss": 2.0556, + "num_input_tokens_seen": 1115947008, + "step": 2838 + }, + { + "epoch": 0.32244555223690224, + "grad_norm": 0.7742637991905212, + "learning_rate": 3.5660580021482274e-05, + "loss": 2.0113, + "num_input_tokens_seen": 1118306304, + "step": 2844 + }, + { + "epoch": 0.3231258171150392, + "grad_norm": 0.7039967179298401, + "learning_rate": 3.5624776226279986e-05, + "loss": 2.079, + "num_input_tokens_seen": 1120665600, + "step": 2850 + }, + { + "epoch": 0.3238060819931761, + "grad_norm": 0.580337643623352, + "learning_rate": 3.55889724310777e-05, + "loss": 2.0946, + "num_input_tokens_seen": 1123024896, + "step": 2856 + }, + { + "epoch": 0.324486346871313, + "grad_norm": 0.5866253972053528, + "learning_rate": 3.555316863587541e-05, + "loss": 2.086, + "num_input_tokens_seen": 1125384192, + "step": 2862 + }, + { + "epoch": 0.32516661174944994, + "grad_norm": 0.5165377259254456, + "learning_rate": 3.5517364840673114e-05, + "loss": 2.0419, + "num_input_tokens_seen": 1127743488, + "step": 2868 + }, + { + "epoch": 0.3258468766275869, + "grad_norm": 0.5327121615409851, + "learning_rate": 3.5481561045470825e-05, + "loss": 2.064, + "num_input_tokens_seen": 1130102784, + "step": 2874 + }, + { + "epoch": 0.3265271415057238, + "grad_norm": 0.7180930972099304, + "learning_rate": 3.544575725026853e-05, + "loss": 1.9961, + "num_input_tokens_seen": 1132462080, + "step": 2880 + }, + { + "epoch": 0.3272074063838607, + "grad_norm": 0.5961750745773315, + "learning_rate": 3.5409953455066235e-05, + "loss": 2.031, + "num_input_tokens_seen": 1134821376, + "step": 2886 + }, + { + "epoch": 0.32788767126199764, + "grad_norm": 0.6628397107124329, + "learning_rate": 3.5374149659863946e-05, + "loss": 1.9875, + "num_input_tokens_seen": 1137180672, + "step": 2892 + }, + { + "epoch": 0.3285679361401346, + "grad_norm": 0.606051504611969, + "learning_rate": 3.533834586466165e-05, + "loss": 2.0553, + "num_input_tokens_seen": 1139539968, + "step": 2898 + }, + { + "epoch": 0.3292482010182715, + "grad_norm": 0.6407272219657898, + "learning_rate": 3.530254206945936e-05, + "loss": 2.0333, + "num_input_tokens_seen": 1141899264, + "step": 2904 + }, + { + "epoch": 0.3299284658964084, + "grad_norm": 0.5641146302223206, + "learning_rate": 3.5266738274257074e-05, + "loss": 2.1093, + "num_input_tokens_seen": 1144258560, + "step": 2910 + }, + { + "epoch": 0.33060873077454533, + "grad_norm": 0.6447109580039978, + "learning_rate": 3.5230934479054786e-05, + "loss": 2.0108, + "num_input_tokens_seen": 1146617856, + "step": 2916 + }, + { + "epoch": 0.33128899565268227, + "grad_norm": 0.6956091523170471, + "learning_rate": 3.519513068385249e-05, + "loss": 2.0884, + "num_input_tokens_seen": 1148977152, + "step": 2922 + }, + { + "epoch": 0.3319692605308192, + "grad_norm": 0.6706202626228333, + "learning_rate": 3.5159326888650195e-05, + "loss": 2.0462, + "num_input_tokens_seen": 1151336448, + "step": 2928 + }, + { + "epoch": 0.3326495254089561, + "grad_norm": 0.5899391174316406, + "learning_rate": 3.512352309344791e-05, + "loss": 2.0629, + "num_input_tokens_seen": 1153695744, + "step": 2934 + }, + { + "epoch": 0.33332979028709303, + "grad_norm": 0.695925772190094, + "learning_rate": 3.508771929824561e-05, + "loss": 2.0594, + "num_input_tokens_seen": 1156055040, + "step": 2940 + }, + { + "epoch": 0.33401005516522997, + "grad_norm": 0.5403394103050232, + "learning_rate": 3.505191550304332e-05, + "loss": 2.0885, + "num_input_tokens_seen": 1158414336, + "step": 2946 + }, + { + "epoch": 0.3346903200433669, + "grad_norm": 0.6385943293571472, + "learning_rate": 3.5016111707841035e-05, + "loss": 2.0986, + "num_input_tokens_seen": 1160773632, + "step": 2952 + }, + { + "epoch": 0.3353705849215038, + "grad_norm": 0.5981218218803406, + "learning_rate": 3.4980307912638746e-05, + "loss": 2.0232, + "num_input_tokens_seen": 1163132928, + "step": 2958 + }, + { + "epoch": 0.3360508497996407, + "grad_norm": 0.6498490571975708, + "learning_rate": 3.494450411743645e-05, + "loss": 2.0837, + "num_input_tokens_seen": 1165492224, + "step": 2964 + }, + { + "epoch": 0.33673111467777767, + "grad_norm": 0.5568425059318542, + "learning_rate": 3.4908700322234156e-05, + "loss": 2.0792, + "num_input_tokens_seen": 1167851520, + "step": 2970 + }, + { + "epoch": 0.3374113795559146, + "grad_norm": 0.5944088697433472, + "learning_rate": 3.487289652703187e-05, + "loss": 2.0658, + "num_input_tokens_seen": 1170210816, + "step": 2976 + }, + { + "epoch": 0.3380916444340515, + "grad_norm": 0.6015023589134216, + "learning_rate": 3.483709273182957e-05, + "loss": 2.0746, + "num_input_tokens_seen": 1172570112, + "step": 2982 + }, + { + "epoch": 0.3387719093121884, + "grad_norm": 0.783666729927063, + "learning_rate": 3.4801288936627283e-05, + "loss": 2.0295, + "num_input_tokens_seen": 1174929408, + "step": 2988 + }, + { + "epoch": 0.33945217419032536, + "grad_norm": 0.5756369829177856, + "learning_rate": 3.476548514142499e-05, + "loss": 2.0684, + "num_input_tokens_seen": 1177288704, + "step": 2994 + }, + { + "epoch": 0.3401324390684623, + "grad_norm": 0.6056890487670898, + "learning_rate": 3.4729681346222707e-05, + "loss": 2.0383, + "num_input_tokens_seen": 1179648000, + "step": 3000 + }, + { + "epoch": 0.3408127039465992, + "grad_norm": 0.8019888997077942, + "learning_rate": 3.469387755102041e-05, + "loss": 2.0801, + "num_input_tokens_seen": 1182007296, + "step": 3006 + }, + { + "epoch": 0.3414929688247361, + "grad_norm": 0.6044601202011108, + "learning_rate": 3.4658073755818116e-05, + "loss": 2.0798, + "num_input_tokens_seen": 1184366592, + "step": 3012 + }, + { + "epoch": 0.34217323370287306, + "grad_norm": 0.5998896360397339, + "learning_rate": 3.462226996061583e-05, + "loss": 2.0317, + "num_input_tokens_seen": 1186725888, + "step": 3018 + }, + { + "epoch": 0.34285349858101, + "grad_norm": 0.5555676221847534, + "learning_rate": 3.458646616541353e-05, + "loss": 1.9898, + "num_input_tokens_seen": 1189085184, + "step": 3024 + }, + { + "epoch": 0.3435337634591469, + "grad_norm": 0.5591822862625122, + "learning_rate": 3.4550662370211244e-05, + "loss": 2.0605, + "num_input_tokens_seen": 1191444480, + "step": 3030 + }, + { + "epoch": 0.3442140283372838, + "grad_norm": 0.6183376908302307, + "learning_rate": 3.451485857500895e-05, + "loss": 2.1107, + "num_input_tokens_seen": 1193803776, + "step": 3036 + }, + { + "epoch": 0.34489429321542076, + "grad_norm": 0.6081872582435608, + "learning_rate": 3.447905477980666e-05, + "loss": 2.0465, + "num_input_tokens_seen": 1196163072, + "step": 3042 + }, + { + "epoch": 0.3455745580935577, + "grad_norm": 0.5790855288505554, + "learning_rate": 3.444325098460437e-05, + "loss": 2.0326, + "num_input_tokens_seen": 1198522368, + "step": 3048 + }, + { + "epoch": 0.34625482297169463, + "grad_norm": 0.7046033143997192, + "learning_rate": 3.440744718940208e-05, + "loss": 2.0282, + "num_input_tokens_seen": 1200881664, + "step": 3054 + }, + { + "epoch": 0.3469350878498315, + "grad_norm": 0.6874545812606812, + "learning_rate": 3.437164339419979e-05, + "loss": 2.0915, + "num_input_tokens_seen": 1203240960, + "step": 3060 + }, + { + "epoch": 0.34761535272796845, + "grad_norm": 0.5742839574813843, + "learning_rate": 3.433583959899749e-05, + "loss": 2.097, + "num_input_tokens_seen": 1205600256, + "step": 3066 + }, + { + "epoch": 0.3482956176061054, + "grad_norm": 0.5930187702178955, + "learning_rate": 3.4300035803795204e-05, + "loss": 2.1049, + "num_input_tokens_seen": 1207959552, + "step": 3072 + }, + { + "epoch": 0.34897588248424233, + "grad_norm": 0.5856387615203857, + "learning_rate": 3.426423200859291e-05, + "loss": 2.0913, + "num_input_tokens_seen": 1210318848, + "step": 3078 + }, + { + "epoch": 0.3496561473623792, + "grad_norm": 0.6059959530830383, + "learning_rate": 3.422842821339062e-05, + "loss": 2.0664, + "num_input_tokens_seen": 1212678144, + "step": 3084 + }, + { + "epoch": 0.35033641224051615, + "grad_norm": 0.6338859796524048, + "learning_rate": 3.419262441818833e-05, + "loss": 2.0543, + "num_input_tokens_seen": 1215037440, + "step": 3090 + }, + { + "epoch": 0.3510166771186531, + "grad_norm": 0.6134727001190186, + "learning_rate": 3.415682062298604e-05, + "loss": 2.0758, + "num_input_tokens_seen": 1217396736, + "step": 3096 + }, + { + "epoch": 0.35169694199679, + "grad_norm": 0.7190840244293213, + "learning_rate": 3.412101682778375e-05, + "loss": 2.0468, + "num_input_tokens_seen": 1219756032, + "step": 3102 + }, + { + "epoch": 0.3523772068749269, + "grad_norm": 0.6040173768997192, + "learning_rate": 3.4085213032581453e-05, + "loss": 2.0888, + "num_input_tokens_seen": 1222115328, + "step": 3108 + }, + { + "epoch": 0.35305747175306385, + "grad_norm": 0.6416704058647156, + "learning_rate": 3.4049409237379165e-05, + "loss": 2.0423, + "num_input_tokens_seen": 1224474624, + "step": 3114 + }, + { + "epoch": 0.3537377366312008, + "grad_norm": 0.6157965064048767, + "learning_rate": 3.401360544217687e-05, + "loss": 2.0746, + "num_input_tokens_seen": 1226833920, + "step": 3120 + }, + { + "epoch": 0.3544180015093377, + "grad_norm": 0.6185963153839111, + "learning_rate": 3.397780164697458e-05, + "loss": 2.0451, + "num_input_tokens_seen": 1229193216, + "step": 3126 + }, + { + "epoch": 0.3550982663874746, + "grad_norm": 0.5963800549507141, + "learning_rate": 3.3941997851772286e-05, + "loss": 2.0392, + "num_input_tokens_seen": 1231552512, + "step": 3132 + }, + { + "epoch": 0.35577853126561154, + "grad_norm": 0.6368474960327148, + "learning_rate": 3.390619405657e-05, + "loss": 2.0505, + "num_input_tokens_seen": 1233911808, + "step": 3138 + }, + { + "epoch": 0.3564587961437485, + "grad_norm": 0.675567090511322, + "learning_rate": 3.387039026136771e-05, + "loss": 2.0833, + "num_input_tokens_seen": 1236271104, + "step": 3144 + }, + { + "epoch": 0.3571390610218854, + "grad_norm": 0.6852293014526367, + "learning_rate": 3.3834586466165414e-05, + "loss": 2.0318, + "num_input_tokens_seen": 1238630400, + "step": 3150 + }, + { + "epoch": 0.3578193259000223, + "grad_norm": 0.7064585089683533, + "learning_rate": 3.3798782670963125e-05, + "loss": 2.0657, + "num_input_tokens_seen": 1240989696, + "step": 3156 + }, + { + "epoch": 0.35849959077815924, + "grad_norm": 0.6410323977470398, + "learning_rate": 3.376297887576083e-05, + "loss": 2.079, + "num_input_tokens_seen": 1243348992, + "step": 3162 + }, + { + "epoch": 0.3591798556562962, + "grad_norm": 0.7537684440612793, + "learning_rate": 3.372717508055854e-05, + "loss": 2.0817, + "num_input_tokens_seen": 1245708288, + "step": 3168 + }, + { + "epoch": 0.3598601205344331, + "grad_norm": 0.7127799391746521, + "learning_rate": 3.3691371285356247e-05, + "loss": 2.0786, + "num_input_tokens_seen": 1248067584, + "step": 3174 + }, + { + "epoch": 0.36054038541257, + "grad_norm": 0.5921429991722107, + "learning_rate": 3.365556749015396e-05, + "loss": 2.094, + "num_input_tokens_seen": 1250426880, + "step": 3180 + }, + { + "epoch": 0.36122065029070693, + "grad_norm": 0.5365628600120544, + "learning_rate": 3.361976369495167e-05, + "loss": 2.0306, + "num_input_tokens_seen": 1252786176, + "step": 3186 + }, + { + "epoch": 0.3619009151688439, + "grad_norm": 0.591437816619873, + "learning_rate": 3.3583959899749374e-05, + "loss": 2.0486, + "num_input_tokens_seen": 1255145472, + "step": 3192 + }, + { + "epoch": 0.3625811800469808, + "grad_norm": 0.5986304879188538, + "learning_rate": 3.3548156104547086e-05, + "loss": 2.0571, + "num_input_tokens_seen": 1257504768, + "step": 3198 + }, + { + "epoch": 0.36280793500635977, + "eval_accuracy": 0.5792582417582418, + "eval_loss": 2.0506937503814697, + "eval_runtime": 129.3292, + "eval_samples_per_second": 3.093, + "eval_steps_per_second": 1.036, + "num_input_tokens_seen": 1258291200, + "step": 3200 + }, + { + "epoch": 0.3632614449251177, + "grad_norm": 0.6536886096000671, + "learning_rate": 3.351235230934479e-05, + "loss": 2.0306, + "num_input_tokens_seen": 1259864064, + "step": 3204 + }, + { + "epoch": 0.36394170980325463, + "grad_norm": 0.6367084980010986, + "learning_rate": 3.34765485141425e-05, + "loss": 2.0495, + "num_input_tokens_seen": 1262223360, + "step": 3210 + }, + { + "epoch": 0.36462197468139157, + "grad_norm": 0.5505596995353699, + "learning_rate": 3.344074471894021e-05, + "loss": 2.0235, + "num_input_tokens_seen": 1264582656, + "step": 3216 + }, + { + "epoch": 0.3653022395595285, + "grad_norm": 0.6911424398422241, + "learning_rate": 3.340494092373791e-05, + "loss": 2.0436, + "num_input_tokens_seen": 1266941952, + "step": 3222 + }, + { + "epoch": 0.3659825044376654, + "grad_norm": 0.8652951717376709, + "learning_rate": 3.336913712853563e-05, + "loss": 2.043, + "num_input_tokens_seen": 1269301248, + "step": 3228 + }, + { + "epoch": 0.36666276931580233, + "grad_norm": 0.7431092262268066, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.0535, + "num_input_tokens_seen": 1271660544, + "step": 3234 + }, + { + "epoch": 0.36734303419393927, + "grad_norm": 0.5794050097465515, + "learning_rate": 3.3297529538131046e-05, + "loss": 2.0574, + "num_input_tokens_seen": 1274019840, + "step": 3240 + }, + { + "epoch": 0.3680232990720762, + "grad_norm": 0.6215260624885559, + "learning_rate": 3.326172574292875e-05, + "loss": 2.086, + "num_input_tokens_seen": 1276379136, + "step": 3246 + }, + { + "epoch": 0.3687035639502131, + "grad_norm": 0.5912430286407471, + "learning_rate": 3.322592194772646e-05, + "loss": 2.079, + "num_input_tokens_seen": 1278738432, + "step": 3252 + }, + { + "epoch": 0.36938382882835, + "grad_norm": 0.5438397526741028, + "learning_rate": 3.319011815252417e-05, + "loss": 2.0822, + "num_input_tokens_seen": 1281097728, + "step": 3258 + }, + { + "epoch": 0.37006409370648696, + "grad_norm": 0.585678219795227, + "learning_rate": 3.315431435732187e-05, + "loss": 2.0288, + "num_input_tokens_seen": 1283457024, + "step": 3264 + }, + { + "epoch": 0.3707443585846239, + "grad_norm": 0.6816807985305786, + "learning_rate": 3.3118510562119584e-05, + "loss": 2.0905, + "num_input_tokens_seen": 1285816320, + "step": 3270 + }, + { + "epoch": 0.37142462346276084, + "grad_norm": 0.6669751405715942, + "learning_rate": 3.3082706766917295e-05, + "loss": 2.0476, + "num_input_tokens_seen": 1288175616, + "step": 3276 + }, + { + "epoch": 0.3721048883408977, + "grad_norm": 0.6388877034187317, + "learning_rate": 3.304690297171501e-05, + "loss": 2.0814, + "num_input_tokens_seen": 1290534912, + "step": 3282 + }, + { + "epoch": 0.37278515321903466, + "grad_norm": 0.6471198797225952, + "learning_rate": 3.301109917651271e-05, + "loss": 2.0667, + "num_input_tokens_seen": 1292894208, + "step": 3288 + }, + { + "epoch": 0.3734654180971716, + "grad_norm": 0.5172976851463318, + "learning_rate": 3.297529538131042e-05, + "loss": 2.0514, + "num_input_tokens_seen": 1295253504, + "step": 3294 + }, + { + "epoch": 0.37414568297530854, + "grad_norm": 0.7585137486457825, + "learning_rate": 3.293949158610813e-05, + "loss": 2.0004, + "num_input_tokens_seen": 1297612800, + "step": 3300 + }, + { + "epoch": 0.3748259478534454, + "grad_norm": 0.5259309411048889, + "learning_rate": 3.290368779090583e-05, + "loss": 2.0757, + "num_input_tokens_seen": 1299972096, + "step": 3306 + }, + { + "epoch": 0.37550621273158236, + "grad_norm": 0.58619225025177, + "learning_rate": 3.2867883995703544e-05, + "loss": 2.0677, + "num_input_tokens_seen": 1302331392, + "step": 3312 + }, + { + "epoch": 0.3761864776097193, + "grad_norm": 0.5706315636634827, + "learning_rate": 3.2832080200501256e-05, + "loss": 2.0797, + "num_input_tokens_seen": 1304690688, + "step": 3318 + }, + { + "epoch": 0.37686674248785623, + "grad_norm": 0.5927962064743042, + "learning_rate": 3.279627640529897e-05, + "loss": 2.079, + "num_input_tokens_seen": 1307049984, + "step": 3324 + }, + { + "epoch": 0.3775470073659931, + "grad_norm": 0.6606913208961487, + "learning_rate": 3.276047261009667e-05, + "loss": 2.1193, + "num_input_tokens_seen": 1309409280, + "step": 3330 + }, + { + "epoch": 0.37822727224413005, + "grad_norm": 0.585796058177948, + "learning_rate": 3.2724668814894384e-05, + "loss": 2.0842, + "num_input_tokens_seen": 1311768576, + "step": 3336 + }, + { + "epoch": 0.378907537122267, + "grad_norm": 0.6631506681442261, + "learning_rate": 3.268886501969209e-05, + "loss": 2.055, + "num_input_tokens_seen": 1314127872, + "step": 3342 + }, + { + "epoch": 0.37958780200040393, + "grad_norm": 0.577156126499176, + "learning_rate": 3.265306122448979e-05, + "loss": 2.0251, + "num_input_tokens_seen": 1316487168, + "step": 3348 + }, + { + "epoch": 0.3802680668785408, + "grad_norm": 0.5798112154006958, + "learning_rate": 3.2617257429287505e-05, + "loss": 2.0066, + "num_input_tokens_seen": 1318846464, + "step": 3354 + }, + { + "epoch": 0.38094833175667775, + "grad_norm": 0.5928402543067932, + "learning_rate": 3.258145363408521e-05, + "loss": 2.0594, + "num_input_tokens_seen": 1321205760, + "step": 3360 + }, + { + "epoch": 0.3816285966348147, + "grad_norm": 0.5730792880058289, + "learning_rate": 3.254564983888292e-05, + "loss": 2.014, + "num_input_tokens_seen": 1323565056, + "step": 3366 + }, + { + "epoch": 0.3823088615129516, + "grad_norm": 0.606386661529541, + "learning_rate": 3.250984604368063e-05, + "loss": 2.0674, + "num_input_tokens_seen": 1325924352, + "step": 3372 + }, + { + "epoch": 0.3829891263910885, + "grad_norm": 0.6056506037712097, + "learning_rate": 3.2474042248478344e-05, + "loss": 2.0847, + "num_input_tokens_seen": 1328283648, + "step": 3378 + }, + { + "epoch": 0.38366939126922545, + "grad_norm": 0.547749936580658, + "learning_rate": 3.243823845327605e-05, + "loss": 2.0519, + "num_input_tokens_seen": 1330642944, + "step": 3384 + }, + { + "epoch": 0.3843496561473624, + "grad_norm": 0.5810631513595581, + "learning_rate": 3.2402434658073754e-05, + "loss": 2.0552, + "num_input_tokens_seen": 1333002240, + "step": 3390 + }, + { + "epoch": 0.3850299210254993, + "grad_norm": 0.7057521343231201, + "learning_rate": 3.2366630862871465e-05, + "loss": 2.1079, + "num_input_tokens_seen": 1335361536, + "step": 3396 + }, + { + "epoch": 0.3857101859036362, + "grad_norm": 0.6711981892585754, + "learning_rate": 3.233082706766917e-05, + "loss": 2.0651, + "num_input_tokens_seen": 1337720832, + "step": 3402 + }, + { + "epoch": 0.38639045078177314, + "grad_norm": 0.7233543992042542, + "learning_rate": 3.229502327246688e-05, + "loss": 2.05, + "num_input_tokens_seen": 1340080128, + "step": 3408 + }, + { + "epoch": 0.3870707156599101, + "grad_norm": 0.6906174421310425, + "learning_rate": 3.225921947726459e-05, + "loss": 2.0953, + "num_input_tokens_seen": 1342439424, + "step": 3414 + }, + { + "epoch": 0.387750980538047, + "grad_norm": 0.634935736656189, + "learning_rate": 3.2223415682062305e-05, + "loss": 2.0757, + "num_input_tokens_seen": 1344798720, + "step": 3420 + }, + { + "epoch": 0.3884312454161839, + "grad_norm": 0.6409153342247009, + "learning_rate": 3.218761188686001e-05, + "loss": 2.0493, + "num_input_tokens_seen": 1347158016, + "step": 3426 + }, + { + "epoch": 0.38911151029432084, + "grad_norm": 0.5605142116546631, + "learning_rate": 3.2151808091657714e-05, + "loss": 2.0471, + "num_input_tokens_seen": 1349517312, + "step": 3432 + }, + { + "epoch": 0.3897917751724578, + "grad_norm": 0.5918275117874146, + "learning_rate": 3.2116004296455426e-05, + "loss": 2.0703, + "num_input_tokens_seen": 1351876608, + "step": 3438 + }, + { + "epoch": 0.3904720400505947, + "grad_norm": 0.583743155002594, + "learning_rate": 3.208020050125313e-05, + "loss": 2.0535, + "num_input_tokens_seen": 1354235904, + "step": 3444 + }, + { + "epoch": 0.3911523049287316, + "grad_norm": 0.5591037273406982, + "learning_rate": 3.204439670605084e-05, + "loss": 2.0863, + "num_input_tokens_seen": 1356595200, + "step": 3450 + }, + { + "epoch": 0.39183256980686854, + "grad_norm": 0.5802523493766785, + "learning_rate": 3.2008592910848554e-05, + "loss": 2.1016, + "num_input_tokens_seen": 1358954496, + "step": 3456 + }, + { + "epoch": 0.3925128346850055, + "grad_norm": 0.5013401508331299, + "learning_rate": 3.1972789115646265e-05, + "loss": 2.0506, + "num_input_tokens_seen": 1361313792, + "step": 3462 + }, + { + "epoch": 0.3931930995631424, + "grad_norm": 0.6218020915985107, + "learning_rate": 3.193698532044397e-05, + "loss": 2.079, + "num_input_tokens_seen": 1363673088, + "step": 3468 + }, + { + "epoch": 0.39387336444127935, + "grad_norm": 0.591705858707428, + "learning_rate": 3.1901181525241675e-05, + "loss": 2.0478, + "num_input_tokens_seen": 1366032384, + "step": 3474 + }, + { + "epoch": 0.39455362931941623, + "grad_norm": 0.6699190139770508, + "learning_rate": 3.1865377730039386e-05, + "loss": 2.0473, + "num_input_tokens_seen": 1368391680, + "step": 3480 + }, + { + "epoch": 0.39523389419755317, + "grad_norm": 0.6555076241493225, + "learning_rate": 3.182957393483709e-05, + "loss": 2.0559, + "num_input_tokens_seen": 1370750976, + "step": 3486 + }, + { + "epoch": 0.3959141590756901, + "grad_norm": 0.7014250159263611, + "learning_rate": 3.17937701396348e-05, + "loss": 2.0714, + "num_input_tokens_seen": 1373110272, + "step": 3492 + }, + { + "epoch": 0.39659442395382705, + "grad_norm": 0.588550329208374, + "learning_rate": 3.175796634443251e-05, + "loss": 2.0885, + "num_input_tokens_seen": 1375469568, + "step": 3498 + }, + { + "epoch": 0.39727468883196393, + "grad_norm": 0.572462260723114, + "learning_rate": 3.172216254923022e-05, + "loss": 2.0506, + "num_input_tokens_seen": 1377828864, + "step": 3504 + }, + { + "epoch": 0.39795495371010087, + "grad_norm": 0.534345805644989, + "learning_rate": 3.168635875402793e-05, + "loss": 2.0648, + "num_input_tokens_seen": 1380188160, + "step": 3510 + }, + { + "epoch": 0.3986352185882378, + "grad_norm": 0.6382195949554443, + "learning_rate": 3.1650554958825635e-05, + "loss": 2.0084, + "num_input_tokens_seen": 1382547456, + "step": 3516 + }, + { + "epoch": 0.39931548346637474, + "grad_norm": 0.6456411480903625, + "learning_rate": 3.161475116362335e-05, + "loss": 2.0935, + "num_input_tokens_seen": 1384906752, + "step": 3522 + }, + { + "epoch": 0.3999957483445116, + "grad_norm": 0.6360987424850464, + "learning_rate": 3.157894736842105e-05, + "loss": 2.0757, + "num_input_tokens_seen": 1387266048, + "step": 3528 + }, + { + "epoch": 0.40067601322264856, + "grad_norm": 0.6245688199996948, + "learning_rate": 3.154314357321876e-05, + "loss": 2.0381, + "num_input_tokens_seen": 1389625344, + "step": 3534 + }, + { + "epoch": 0.4013562781007855, + "grad_norm": 0.6057738661766052, + "learning_rate": 3.150733977801647e-05, + "loss": 2.0865, + "num_input_tokens_seen": 1391984640, + "step": 3540 + }, + { + "epoch": 0.40203654297892244, + "grad_norm": 0.6015221476554871, + "learning_rate": 3.147153598281418e-05, + "loss": 2.0725, + "num_input_tokens_seen": 1394343936, + "step": 3546 + }, + { + "epoch": 0.4027168078570593, + "grad_norm": 0.6431640982627869, + "learning_rate": 3.143573218761189e-05, + "loss": 2.0358, + "num_input_tokens_seen": 1396703232, + "step": 3552 + }, + { + "epoch": 0.40339707273519626, + "grad_norm": 0.5379701256752014, + "learning_rate": 3.1399928392409596e-05, + "loss": 2.0898, + "num_input_tokens_seen": 1399062528, + "step": 3558 + }, + { + "epoch": 0.4040773376133332, + "grad_norm": 0.647147536277771, + "learning_rate": 3.136412459720731e-05, + "loss": 2.0025, + "num_input_tokens_seen": 1401421824, + "step": 3564 + }, + { + "epoch": 0.40475760249147014, + "grad_norm": 0.547764003276825, + "learning_rate": 3.132832080200501e-05, + "loss": 2.0452, + "num_input_tokens_seen": 1403781120, + "step": 3570 + }, + { + "epoch": 0.405437867369607, + "grad_norm": 0.7669888734817505, + "learning_rate": 3.1292517006802724e-05, + "loss": 2.0367, + "num_input_tokens_seen": 1406140416, + "step": 3576 + }, + { + "epoch": 0.40611813224774396, + "grad_norm": 0.5902658700942993, + "learning_rate": 3.125671321160043e-05, + "loss": 2.0263, + "num_input_tokens_seen": 1408499712, + "step": 3582 + }, + { + "epoch": 0.4067983971258809, + "grad_norm": 0.5572285056114197, + "learning_rate": 3.122090941639814e-05, + "loss": 2.0315, + "num_input_tokens_seen": 1410859008, + "step": 3588 + }, + { + "epoch": 0.40747866200401783, + "grad_norm": 0.58447265625, + "learning_rate": 3.1185105621195845e-05, + "loss": 2.1052, + "num_input_tokens_seen": 1413218304, + "step": 3594 + }, + { + "epoch": 0.4081589268821547, + "grad_norm": 0.5623412132263184, + "learning_rate": 3.1149301825993556e-05, + "loss": 2.0841, + "num_input_tokens_seen": 1415577600, + "step": 3600 + }, + { + "epoch": 0.4081589268821547, + "eval_accuracy": 0.5801733821733822, + "eval_loss": 2.0434608459472656, + "eval_runtime": 128.4268, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.043, + "num_input_tokens_seen": 1415577600, + "step": 3600 + }, + { + "epoch": 0.40883919176029165, + "grad_norm": 0.5253978371620178, + "learning_rate": 3.111349803079127e-05, + "loss": 2.04, + "num_input_tokens_seen": 1417936896, + "step": 3606 + }, + { + "epoch": 0.4095194566384286, + "grad_norm": 0.5712242722511292, + "learning_rate": 3.107769423558897e-05, + "loss": 2.0483, + "num_input_tokens_seen": 1420296192, + "step": 3612 + }, + { + "epoch": 0.41019972151656553, + "grad_norm": 0.5923311710357666, + "learning_rate": 3.1041890440386684e-05, + "loss": 2.0499, + "num_input_tokens_seen": 1422655488, + "step": 3618 + }, + { + "epoch": 0.4108799863947024, + "grad_norm": 0.8339553475379944, + "learning_rate": 3.100608664518439e-05, + "loss": 2.1001, + "num_input_tokens_seen": 1425014784, + "step": 3624 + }, + { + "epoch": 0.41156025127283935, + "grad_norm": 0.6857354044914246, + "learning_rate": 3.09702828499821e-05, + "loss": 1.9921, + "num_input_tokens_seen": 1427374080, + "step": 3630 + }, + { + "epoch": 0.4122405161509763, + "grad_norm": 0.6165304183959961, + "learning_rate": 3.0934479054779805e-05, + "loss": 2.0732, + "num_input_tokens_seen": 1429733376, + "step": 3636 + }, + { + "epoch": 0.4129207810291132, + "grad_norm": 0.528439998626709, + "learning_rate": 3.089867525957752e-05, + "loss": 2.0696, + "num_input_tokens_seen": 1432092672, + "step": 3642 + }, + { + "epoch": 0.4136010459072501, + "grad_norm": 0.5932063460350037, + "learning_rate": 3.086287146437523e-05, + "loss": 2.0249, + "num_input_tokens_seen": 1434451968, + "step": 3648 + }, + { + "epoch": 0.41428131078538705, + "grad_norm": 0.6371628046035767, + "learning_rate": 3.082706766917293e-05, + "loss": 2.0799, + "num_input_tokens_seen": 1436811264, + "step": 3654 + }, + { + "epoch": 0.414961575663524, + "grad_norm": 0.5642787218093872, + "learning_rate": 3.0791263873970645e-05, + "loss": 2.0326, + "num_input_tokens_seen": 1439170560, + "step": 3660 + }, + { + "epoch": 0.4156418405416609, + "grad_norm": 0.6325972080230713, + "learning_rate": 3.075546007876835e-05, + "loss": 2.0502, + "num_input_tokens_seen": 1441529856, + "step": 3666 + }, + { + "epoch": 0.41632210541979786, + "grad_norm": 0.557271420955658, + "learning_rate": 3.071965628356606e-05, + "loss": 2.0377, + "num_input_tokens_seen": 1443889152, + "step": 3672 + }, + { + "epoch": 0.41700237029793474, + "grad_norm": 0.5825797319412231, + "learning_rate": 3.0683852488363766e-05, + "loss": 2.0536, + "num_input_tokens_seen": 1446248448, + "step": 3678 + }, + { + "epoch": 0.4176826351760717, + "grad_norm": 0.6249643564224243, + "learning_rate": 3.064804869316148e-05, + "loss": 2.0838, + "num_input_tokens_seen": 1448607744, + "step": 3684 + }, + { + "epoch": 0.4183629000542086, + "grad_norm": 0.6836763024330139, + "learning_rate": 3.061224489795919e-05, + "loss": 2.0301, + "num_input_tokens_seen": 1450967040, + "step": 3690 + }, + { + "epoch": 0.41904316493234556, + "grad_norm": 0.5793729424476624, + "learning_rate": 3.0576441102756894e-05, + "loss": 2.0429, + "num_input_tokens_seen": 1453326336, + "step": 3696 + }, + { + "epoch": 0.41972342981048244, + "grad_norm": 0.6290580034255981, + "learning_rate": 3.0540637307554605e-05, + "loss": 2.0906, + "num_input_tokens_seen": 1455685632, + "step": 3702 + }, + { + "epoch": 0.4204036946886194, + "grad_norm": 0.6509286165237427, + "learning_rate": 3.050483351235231e-05, + "loss": 2.0252, + "num_input_tokens_seen": 1458044928, + "step": 3708 + }, + { + "epoch": 0.4210839595667563, + "grad_norm": 0.5808912515640259, + "learning_rate": 3.046902971715002e-05, + "loss": 2.0958, + "num_input_tokens_seen": 1460404224, + "step": 3714 + }, + { + "epoch": 0.42176422444489325, + "grad_norm": 0.6550482511520386, + "learning_rate": 3.0433225921947726e-05, + "loss": 2.0413, + "num_input_tokens_seen": 1462763520, + "step": 3720 + }, + { + "epoch": 0.42244448932303014, + "grad_norm": 0.6474577784538269, + "learning_rate": 3.0397422126745434e-05, + "loss": 2.0362, + "num_input_tokens_seen": 1465122816, + "step": 3726 + }, + { + "epoch": 0.4231247542011671, + "grad_norm": 0.6413889527320862, + "learning_rate": 3.0361618331543146e-05, + "loss": 2.0385, + "num_input_tokens_seen": 1467482112, + "step": 3732 + }, + { + "epoch": 0.423805019079304, + "grad_norm": 0.5291987061500549, + "learning_rate": 3.032581453634085e-05, + "loss": 2.0211, + "num_input_tokens_seen": 1469841408, + "step": 3738 + }, + { + "epoch": 0.42448528395744095, + "grad_norm": 0.5267509818077087, + "learning_rate": 3.0290010741138562e-05, + "loss": 2.0544, + "num_input_tokens_seen": 1472200704, + "step": 3744 + }, + { + "epoch": 0.42516554883557783, + "grad_norm": 0.5063323974609375, + "learning_rate": 3.025420694593627e-05, + "loss": 2.112, + "num_input_tokens_seen": 1474560000, + "step": 3750 + }, + { + "epoch": 0.42584581371371477, + "grad_norm": 0.5626435875892639, + "learning_rate": 3.0218403150733982e-05, + "loss": 2.0111, + "num_input_tokens_seen": 1476919296, + "step": 3756 + }, + { + "epoch": 0.4265260785918517, + "grad_norm": 0.5254883766174316, + "learning_rate": 3.0182599355531687e-05, + "loss": 2.0665, + "num_input_tokens_seen": 1479278592, + "step": 3762 + }, + { + "epoch": 0.42720634346998865, + "grad_norm": 0.6676201224327087, + "learning_rate": 3.0146795560329395e-05, + "loss": 2.0553, + "num_input_tokens_seen": 1481637888, + "step": 3768 + }, + { + "epoch": 0.42788660834812553, + "grad_norm": 0.618036150932312, + "learning_rate": 3.0110991765127106e-05, + "loss": 2.0896, + "num_input_tokens_seen": 1483997184, + "step": 3774 + }, + { + "epoch": 0.42856687322626247, + "grad_norm": 0.5902726054191589, + "learning_rate": 3.007518796992481e-05, + "loss": 2.0941, + "num_input_tokens_seen": 1486356480, + "step": 3780 + }, + { + "epoch": 0.4292471381043994, + "grad_norm": 0.6541236639022827, + "learning_rate": 3.0039384174722523e-05, + "loss": 2.0441, + "num_input_tokens_seen": 1488715776, + "step": 3786 + }, + { + "epoch": 0.42992740298253634, + "grad_norm": 0.6734746098518372, + "learning_rate": 3.000358037952023e-05, + "loss": 2.098, + "num_input_tokens_seen": 1491075072, + "step": 3792 + }, + { + "epoch": 0.4306076678606732, + "grad_norm": 0.690733015537262, + "learning_rate": 2.9967776584317943e-05, + "loss": 2.0626, + "num_input_tokens_seen": 1493434368, + "step": 3798 + }, + { + "epoch": 0.43128793273881016, + "grad_norm": 0.6429844498634338, + "learning_rate": 2.9931972789115647e-05, + "loss": 2.0556, + "num_input_tokens_seen": 1495793664, + "step": 3804 + }, + { + "epoch": 0.4319681976169471, + "grad_norm": 0.692583441734314, + "learning_rate": 2.9896168993913355e-05, + "loss": 2.041, + "num_input_tokens_seen": 1498152960, + "step": 3810 + }, + { + "epoch": 0.43264846249508404, + "grad_norm": 0.5887177586555481, + "learning_rate": 2.9860365198711067e-05, + "loss": 2.0964, + "num_input_tokens_seen": 1500512256, + "step": 3816 + }, + { + "epoch": 0.4333287273732209, + "grad_norm": 0.5106215476989746, + "learning_rate": 2.9824561403508772e-05, + "loss": 1.9808, + "num_input_tokens_seen": 1502871552, + "step": 3822 + }, + { + "epoch": 0.43400899225135786, + "grad_norm": 0.648137629032135, + "learning_rate": 2.9788757608306483e-05, + "loss": 2.0234, + "num_input_tokens_seen": 1505230848, + "step": 3828 + }, + { + "epoch": 0.4346892571294948, + "grad_norm": 0.5356113314628601, + "learning_rate": 2.9752953813104188e-05, + "loss": 2.0467, + "num_input_tokens_seen": 1507590144, + "step": 3834 + }, + { + "epoch": 0.43536952200763174, + "grad_norm": 0.5586897134780884, + "learning_rate": 2.9717150017901903e-05, + "loss": 2.0798, + "num_input_tokens_seen": 1509949440, + "step": 3840 + }, + { + "epoch": 0.4360497868857686, + "grad_norm": 0.5449038743972778, + "learning_rate": 2.9681346222699608e-05, + "loss": 2.074, + "num_input_tokens_seen": 1512308736, + "step": 3846 + }, + { + "epoch": 0.43673005176390556, + "grad_norm": 0.610542893409729, + "learning_rate": 2.9645542427497313e-05, + "loss": 2.0539, + "num_input_tokens_seen": 1514668032, + "step": 3852 + }, + { + "epoch": 0.4374103166420425, + "grad_norm": 0.591969907283783, + "learning_rate": 2.9609738632295024e-05, + "loss": 2.0768, + "num_input_tokens_seen": 1517027328, + "step": 3858 + }, + { + "epoch": 0.43809058152017943, + "grad_norm": 0.6223018765449524, + "learning_rate": 2.9573934837092732e-05, + "loss": 1.9882, + "num_input_tokens_seen": 1519386624, + "step": 3864 + }, + { + "epoch": 0.43877084639831637, + "grad_norm": 0.5089840292930603, + "learning_rate": 2.9538131041890444e-05, + "loss": 2.03, + "num_input_tokens_seen": 1521745920, + "step": 3870 + }, + { + "epoch": 0.43945111127645325, + "grad_norm": 0.6258916854858398, + "learning_rate": 2.950232724668815e-05, + "loss": 2.0287, + "num_input_tokens_seen": 1524105216, + "step": 3876 + }, + { + "epoch": 0.4401313761545902, + "grad_norm": 0.7169709205627441, + "learning_rate": 2.946652345148586e-05, + "loss": 2.0542, + "num_input_tokens_seen": 1526464512, + "step": 3882 + }, + { + "epoch": 0.44081164103272713, + "grad_norm": 0.7032943367958069, + "learning_rate": 2.9430719656283568e-05, + "loss": 2.0616, + "num_input_tokens_seen": 1528823808, + "step": 3888 + }, + { + "epoch": 0.44149190591086407, + "grad_norm": 0.7222307324409485, + "learning_rate": 2.9394915861081273e-05, + "loss": 2.0536, + "num_input_tokens_seen": 1531183104, + "step": 3894 + }, + { + "epoch": 0.44217217078900095, + "grad_norm": 0.6075944304466248, + "learning_rate": 2.9359112065878985e-05, + "loss": 2.0727, + "num_input_tokens_seen": 1533542400, + "step": 3900 + }, + { + "epoch": 0.4428524356671379, + "grad_norm": 0.5314520597457886, + "learning_rate": 2.9323308270676693e-05, + "loss": 2.0807, + "num_input_tokens_seen": 1535901696, + "step": 3906 + }, + { + "epoch": 0.4435327005452748, + "grad_norm": 0.5990427136421204, + "learning_rate": 2.9287504475474404e-05, + "loss": 2.0537, + "num_input_tokens_seen": 1538260992, + "step": 3912 + }, + { + "epoch": 0.44421296542341177, + "grad_norm": 0.6096120476722717, + "learning_rate": 2.925170068027211e-05, + "loss": 2.0997, + "num_input_tokens_seen": 1540620288, + "step": 3918 + }, + { + "epoch": 0.44489323030154865, + "grad_norm": 0.5830526351928711, + "learning_rate": 2.921589688506982e-05, + "loss": 2.0595, + "num_input_tokens_seen": 1542979584, + "step": 3924 + }, + { + "epoch": 0.4455734951796856, + "grad_norm": 0.5455676913261414, + "learning_rate": 2.918009308986753e-05, + "loss": 2.0565, + "num_input_tokens_seen": 1545338880, + "step": 3930 + }, + { + "epoch": 0.4462537600578225, + "grad_norm": 0.5473060011863708, + "learning_rate": 2.9144289294665234e-05, + "loss": 2.0598, + "num_input_tokens_seen": 1547698176, + "step": 3936 + }, + { + "epoch": 0.44693402493595946, + "grad_norm": 0.5502248406410217, + "learning_rate": 2.9108485499462945e-05, + "loss": 2.0327, + "num_input_tokens_seen": 1550057472, + "step": 3942 + }, + { + "epoch": 0.44761428981409634, + "grad_norm": 0.5692510008811951, + "learning_rate": 2.907268170426065e-05, + "loss": 2.0605, + "num_input_tokens_seen": 1552416768, + "step": 3948 + }, + { + "epoch": 0.4482945546922333, + "grad_norm": 0.5818192362785339, + "learning_rate": 2.9036877909058365e-05, + "loss": 2.058, + "num_input_tokens_seen": 1554776064, + "step": 3954 + }, + { + "epoch": 0.4489748195703702, + "grad_norm": 0.6013736128807068, + "learning_rate": 2.900107411385607e-05, + "loss": 2.0503, + "num_input_tokens_seen": 1557135360, + "step": 3960 + }, + { + "epoch": 0.44965508444850716, + "grad_norm": 0.5566754341125488, + "learning_rate": 2.896527031865378e-05, + "loss": 2.0796, + "num_input_tokens_seen": 1559494656, + "step": 3966 + }, + { + "epoch": 0.45033534932664404, + "grad_norm": 0.6082068085670471, + "learning_rate": 2.8929466523451486e-05, + "loss": 2.0633, + "num_input_tokens_seen": 1561853952, + "step": 3972 + }, + { + "epoch": 0.451015614204781, + "grad_norm": 0.5426793694496155, + "learning_rate": 2.8893662728249194e-05, + "loss": 2.0964, + "num_input_tokens_seen": 1564213248, + "step": 3978 + }, + { + "epoch": 0.4516958790829179, + "grad_norm": 0.549892246723175, + "learning_rate": 2.8857858933046906e-05, + "loss": 2.0454, + "num_input_tokens_seen": 1566572544, + "step": 3984 + }, + { + "epoch": 0.45237614396105486, + "grad_norm": 0.5879752039909363, + "learning_rate": 2.882205513784461e-05, + "loss": 2.0256, + "num_input_tokens_seen": 1568931840, + "step": 3990 + }, + { + "epoch": 0.45305640883919174, + "grad_norm": 0.6315425038337708, + "learning_rate": 2.8786251342642322e-05, + "loss": 2.0484, + "num_input_tokens_seen": 1571291136, + "step": 3996 + }, + { + "epoch": 0.4535099187579497, + "eval_accuracy": 0.5812625152625153, + "eval_loss": 2.036273241043091, + "eval_runtime": 128.5473, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 1.042, + "num_input_tokens_seen": 1572864000, + "step": 4000 + }, + { + "epoch": 0.4537366737173287, + "grad_norm": 0.5952754616737366, + "learning_rate": 2.875044754744003e-05, + "loss": 2.0647, + "num_input_tokens_seen": 1573650432, + "step": 4002 + }, + { + "epoch": 0.4544169385954656, + "grad_norm": 0.6178935766220093, + "learning_rate": 2.871464375223774e-05, + "loss": 2.0479, + "num_input_tokens_seen": 1576009728, + "step": 4008 + }, + { + "epoch": 0.45509720347360255, + "grad_norm": 0.5785337090492249, + "learning_rate": 2.8678839957035446e-05, + "loss": 2.0627, + "num_input_tokens_seen": 1578369024, + "step": 4014 + }, + { + "epoch": 0.45577746835173943, + "grad_norm": 0.7288320064544678, + "learning_rate": 2.8643036161833155e-05, + "loss": 2.0494, + "num_input_tokens_seen": 1580728320, + "step": 4020 + }, + { + "epoch": 0.4564577332298764, + "grad_norm": 0.5112663507461548, + "learning_rate": 2.8607232366630866e-05, + "loss": 2.0734, + "num_input_tokens_seen": 1583087616, + "step": 4026 + }, + { + "epoch": 0.4571379981080133, + "grad_norm": 0.6311584115028381, + "learning_rate": 2.857142857142857e-05, + "loss": 2.0358, + "num_input_tokens_seen": 1585446912, + "step": 4032 + }, + { + "epoch": 0.45781826298615025, + "grad_norm": 0.7229108214378357, + "learning_rate": 2.8535624776226282e-05, + "loss": 2.1127, + "num_input_tokens_seen": 1587806208, + "step": 4038 + }, + { + "epoch": 0.45849852786428713, + "grad_norm": 0.6722413301467896, + "learning_rate": 2.849982098102399e-05, + "loss": 2.0238, + "num_input_tokens_seen": 1590165504, + "step": 4044 + }, + { + "epoch": 0.45917879274242407, + "grad_norm": 0.5801773071289062, + "learning_rate": 2.8464017185821702e-05, + "loss": 2.0511, + "num_input_tokens_seen": 1592524800, + "step": 4050 + }, + { + "epoch": 0.459859057620561, + "grad_norm": 0.5483337044715881, + "learning_rate": 2.8428213390619407e-05, + "loss": 2.0307, + "num_input_tokens_seen": 1594884096, + "step": 4056 + }, + { + "epoch": 0.46053932249869795, + "grad_norm": 0.6021159887313843, + "learning_rate": 2.839240959541711e-05, + "loss": 2.0612, + "num_input_tokens_seen": 1597243392, + "step": 4062 + }, + { + "epoch": 0.4612195873768348, + "grad_norm": 0.6574224233627319, + "learning_rate": 2.8356605800214827e-05, + "loss": 2.1141, + "num_input_tokens_seen": 1599602688, + "step": 4068 + }, + { + "epoch": 0.46189985225497177, + "grad_norm": 0.6547892093658447, + "learning_rate": 2.832080200501253e-05, + "loss": 2.028, + "num_input_tokens_seen": 1601961984, + "step": 4074 + }, + { + "epoch": 0.4625801171331087, + "grad_norm": 0.5769033432006836, + "learning_rate": 2.8284998209810243e-05, + "loss": 2.0161, + "num_input_tokens_seen": 1604321280, + "step": 4080 + }, + { + "epoch": 0.46326038201124564, + "grad_norm": 0.5694014430046082, + "learning_rate": 2.8249194414607948e-05, + "loss": 2.0506, + "num_input_tokens_seen": 1606680576, + "step": 4086 + }, + { + "epoch": 0.4639406468893826, + "grad_norm": 0.522310733795166, + "learning_rate": 2.821339061940566e-05, + "loss": 2.0775, + "num_input_tokens_seen": 1609039872, + "step": 4092 + }, + { + "epoch": 0.46462091176751946, + "grad_norm": 0.5985013842582703, + "learning_rate": 2.8177586824203367e-05, + "loss": 2.0341, + "num_input_tokens_seen": 1611399168, + "step": 4098 + }, + { + "epoch": 0.4653011766456564, + "grad_norm": 0.6218437552452087, + "learning_rate": 2.8141783029001072e-05, + "loss": 2.0579, + "num_input_tokens_seen": 1613758464, + "step": 4104 + }, + { + "epoch": 0.46598144152379334, + "grad_norm": 0.6592255234718323, + "learning_rate": 2.8105979233798784e-05, + "loss": 2.0348, + "num_input_tokens_seen": 1616117760, + "step": 4110 + }, + { + "epoch": 0.4666617064019303, + "grad_norm": 0.583972692489624, + "learning_rate": 2.8070175438596492e-05, + "loss": 2.0909, + "num_input_tokens_seen": 1618477056, + "step": 4116 + }, + { + "epoch": 0.46734197128006716, + "grad_norm": 0.5974957346916199, + "learning_rate": 2.8034371643394203e-05, + "loss": 2.0778, + "num_input_tokens_seen": 1620836352, + "step": 4122 + }, + { + "epoch": 0.4680222361582041, + "grad_norm": 0.6519222855567932, + "learning_rate": 2.7998567848191908e-05, + "loss": 2.1053, + "num_input_tokens_seen": 1623195648, + "step": 4128 + }, + { + "epoch": 0.46870250103634103, + "grad_norm": 0.6389775276184082, + "learning_rate": 2.796276405298962e-05, + "loss": 2.0589, + "num_input_tokens_seen": 1625554944, + "step": 4134 + }, + { + "epoch": 0.469382765914478, + "grad_norm": 0.559096097946167, + "learning_rate": 2.7926960257787328e-05, + "loss": 2.0486, + "num_input_tokens_seen": 1627914240, + "step": 4140 + }, + { + "epoch": 0.47006303079261486, + "grad_norm": 0.6475938558578491, + "learning_rate": 2.7891156462585033e-05, + "loss": 2.0917, + "num_input_tokens_seen": 1630273536, + "step": 4146 + }, + { + "epoch": 0.4707432956707518, + "grad_norm": 0.5632593035697937, + "learning_rate": 2.7855352667382744e-05, + "loss": 2.1186, + "num_input_tokens_seen": 1632632832, + "step": 4152 + }, + { + "epoch": 0.47142356054888873, + "grad_norm": 0.6311493515968323, + "learning_rate": 2.7819548872180452e-05, + "loss": 2.0449, + "num_input_tokens_seen": 1634992128, + "step": 4158 + }, + { + "epoch": 0.47210382542702567, + "grad_norm": 0.5263134837150574, + "learning_rate": 2.7783745076978164e-05, + "loss": 2.0727, + "num_input_tokens_seen": 1637351424, + "step": 4164 + }, + { + "epoch": 0.47278409030516255, + "grad_norm": 0.6025407910346985, + "learning_rate": 2.774794128177587e-05, + "loss": 2.0727, + "num_input_tokens_seen": 1639710720, + "step": 4170 + }, + { + "epoch": 0.4734643551832995, + "grad_norm": 0.5921410322189331, + "learning_rate": 2.771213748657358e-05, + "loss": 2.0652, + "num_input_tokens_seen": 1642070016, + "step": 4176 + }, + { + "epoch": 0.47414462006143643, + "grad_norm": 0.631074845790863, + "learning_rate": 2.767633369137129e-05, + "loss": 2.0445, + "num_input_tokens_seen": 1644429312, + "step": 4182 + }, + { + "epoch": 0.47482488493957337, + "grad_norm": 0.6067950129508972, + "learning_rate": 2.7640529896168993e-05, + "loss": 2.0518, + "num_input_tokens_seen": 1646788608, + "step": 4188 + }, + { + "epoch": 0.47550514981771025, + "grad_norm": 0.7098994851112366, + "learning_rate": 2.7604726100966705e-05, + "loss": 2.0632, + "num_input_tokens_seen": 1649147904, + "step": 4194 + }, + { + "epoch": 0.4761854146958472, + "grad_norm": 0.59510338306427, + "learning_rate": 2.756892230576441e-05, + "loss": 2.0555, + "num_input_tokens_seen": 1651507200, + "step": 4200 + }, + { + "epoch": 0.4768656795739841, + "grad_norm": 0.5363790392875671, + "learning_rate": 2.753311851056212e-05, + "loss": 2.0648, + "num_input_tokens_seen": 1653866496, + "step": 4206 + }, + { + "epoch": 0.47754594445212106, + "grad_norm": 0.6022222638130188, + "learning_rate": 2.749731471535983e-05, + "loss": 2.0435, + "num_input_tokens_seen": 1656225792, + "step": 4212 + }, + { + "epoch": 0.47822620933025795, + "grad_norm": 0.6179582476615906, + "learning_rate": 2.746151092015754e-05, + "loss": 2.0574, + "num_input_tokens_seen": 1658585088, + "step": 4218 + }, + { + "epoch": 0.4789064742083949, + "grad_norm": 0.5979219079017639, + "learning_rate": 2.7425707124955245e-05, + "loss": 2.0891, + "num_input_tokens_seen": 1660944384, + "step": 4224 + }, + { + "epoch": 0.4795867390865318, + "grad_norm": 0.5890262722969055, + "learning_rate": 2.7389903329752954e-05, + "loss": 2.0579, + "num_input_tokens_seen": 1663303680, + "step": 4230 + }, + { + "epoch": 0.48026700396466876, + "grad_norm": 0.5914406776428223, + "learning_rate": 2.7354099534550665e-05, + "loss": 2.0666, + "num_input_tokens_seen": 1665662976, + "step": 4236 + }, + { + "epoch": 0.48094726884280564, + "grad_norm": 0.5765048861503601, + "learning_rate": 2.731829573934837e-05, + "loss": 2.0057, + "num_input_tokens_seen": 1668022272, + "step": 4242 + }, + { + "epoch": 0.4816275337209426, + "grad_norm": 0.5583755970001221, + "learning_rate": 2.728249194414608e-05, + "loss": 2.0303, + "num_input_tokens_seen": 1670381568, + "step": 4248 + }, + { + "epoch": 0.4823077985990795, + "grad_norm": 0.5724276900291443, + "learning_rate": 2.724668814894379e-05, + "loss": 2.0575, + "num_input_tokens_seen": 1672740864, + "step": 4254 + }, + { + "epoch": 0.48298806347721646, + "grad_norm": 0.6578513979911804, + "learning_rate": 2.72108843537415e-05, + "loss": 2.0213, + "num_input_tokens_seen": 1675100160, + "step": 4260 + }, + { + "epoch": 0.48366832835535334, + "grad_norm": 0.6713804602622986, + "learning_rate": 2.7175080558539206e-05, + "loss": 2.0428, + "num_input_tokens_seen": 1677459456, + "step": 4266 + }, + { + "epoch": 0.4843485932334903, + "grad_norm": 0.5634715557098389, + "learning_rate": 2.7139276763336914e-05, + "loss": 2.0461, + "num_input_tokens_seen": 1679818752, + "step": 4272 + }, + { + "epoch": 0.4850288581116272, + "grad_norm": 0.6080957055091858, + "learning_rate": 2.7103472968134626e-05, + "loss": 2.1146, + "num_input_tokens_seen": 1682178048, + "step": 4278 + }, + { + "epoch": 0.48570912298976415, + "grad_norm": 0.5606207251548767, + "learning_rate": 2.706766917293233e-05, + "loss": 2.0456, + "num_input_tokens_seen": 1684537344, + "step": 4284 + }, + { + "epoch": 0.4863893878679011, + "grad_norm": 0.6365392208099365, + "learning_rate": 2.7031865377730042e-05, + "loss": 2.0354, + "num_input_tokens_seen": 1686896640, + "step": 4290 + }, + { + "epoch": 0.487069652746038, + "grad_norm": 0.6958995461463928, + "learning_rate": 2.699606158252775e-05, + "loss": 2.0047, + "num_input_tokens_seen": 1689255936, + "step": 4296 + }, + { + "epoch": 0.4877499176241749, + "grad_norm": 0.7712015509605408, + "learning_rate": 2.6960257787325462e-05, + "loss": 2.0329, + "num_input_tokens_seen": 1691615232, + "step": 4302 + }, + { + "epoch": 0.48843018250231185, + "grad_norm": 0.5757302045822144, + "learning_rate": 2.6924453992123166e-05, + "loss": 2.0256, + "num_input_tokens_seen": 1693974528, + "step": 4308 + }, + { + "epoch": 0.4891104473804488, + "grad_norm": 0.5976963639259338, + "learning_rate": 2.688865019692087e-05, + "loss": 2.0844, + "num_input_tokens_seen": 1696333824, + "step": 4314 + }, + { + "epoch": 0.48979071225858567, + "grad_norm": 0.5346413254737854, + "learning_rate": 2.6852846401718583e-05, + "loss": 2.0371, + "num_input_tokens_seen": 1698693120, + "step": 4320 + }, + { + "epoch": 0.4904709771367226, + "grad_norm": 0.5894768238067627, + "learning_rate": 2.681704260651629e-05, + "loss": 2.0424, + "num_input_tokens_seen": 1701052416, + "step": 4326 + }, + { + "epoch": 0.49115124201485955, + "grad_norm": 0.5185033679008484, + "learning_rate": 2.6781238811314003e-05, + "loss": 2.0221, + "num_input_tokens_seen": 1703411712, + "step": 4332 + }, + { + "epoch": 0.4918315068929965, + "grad_norm": 0.708967924118042, + "learning_rate": 2.6745435016111707e-05, + "loss": 2.0596, + "num_input_tokens_seen": 1705771008, + "step": 4338 + }, + { + "epoch": 0.49251177177113337, + "grad_norm": 0.539934515953064, + "learning_rate": 2.670963122090942e-05, + "loss": 2.0724, + "num_input_tokens_seen": 1708130304, + "step": 4344 + }, + { + "epoch": 0.4931920366492703, + "grad_norm": 0.536572277545929, + "learning_rate": 2.6673827425707127e-05, + "loss": 2.0064, + "num_input_tokens_seen": 1710489600, + "step": 4350 + }, + { + "epoch": 0.49387230152740724, + "grad_norm": 0.6322241425514221, + "learning_rate": 2.6638023630504832e-05, + "loss": 2.0661, + "num_input_tokens_seen": 1712848896, + "step": 4356 + }, + { + "epoch": 0.4945525664055442, + "grad_norm": 0.6483719348907471, + "learning_rate": 2.6602219835302543e-05, + "loss": 2.0301, + "num_input_tokens_seen": 1715208192, + "step": 4362 + }, + { + "epoch": 0.49523283128368106, + "grad_norm": 0.7183097004890442, + "learning_rate": 2.656641604010025e-05, + "loss": 2.0372, + "num_input_tokens_seen": 1717567488, + "step": 4368 + }, + { + "epoch": 0.495913096161818, + "grad_norm": 0.5163341760635376, + "learning_rate": 2.6530612244897963e-05, + "loss": 2.0536, + "num_input_tokens_seen": 1719926784, + "step": 4374 + }, + { + "epoch": 0.49659336103995494, + "grad_norm": 0.5748982429504395, + "learning_rate": 2.6494808449695668e-05, + "loss": 2.0573, + "num_input_tokens_seen": 1722286080, + "step": 4380 + }, + { + "epoch": 0.4972736259180919, + "grad_norm": 0.5292128324508667, + "learning_rate": 2.645900465449338e-05, + "loss": 2.0675, + "num_input_tokens_seen": 1724645376, + "step": 4386 + }, + { + "epoch": 0.49795389079622876, + "grad_norm": 0.5424016714096069, + "learning_rate": 2.6423200859291087e-05, + "loss": 2.0624, + "num_input_tokens_seen": 1727004672, + "step": 4392 + }, + { + "epoch": 0.4986341556743657, + "grad_norm": 0.5220805406570435, + "learning_rate": 2.6387397064088792e-05, + "loss": 2.0199, + "num_input_tokens_seen": 1729363968, + "step": 4398 + }, + { + "epoch": 0.49886091063374466, + "eval_accuracy": 0.5820293040293041, + "eval_loss": 2.0314505100250244, + "eval_runtime": 129.9151, + "eval_samples_per_second": 3.079, + "eval_steps_per_second": 1.031, + "num_input_tokens_seen": 1730150400, + "step": 4400 + }, + { + "epoch": 0.49931442055250264, + "grad_norm": 0.5268684029579163, + "learning_rate": 2.6351593268886504e-05, + "loss": 2.0301, + "num_input_tokens_seen": 1731723264, + "step": 4404 + }, + { + "epoch": 0.4999946854306396, + "grad_norm": 0.53367018699646, + "learning_rate": 2.6315789473684212e-05, + "loss": 2.0371, + "num_input_tokens_seen": 1734082560, + "step": 4410 + }, + { + "epoch": 0.5006749503087765, + "grad_norm": 0.6249188184738159, + "learning_rate": 2.6279985678481924e-05, + "loss": 2.0451, + "num_input_tokens_seen": 1736441856, + "step": 4416 + }, + { + "epoch": 0.5013552151869134, + "grad_norm": 0.5434116125106812, + "learning_rate": 2.6244181883279628e-05, + "loss": 2.0524, + "num_input_tokens_seen": 1738801152, + "step": 4422 + }, + { + "epoch": 0.5020354800650503, + "grad_norm": 0.6205160021781921, + "learning_rate": 2.620837808807734e-05, + "loss": 2.0484, + "num_input_tokens_seen": 1741160448, + "step": 4428 + }, + { + "epoch": 0.5027157449431873, + "grad_norm": 0.6665350198745728, + "learning_rate": 2.6172574292875045e-05, + "loss": 2.0277, + "num_input_tokens_seen": 1743519744, + "step": 4434 + }, + { + "epoch": 0.5033960098213242, + "grad_norm": 0.6074947714805603, + "learning_rate": 2.6136770497672753e-05, + "loss": 1.9797, + "num_input_tokens_seen": 1745879040, + "step": 4440 + }, + { + "epoch": 0.5040762746994611, + "grad_norm": 0.801179051399231, + "learning_rate": 2.6100966702470464e-05, + "loss": 2.0997, + "num_input_tokens_seen": 1748238336, + "step": 4446 + }, + { + "epoch": 0.504756539577598, + "grad_norm": 0.6840958595275879, + "learning_rate": 2.606516290726817e-05, + "loss": 2.0205, + "num_input_tokens_seen": 1750597632, + "step": 4452 + }, + { + "epoch": 0.5054368044557349, + "grad_norm": 0.6576579809188843, + "learning_rate": 2.602935911206588e-05, + "loss": 2.0203, + "num_input_tokens_seen": 1752956928, + "step": 4458 + }, + { + "epoch": 0.5061170693338719, + "grad_norm": 0.6868447065353394, + "learning_rate": 2.599355531686359e-05, + "loss": 1.9909, + "num_input_tokens_seen": 1755316224, + "step": 4464 + }, + { + "epoch": 0.5067973342120088, + "grad_norm": 0.5453672409057617, + "learning_rate": 2.59577515216613e-05, + "loss": 2.006, + "num_input_tokens_seen": 1757675520, + "step": 4470 + }, + { + "epoch": 0.5074775990901457, + "grad_norm": 0.6682925224304199, + "learning_rate": 2.5921947726459005e-05, + "loss": 2.0254, + "num_input_tokens_seen": 1760034816, + "step": 4476 + }, + { + "epoch": 0.5081578639682827, + "grad_norm": 0.5425339937210083, + "learning_rate": 2.5886143931256713e-05, + "loss": 2.0437, + "num_input_tokens_seen": 1762394112, + "step": 4482 + }, + { + "epoch": 0.5088381288464195, + "grad_norm": 0.5697811245918274, + "learning_rate": 2.5850340136054425e-05, + "loss": 2.0491, + "num_input_tokens_seen": 1764753408, + "step": 4488 + }, + { + "epoch": 0.5095183937245565, + "grad_norm": 0.5228165984153748, + "learning_rate": 2.581453634085213e-05, + "loss": 2.0265, + "num_input_tokens_seen": 1767112704, + "step": 4494 + }, + { + "epoch": 0.5101986586026934, + "grad_norm": 0.5687726736068726, + "learning_rate": 2.577873254564984e-05, + "loss": 2.0485, + "num_input_tokens_seen": 1769472000, + "step": 4500 + }, + { + "epoch": 0.5108789234808303, + "grad_norm": 0.5901363492012024, + "learning_rate": 2.574292875044755e-05, + "loss": 2.0407, + "num_input_tokens_seen": 1771831296, + "step": 4506 + }, + { + "epoch": 0.5115591883589673, + "grad_norm": 0.585297703742981, + "learning_rate": 2.570712495524526e-05, + "loss": 2.0564, + "num_input_tokens_seen": 1774190592, + "step": 4512 + }, + { + "epoch": 0.5122394532371042, + "grad_norm": 0.5718687772750854, + "learning_rate": 2.5671321160042966e-05, + "loss": 2.0269, + "num_input_tokens_seen": 1776549888, + "step": 4518 + }, + { + "epoch": 0.5129197181152412, + "grad_norm": 0.5592496991157532, + "learning_rate": 2.5635517364840674e-05, + "loss": 2.038, + "num_input_tokens_seen": 1778909184, + "step": 4524 + }, + { + "epoch": 0.5135999829933781, + "grad_norm": 0.5649601817131042, + "learning_rate": 2.5599713569638385e-05, + "loss": 2.0191, + "num_input_tokens_seen": 1781268480, + "step": 4530 + }, + { + "epoch": 0.5142802478715149, + "grad_norm": 0.5326777696609497, + "learning_rate": 2.556390977443609e-05, + "loss": 2.0632, + "num_input_tokens_seen": 1783627776, + "step": 4536 + }, + { + "epoch": 0.5149605127496519, + "grad_norm": 0.6131651401519775, + "learning_rate": 2.55281059792338e-05, + "loss": 2.0646, + "num_input_tokens_seen": 1785987072, + "step": 4542 + }, + { + "epoch": 0.5156407776277888, + "grad_norm": 0.6253523230552673, + "learning_rate": 2.5492302184031506e-05, + "loss": 2.023, + "num_input_tokens_seen": 1788346368, + "step": 4548 + }, + { + "epoch": 0.5163210425059257, + "grad_norm": 0.5869733095169067, + "learning_rate": 2.545649838882922e-05, + "loss": 2.0196, + "num_input_tokens_seen": 1790705664, + "step": 4554 + }, + { + "epoch": 0.5170013073840627, + "grad_norm": 0.6161871552467346, + "learning_rate": 2.5420694593626926e-05, + "loss": 2.0352, + "num_input_tokens_seen": 1793064960, + "step": 4560 + }, + { + "epoch": 0.5176815722621996, + "grad_norm": 0.5765447020530701, + "learning_rate": 2.538489079842463e-05, + "loss": 2.0604, + "num_input_tokens_seen": 1795424256, + "step": 4566 + }, + { + "epoch": 0.5183618371403366, + "grad_norm": 0.5514522194862366, + "learning_rate": 2.5349087003222342e-05, + "loss": 2.0117, + "num_input_tokens_seen": 1797783552, + "step": 4572 + }, + { + "epoch": 0.5190421020184735, + "grad_norm": 0.6634048223495483, + "learning_rate": 2.531328320802005e-05, + "loss": 2.0526, + "num_input_tokens_seen": 1800142848, + "step": 4578 + }, + { + "epoch": 0.5197223668966103, + "grad_norm": 0.5515666604042053, + "learning_rate": 2.5277479412817762e-05, + "loss": 2.0145, + "num_input_tokens_seen": 1802502144, + "step": 4584 + }, + { + "epoch": 0.5204026317747473, + "grad_norm": 0.5209513902664185, + "learning_rate": 2.5241675617615467e-05, + "loss": 2.0392, + "num_input_tokens_seen": 1804861440, + "step": 4590 + }, + { + "epoch": 0.5210828966528842, + "grad_norm": 0.586448609828949, + "learning_rate": 2.520587182241318e-05, + "loss": 2.0467, + "num_input_tokens_seen": 1807220736, + "step": 4596 + }, + { + "epoch": 0.5217631615310211, + "grad_norm": 0.6192159652709961, + "learning_rate": 2.5170068027210887e-05, + "loss": 2.0113, + "num_input_tokens_seen": 1809580032, + "step": 4602 + }, + { + "epoch": 0.5224434264091581, + "grad_norm": 0.5666912794113159, + "learning_rate": 2.513426423200859e-05, + "loss": 2.0656, + "num_input_tokens_seen": 1811939328, + "step": 4608 + }, + { + "epoch": 0.523123691287295, + "grad_norm": 0.553439199924469, + "learning_rate": 2.5098460436806303e-05, + "loss": 2.0829, + "num_input_tokens_seen": 1814298624, + "step": 4614 + }, + { + "epoch": 0.523803956165432, + "grad_norm": 0.5798554420471191, + "learning_rate": 2.506265664160401e-05, + "loss": 2.0018, + "num_input_tokens_seen": 1816657920, + "step": 4620 + }, + { + "epoch": 0.5244842210435688, + "grad_norm": 0.5392002463340759, + "learning_rate": 2.5026852846401723e-05, + "loss": 2.0356, + "num_input_tokens_seen": 1819017216, + "step": 4626 + }, + { + "epoch": 0.5251644859217057, + "grad_norm": 0.5326321721076965, + "learning_rate": 2.4991049051199427e-05, + "loss": 2.0293, + "num_input_tokens_seen": 1821376512, + "step": 4632 + }, + { + "epoch": 0.5258447507998427, + "grad_norm": 0.5811095237731934, + "learning_rate": 2.4955245255997136e-05, + "loss": 2.0293, + "num_input_tokens_seen": 1823735808, + "step": 4638 + }, + { + "epoch": 0.5265250156779796, + "grad_norm": 0.6148865222930908, + "learning_rate": 2.4919441460794847e-05, + "loss": 2.0708, + "num_input_tokens_seen": 1826095104, + "step": 4644 + }, + { + "epoch": 0.5272052805561165, + "grad_norm": 0.552721381187439, + "learning_rate": 2.4883637665592555e-05, + "loss": 2.0677, + "num_input_tokens_seen": 1828454400, + "step": 4650 + }, + { + "epoch": 0.5278855454342535, + "grad_norm": 0.7011052966117859, + "learning_rate": 2.4847833870390263e-05, + "loss": 2.0427, + "num_input_tokens_seen": 1830813696, + "step": 4656 + }, + { + "epoch": 0.5285658103123904, + "grad_norm": 0.698103129863739, + "learning_rate": 2.4812030075187968e-05, + "loss": 2.0446, + "num_input_tokens_seen": 1833172992, + "step": 4662 + }, + { + "epoch": 0.5292460751905274, + "grad_norm": 0.6698234677314758, + "learning_rate": 2.477622627998568e-05, + "loss": 1.989, + "num_input_tokens_seen": 1835532288, + "step": 4668 + }, + { + "epoch": 0.5299263400686642, + "grad_norm": 0.686160683631897, + "learning_rate": 2.4740422484783388e-05, + "loss": 2.0177, + "num_input_tokens_seen": 1837891584, + "step": 4674 + }, + { + "epoch": 0.5306066049468011, + "grad_norm": 0.7087785601615906, + "learning_rate": 2.4704618689581096e-05, + "loss": 1.9913, + "num_input_tokens_seen": 1840250880, + "step": 4680 + }, + { + "epoch": 0.5312868698249381, + "grad_norm": 0.7636250257492065, + "learning_rate": 2.4668814894378804e-05, + "loss": 2.0415, + "num_input_tokens_seen": 1842610176, + "step": 4686 + }, + { + "epoch": 0.531967134703075, + "grad_norm": 0.6679571270942688, + "learning_rate": 2.4633011099176516e-05, + "loss": 2.0405, + "num_input_tokens_seen": 1844969472, + "step": 4692 + }, + { + "epoch": 0.532647399581212, + "grad_norm": 0.6184023022651672, + "learning_rate": 2.4597207303974224e-05, + "loss": 2.0398, + "num_input_tokens_seen": 1847328768, + "step": 4698 + }, + { + "epoch": 0.5333276644593489, + "grad_norm": 0.6215579509735107, + "learning_rate": 2.456140350877193e-05, + "loss": 2.0421, + "num_input_tokens_seen": 1849688064, + "step": 4704 + }, + { + "epoch": 0.5340079293374858, + "grad_norm": 0.6236794590950012, + "learning_rate": 2.452559971356964e-05, + "loss": 2.0267, + "num_input_tokens_seen": 1852047360, + "step": 4710 + }, + { + "epoch": 0.5346881942156227, + "grad_norm": 0.5402464270591736, + "learning_rate": 2.448979591836735e-05, + "loss": 2.0491, + "num_input_tokens_seen": 1854406656, + "step": 4716 + }, + { + "epoch": 0.5353684590937596, + "grad_norm": 0.6620278358459473, + "learning_rate": 2.4453992123165057e-05, + "loss": 2.0231, + "num_input_tokens_seen": 1856765952, + "step": 4722 + }, + { + "epoch": 0.5360487239718965, + "grad_norm": 0.6001939177513123, + "learning_rate": 2.4418188327962765e-05, + "loss": 2.07, + "num_input_tokens_seen": 1859125248, + "step": 4728 + }, + { + "epoch": 0.5367289888500335, + "grad_norm": 0.5543425679206848, + "learning_rate": 2.4382384532760473e-05, + "loss": 2.0106, + "num_input_tokens_seen": 1861484544, + "step": 4734 + }, + { + "epoch": 0.5374092537281704, + "grad_norm": 0.4775249660015106, + "learning_rate": 2.4346580737558184e-05, + "loss": 2.0754, + "num_input_tokens_seen": 1863843840, + "step": 4740 + }, + { + "epoch": 0.5380895186063074, + "grad_norm": 0.5671461224555969, + "learning_rate": 2.431077694235589e-05, + "loss": 2.0448, + "num_input_tokens_seen": 1866203136, + "step": 4746 + }, + { + "epoch": 0.5387697834844443, + "grad_norm": 0.6002800464630127, + "learning_rate": 2.4274973147153597e-05, + "loss": 2.0081, + "num_input_tokens_seen": 1868562432, + "step": 4752 + }, + { + "epoch": 0.5394500483625811, + "grad_norm": 0.5340938568115234, + "learning_rate": 2.423916935195131e-05, + "loss": 2.0068, + "num_input_tokens_seen": 1870921728, + "step": 4758 + }, + { + "epoch": 0.5401303132407181, + "grad_norm": 0.550006628036499, + "learning_rate": 2.4203365556749017e-05, + "loss": 2.0779, + "num_input_tokens_seen": 1873281024, + "step": 4764 + }, + { + "epoch": 0.540810578118855, + "grad_norm": 0.6014347672462463, + "learning_rate": 2.4167561761546725e-05, + "loss": 2.0408, + "num_input_tokens_seen": 1875640320, + "step": 4770 + }, + { + "epoch": 0.5414908429969919, + "grad_norm": 0.6230180859565735, + "learning_rate": 2.4131757966344433e-05, + "loss": 2.0508, + "num_input_tokens_seen": 1877999616, + "step": 4776 + }, + { + "epoch": 0.5421711078751289, + "grad_norm": 0.570754885673523, + "learning_rate": 2.4095954171142145e-05, + "loss": 2.0594, + "num_input_tokens_seen": 1880358912, + "step": 4782 + }, + { + "epoch": 0.5428513727532658, + "grad_norm": 0.5309892892837524, + "learning_rate": 2.406015037593985e-05, + "loss": 2.0985, + "num_input_tokens_seen": 1882718208, + "step": 4788 + }, + { + "epoch": 0.5435316376314028, + "grad_norm": 0.6809681057929993, + "learning_rate": 2.4024346580737558e-05, + "loss": 2.0498, + "num_input_tokens_seen": 1885077504, + "step": 4794 + }, + { + "epoch": 0.5442119025095397, + "grad_norm": 0.5176597237586975, + "learning_rate": 2.3988542785535266e-05, + "loss": 2.0361, + "num_input_tokens_seen": 1887436800, + "step": 4800 + }, + { + "epoch": 0.5442119025095397, + "eval_accuracy": 0.582943833943834, + "eval_loss": 2.0261168479919434, + "eval_runtime": 128.4528, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 1.043, + "num_input_tokens_seen": 1887436800, + "step": 4800 + }, + { + "epoch": 0.5448921673876765, + "grad_norm": 0.6304104924201965, + "learning_rate": 2.3952738990332978e-05, + "loss": 2.051, + "num_input_tokens_seen": 1889796096, + "step": 4806 + }, + { + "epoch": 0.5455724322658135, + "grad_norm": 0.6736240983009338, + "learning_rate": 2.3916935195130686e-05, + "loss": 2.0604, + "num_input_tokens_seen": 1892155392, + "step": 4812 + }, + { + "epoch": 0.5462526971439504, + "grad_norm": 0.6046845316886902, + "learning_rate": 2.3881131399928394e-05, + "loss": 2.0382, + "num_input_tokens_seen": 1894514688, + "step": 4818 + }, + { + "epoch": 0.5469329620220873, + "grad_norm": 0.6192537546157837, + "learning_rate": 2.3845327604726102e-05, + "loss": 2.017, + "num_input_tokens_seen": 1896873984, + "step": 4824 + }, + { + "epoch": 0.5476132269002243, + "grad_norm": 0.6411442160606384, + "learning_rate": 2.380952380952381e-05, + "loss": 2.0302, + "num_input_tokens_seen": 1899233280, + "step": 4830 + }, + { + "epoch": 0.5482934917783612, + "grad_norm": 0.5721175670623779, + "learning_rate": 2.377372001432152e-05, + "loss": 2.0338, + "num_input_tokens_seen": 1901592576, + "step": 4836 + }, + { + "epoch": 0.5489737566564982, + "grad_norm": 0.5743176937103271, + "learning_rate": 2.3737916219119226e-05, + "loss": 2.0426, + "num_input_tokens_seen": 1903951872, + "step": 4842 + }, + { + "epoch": 0.549654021534635, + "grad_norm": 0.5680631995201111, + "learning_rate": 2.3702112423916935e-05, + "loss": 1.9743, + "num_input_tokens_seen": 1906311168, + "step": 4848 + }, + { + "epoch": 0.5503342864127719, + "grad_norm": 0.5353610515594482, + "learning_rate": 2.3666308628714646e-05, + "loss": 2.0359, + "num_input_tokens_seen": 1908670464, + "step": 4854 + }, + { + "epoch": 0.5510145512909089, + "grad_norm": 0.5486804842948914, + "learning_rate": 2.3630504833512354e-05, + "loss": 2.0598, + "num_input_tokens_seen": 1911029760, + "step": 4860 + }, + { + "epoch": 0.5516948161690458, + "grad_norm": 0.5187994241714478, + "learning_rate": 2.3594701038310063e-05, + "loss": 2.0122, + "num_input_tokens_seen": 1913389056, + "step": 4866 + }, + { + "epoch": 0.5523750810471827, + "grad_norm": 0.5575273036956787, + "learning_rate": 2.355889724310777e-05, + "loss": 1.9224, + "num_input_tokens_seen": 1915748352, + "step": 4872 + }, + { + "epoch": 0.5530553459253197, + "grad_norm": 0.5445749163627625, + "learning_rate": 2.352309344790548e-05, + "loss": 2.0599, + "num_input_tokens_seen": 1918107648, + "step": 4878 + }, + { + "epoch": 0.5537356108034566, + "grad_norm": 0.555401086807251, + "learning_rate": 2.3487289652703187e-05, + "loss": 1.9883, + "num_input_tokens_seen": 1920466944, + "step": 4884 + }, + { + "epoch": 0.5544158756815936, + "grad_norm": 0.6415968537330627, + "learning_rate": 2.3451485857500895e-05, + "loss": 2.0529, + "num_input_tokens_seen": 1922826240, + "step": 4890 + }, + { + "epoch": 0.5550961405597304, + "grad_norm": 0.5108083486557007, + "learning_rate": 2.3415682062298607e-05, + "loss": 2.0445, + "num_input_tokens_seen": 1925185536, + "step": 4896 + }, + { + "epoch": 0.5557764054378673, + "grad_norm": 0.5858399868011475, + "learning_rate": 2.3379878267096315e-05, + "loss": 2.0457, + "num_input_tokens_seen": 1927544832, + "step": 4902 + }, + { + "epoch": 0.5564566703160043, + "grad_norm": 0.5217518210411072, + "learning_rate": 2.3344074471894023e-05, + "loss": 2.091, + "num_input_tokens_seen": 1929904128, + "step": 4908 + }, + { + "epoch": 0.5571369351941412, + "grad_norm": 0.596930980682373, + "learning_rate": 2.3308270676691728e-05, + "loss": 1.9999, + "num_input_tokens_seen": 1932263424, + "step": 4914 + }, + { + "epoch": 0.5578172000722782, + "grad_norm": 0.5927145481109619, + "learning_rate": 2.327246688148944e-05, + "loss": 2.0268, + "num_input_tokens_seen": 1934622720, + "step": 4920 + }, + { + "epoch": 0.5584974649504151, + "grad_norm": 0.5516043305397034, + "learning_rate": 2.3236663086287147e-05, + "loss": 1.9878, + "num_input_tokens_seen": 1936982016, + "step": 4926 + }, + { + "epoch": 0.559177729828552, + "grad_norm": 0.627019464969635, + "learning_rate": 2.3200859291084856e-05, + "loss": 2.0346, + "num_input_tokens_seen": 1939341312, + "step": 4932 + }, + { + "epoch": 0.559857994706689, + "grad_norm": 0.5447899699211121, + "learning_rate": 2.3165055495882564e-05, + "loss": 2.0513, + "num_input_tokens_seen": 1941700608, + "step": 4938 + }, + { + "epoch": 0.5605382595848258, + "grad_norm": 0.5802628397941589, + "learning_rate": 2.3129251700680275e-05, + "loss": 2.0973, + "num_input_tokens_seen": 1944059904, + "step": 4944 + }, + { + "epoch": 0.5612185244629627, + "grad_norm": 0.5924973487854004, + "learning_rate": 2.3093447905477984e-05, + "loss": 2.0682, + "num_input_tokens_seen": 1946419200, + "step": 4950 + }, + { + "epoch": 0.5618987893410997, + "grad_norm": 0.7050002813339233, + "learning_rate": 2.3057644110275688e-05, + "loss": 2.0724, + "num_input_tokens_seen": 1948778496, + "step": 4956 + }, + { + "epoch": 0.5625790542192366, + "grad_norm": 0.5495327711105347, + "learning_rate": 2.3021840315073396e-05, + "loss": 1.9644, + "num_input_tokens_seen": 1951137792, + "step": 4962 + }, + { + "epoch": 0.5632593190973736, + "grad_norm": 0.5360766649246216, + "learning_rate": 2.2986036519871108e-05, + "loss": 1.9898, + "num_input_tokens_seen": 1953497088, + "step": 4968 + }, + { + "epoch": 0.5639395839755105, + "grad_norm": 0.5336543321609497, + "learning_rate": 2.2950232724668816e-05, + "loss": 2.0032, + "num_input_tokens_seen": 1955856384, + "step": 4974 + }, + { + "epoch": 0.5646198488536474, + "grad_norm": 0.6477164626121521, + "learning_rate": 2.2914428929466524e-05, + "loss": 2.0446, + "num_input_tokens_seen": 1958215680, + "step": 4980 + }, + { + "epoch": 0.5653001137317843, + "grad_norm": 0.6914204955101013, + "learning_rate": 2.2878625134264232e-05, + "loss": 2.0031, + "num_input_tokens_seen": 1960574976, + "step": 4986 + }, + { + "epoch": 0.5659803786099212, + "grad_norm": 0.6375141143798828, + "learning_rate": 2.2842821339061944e-05, + "loss": 2.0626, + "num_input_tokens_seen": 1962934272, + "step": 4992 + }, + { + "epoch": 0.5666606434880581, + "grad_norm": 0.6484731435775757, + "learning_rate": 2.280701754385965e-05, + "loss": 1.9985, + "num_input_tokens_seen": 1965293568, + "step": 4998 + }, + { + "epoch": 0.5673409083661951, + "grad_norm": 0.6331252455711365, + "learning_rate": 2.2771213748657357e-05, + "loss": 2.0994, + "num_input_tokens_seen": 1967652864, + "step": 5004 + }, + { + "epoch": 0.568021173244332, + "grad_norm": 0.5034027695655823, + "learning_rate": 2.273540995345507e-05, + "loss": 2.0355, + "num_input_tokens_seen": 1970012160, + "step": 5010 + }, + { + "epoch": 0.568701438122469, + "grad_norm": 0.5032246708869934, + "learning_rate": 2.2699606158252777e-05, + "loss": 2.0454, + "num_input_tokens_seen": 1972371456, + "step": 5016 + }, + { + "epoch": 0.5693817030006059, + "grad_norm": 0.6073436737060547, + "learning_rate": 2.2663802363050485e-05, + "loss": 2.0395, + "num_input_tokens_seen": 1974730752, + "step": 5022 + }, + { + "epoch": 0.5700619678787427, + "grad_norm": 0.5456060767173767, + "learning_rate": 2.2627998567848193e-05, + "loss": 2.0063, + "num_input_tokens_seen": 1977090048, + "step": 5028 + }, + { + "epoch": 0.5707422327568797, + "grad_norm": 0.505669891834259, + "learning_rate": 2.25921947726459e-05, + "loss": 1.9756, + "num_input_tokens_seen": 1979449344, + "step": 5034 + }, + { + "epoch": 0.5714224976350166, + "grad_norm": 0.4892086088657379, + "learning_rate": 2.255639097744361e-05, + "loss": 2.0305, + "num_input_tokens_seen": 1981808640, + "step": 5040 + }, + { + "epoch": 0.5721027625131535, + "grad_norm": 0.5138855576515198, + "learning_rate": 2.2520587182241317e-05, + "loss": 2.0669, + "num_input_tokens_seen": 1984167936, + "step": 5046 + }, + { + "epoch": 0.5727830273912905, + "grad_norm": 0.5779445767402649, + "learning_rate": 2.2484783387039026e-05, + "loss": 2.0237, + "num_input_tokens_seen": 1986527232, + "step": 5052 + }, + { + "epoch": 0.5734632922694274, + "grad_norm": 0.6262017488479614, + "learning_rate": 2.2448979591836737e-05, + "loss": 2.0587, + "num_input_tokens_seen": 1988886528, + "step": 5058 + }, + { + "epoch": 0.5741435571475644, + "grad_norm": 0.5465866923332214, + "learning_rate": 2.2413175796634445e-05, + "loss": 2.0295, + "num_input_tokens_seen": 1991245824, + "step": 5064 + }, + { + "epoch": 0.5748238220257013, + "grad_norm": 0.653732419013977, + "learning_rate": 2.2377372001432153e-05, + "loss": 2.0325, + "num_input_tokens_seen": 1993605120, + "step": 5070 + }, + { + "epoch": 0.5755040869038381, + "grad_norm": 0.5579174160957336, + "learning_rate": 2.234156820622986e-05, + "loss": 2.0289, + "num_input_tokens_seen": 1995964416, + "step": 5076 + }, + { + "epoch": 0.5761843517819751, + "grad_norm": 0.5739534497261047, + "learning_rate": 2.230576441102757e-05, + "loss": 2.0592, + "num_input_tokens_seen": 1998323712, + "step": 5082 + }, + { + "epoch": 0.576864616660112, + "grad_norm": 0.5563738346099854, + "learning_rate": 2.2269960615825278e-05, + "loss": 2.0624, + "num_input_tokens_seen": 2000683008, + "step": 5088 + }, + { + "epoch": 0.577544881538249, + "grad_norm": 0.6019315719604492, + "learning_rate": 2.2234156820622986e-05, + "loss": 2.0466, + "num_input_tokens_seen": 2003042304, + "step": 5094 + }, + { + "epoch": 0.5782251464163859, + "grad_norm": 0.7558180689811707, + "learning_rate": 2.2198353025420694e-05, + "loss": 2.0494, + "num_input_tokens_seen": 2005401600, + "step": 5100 + }, + { + "epoch": 0.5789054112945228, + "grad_norm": 0.7054827213287354, + "learning_rate": 2.2162549230218406e-05, + "loss": 2.0022, + "num_input_tokens_seen": 2007760896, + "step": 5106 + }, + { + "epoch": 0.5795856761726598, + "grad_norm": 0.5728718042373657, + "learning_rate": 2.2126745435016114e-05, + "loss": 2.0443, + "num_input_tokens_seen": 2010120192, + "step": 5112 + }, + { + "epoch": 0.5802659410507967, + "grad_norm": 0.6196462512016296, + "learning_rate": 2.2090941639813822e-05, + "loss": 2.0502, + "num_input_tokens_seen": 2012479488, + "step": 5118 + }, + { + "epoch": 0.5809462059289335, + "grad_norm": 0.5134831070899963, + "learning_rate": 2.205513784461153e-05, + "loss": 1.9629, + "num_input_tokens_seen": 2014838784, + "step": 5124 + }, + { + "epoch": 0.5816264708070705, + "grad_norm": 0.5899059176445007, + "learning_rate": 2.201933404940924e-05, + "loss": 2.0479, + "num_input_tokens_seen": 2017198080, + "step": 5130 + }, + { + "epoch": 0.5823067356852074, + "grad_norm": 0.5743027329444885, + "learning_rate": 2.1983530254206947e-05, + "loss": 2.0633, + "num_input_tokens_seen": 2019557376, + "step": 5136 + }, + { + "epoch": 0.5829870005633444, + "grad_norm": 0.6599460244178772, + "learning_rate": 2.1947726459004655e-05, + "loss": 2.0429, + "num_input_tokens_seen": 2021916672, + "step": 5142 + }, + { + "epoch": 0.5836672654414813, + "grad_norm": 0.5353802442550659, + "learning_rate": 2.1911922663802363e-05, + "loss": 2.0422, + "num_input_tokens_seen": 2024275968, + "step": 5148 + }, + { + "epoch": 0.5843475303196182, + "grad_norm": 0.6173189282417297, + "learning_rate": 2.1876118868600074e-05, + "loss": 2.0345, + "num_input_tokens_seen": 2026635264, + "step": 5154 + }, + { + "epoch": 0.5850277951977552, + "grad_norm": 0.5757061839103699, + "learning_rate": 2.1840315073397783e-05, + "loss": 2.061, + "num_input_tokens_seen": 2028994560, + "step": 5160 + }, + { + "epoch": 0.585708060075892, + "grad_norm": 0.537259042263031, + "learning_rate": 2.1804511278195487e-05, + "loss": 2.067, + "num_input_tokens_seen": 2031353856, + "step": 5166 + }, + { + "epoch": 0.5863883249540289, + "grad_norm": 0.6066441535949707, + "learning_rate": 2.17687074829932e-05, + "loss": 2.0059, + "num_input_tokens_seen": 2033713152, + "step": 5172 + }, + { + "epoch": 0.5870685898321659, + "grad_norm": 0.5652830600738525, + "learning_rate": 2.1732903687790907e-05, + "loss": 2.0117, + "num_input_tokens_seen": 2036072448, + "step": 5178 + }, + { + "epoch": 0.5877488547103028, + "grad_norm": 0.5732890367507935, + "learning_rate": 2.1697099892588615e-05, + "loss": 2.0529, + "num_input_tokens_seen": 2038431744, + "step": 5184 + }, + { + "epoch": 0.5884291195884398, + "grad_norm": 0.5537346601486206, + "learning_rate": 2.1661296097386323e-05, + "loss": 2.0331, + "num_input_tokens_seen": 2040791040, + "step": 5190 + }, + { + "epoch": 0.5891093844665767, + "grad_norm": 0.5528678894042969, + "learning_rate": 2.1625492302184035e-05, + "loss": 2.057, + "num_input_tokens_seen": 2043150336, + "step": 5196 + }, + { + "epoch": 0.5895628943853346, + "eval_accuracy": 0.5838162393162393, + "eval_loss": 2.0207207202911377, + "eval_runtime": 128.1654, + "eval_samples_per_second": 3.121, + "eval_steps_per_second": 1.046, + "num_input_tokens_seen": 2044723200, + "step": 5200 + }, + { + "epoch": 0.5897896493447136, + "grad_norm": 0.6346768140792847, + "learning_rate": 2.1589688506981743e-05, + "loss": 2.0408, + "num_input_tokens_seen": 2045509632, + "step": 5202 + }, + { + "epoch": 0.5904699142228506, + "grad_norm": 0.64287930727005, + "learning_rate": 2.1553884711779448e-05, + "loss": 1.9892, + "num_input_tokens_seen": 2047868928, + "step": 5208 + }, + { + "epoch": 0.5911501791009874, + "grad_norm": 0.5813894271850586, + "learning_rate": 2.1518080916577156e-05, + "loss": 2.041, + "num_input_tokens_seen": 2050228224, + "step": 5214 + }, + { + "epoch": 0.5918304439791243, + "grad_norm": 0.5855168104171753, + "learning_rate": 2.1482277121374868e-05, + "loss": 1.9966, + "num_input_tokens_seen": 2052587520, + "step": 5220 + }, + { + "epoch": 0.5925107088572613, + "grad_norm": 0.5594427585601807, + "learning_rate": 2.1446473326172576e-05, + "loss": 2.0597, + "num_input_tokens_seen": 2054946816, + "step": 5226 + }, + { + "epoch": 0.5931909737353982, + "grad_norm": 0.5417963266372681, + "learning_rate": 2.1410669530970284e-05, + "loss": 2.0369, + "num_input_tokens_seen": 2057306112, + "step": 5232 + }, + { + "epoch": 0.5938712386135352, + "grad_norm": 0.5939909815788269, + "learning_rate": 2.1374865735767992e-05, + "loss": 2.0407, + "num_input_tokens_seen": 2059665408, + "step": 5238 + }, + { + "epoch": 0.5945515034916721, + "grad_norm": 0.5187436938285828, + "learning_rate": 2.1339061940565704e-05, + "loss": 2.0203, + "num_input_tokens_seen": 2062024704, + "step": 5244 + }, + { + "epoch": 0.595231768369809, + "grad_norm": 0.5559435486793518, + "learning_rate": 2.130325814536341e-05, + "loss": 1.9577, + "num_input_tokens_seen": 2064384000, + "step": 5250 + }, + { + "epoch": 0.595912033247946, + "grad_norm": 0.5728279948234558, + "learning_rate": 2.1267454350161117e-05, + "loss": 1.9924, + "num_input_tokens_seen": 2066743296, + "step": 5256 + }, + { + "epoch": 0.5965922981260828, + "grad_norm": 0.5480873584747314, + "learning_rate": 2.1231650554958825e-05, + "loss": 2.0437, + "num_input_tokens_seen": 2069102592, + "step": 5262 + }, + { + "epoch": 0.5972725630042197, + "grad_norm": 0.5689738392829895, + "learning_rate": 2.1195846759756536e-05, + "loss": 2.0233, + "num_input_tokens_seen": 2071461888, + "step": 5268 + }, + { + "epoch": 0.5979528278823567, + "grad_norm": 0.5396390557289124, + "learning_rate": 2.1160042964554244e-05, + "loss": 2.0224, + "num_input_tokens_seen": 2073821184, + "step": 5274 + }, + { + "epoch": 0.5986330927604936, + "grad_norm": 0.6151924133300781, + "learning_rate": 2.1124239169351953e-05, + "loss": 2.0475, + "num_input_tokens_seen": 2076180480, + "step": 5280 + }, + { + "epoch": 0.5993133576386306, + "grad_norm": 0.5532135367393494, + "learning_rate": 2.108843537414966e-05, + "loss": 2.0383, + "num_input_tokens_seen": 2078539776, + "step": 5286 + }, + { + "epoch": 0.5999936225167675, + "grad_norm": 0.5925424695014954, + "learning_rate": 2.105263157894737e-05, + "loss": 2.001, + "num_input_tokens_seen": 2080899072, + "step": 5292 + }, + { + "epoch": 0.6006738873949043, + "grad_norm": 0.5219939947128296, + "learning_rate": 2.1016827783745077e-05, + "loss": 2.0261, + "num_input_tokens_seen": 2083258368, + "step": 5298 + }, + { + "epoch": 0.6013541522730413, + "grad_norm": 0.5546817779541016, + "learning_rate": 2.0981023988542785e-05, + "loss": 2.0535, + "num_input_tokens_seen": 2085617664, + "step": 5304 + }, + { + "epoch": 0.6020344171511782, + "grad_norm": 0.5580465793609619, + "learning_rate": 2.0945220193340497e-05, + "loss": 2.0498, + "num_input_tokens_seen": 2087976960, + "step": 5310 + }, + { + "epoch": 0.6027146820293152, + "grad_norm": 0.5796703696250916, + "learning_rate": 2.0909416398138205e-05, + "loss": 2.0512, + "num_input_tokens_seen": 2090336256, + "step": 5316 + }, + { + "epoch": 0.6033949469074521, + "grad_norm": 0.6007897257804871, + "learning_rate": 2.0873612602935913e-05, + "loss": 2.0236, + "num_input_tokens_seen": 2092695552, + "step": 5322 + }, + { + "epoch": 0.604075211785589, + "grad_norm": 0.5929319262504578, + "learning_rate": 2.083780880773362e-05, + "loss": 2.0342, + "num_input_tokens_seen": 2095054848, + "step": 5328 + }, + { + "epoch": 0.604755476663726, + "grad_norm": 0.6018472909927368, + "learning_rate": 2.080200501253133e-05, + "loss": 1.9576, + "num_input_tokens_seen": 2097414144, + "step": 5334 + }, + { + "epoch": 0.6054357415418629, + "grad_norm": 0.546468198299408, + "learning_rate": 2.0766201217329038e-05, + "loss": 2.0191, + "num_input_tokens_seen": 2099773440, + "step": 5340 + }, + { + "epoch": 0.6061160064199997, + "grad_norm": 0.5679252743721008, + "learning_rate": 2.0730397422126746e-05, + "loss": 2.0272, + "num_input_tokens_seen": 2102132736, + "step": 5346 + }, + { + "epoch": 0.6067962712981367, + "grad_norm": 0.5521571040153503, + "learning_rate": 2.0694593626924454e-05, + "loss": 2.0423, + "num_input_tokens_seen": 2104492032, + "step": 5352 + }, + { + "epoch": 0.6074765361762736, + "grad_norm": 0.597798228263855, + "learning_rate": 2.0658789831722165e-05, + "loss": 2.0251, + "num_input_tokens_seen": 2106851328, + "step": 5358 + }, + { + "epoch": 0.6081568010544106, + "grad_norm": 0.557311475276947, + "learning_rate": 2.0622986036519874e-05, + "loss": 2.0391, + "num_input_tokens_seen": 2109210624, + "step": 5364 + }, + { + "epoch": 0.6088370659325475, + "grad_norm": 0.6349292397499084, + "learning_rate": 2.0587182241317582e-05, + "loss": 2.0279, + "num_input_tokens_seen": 2111569920, + "step": 5370 + }, + { + "epoch": 0.6095173308106844, + "grad_norm": 0.5559099912643433, + "learning_rate": 2.0551378446115287e-05, + "loss": 2.01, + "num_input_tokens_seen": 2113929216, + "step": 5376 + }, + { + "epoch": 0.6101975956888214, + "grad_norm": 0.6606104969978333, + "learning_rate": 2.0515574650912998e-05, + "loss": 2.058, + "num_input_tokens_seen": 2116288512, + "step": 5382 + }, + { + "epoch": 0.6108778605669583, + "grad_norm": 0.5794185996055603, + "learning_rate": 2.0479770855710706e-05, + "loss": 2.0456, + "num_input_tokens_seen": 2118647808, + "step": 5388 + }, + { + "epoch": 0.6115581254450951, + "grad_norm": 0.5368028879165649, + "learning_rate": 2.0443967060508414e-05, + "loss": 2.003, + "num_input_tokens_seen": 2121007104, + "step": 5394 + }, + { + "epoch": 0.6122383903232321, + "grad_norm": 0.6016758680343628, + "learning_rate": 2.0408163265306123e-05, + "loss": 2.0776, + "num_input_tokens_seen": 2123366400, + "step": 5400 + }, + { + "epoch": 0.612918655201369, + "grad_norm": 0.5522080659866333, + "learning_rate": 2.0372359470103834e-05, + "loss": 2.0433, + "num_input_tokens_seen": 2125725696, + "step": 5406 + }, + { + "epoch": 0.613598920079506, + "grad_norm": 0.622534990310669, + "learning_rate": 2.0336555674901542e-05, + "loss": 2.0225, + "num_input_tokens_seen": 2128084992, + "step": 5412 + }, + { + "epoch": 0.6142791849576429, + "grad_norm": 0.5342540144920349, + "learning_rate": 2.0300751879699247e-05, + "loss": 2.0334, + "num_input_tokens_seen": 2130444288, + "step": 5418 + }, + { + "epoch": 0.6149594498357798, + "grad_norm": 0.5722180604934692, + "learning_rate": 2.026494808449696e-05, + "loss": 2.0581, + "num_input_tokens_seen": 2132803584, + "step": 5424 + }, + { + "epoch": 0.6156397147139168, + "grad_norm": 0.6203873157501221, + "learning_rate": 2.0229144289294667e-05, + "loss": 2.0202, + "num_input_tokens_seen": 2135162880, + "step": 5430 + }, + { + "epoch": 0.6163199795920536, + "grad_norm": 0.5414242148399353, + "learning_rate": 2.0193340494092375e-05, + "loss": 2.0302, + "num_input_tokens_seen": 2137522176, + "step": 5436 + }, + { + "epoch": 0.6170002444701905, + "grad_norm": 0.6533239483833313, + "learning_rate": 2.0157536698890083e-05, + "loss": 2.046, + "num_input_tokens_seen": 2139881472, + "step": 5442 + }, + { + "epoch": 0.6176805093483275, + "grad_norm": 0.647675096988678, + "learning_rate": 2.012173290368779e-05, + "loss": 2.0384, + "num_input_tokens_seen": 2142240768, + "step": 5448 + }, + { + "epoch": 0.6183607742264644, + "grad_norm": 0.7028170228004456, + "learning_rate": 2.0085929108485503e-05, + "loss": 2.0393, + "num_input_tokens_seen": 2144600064, + "step": 5454 + }, + { + "epoch": 0.6190410391046014, + "grad_norm": 0.6044926047325134, + "learning_rate": 2.0050125313283208e-05, + "loss": 2.0437, + "num_input_tokens_seen": 2146959360, + "step": 5460 + }, + { + "epoch": 0.6197213039827383, + "grad_norm": 0.5385059714317322, + "learning_rate": 2.0014321518080916e-05, + "loss": 2.016, + "num_input_tokens_seen": 2149318656, + "step": 5466 + }, + { + "epoch": 0.6204015688608752, + "grad_norm": 0.6397769451141357, + "learning_rate": 1.9978517722878627e-05, + "loss": 2.0065, + "num_input_tokens_seen": 2151677952, + "step": 5472 + }, + { + "epoch": 0.6210818337390122, + "grad_norm": 0.6114969253540039, + "learning_rate": 1.9942713927676335e-05, + "loss": 2.0456, + "num_input_tokens_seen": 2154037248, + "step": 5478 + }, + { + "epoch": 0.621762098617149, + "grad_norm": 0.585610568523407, + "learning_rate": 1.9906910132474044e-05, + "loss": 2.0395, + "num_input_tokens_seen": 2156396544, + "step": 5484 + }, + { + "epoch": 0.6224423634952859, + "grad_norm": 0.5144538879394531, + "learning_rate": 1.987110633727175e-05, + "loss": 2.0605, + "num_input_tokens_seen": 2158755840, + "step": 5490 + }, + { + "epoch": 0.6231226283734229, + "grad_norm": 0.5797079205513, + "learning_rate": 1.9835302542069463e-05, + "loss": 2.0206, + "num_input_tokens_seen": 2161115136, + "step": 5496 + }, + { + "epoch": 0.6238028932515598, + "grad_norm": 0.49237367510795593, + "learning_rate": 1.9799498746867168e-05, + "loss": 1.9534, + "num_input_tokens_seen": 2163474432, + "step": 5502 + }, + { + "epoch": 0.6244831581296968, + "grad_norm": 0.5623591542243958, + "learning_rate": 1.9763694951664876e-05, + "loss": 2.0229, + "num_input_tokens_seen": 2165833728, + "step": 5508 + }, + { + "epoch": 0.6251634230078337, + "grad_norm": 0.5550060868263245, + "learning_rate": 1.9727891156462584e-05, + "loss": 2.0442, + "num_input_tokens_seen": 2168193024, + "step": 5514 + }, + { + "epoch": 0.6258436878859706, + "grad_norm": 0.5482363104820251, + "learning_rate": 1.9692087361260296e-05, + "loss": 2.0418, + "num_input_tokens_seen": 2170552320, + "step": 5520 + }, + { + "epoch": 0.6265239527641076, + "grad_norm": 0.612348198890686, + "learning_rate": 1.9656283566058004e-05, + "loss": 1.9943, + "num_input_tokens_seen": 2172911616, + "step": 5526 + }, + { + "epoch": 0.6272042176422444, + "grad_norm": 0.5311436057090759, + "learning_rate": 1.9620479770855712e-05, + "loss": 2.0369, + "num_input_tokens_seen": 2175270912, + "step": 5532 + }, + { + "epoch": 0.6278844825203814, + "grad_norm": 0.5449828505516052, + "learning_rate": 1.958467597565342e-05, + "loss": 2.0169, + "num_input_tokens_seen": 2177630208, + "step": 5538 + }, + { + "epoch": 0.6285647473985183, + "grad_norm": 0.5630024671554565, + "learning_rate": 1.954887218045113e-05, + "loss": 2.0441, + "num_input_tokens_seen": 2179989504, + "step": 5544 + }, + { + "epoch": 0.6292450122766552, + "grad_norm": 0.5129250288009644, + "learning_rate": 1.9513068385248837e-05, + "loss": 1.9904, + "num_input_tokens_seen": 2182348800, + "step": 5550 + }, + { + "epoch": 0.6299252771547922, + "grad_norm": 0.6263514161109924, + "learning_rate": 1.9477264590046545e-05, + "loss": 2.0084, + "num_input_tokens_seen": 2184708096, + "step": 5556 + }, + { + "epoch": 0.6306055420329291, + "grad_norm": 0.5702618360519409, + "learning_rate": 1.9441460794844253e-05, + "loss": 2.0487, + "num_input_tokens_seen": 2187067392, + "step": 5562 + }, + { + "epoch": 0.631285806911066, + "grad_norm": 0.6130457520484924, + "learning_rate": 1.9405656999641965e-05, + "loss": 2.047, + "num_input_tokens_seen": 2189426688, + "step": 5568 + }, + { + "epoch": 0.6319660717892029, + "grad_norm": 0.5357660055160522, + "learning_rate": 1.9369853204439673e-05, + "loss": 1.9605, + "num_input_tokens_seen": 2191785984, + "step": 5574 + }, + { + "epoch": 0.6326463366673398, + "grad_norm": 0.515099287033081, + "learning_rate": 1.933404940923738e-05, + "loss": 1.9558, + "num_input_tokens_seen": 2194145280, + "step": 5580 + }, + { + "epoch": 0.6333266015454768, + "grad_norm": 0.5823177695274353, + "learning_rate": 1.929824561403509e-05, + "loss": 2.0505, + "num_input_tokens_seen": 2196504576, + "step": 5586 + }, + { + "epoch": 0.6340068664236137, + "grad_norm": 0.5302935838699341, + "learning_rate": 1.9262441818832797e-05, + "loss": 2.0165, + "num_input_tokens_seen": 2198863872, + "step": 5592 + }, + { + "epoch": 0.6346871313017506, + "grad_norm": 0.6928517818450928, + "learning_rate": 1.9226638023630505e-05, + "loss": 2.0234, + "num_input_tokens_seen": 2201223168, + "step": 5598 + }, + { + "epoch": 0.6349138862611295, + "eval_accuracy": 0.5844572649572649, + "eval_loss": 2.0162546634674072, + "eval_runtime": 129.1053, + "eval_samples_per_second": 3.098, + "eval_steps_per_second": 1.038, + "num_input_tokens_seen": 2202009600, + "step": 5600 + }, + { + "epoch": 0.6353673961798876, + "grad_norm": 0.6656593084335327, + "learning_rate": 1.9190834228428213e-05, + "loss": 2.0049, + "num_input_tokens_seen": 2203582464, + "step": 5604 + }, + { + "epoch": 0.6360476610580245, + "grad_norm": 0.5688000917434692, + "learning_rate": 1.9155030433225925e-05, + "loss": 2.0513, + "num_input_tokens_seen": 2205941760, + "step": 5610 + }, + { + "epoch": 0.6367279259361613, + "grad_norm": 0.5396568179130554, + "learning_rate": 1.9119226638023633e-05, + "loss": 2.0563, + "num_input_tokens_seen": 2208301056, + "step": 5616 + }, + { + "epoch": 0.6374081908142983, + "grad_norm": 0.6275331974029541, + "learning_rate": 1.908342284282134e-05, + "loss": 2.0474, + "num_input_tokens_seen": 2210660352, + "step": 5622 + }, + { + "epoch": 0.6380884556924352, + "grad_norm": 0.53306645154953, + "learning_rate": 1.9047619047619046e-05, + "loss": 2.0287, + "num_input_tokens_seen": 2213019648, + "step": 5628 + }, + { + "epoch": 0.6387687205705722, + "grad_norm": 0.5248289704322815, + "learning_rate": 1.9011815252416758e-05, + "loss": 1.9989, + "num_input_tokens_seen": 2215378944, + "step": 5634 + }, + { + "epoch": 0.6394489854487091, + "grad_norm": 0.5750051140785217, + "learning_rate": 1.8976011457214466e-05, + "loss": 2.103, + "num_input_tokens_seen": 2217738240, + "step": 5640 + }, + { + "epoch": 0.640129250326846, + "grad_norm": 0.6214660406112671, + "learning_rate": 1.8940207662012174e-05, + "loss": 2.0476, + "num_input_tokens_seen": 2220097536, + "step": 5646 + }, + { + "epoch": 0.640809515204983, + "grad_norm": 0.6341087222099304, + "learning_rate": 1.8904403866809882e-05, + "loss": 2.0424, + "num_input_tokens_seen": 2222456832, + "step": 5652 + }, + { + "epoch": 0.6414897800831199, + "grad_norm": 0.5591261386871338, + "learning_rate": 1.8868600071607594e-05, + "loss": 2.0624, + "num_input_tokens_seen": 2224816128, + "step": 5658 + }, + { + "epoch": 0.6421700449612567, + "grad_norm": 0.538550615310669, + "learning_rate": 1.8832796276405302e-05, + "loss": 2.0207, + "num_input_tokens_seen": 2227175424, + "step": 5664 + }, + { + "epoch": 0.6428503098393937, + "grad_norm": 0.5393409729003906, + "learning_rate": 1.8796992481203007e-05, + "loss": 2.0775, + "num_input_tokens_seen": 2229534720, + "step": 5670 + }, + { + "epoch": 0.6435305747175306, + "grad_norm": 0.6216705441474915, + "learning_rate": 1.8761188686000715e-05, + "loss": 2.0119, + "num_input_tokens_seen": 2231894016, + "step": 5676 + }, + { + "epoch": 0.6442108395956676, + "grad_norm": 0.7083945274353027, + "learning_rate": 1.8725384890798426e-05, + "loss": 2.0221, + "num_input_tokens_seen": 2234253312, + "step": 5682 + }, + { + "epoch": 0.6448911044738045, + "grad_norm": 0.6334338784217834, + "learning_rate": 1.8689581095596134e-05, + "loss": 2.025, + "num_input_tokens_seen": 2236612608, + "step": 5688 + }, + { + "epoch": 0.6455713693519414, + "grad_norm": 0.8451031446456909, + "learning_rate": 1.8653777300393843e-05, + "loss": 2.0488, + "num_input_tokens_seen": 2238971904, + "step": 5694 + }, + { + "epoch": 0.6462516342300784, + "grad_norm": 0.6180372834205627, + "learning_rate": 1.861797350519155e-05, + "loss": 2.0468, + "num_input_tokens_seen": 2241331200, + "step": 5700 + }, + { + "epoch": 0.6469318991082152, + "grad_norm": 0.5460434556007385, + "learning_rate": 1.8582169709989262e-05, + "loss": 2.0523, + "num_input_tokens_seen": 2243690496, + "step": 5706 + }, + { + "epoch": 0.6476121639863522, + "grad_norm": 0.54164719581604, + "learning_rate": 1.8546365914786967e-05, + "loss": 2.0406, + "num_input_tokens_seen": 2246049792, + "step": 5712 + }, + { + "epoch": 0.6482924288644891, + "grad_norm": 0.5062336921691895, + "learning_rate": 1.8510562119584675e-05, + "loss": 2.0353, + "num_input_tokens_seen": 2248409088, + "step": 5718 + }, + { + "epoch": 0.648972693742626, + "grad_norm": 0.5514745712280273, + "learning_rate": 1.8474758324382387e-05, + "loss": 2.0479, + "num_input_tokens_seen": 2250768384, + "step": 5724 + }, + { + "epoch": 0.649652958620763, + "grad_norm": 0.6137591600418091, + "learning_rate": 1.8438954529180095e-05, + "loss": 2.0144, + "num_input_tokens_seen": 2253127680, + "step": 5730 + }, + { + "epoch": 0.6503332234988999, + "grad_norm": 0.6638593077659607, + "learning_rate": 1.8403150733977803e-05, + "loss": 2.0213, + "num_input_tokens_seen": 2255486976, + "step": 5736 + }, + { + "epoch": 0.6510134883770368, + "grad_norm": 0.5813100337982178, + "learning_rate": 1.836734693877551e-05, + "loss": 1.9889, + "num_input_tokens_seen": 2257846272, + "step": 5742 + }, + { + "epoch": 0.6516937532551738, + "grad_norm": 0.5840685963630676, + "learning_rate": 1.833154314357322e-05, + "loss": 1.9623, + "num_input_tokens_seen": 2260205568, + "step": 5748 + }, + { + "epoch": 0.6523740181333106, + "grad_norm": 0.6277685761451721, + "learning_rate": 1.8295739348370928e-05, + "loss": 2.0894, + "num_input_tokens_seen": 2262564864, + "step": 5754 + }, + { + "epoch": 0.6530542830114476, + "grad_norm": 0.5178935527801514, + "learning_rate": 1.8259935553168636e-05, + "loss": 2.0359, + "num_input_tokens_seen": 2264924160, + "step": 5760 + }, + { + "epoch": 0.6537345478895845, + "grad_norm": 0.5926587581634521, + "learning_rate": 1.8224131757966344e-05, + "loss": 1.9869, + "num_input_tokens_seen": 2267283456, + "step": 5766 + }, + { + "epoch": 0.6544148127677214, + "grad_norm": 0.5860345959663391, + "learning_rate": 1.8188327962764055e-05, + "loss": 2.0953, + "num_input_tokens_seen": 2269642752, + "step": 5772 + }, + { + "epoch": 0.6550950776458584, + "grad_norm": 0.5178937315940857, + "learning_rate": 1.8152524167561764e-05, + "loss": 2.0017, + "num_input_tokens_seen": 2272002048, + "step": 5778 + }, + { + "epoch": 0.6557753425239953, + "grad_norm": 0.5575287938117981, + "learning_rate": 1.8116720372359472e-05, + "loss": 1.9847, + "num_input_tokens_seen": 2274361344, + "step": 5784 + }, + { + "epoch": 0.6564556074021322, + "grad_norm": 0.7367034554481506, + "learning_rate": 1.808091657715718e-05, + "loss": 2.0555, + "num_input_tokens_seen": 2276720640, + "step": 5790 + }, + { + "epoch": 0.6571358722802692, + "grad_norm": 0.5715209245681763, + "learning_rate": 1.8045112781954888e-05, + "loss": 2.0645, + "num_input_tokens_seen": 2279079936, + "step": 5796 + }, + { + "epoch": 0.657816137158406, + "grad_norm": 0.5563541054725647, + "learning_rate": 1.8009308986752596e-05, + "loss": 1.9926, + "num_input_tokens_seen": 2281439232, + "step": 5802 + }, + { + "epoch": 0.658496402036543, + "grad_norm": 0.6194490194320679, + "learning_rate": 1.7973505191550304e-05, + "loss": 2.0258, + "num_input_tokens_seen": 2283798528, + "step": 5808 + }, + { + "epoch": 0.6591766669146799, + "grad_norm": 0.548372209072113, + "learning_rate": 1.7937701396348013e-05, + "loss": 2.0432, + "num_input_tokens_seen": 2286157824, + "step": 5814 + }, + { + "epoch": 0.6598569317928168, + "grad_norm": 0.5206018090248108, + "learning_rate": 1.7901897601145724e-05, + "loss": 2.0205, + "num_input_tokens_seen": 2288517120, + "step": 5820 + }, + { + "epoch": 0.6605371966709538, + "grad_norm": 0.5887606739997864, + "learning_rate": 1.7866093805943432e-05, + "loss": 2.0464, + "num_input_tokens_seen": 2290876416, + "step": 5826 + }, + { + "epoch": 0.6612174615490907, + "grad_norm": 0.5260190367698669, + "learning_rate": 1.7830290010741137e-05, + "loss": 2.0031, + "num_input_tokens_seen": 2293235712, + "step": 5832 + }, + { + "epoch": 0.6618977264272276, + "grad_norm": 0.5165619850158691, + "learning_rate": 1.779448621553885e-05, + "loss": 2.0114, + "num_input_tokens_seen": 2295595008, + "step": 5838 + }, + { + "epoch": 0.6625779913053645, + "grad_norm": 0.5297482013702393, + "learning_rate": 1.7758682420336557e-05, + "loss": 2.0397, + "num_input_tokens_seen": 2297954304, + "step": 5844 + }, + { + "epoch": 0.6632582561835014, + "grad_norm": 0.5319347977638245, + "learning_rate": 1.7722878625134265e-05, + "loss": 1.9881, + "num_input_tokens_seen": 2300313600, + "step": 5850 + }, + { + "epoch": 0.6639385210616384, + "grad_norm": 0.5887789726257324, + "learning_rate": 1.7687074829931973e-05, + "loss": 2.0377, + "num_input_tokens_seen": 2302672896, + "step": 5856 + }, + { + "epoch": 0.6646187859397753, + "grad_norm": 0.5922223925590515, + "learning_rate": 1.765127103472968e-05, + "loss": 2.0109, + "num_input_tokens_seen": 2305032192, + "step": 5862 + }, + { + "epoch": 0.6652990508179122, + "grad_norm": 0.5542311072349548, + "learning_rate": 1.7615467239527393e-05, + "loss": 2.0284, + "num_input_tokens_seen": 2307391488, + "step": 5868 + }, + { + "epoch": 0.6659793156960492, + "grad_norm": 0.5875877141952515, + "learning_rate": 1.7579663444325098e-05, + "loss": 1.9765, + "num_input_tokens_seen": 2309750784, + "step": 5874 + }, + { + "epoch": 0.6666595805741861, + "grad_norm": 0.5613130927085876, + "learning_rate": 1.7543859649122806e-05, + "loss": 2.0177, + "num_input_tokens_seen": 2312110080, + "step": 5880 + }, + { + "epoch": 0.6673398454523229, + "grad_norm": 0.6007198691368103, + "learning_rate": 1.7508055853920517e-05, + "loss": 2.0009, + "num_input_tokens_seen": 2314469376, + "step": 5886 + }, + { + "epoch": 0.6680201103304599, + "grad_norm": 0.5905076861381531, + "learning_rate": 1.7472252058718225e-05, + "loss": 2.0161, + "num_input_tokens_seen": 2316828672, + "step": 5892 + }, + { + "epoch": 0.6687003752085968, + "grad_norm": 0.5711119771003723, + "learning_rate": 1.7436448263515934e-05, + "loss": 2.0024, + "num_input_tokens_seen": 2319187968, + "step": 5898 + }, + { + "epoch": 0.6693806400867338, + "grad_norm": 0.5723757147789001, + "learning_rate": 1.7400644468313642e-05, + "loss": 2.0639, + "num_input_tokens_seen": 2321547264, + "step": 5904 + }, + { + "epoch": 0.6700609049648707, + "grad_norm": 0.6024678349494934, + "learning_rate": 1.7364840673111353e-05, + "loss": 1.9558, + "num_input_tokens_seen": 2323906560, + "step": 5910 + }, + { + "epoch": 0.6707411698430076, + "grad_norm": 0.5753939151763916, + "learning_rate": 1.7329036877909058e-05, + "loss": 2.066, + "num_input_tokens_seen": 2326265856, + "step": 5916 + }, + { + "epoch": 0.6714214347211446, + "grad_norm": 0.6269899606704712, + "learning_rate": 1.7293233082706766e-05, + "loss": 1.9689, + "num_input_tokens_seen": 2328625152, + "step": 5922 + }, + { + "epoch": 0.6721016995992815, + "grad_norm": 0.5666351318359375, + "learning_rate": 1.7257429287504474e-05, + "loss": 1.9687, + "num_input_tokens_seen": 2330984448, + "step": 5928 + }, + { + "epoch": 0.6727819644774184, + "grad_norm": 0.5650635957717896, + "learning_rate": 1.7221625492302186e-05, + "loss": 2.0503, + "num_input_tokens_seen": 2333343744, + "step": 5934 + }, + { + "epoch": 0.6734622293555553, + "grad_norm": 0.5674002766609192, + "learning_rate": 1.7185821697099894e-05, + "loss": 2.0113, + "num_input_tokens_seen": 2335703040, + "step": 5940 + }, + { + "epoch": 0.6741424942336922, + "grad_norm": 0.5696431398391724, + "learning_rate": 1.7150017901897602e-05, + "loss": 2.0004, + "num_input_tokens_seen": 2338062336, + "step": 5946 + }, + { + "epoch": 0.6748227591118292, + "grad_norm": 0.5952620506286621, + "learning_rate": 1.711421410669531e-05, + "loss": 2.0284, + "num_input_tokens_seen": 2340421632, + "step": 5952 + }, + { + "epoch": 0.6755030239899661, + "grad_norm": 0.5755632519721985, + "learning_rate": 1.707841031149302e-05, + "loss": 2.0401, + "num_input_tokens_seen": 2342780928, + "step": 5958 + }, + { + "epoch": 0.676183288868103, + "grad_norm": 0.5462335348129272, + "learning_rate": 1.7042606516290727e-05, + "loss": 2.0531, + "num_input_tokens_seen": 2345140224, + "step": 5964 + }, + { + "epoch": 0.67686355374624, + "grad_norm": 0.5699030756950378, + "learning_rate": 1.7006802721088435e-05, + "loss": 2.0432, + "num_input_tokens_seen": 2347499520, + "step": 5970 + }, + { + "epoch": 0.6775438186243768, + "grad_norm": 0.546146035194397, + "learning_rate": 1.6970998925886143e-05, + "loss": 2.0535, + "num_input_tokens_seen": 2349858816, + "step": 5976 + }, + { + "epoch": 0.6782240835025138, + "grad_norm": 0.5467692613601685, + "learning_rate": 1.6935195130683855e-05, + "loss": 2.054, + "num_input_tokens_seen": 2352218112, + "step": 5982 + }, + { + "epoch": 0.6789043483806507, + "grad_norm": 0.5051277875900269, + "learning_rate": 1.6899391335481563e-05, + "loss": 2.0584, + "num_input_tokens_seen": 2354577408, + "step": 5988 + }, + { + "epoch": 0.6795846132587876, + "grad_norm": 0.5340428352355957, + "learning_rate": 1.686358754027927e-05, + "loss": 2.0183, + "num_input_tokens_seen": 2356936704, + "step": 5994 + }, + { + "epoch": 0.6802648781369246, + "grad_norm": 0.5289435386657715, + "learning_rate": 1.682778374507698e-05, + "loss": 2.073, + "num_input_tokens_seen": 2359296000, + "step": 6000 + }, + { + "epoch": 0.6802648781369246, + "eval_accuracy": 0.5850433455433456, + "eval_loss": 2.012032985687256, + "eval_runtime": 128.1029, + "eval_samples_per_second": 3.122, + "eval_steps_per_second": 1.046, + "num_input_tokens_seen": 2359296000, + "step": 6000 + }, + { + "epoch": 0.6809451430150615, + "grad_norm": 0.5148223638534546, + "learning_rate": 1.6791979949874687e-05, + "loss": 2.0406, + "num_input_tokens_seen": 2361655296, + "step": 6006 + }, + { + "epoch": 0.6816254078931984, + "grad_norm": 0.5186204314231873, + "learning_rate": 1.6756176154672395e-05, + "loss": 2.0097, + "num_input_tokens_seen": 2364014592, + "step": 6012 + }, + { + "epoch": 0.6823056727713354, + "grad_norm": 0.5499780774116516, + "learning_rate": 1.6720372359470104e-05, + "loss": 1.9898, + "num_input_tokens_seen": 2366373888, + "step": 6018 + }, + { + "epoch": 0.6829859376494722, + "grad_norm": 0.5377045273780823, + "learning_rate": 1.6684568564267815e-05, + "loss": 2.0034, + "num_input_tokens_seen": 2368733184, + "step": 6024 + }, + { + "epoch": 0.6836662025276092, + "grad_norm": 0.6097836494445801, + "learning_rate": 1.6648764769065523e-05, + "loss": 2.048, + "num_input_tokens_seen": 2371092480, + "step": 6030 + }, + { + "epoch": 0.6843464674057461, + "grad_norm": 0.5289618968963623, + "learning_rate": 1.661296097386323e-05, + "loss": 2.0785, + "num_input_tokens_seen": 2373451776, + "step": 6036 + }, + { + "epoch": 0.685026732283883, + "grad_norm": 0.522239089012146, + "learning_rate": 1.6577157178660936e-05, + "loss": 2.0472, + "num_input_tokens_seen": 2375811072, + "step": 6042 + }, + { + "epoch": 0.68570699716202, + "grad_norm": 0.5830532908439636, + "learning_rate": 1.6541353383458648e-05, + "loss": 1.9969, + "num_input_tokens_seen": 2378170368, + "step": 6048 + }, + { + "epoch": 0.6863872620401569, + "grad_norm": 0.5812861919403076, + "learning_rate": 1.6505549588256356e-05, + "loss": 2.0162, + "num_input_tokens_seen": 2380529664, + "step": 6054 + }, + { + "epoch": 0.6870675269182938, + "grad_norm": 0.607001543045044, + "learning_rate": 1.6469745793054064e-05, + "loss": 1.977, + "num_input_tokens_seen": 2382888960, + "step": 6060 + }, + { + "epoch": 0.6877477917964308, + "grad_norm": 0.5279623866081238, + "learning_rate": 1.6433941997851772e-05, + "loss": 2.0245, + "num_input_tokens_seen": 2385248256, + "step": 6066 + }, + { + "epoch": 0.6884280566745676, + "grad_norm": 0.49770334362983704, + "learning_rate": 1.6398138202649484e-05, + "loss": 2.0936, + "num_input_tokens_seen": 2387607552, + "step": 6072 + }, + { + "epoch": 0.6891083215527046, + "grad_norm": 0.5788572430610657, + "learning_rate": 1.6362334407447192e-05, + "loss": 2.0201, + "num_input_tokens_seen": 2389966848, + "step": 6078 + }, + { + "epoch": 0.6897885864308415, + "grad_norm": 0.5910390019416809, + "learning_rate": 1.6326530612244897e-05, + "loss": 2.0595, + "num_input_tokens_seen": 2392326144, + "step": 6084 + }, + { + "epoch": 0.6904688513089784, + "grad_norm": 0.5049883127212524, + "learning_rate": 1.6290726817042605e-05, + "loss": 2.0315, + "num_input_tokens_seen": 2394685440, + "step": 6090 + }, + { + "epoch": 0.6911491161871154, + "grad_norm": 0.6210893988609314, + "learning_rate": 1.6254923021840316e-05, + "loss": 2.0042, + "num_input_tokens_seen": 2397044736, + "step": 6096 + }, + { + "epoch": 0.6918293810652523, + "grad_norm": 0.5483914017677307, + "learning_rate": 1.6219119226638025e-05, + "loss": 1.9958, + "num_input_tokens_seen": 2399404032, + "step": 6102 + }, + { + "epoch": 0.6925096459433893, + "grad_norm": 0.5364962220191956, + "learning_rate": 1.6183315431435733e-05, + "loss": 2.0369, + "num_input_tokens_seen": 2401763328, + "step": 6108 + }, + { + "epoch": 0.6931899108215261, + "grad_norm": 0.5008904933929443, + "learning_rate": 1.614751163623344e-05, + "loss": 2.0351, + "num_input_tokens_seen": 2404122624, + "step": 6114 + }, + { + "epoch": 0.693870175699663, + "grad_norm": 0.5553967356681824, + "learning_rate": 1.6111707841031152e-05, + "loss": 2.0215, + "num_input_tokens_seen": 2406481920, + "step": 6120 + }, + { + "epoch": 0.6945504405778, + "grad_norm": 0.5240505337715149, + "learning_rate": 1.6075904045828857e-05, + "loss": 2.0129, + "num_input_tokens_seen": 2408841216, + "step": 6126 + }, + { + "epoch": 0.6952307054559369, + "grad_norm": 0.5032373070716858, + "learning_rate": 1.6040100250626565e-05, + "loss": 2.003, + "num_input_tokens_seen": 2411200512, + "step": 6132 + }, + { + "epoch": 0.6959109703340738, + "grad_norm": 0.5194136500358582, + "learning_rate": 1.6004296455424277e-05, + "loss": 1.9717, + "num_input_tokens_seen": 2413559808, + "step": 6138 + }, + { + "epoch": 0.6965912352122108, + "grad_norm": 0.6916829943656921, + "learning_rate": 1.5968492660221985e-05, + "loss": 2.0228, + "num_input_tokens_seen": 2415919104, + "step": 6144 + }, + { + "epoch": 0.6972715000903477, + "grad_norm": 0.5243424773216248, + "learning_rate": 1.5932688865019693e-05, + "loss": 2.0121, + "num_input_tokens_seen": 2418278400, + "step": 6150 + }, + { + "epoch": 0.6979517649684847, + "grad_norm": 0.6381689310073853, + "learning_rate": 1.58968850698174e-05, + "loss": 2.0556, + "num_input_tokens_seen": 2420637696, + "step": 6156 + }, + { + "epoch": 0.6986320298466215, + "grad_norm": 0.537339448928833, + "learning_rate": 1.586108127461511e-05, + "loss": 1.9974, + "num_input_tokens_seen": 2422996992, + "step": 6162 + }, + { + "epoch": 0.6993122947247584, + "grad_norm": 0.5361027717590332, + "learning_rate": 1.5825277479412818e-05, + "loss": 2.0205, + "num_input_tokens_seen": 2425356288, + "step": 6168 + }, + { + "epoch": 0.6999925596028954, + "grad_norm": 0.5492175817489624, + "learning_rate": 1.5789473684210526e-05, + "loss": 2.0168, + "num_input_tokens_seen": 2427715584, + "step": 6174 + }, + { + "epoch": 0.7006728244810323, + "grad_norm": 0.5616779923439026, + "learning_rate": 1.5753669889008234e-05, + "loss": 2.0008, + "num_input_tokens_seen": 2430074880, + "step": 6180 + }, + { + "epoch": 0.7013530893591692, + "grad_norm": 0.5484799146652222, + "learning_rate": 1.5717866093805946e-05, + "loss": 1.9296, + "num_input_tokens_seen": 2432434176, + "step": 6186 + }, + { + "epoch": 0.7020333542373062, + "grad_norm": 0.6379416584968567, + "learning_rate": 1.5682062298603654e-05, + "loss": 2.0134, + "num_input_tokens_seen": 2434793472, + "step": 6192 + }, + { + "epoch": 0.7027136191154431, + "grad_norm": 0.582015872001648, + "learning_rate": 1.5646258503401362e-05, + "loss": 2.04, + "num_input_tokens_seen": 2437152768, + "step": 6198 + }, + { + "epoch": 0.70339388399358, + "grad_norm": 0.545238196849823, + "learning_rate": 1.561045470819907e-05, + "loss": 2.0462, + "num_input_tokens_seen": 2439512064, + "step": 6204 + }, + { + "epoch": 0.7040741488717169, + "grad_norm": 0.6547414660453796, + "learning_rate": 1.5574650912996778e-05, + "loss": 2.0875, + "num_input_tokens_seen": 2441871360, + "step": 6210 + }, + { + "epoch": 0.7047544137498538, + "grad_norm": 0.5639871954917908, + "learning_rate": 1.5538847117794486e-05, + "loss": 2.0379, + "num_input_tokens_seen": 2444230656, + "step": 6216 + }, + { + "epoch": 0.7054346786279908, + "grad_norm": 0.5656554698944092, + "learning_rate": 1.5503043322592194e-05, + "loss": 2.0478, + "num_input_tokens_seen": 2446589952, + "step": 6222 + }, + { + "epoch": 0.7061149435061277, + "grad_norm": 0.5386386513710022, + "learning_rate": 1.5467239527389903e-05, + "loss": 2.0567, + "num_input_tokens_seen": 2448949248, + "step": 6228 + }, + { + "epoch": 0.7067952083842646, + "grad_norm": 0.5181793570518494, + "learning_rate": 1.5431435732187614e-05, + "loss": 2.0285, + "num_input_tokens_seen": 2451308544, + "step": 6234 + }, + { + "epoch": 0.7074754732624016, + "grad_norm": 0.5418627858161926, + "learning_rate": 1.5395631936985322e-05, + "loss": 2.0252, + "num_input_tokens_seen": 2453667840, + "step": 6240 + }, + { + "epoch": 0.7081557381405384, + "grad_norm": 0.5028561353683472, + "learning_rate": 1.535982814178303e-05, + "loss": 2.0651, + "num_input_tokens_seen": 2456027136, + "step": 6246 + }, + { + "epoch": 0.7088360030186754, + "grad_norm": 0.5279743075370789, + "learning_rate": 1.532402434658074e-05, + "loss": 1.9913, + "num_input_tokens_seen": 2458386432, + "step": 6252 + }, + { + "epoch": 0.7095162678968123, + "grad_norm": 0.5399214029312134, + "learning_rate": 1.5288220551378447e-05, + "loss": 2.0098, + "num_input_tokens_seen": 2460745728, + "step": 6258 + }, + { + "epoch": 0.7101965327749492, + "grad_norm": 0.5421512722969055, + "learning_rate": 1.5252416756176155e-05, + "loss": 2.0065, + "num_input_tokens_seen": 2463105024, + "step": 6264 + }, + { + "epoch": 0.7108767976530862, + "grad_norm": 0.5487905740737915, + "learning_rate": 1.5216612960973863e-05, + "loss": 2.0431, + "num_input_tokens_seen": 2465464320, + "step": 6270 + }, + { + "epoch": 0.7115570625312231, + "grad_norm": 0.5177443623542786, + "learning_rate": 1.5180809165771573e-05, + "loss": 2.032, + "num_input_tokens_seen": 2467823616, + "step": 6276 + }, + { + "epoch": 0.71223732740936, + "grad_norm": 0.5425601601600647, + "learning_rate": 1.5145005370569281e-05, + "loss": 2.0493, + "num_input_tokens_seen": 2470182912, + "step": 6282 + }, + { + "epoch": 0.712917592287497, + "grad_norm": 0.5033071041107178, + "learning_rate": 1.5109201575366991e-05, + "loss": 1.9629, + "num_input_tokens_seen": 2472542208, + "step": 6288 + }, + { + "epoch": 0.7135978571656338, + "grad_norm": 0.5855656862258911, + "learning_rate": 1.5073397780164697e-05, + "loss": 1.9704, + "num_input_tokens_seen": 2474901504, + "step": 6294 + }, + { + "epoch": 0.7142781220437708, + "grad_norm": 0.5889589786529541, + "learning_rate": 1.5037593984962406e-05, + "loss": 1.9958, + "num_input_tokens_seen": 2477260800, + "step": 6300 + }, + { + "epoch": 0.7149583869219077, + "grad_norm": 0.5564432740211487, + "learning_rate": 1.5001790189760115e-05, + "loss": 2.017, + "num_input_tokens_seen": 2479620096, + "step": 6306 + }, + { + "epoch": 0.7156386518000446, + "grad_norm": 0.5233476161956787, + "learning_rate": 1.4965986394557824e-05, + "loss": 2.0426, + "num_input_tokens_seen": 2481979392, + "step": 6312 + }, + { + "epoch": 0.7163189166781816, + "grad_norm": 0.5455360412597656, + "learning_rate": 1.4930182599355533e-05, + "loss": 1.9938, + "num_input_tokens_seen": 2484338688, + "step": 6318 + }, + { + "epoch": 0.7169991815563185, + "grad_norm": 0.4952971935272217, + "learning_rate": 1.4894378804153242e-05, + "loss": 1.9942, + "num_input_tokens_seen": 2486697984, + "step": 6324 + }, + { + "epoch": 0.7176794464344555, + "grad_norm": 0.5228383541107178, + "learning_rate": 1.4858575008950952e-05, + "loss": 1.9894, + "num_input_tokens_seen": 2489057280, + "step": 6330 + }, + { + "epoch": 0.7183597113125924, + "grad_norm": 0.5237627029418945, + "learning_rate": 1.4822771213748656e-05, + "loss": 2.0579, + "num_input_tokens_seen": 2491416576, + "step": 6336 + }, + { + "epoch": 0.7190399761907292, + "grad_norm": 0.541063666343689, + "learning_rate": 1.4786967418546366e-05, + "loss": 2.0231, + "num_input_tokens_seen": 2493775872, + "step": 6342 + }, + { + "epoch": 0.7197202410688662, + "grad_norm": 0.5005678534507751, + "learning_rate": 1.4751163623344074e-05, + "loss": 2.0143, + "num_input_tokens_seen": 2496135168, + "step": 6348 + }, + { + "epoch": 0.7204005059470031, + "grad_norm": 0.5186320543289185, + "learning_rate": 1.4715359828141784e-05, + "loss": 2.0299, + "num_input_tokens_seen": 2498494464, + "step": 6354 + }, + { + "epoch": 0.72108077082514, + "grad_norm": 0.5344519019126892, + "learning_rate": 1.4679556032939492e-05, + "loss": 2.0408, + "num_input_tokens_seen": 2500853760, + "step": 6360 + }, + { + "epoch": 0.721761035703277, + "grad_norm": 0.5525063276290894, + "learning_rate": 1.4643752237737202e-05, + "loss": 2.0227, + "num_input_tokens_seen": 2503213056, + "step": 6366 + }, + { + "epoch": 0.7224413005814139, + "grad_norm": 0.5622419118881226, + "learning_rate": 1.460794844253491e-05, + "loss": 2.0103, + "num_input_tokens_seen": 2505572352, + "step": 6372 + }, + { + "epoch": 0.7231215654595509, + "grad_norm": 0.5473782420158386, + "learning_rate": 1.4572144647332617e-05, + "loss": 2.0163, + "num_input_tokens_seen": 2507931648, + "step": 6378 + }, + { + "epoch": 0.7238018303376877, + "grad_norm": 0.5566105246543884, + "learning_rate": 1.4536340852130325e-05, + "loss": 2.0512, + "num_input_tokens_seen": 2510290944, + "step": 6384 + }, + { + "epoch": 0.7244820952158246, + "grad_norm": 0.5519588589668274, + "learning_rate": 1.4500537056928035e-05, + "loss": 2.0518, + "num_input_tokens_seen": 2512650240, + "step": 6390 + }, + { + "epoch": 0.7251623600939616, + "grad_norm": 0.6528738737106323, + "learning_rate": 1.4464733261725743e-05, + "loss": 2.058, + "num_input_tokens_seen": 2515009536, + "step": 6396 + }, + { + "epoch": 0.7256158700127195, + "eval_accuracy": 0.5862222222222222, + "eval_loss": 2.0074377059936523, + "eval_runtime": 129.8788, + "eval_samples_per_second": 3.08, + "eval_steps_per_second": 1.032, + "num_input_tokens_seen": 2516582400, + "step": 6400 + }, + { + "epoch": 0.7258426249720985, + "grad_norm": 0.5393079519271851, + "learning_rate": 1.4428929466523453e-05, + "loss": 1.9774, + "num_input_tokens_seen": 2517368832, + "step": 6402 + }, + { + "epoch": 0.7265228898502354, + "grad_norm": 0.6304501295089722, + "learning_rate": 1.4393125671321161e-05, + "loss": 2.0694, + "num_input_tokens_seen": 2519728128, + "step": 6408 + }, + { + "epoch": 0.7272031547283724, + "grad_norm": 0.5596165060997009, + "learning_rate": 1.435732187611887e-05, + "loss": 2.0378, + "num_input_tokens_seen": 2522087424, + "step": 6414 + }, + { + "epoch": 0.7278834196065093, + "grad_norm": 0.5285012125968933, + "learning_rate": 1.4321518080916577e-05, + "loss": 2.0108, + "num_input_tokens_seen": 2524446720, + "step": 6420 + }, + { + "epoch": 0.7285636844846463, + "grad_norm": 0.5212527513504028, + "learning_rate": 1.4285714285714285e-05, + "loss": 2.0467, + "num_input_tokens_seen": 2526806016, + "step": 6426 + }, + { + "epoch": 0.7292439493627831, + "grad_norm": 0.5333656072616577, + "learning_rate": 1.4249910490511995e-05, + "loss": 2.0218, + "num_input_tokens_seen": 2529165312, + "step": 6432 + }, + { + "epoch": 0.72992421424092, + "grad_norm": 0.5523655414581299, + "learning_rate": 1.4214106695309703e-05, + "loss": 1.9989, + "num_input_tokens_seen": 2531524608, + "step": 6438 + }, + { + "epoch": 0.730604479119057, + "grad_norm": 0.5648514032363892, + "learning_rate": 1.4178302900107413e-05, + "loss": 2.0348, + "num_input_tokens_seen": 2533883904, + "step": 6444 + }, + { + "epoch": 0.7312847439971939, + "grad_norm": 0.5992633700370789, + "learning_rate": 1.4142499104905121e-05, + "loss": 2.0292, + "num_input_tokens_seen": 2536243200, + "step": 6450 + }, + { + "epoch": 0.7319650088753308, + "grad_norm": 0.5971994400024414, + "learning_rate": 1.410669530970283e-05, + "loss": 1.9372, + "num_input_tokens_seen": 2538602496, + "step": 6456 + }, + { + "epoch": 0.7326452737534678, + "grad_norm": 0.5448790192604065, + "learning_rate": 1.4070891514500536e-05, + "loss": 2.0675, + "num_input_tokens_seen": 2540961792, + "step": 6462 + }, + { + "epoch": 0.7333255386316047, + "grad_norm": 0.542102038860321, + "learning_rate": 1.4035087719298246e-05, + "loss": 2.0221, + "num_input_tokens_seen": 2543321088, + "step": 6468 + }, + { + "epoch": 0.7340058035097417, + "grad_norm": 0.5358943939208984, + "learning_rate": 1.3999283924095954e-05, + "loss": 1.9774, + "num_input_tokens_seen": 2545680384, + "step": 6474 + }, + { + "epoch": 0.7346860683878785, + "grad_norm": 0.532248318195343, + "learning_rate": 1.3963480128893664e-05, + "loss": 2.0362, + "num_input_tokens_seen": 2548039680, + "step": 6480 + }, + { + "epoch": 0.7353663332660154, + "grad_norm": 0.52365642786026, + "learning_rate": 1.3927676333691372e-05, + "loss": 2.0386, + "num_input_tokens_seen": 2550398976, + "step": 6486 + }, + { + "epoch": 0.7360465981441524, + "grad_norm": 0.5019295811653137, + "learning_rate": 1.3891872538489082e-05, + "loss": 2.0007, + "num_input_tokens_seen": 2552758272, + "step": 6492 + }, + { + "epoch": 0.7367268630222893, + "grad_norm": 0.5861442685127258, + "learning_rate": 1.385606874328679e-05, + "loss": 2.0565, + "num_input_tokens_seen": 2555117568, + "step": 6498 + }, + { + "epoch": 0.7374071279004262, + "grad_norm": 0.5421295762062073, + "learning_rate": 1.3820264948084497e-05, + "loss": 2.0346, + "num_input_tokens_seen": 2557476864, + "step": 6504 + }, + { + "epoch": 0.7380873927785632, + "grad_norm": 0.5109795331954956, + "learning_rate": 1.3784461152882205e-05, + "loss": 2.0375, + "num_input_tokens_seen": 2559836160, + "step": 6510 + }, + { + "epoch": 0.7387676576567, + "grad_norm": 0.5438470840454102, + "learning_rate": 1.3748657357679915e-05, + "loss": 2.041, + "num_input_tokens_seen": 2562195456, + "step": 6516 + }, + { + "epoch": 0.739447922534837, + "grad_norm": 0.568148672580719, + "learning_rate": 1.3712853562477623e-05, + "loss": 2.0456, + "num_input_tokens_seen": 2564554752, + "step": 6522 + }, + { + "epoch": 0.7401281874129739, + "grad_norm": 0.5753766298294067, + "learning_rate": 1.3677049767275333e-05, + "loss": 2.0265, + "num_input_tokens_seen": 2566914048, + "step": 6528 + }, + { + "epoch": 0.7408084522911108, + "grad_norm": 0.5368342399597168, + "learning_rate": 1.364124597207304e-05, + "loss": 2.0367, + "num_input_tokens_seen": 2569273344, + "step": 6534 + }, + { + "epoch": 0.7414887171692478, + "grad_norm": 0.5538693070411682, + "learning_rate": 1.360544217687075e-05, + "loss": 2.0138, + "num_input_tokens_seen": 2571632640, + "step": 6540 + }, + { + "epoch": 0.7421689820473847, + "grad_norm": 0.5635451674461365, + "learning_rate": 1.3569638381668457e-05, + "loss": 1.9975, + "num_input_tokens_seen": 2573991936, + "step": 6546 + }, + { + "epoch": 0.7428492469255217, + "grad_norm": 0.6103095412254333, + "learning_rate": 1.3533834586466165e-05, + "loss": 2.0166, + "num_input_tokens_seen": 2576351232, + "step": 6552 + }, + { + "epoch": 0.7435295118036586, + "grad_norm": 0.576130211353302, + "learning_rate": 1.3498030791263875e-05, + "loss": 2.0627, + "num_input_tokens_seen": 2578710528, + "step": 6558 + }, + { + "epoch": 0.7442097766817954, + "grad_norm": 0.5822626948356628, + "learning_rate": 1.3462226996061583e-05, + "loss": 2.0108, + "num_input_tokens_seen": 2581069824, + "step": 6564 + }, + { + "epoch": 0.7448900415599324, + "grad_norm": 0.5234280824661255, + "learning_rate": 1.3426423200859291e-05, + "loss": 1.9447, + "num_input_tokens_seen": 2583429120, + "step": 6570 + }, + { + "epoch": 0.7455703064380693, + "grad_norm": 0.544247031211853, + "learning_rate": 1.3390619405657001e-05, + "loss": 1.9903, + "num_input_tokens_seen": 2585788416, + "step": 6576 + }, + { + "epoch": 0.7462505713162062, + "grad_norm": 0.5101417899131775, + "learning_rate": 1.335481561045471e-05, + "loss": 2.0365, + "num_input_tokens_seen": 2588147712, + "step": 6582 + }, + { + "epoch": 0.7469308361943432, + "grad_norm": 0.498687744140625, + "learning_rate": 1.3319011815252416e-05, + "loss": 2.0617, + "num_input_tokens_seen": 2590507008, + "step": 6588 + }, + { + "epoch": 0.7476111010724801, + "grad_norm": 0.49293607473373413, + "learning_rate": 1.3283208020050126e-05, + "loss": 2.0262, + "num_input_tokens_seen": 2592866304, + "step": 6594 + }, + { + "epoch": 0.7482913659506171, + "grad_norm": 0.544154703617096, + "learning_rate": 1.3247404224847834e-05, + "loss": 2.0151, + "num_input_tokens_seen": 2595225600, + "step": 6600 + }, + { + "epoch": 0.748971630828754, + "grad_norm": 0.5422555208206177, + "learning_rate": 1.3211600429645544e-05, + "loss": 2.0348, + "num_input_tokens_seen": 2597584896, + "step": 6606 + }, + { + "epoch": 0.7496518957068908, + "grad_norm": 0.5072320699691772, + "learning_rate": 1.3175796634443252e-05, + "loss": 2.0011, + "num_input_tokens_seen": 2599944192, + "step": 6612 + }, + { + "epoch": 0.7503321605850278, + "grad_norm": 0.6141318678855896, + "learning_rate": 1.3139992839240962e-05, + "loss": 2.043, + "num_input_tokens_seen": 2602303488, + "step": 6618 + }, + { + "epoch": 0.7510124254631647, + "grad_norm": 0.5565091967582703, + "learning_rate": 1.310418904403867e-05, + "loss": 2.0446, + "num_input_tokens_seen": 2604662784, + "step": 6624 + }, + { + "epoch": 0.7516926903413016, + "grad_norm": 0.5500200986862183, + "learning_rate": 1.3068385248836376e-05, + "loss": 2.0254, + "num_input_tokens_seen": 2607022080, + "step": 6630 + }, + { + "epoch": 0.7523729552194386, + "grad_norm": 0.6047897338867188, + "learning_rate": 1.3032581453634085e-05, + "loss": 2.0415, + "num_input_tokens_seen": 2609381376, + "step": 6636 + }, + { + "epoch": 0.7530532200975755, + "grad_norm": 0.6070099472999573, + "learning_rate": 1.2996777658431794e-05, + "loss": 2.0615, + "num_input_tokens_seen": 2611740672, + "step": 6642 + }, + { + "epoch": 0.7537334849757125, + "grad_norm": 0.525489866733551, + "learning_rate": 1.2960973863229503e-05, + "loss": 2.0465, + "num_input_tokens_seen": 2614099968, + "step": 6648 + }, + { + "epoch": 0.7544137498538493, + "grad_norm": 0.4968653917312622, + "learning_rate": 1.2925170068027212e-05, + "loss": 1.9749, + "num_input_tokens_seen": 2616459264, + "step": 6654 + }, + { + "epoch": 0.7550940147319862, + "grad_norm": 0.5281318426132202, + "learning_rate": 1.288936627282492e-05, + "loss": 1.9575, + "num_input_tokens_seen": 2618818560, + "step": 6660 + }, + { + "epoch": 0.7557742796101232, + "grad_norm": 0.5236896872520447, + "learning_rate": 1.285356247762263e-05, + "loss": 2.0634, + "num_input_tokens_seen": 2621177856, + "step": 6666 + }, + { + "epoch": 0.7564545444882601, + "grad_norm": 0.6024266481399536, + "learning_rate": 1.2817758682420337e-05, + "loss": 1.9841, + "num_input_tokens_seen": 2623537152, + "step": 6672 + }, + { + "epoch": 0.757134809366397, + "grad_norm": 0.5235931277275085, + "learning_rate": 1.2781954887218045e-05, + "loss": 1.9866, + "num_input_tokens_seen": 2625896448, + "step": 6678 + }, + { + "epoch": 0.757815074244534, + "grad_norm": 0.5041958093643188, + "learning_rate": 1.2746151092015753e-05, + "loss": 2.0075, + "num_input_tokens_seen": 2628255744, + "step": 6684 + }, + { + "epoch": 0.7584953391226709, + "grad_norm": 0.5709572434425354, + "learning_rate": 1.2710347296813463e-05, + "loss": 2.0231, + "num_input_tokens_seen": 2630615040, + "step": 6690 + }, + { + "epoch": 0.7591756040008079, + "grad_norm": 0.5276849269866943, + "learning_rate": 1.2674543501611171e-05, + "loss": 2.0314, + "num_input_tokens_seen": 2632974336, + "step": 6696 + }, + { + "epoch": 0.7598558688789447, + "grad_norm": 0.5111777186393738, + "learning_rate": 1.2638739706408881e-05, + "loss": 2.0542, + "num_input_tokens_seen": 2635333632, + "step": 6702 + }, + { + "epoch": 0.7605361337570816, + "grad_norm": 0.5633344650268555, + "learning_rate": 1.260293591120659e-05, + "loss": 2.0438, + "num_input_tokens_seen": 2637692928, + "step": 6708 + }, + { + "epoch": 0.7612163986352186, + "grad_norm": 0.5294421315193176, + "learning_rate": 1.2567132116004296e-05, + "loss": 2.0113, + "num_input_tokens_seen": 2640052224, + "step": 6714 + }, + { + "epoch": 0.7618966635133555, + "grad_norm": 0.5252106189727783, + "learning_rate": 1.2531328320802006e-05, + "loss": 1.9799, + "num_input_tokens_seen": 2642411520, + "step": 6720 + }, + { + "epoch": 0.7625769283914925, + "grad_norm": 0.5746698379516602, + "learning_rate": 1.2495524525599714e-05, + "loss": 2.0742, + "num_input_tokens_seen": 2644770816, + "step": 6726 + }, + { + "epoch": 0.7632571932696294, + "grad_norm": 0.5127720236778259, + "learning_rate": 1.2459720730397424e-05, + "loss": 1.9997, + "num_input_tokens_seen": 2647130112, + "step": 6732 + }, + { + "epoch": 0.7639374581477663, + "grad_norm": 0.654504120349884, + "learning_rate": 1.2423916935195132e-05, + "loss": 2.0319, + "num_input_tokens_seen": 2649489408, + "step": 6738 + }, + { + "epoch": 0.7646177230259033, + "grad_norm": 0.5886629819869995, + "learning_rate": 1.238811313999284e-05, + "loss": 2.0316, + "num_input_tokens_seen": 2651848704, + "step": 6744 + }, + { + "epoch": 0.7652979879040401, + "grad_norm": 0.6034631133079529, + "learning_rate": 1.2352309344790548e-05, + "loss": 2.0575, + "num_input_tokens_seen": 2654208000, + "step": 6750 + }, + { + "epoch": 0.765978252782177, + "grad_norm": 0.5659487247467041, + "learning_rate": 1.2316505549588258e-05, + "loss": 2.0641, + "num_input_tokens_seen": 2656567296, + "step": 6756 + }, + { + "epoch": 0.766658517660314, + "grad_norm": 0.5753200650215149, + "learning_rate": 1.2280701754385964e-05, + "loss": 2.0554, + "num_input_tokens_seen": 2658926592, + "step": 6762 + }, + { + "epoch": 0.7673387825384509, + "grad_norm": 0.553452730178833, + "learning_rate": 1.2244897959183674e-05, + "loss": 2.0957, + "num_input_tokens_seen": 2661285888, + "step": 6768 + }, + { + "epoch": 0.7680190474165879, + "grad_norm": 0.5258597731590271, + "learning_rate": 1.2209094163981382e-05, + "loss": 2.0221, + "num_input_tokens_seen": 2663645184, + "step": 6774 + }, + { + "epoch": 0.7686993122947248, + "grad_norm": 0.5694190859794617, + "learning_rate": 1.2173290368779092e-05, + "loss": 2.0153, + "num_input_tokens_seen": 2666004480, + "step": 6780 + }, + { + "epoch": 0.7693795771728617, + "grad_norm": 0.5532529354095459, + "learning_rate": 1.2137486573576799e-05, + "loss": 2.0087, + "num_input_tokens_seen": 2668363776, + "step": 6786 + }, + { + "epoch": 0.7700598420509986, + "grad_norm": 0.5136593580245972, + "learning_rate": 1.2101682778374509e-05, + "loss": 2.0324, + "num_input_tokens_seen": 2670723072, + "step": 6792 + }, + { + "epoch": 0.7707401069291355, + "grad_norm": 0.5418703556060791, + "learning_rate": 1.2065878983172217e-05, + "loss": 2.0253, + "num_input_tokens_seen": 2673082368, + "step": 6798 + }, + { + "epoch": 0.7709668618885145, + "eval_accuracy": 0.5866208791208791, + "eval_loss": 2.00406551361084, + "eval_runtime": 129.6133, + "eval_samples_per_second": 3.086, + "eval_steps_per_second": 1.034, + "num_input_tokens_seen": 2673868800, + "step": 6800 + }, + { + "epoch": 0.7714203718072724, + "grad_norm": 0.6343456506729126, + "learning_rate": 1.2030075187969925e-05, + "loss": 2.0459, + "num_input_tokens_seen": 2675441664, + "step": 6804 + }, + { + "epoch": 0.7721006366854094, + "grad_norm": 0.5664966702461243, + "learning_rate": 1.1994271392767633e-05, + "loss": 2.0402, + "num_input_tokens_seen": 2677800960, + "step": 6810 + }, + { + "epoch": 0.7727809015635463, + "grad_norm": 0.5292795300483704, + "learning_rate": 1.1958467597565343e-05, + "loss": 2.0636, + "num_input_tokens_seen": 2680160256, + "step": 6816 + }, + { + "epoch": 0.7734611664416833, + "grad_norm": 0.5384446978569031, + "learning_rate": 1.1922663802363051e-05, + "loss": 2.0319, + "num_input_tokens_seen": 2682519552, + "step": 6822 + }, + { + "epoch": 0.7741414313198202, + "grad_norm": 0.6125785112380981, + "learning_rate": 1.188686000716076e-05, + "loss": 2.0176, + "num_input_tokens_seen": 2684878848, + "step": 6828 + }, + { + "epoch": 0.774821696197957, + "grad_norm": 0.5301167368888855, + "learning_rate": 1.1851056211958467e-05, + "loss": 2.0285, + "num_input_tokens_seen": 2687238144, + "step": 6834 + }, + { + "epoch": 0.775501961076094, + "grad_norm": 0.5614597201347351, + "learning_rate": 1.1815252416756177e-05, + "loss": 2.0672, + "num_input_tokens_seen": 2689597440, + "step": 6840 + }, + { + "epoch": 0.7761822259542309, + "grad_norm": 0.5375152826309204, + "learning_rate": 1.1779448621553885e-05, + "loss": 1.9599, + "num_input_tokens_seen": 2691956736, + "step": 6846 + }, + { + "epoch": 0.7768624908323678, + "grad_norm": 0.5689718127250671, + "learning_rate": 1.1743644826351593e-05, + "loss": 2.0173, + "num_input_tokens_seen": 2694316032, + "step": 6852 + }, + { + "epoch": 0.7775427557105048, + "grad_norm": 0.5268839597702026, + "learning_rate": 1.1707841031149303e-05, + "loss": 2.0727, + "num_input_tokens_seen": 2696675328, + "step": 6858 + }, + { + "epoch": 0.7782230205886417, + "grad_norm": 0.5965040326118469, + "learning_rate": 1.1672037235947012e-05, + "loss": 2.0151, + "num_input_tokens_seen": 2699034624, + "step": 6864 + }, + { + "epoch": 0.7789032854667787, + "grad_norm": 0.5147624611854553, + "learning_rate": 1.163623344074472e-05, + "loss": 1.9906, + "num_input_tokens_seen": 2701393920, + "step": 6870 + }, + { + "epoch": 0.7795835503449156, + "grad_norm": 0.5790501832962036, + "learning_rate": 1.1600429645542428e-05, + "loss": 1.959, + "num_input_tokens_seen": 2703753216, + "step": 6876 + }, + { + "epoch": 0.7802638152230524, + "grad_norm": 0.5348291993141174, + "learning_rate": 1.1564625850340138e-05, + "loss": 2.025, + "num_input_tokens_seen": 2706112512, + "step": 6882 + }, + { + "epoch": 0.7809440801011894, + "grad_norm": 0.5875257849693298, + "learning_rate": 1.1528822055137844e-05, + "loss": 2.0391, + "num_input_tokens_seen": 2708471808, + "step": 6888 + }, + { + "epoch": 0.7816243449793263, + "grad_norm": 0.5611306428909302, + "learning_rate": 1.1493018259935554e-05, + "loss": 2.0412, + "num_input_tokens_seen": 2710831104, + "step": 6894 + }, + { + "epoch": 0.7823046098574632, + "grad_norm": 0.6583240628242493, + "learning_rate": 1.1457214464733262e-05, + "loss": 2.0196, + "num_input_tokens_seen": 2713190400, + "step": 6900 + }, + { + "epoch": 0.7829848747356002, + "grad_norm": 0.5052137970924377, + "learning_rate": 1.1421410669530972e-05, + "loss": 1.978, + "num_input_tokens_seen": 2715549696, + "step": 6906 + }, + { + "epoch": 0.7836651396137371, + "grad_norm": 0.5078434348106384, + "learning_rate": 1.1385606874328678e-05, + "loss": 1.9985, + "num_input_tokens_seen": 2717908992, + "step": 6912 + }, + { + "epoch": 0.7843454044918741, + "grad_norm": 0.5650710463523865, + "learning_rate": 1.1349803079126388e-05, + "loss": 2.0562, + "num_input_tokens_seen": 2720268288, + "step": 6918 + }, + { + "epoch": 0.785025669370011, + "grad_norm": 0.5435272455215454, + "learning_rate": 1.1313999283924096e-05, + "loss": 1.969, + "num_input_tokens_seen": 2722627584, + "step": 6924 + }, + { + "epoch": 0.7857059342481478, + "grad_norm": 0.5620574951171875, + "learning_rate": 1.1278195488721805e-05, + "loss": 2.0299, + "num_input_tokens_seen": 2724986880, + "step": 6930 + }, + { + "epoch": 0.7863861991262848, + "grad_norm": 0.5396995544433594, + "learning_rate": 1.1242391693519513e-05, + "loss": 2.0016, + "num_input_tokens_seen": 2727346176, + "step": 6936 + }, + { + "epoch": 0.7870664640044217, + "grad_norm": 0.5387789011001587, + "learning_rate": 1.1206587898317223e-05, + "loss": 1.9991, + "num_input_tokens_seen": 2729705472, + "step": 6942 + }, + { + "epoch": 0.7877467288825587, + "grad_norm": 0.5884684920310974, + "learning_rate": 1.117078410311493e-05, + "loss": 2.0051, + "num_input_tokens_seen": 2732064768, + "step": 6948 + }, + { + "epoch": 0.7884269937606956, + "grad_norm": 0.5044008493423462, + "learning_rate": 1.1134980307912639e-05, + "loss": 2.0211, + "num_input_tokens_seen": 2734424064, + "step": 6954 + }, + { + "epoch": 0.7891072586388325, + "grad_norm": 0.5228249430656433, + "learning_rate": 1.1099176512710347e-05, + "loss": 2.0181, + "num_input_tokens_seen": 2736783360, + "step": 6960 + }, + { + "epoch": 0.7897875235169695, + "grad_norm": 0.5564016103744507, + "learning_rate": 1.1063372717508057e-05, + "loss": 1.9769, + "num_input_tokens_seen": 2739142656, + "step": 6966 + }, + { + "epoch": 0.7904677883951063, + "grad_norm": 0.505305826663971, + "learning_rate": 1.1027568922305765e-05, + "loss": 2.013, + "num_input_tokens_seen": 2741501952, + "step": 6972 + }, + { + "epoch": 0.7911480532732432, + "grad_norm": 0.5639991760253906, + "learning_rate": 1.0991765127103473e-05, + "loss": 2.0399, + "num_input_tokens_seen": 2743861248, + "step": 6978 + }, + { + "epoch": 0.7918283181513802, + "grad_norm": 0.583869218826294, + "learning_rate": 1.0955961331901181e-05, + "loss": 2.0013, + "num_input_tokens_seen": 2746220544, + "step": 6984 + }, + { + "epoch": 0.7925085830295171, + "grad_norm": 0.538934051990509, + "learning_rate": 1.0920157536698891e-05, + "loss": 1.9875, + "num_input_tokens_seen": 2748579840, + "step": 6990 + }, + { + "epoch": 0.7931888479076541, + "grad_norm": 0.5135723948478699, + "learning_rate": 1.08843537414966e-05, + "loss": 1.9953, + "num_input_tokens_seen": 2750939136, + "step": 6996 + }, + { + "epoch": 0.793869112785791, + "grad_norm": 0.4978064298629761, + "learning_rate": 1.0848549946294308e-05, + "loss": 2.0216, + "num_input_tokens_seen": 2753298432, + "step": 7002 + }, + { + "epoch": 0.7945493776639279, + "grad_norm": 0.5002549886703491, + "learning_rate": 1.0812746151092017e-05, + "loss": 2.0746, + "num_input_tokens_seen": 2755657728, + "step": 7008 + }, + { + "epoch": 0.7952296425420649, + "grad_norm": 0.5456427335739136, + "learning_rate": 1.0776942355889724e-05, + "loss": 1.9826, + "num_input_tokens_seen": 2758017024, + "step": 7014 + }, + { + "epoch": 0.7959099074202017, + "grad_norm": 0.5168840885162354, + "learning_rate": 1.0741138560687434e-05, + "loss": 1.9527, + "num_input_tokens_seen": 2760376320, + "step": 7020 + }, + { + "epoch": 0.7965901722983386, + "grad_norm": 0.6147148013114929, + "learning_rate": 1.0705334765485142e-05, + "loss": 1.9762, + "num_input_tokens_seen": 2762735616, + "step": 7026 + }, + { + "epoch": 0.7972704371764756, + "grad_norm": 0.5876660346984863, + "learning_rate": 1.0669530970282852e-05, + "loss": 1.9757, + "num_input_tokens_seen": 2765094912, + "step": 7032 + }, + { + "epoch": 0.7979507020546125, + "grad_norm": 0.5405025482177734, + "learning_rate": 1.0633727175080558e-05, + "loss": 2.0329, + "num_input_tokens_seen": 2767454208, + "step": 7038 + }, + { + "epoch": 0.7986309669327495, + "grad_norm": 0.5711649656295776, + "learning_rate": 1.0597923379878268e-05, + "loss": 2.0332, + "num_input_tokens_seen": 2769813504, + "step": 7044 + }, + { + "epoch": 0.7993112318108864, + "grad_norm": 0.5035630464553833, + "learning_rate": 1.0562119584675976e-05, + "loss": 1.9839, + "num_input_tokens_seen": 2772172800, + "step": 7050 + }, + { + "epoch": 0.7999914966890233, + "grad_norm": 0.6048845052719116, + "learning_rate": 1.0526315789473684e-05, + "loss": 2.0051, + "num_input_tokens_seen": 2774532096, + "step": 7056 + }, + { + "epoch": 0.8006717615671602, + "grad_norm": 0.528458297252655, + "learning_rate": 1.0490511994271393e-05, + "loss": 2.0181, + "num_input_tokens_seen": 2776891392, + "step": 7062 + }, + { + "epoch": 0.8013520264452971, + "grad_norm": 0.5959348678588867, + "learning_rate": 1.0454708199069102e-05, + "loss": 2.0368, + "num_input_tokens_seen": 2779250688, + "step": 7068 + }, + { + "epoch": 0.802032291323434, + "grad_norm": 0.6448953151702881, + "learning_rate": 1.041890440386681e-05, + "loss": 2.0584, + "num_input_tokens_seen": 2781609984, + "step": 7074 + }, + { + "epoch": 0.802712556201571, + "grad_norm": 0.5180519223213196, + "learning_rate": 1.0383100608664519e-05, + "loss": 1.9999, + "num_input_tokens_seen": 2783969280, + "step": 7080 + }, + { + "epoch": 0.8033928210797079, + "grad_norm": 0.5477824807167053, + "learning_rate": 1.0347296813462227e-05, + "loss": 2.0164, + "num_input_tokens_seen": 2786328576, + "step": 7086 + }, + { + "epoch": 0.8040730859578449, + "grad_norm": 0.5789219737052917, + "learning_rate": 1.0311493018259937e-05, + "loss": 2.0155, + "num_input_tokens_seen": 2788687872, + "step": 7092 + }, + { + "epoch": 0.8047533508359818, + "grad_norm": 0.5404716730117798, + "learning_rate": 1.0275689223057643e-05, + "loss": 1.9827, + "num_input_tokens_seen": 2791047168, + "step": 7098 + }, + { + "epoch": 0.8054336157141186, + "grad_norm": 0.583008885383606, + "learning_rate": 1.0239885427855353e-05, + "loss": 2.0253, + "num_input_tokens_seen": 2793406464, + "step": 7104 + }, + { + "epoch": 0.8061138805922556, + "grad_norm": 0.5732628703117371, + "learning_rate": 1.0204081632653061e-05, + "loss": 2.0036, + "num_input_tokens_seen": 2795765760, + "step": 7110 + }, + { + "epoch": 0.8067941454703925, + "grad_norm": 0.48751312494277954, + "learning_rate": 1.0168277837450771e-05, + "loss": 1.997, + "num_input_tokens_seen": 2798125056, + "step": 7116 + }, + { + "epoch": 0.8074744103485294, + "grad_norm": 0.4976810812950134, + "learning_rate": 1.013247404224848e-05, + "loss": 1.9776, + "num_input_tokens_seen": 2800484352, + "step": 7122 + }, + { + "epoch": 0.8081546752266664, + "grad_norm": 0.5391010642051697, + "learning_rate": 1.0096670247046187e-05, + "loss": 2.03, + "num_input_tokens_seen": 2802843648, + "step": 7128 + }, + { + "epoch": 0.8088349401048033, + "grad_norm": 0.5493925213813782, + "learning_rate": 1.0060866451843896e-05, + "loss": 2.0477, + "num_input_tokens_seen": 2805202944, + "step": 7134 + }, + { + "epoch": 0.8095152049829403, + "grad_norm": 0.544150173664093, + "learning_rate": 1.0025062656641604e-05, + "loss": 1.9951, + "num_input_tokens_seen": 2807562240, + "step": 7140 + }, + { + "epoch": 0.8101954698610772, + "grad_norm": 0.48092713952064514, + "learning_rate": 9.989258861439314e-06, + "loss": 2.017, + "num_input_tokens_seen": 2809921536, + "step": 7146 + }, + { + "epoch": 0.810875734739214, + "grad_norm": 0.5483365058898926, + "learning_rate": 9.953455066237022e-06, + "loss": 1.9781, + "num_input_tokens_seen": 2812280832, + "step": 7152 + }, + { + "epoch": 0.811555999617351, + "grad_norm": 0.5313854813575745, + "learning_rate": 9.917651271034732e-06, + "loss": 2.0425, + "num_input_tokens_seen": 2814640128, + "step": 7158 + }, + { + "epoch": 0.8122362644954879, + "grad_norm": 0.4918869137763977, + "learning_rate": 9.881847475832438e-06, + "loss": 2.0208, + "num_input_tokens_seen": 2816999424, + "step": 7164 + }, + { + "epoch": 0.8129165293736249, + "grad_norm": 0.5176813006401062, + "learning_rate": 9.846043680630148e-06, + "loss": 1.9843, + "num_input_tokens_seen": 2819358720, + "step": 7170 + }, + { + "epoch": 0.8135967942517618, + "grad_norm": 0.5130747556686401, + "learning_rate": 9.810239885427856e-06, + "loss": 1.9857, + "num_input_tokens_seen": 2821718016, + "step": 7176 + }, + { + "epoch": 0.8142770591298987, + "grad_norm": 0.5297340750694275, + "learning_rate": 9.774436090225564e-06, + "loss": 1.9911, + "num_input_tokens_seen": 2824077312, + "step": 7182 + }, + { + "epoch": 0.8149573240080357, + "grad_norm": 0.5061428546905518, + "learning_rate": 9.738632295023272e-06, + "loss": 1.9687, + "num_input_tokens_seen": 2826436608, + "step": 7188 + }, + { + "epoch": 0.8156375888861725, + "grad_norm": 0.5404644012451172, + "learning_rate": 9.702828499820982e-06, + "loss": 1.9658, + "num_input_tokens_seen": 2828795904, + "step": 7194 + }, + { + "epoch": 0.8163178537643094, + "grad_norm": 0.5056130886077881, + "learning_rate": 9.66702470461869e-06, + "loss": 1.995, + "num_input_tokens_seen": 2831155200, + "step": 7200 + }, + { + "epoch": 0.8163178537643094, + "eval_accuracy": 0.5872002442002442, + "eval_loss": 2.000960350036621, + "eval_runtime": 128.4298, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.043, + "num_input_tokens_seen": 2831155200, + "step": 7200 + }, + { + "epoch": 0.8169981186424464, + "grad_norm": 0.5106950402259827, + "learning_rate": 9.631220909416399e-06, + "loss": 2.0265, + "num_input_tokens_seen": 2833514496, + "step": 7206 + }, + { + "epoch": 0.8176783835205833, + "grad_norm": 0.5333806276321411, + "learning_rate": 9.595417114214107e-06, + "loss": 1.9495, + "num_input_tokens_seen": 2835873792, + "step": 7212 + }, + { + "epoch": 0.8183586483987203, + "grad_norm": 0.5510848164558411, + "learning_rate": 9.559613319011817e-06, + "loss": 2.0107, + "num_input_tokens_seen": 2838233088, + "step": 7218 + }, + { + "epoch": 0.8190389132768572, + "grad_norm": 0.5609148144721985, + "learning_rate": 9.523809523809523e-06, + "loss": 2.0433, + "num_input_tokens_seen": 2840592384, + "step": 7224 + }, + { + "epoch": 0.8197191781549941, + "grad_norm": 0.5448800921440125, + "learning_rate": 9.488005728607233e-06, + "loss": 1.9898, + "num_input_tokens_seen": 2842951680, + "step": 7230 + }, + { + "epoch": 0.8203994430331311, + "grad_norm": 0.5326699018478394, + "learning_rate": 9.452201933404941e-06, + "loss": 1.9797, + "num_input_tokens_seen": 2845310976, + "step": 7236 + }, + { + "epoch": 0.8210797079112679, + "grad_norm": 0.5335074663162231, + "learning_rate": 9.416398138202651e-06, + "loss": 2.0008, + "num_input_tokens_seen": 2847670272, + "step": 7242 + }, + { + "epoch": 0.8217599727894048, + "grad_norm": 0.49363961815834045, + "learning_rate": 9.380594343000357e-06, + "loss": 1.9962, + "num_input_tokens_seen": 2850029568, + "step": 7248 + }, + { + "epoch": 0.8224402376675418, + "grad_norm": 0.5354363918304443, + "learning_rate": 9.344790547798067e-06, + "loss": 2.0384, + "num_input_tokens_seen": 2852388864, + "step": 7254 + }, + { + "epoch": 0.8231205025456787, + "grad_norm": 0.5777958631515503, + "learning_rate": 9.308986752595775e-06, + "loss": 1.9839, + "num_input_tokens_seen": 2854748160, + "step": 7260 + }, + { + "epoch": 0.8238007674238157, + "grad_norm": 0.5475658178329468, + "learning_rate": 9.273182957393484e-06, + "loss": 1.9714, + "num_input_tokens_seen": 2857107456, + "step": 7266 + }, + { + "epoch": 0.8244810323019526, + "grad_norm": 0.5318132638931274, + "learning_rate": 9.237379162191193e-06, + "loss": 1.9882, + "num_input_tokens_seen": 2859466752, + "step": 7272 + }, + { + "epoch": 0.8251612971800895, + "grad_norm": 0.5123252272605896, + "learning_rate": 9.201575366988902e-06, + "loss": 2.0005, + "num_input_tokens_seen": 2861826048, + "step": 7278 + }, + { + "epoch": 0.8258415620582265, + "grad_norm": 0.5346453785896301, + "learning_rate": 9.16577157178661e-06, + "loss": 1.9623, + "num_input_tokens_seen": 2864185344, + "step": 7284 + }, + { + "epoch": 0.8265218269363633, + "grad_norm": 0.5490689277648926, + "learning_rate": 9.129967776584318e-06, + "loss": 2.0012, + "num_input_tokens_seen": 2866544640, + "step": 7290 + }, + { + "epoch": 0.8272020918145002, + "grad_norm": 0.5043258666992188, + "learning_rate": 9.094163981382028e-06, + "loss": 1.9961, + "num_input_tokens_seen": 2868903936, + "step": 7296 + }, + { + "epoch": 0.8278823566926372, + "grad_norm": 0.5261684060096741, + "learning_rate": 9.058360186179736e-06, + "loss": 2.0281, + "num_input_tokens_seen": 2871263232, + "step": 7302 + }, + { + "epoch": 0.8285626215707741, + "grad_norm": 0.503143846988678, + "learning_rate": 9.022556390977444e-06, + "loss": 1.9774, + "num_input_tokens_seen": 2873622528, + "step": 7308 + }, + { + "epoch": 0.8292428864489111, + "grad_norm": 0.548707127571106, + "learning_rate": 8.986752595775152e-06, + "loss": 1.9829, + "num_input_tokens_seen": 2875981824, + "step": 7314 + }, + { + "epoch": 0.829923151327048, + "grad_norm": 0.5377416610717773, + "learning_rate": 8.950948800572862e-06, + "loss": 2.002, + "num_input_tokens_seen": 2878341120, + "step": 7320 + }, + { + "epoch": 0.8306034162051849, + "grad_norm": 0.5520649552345276, + "learning_rate": 8.915145005370569e-06, + "loss": 1.9893, + "num_input_tokens_seen": 2880700416, + "step": 7326 + }, + { + "epoch": 0.8312836810833218, + "grad_norm": 0.6141591668128967, + "learning_rate": 8.879341210168278e-06, + "loss": 2.0882, + "num_input_tokens_seen": 2883059712, + "step": 7332 + }, + { + "epoch": 0.8319639459614587, + "grad_norm": 0.5497307181358337, + "learning_rate": 8.843537414965987e-06, + "loss": 2.0063, + "num_input_tokens_seen": 2885419008, + "step": 7338 + }, + { + "epoch": 0.8326442108395957, + "grad_norm": 0.5077412724494934, + "learning_rate": 8.807733619763696e-06, + "loss": 2.0364, + "num_input_tokens_seen": 2887778304, + "step": 7344 + }, + { + "epoch": 0.8333244757177326, + "grad_norm": 0.5644519925117493, + "learning_rate": 8.771929824561403e-06, + "loss": 2.0403, + "num_input_tokens_seen": 2890137600, + "step": 7350 + }, + { + "epoch": 0.8340047405958695, + "grad_norm": 0.5899609327316284, + "learning_rate": 8.736126029359113e-06, + "loss": 2.0182, + "num_input_tokens_seen": 2892496896, + "step": 7356 + }, + { + "epoch": 0.8346850054740065, + "grad_norm": 0.5211791396141052, + "learning_rate": 8.700322234156821e-06, + "loss": 2.0668, + "num_input_tokens_seen": 2894856192, + "step": 7362 + }, + { + "epoch": 0.8353652703521434, + "grad_norm": 0.5184838771820068, + "learning_rate": 8.664518438954529e-06, + "loss": 2.0001, + "num_input_tokens_seen": 2897215488, + "step": 7368 + }, + { + "epoch": 0.8360455352302802, + "grad_norm": 0.49509575963020325, + "learning_rate": 8.628714643752237e-06, + "loss": 1.9863, + "num_input_tokens_seen": 2899574784, + "step": 7374 + }, + { + "epoch": 0.8367258001084172, + "grad_norm": 0.6435425877571106, + "learning_rate": 8.592910848549947e-06, + "loss": 2.0189, + "num_input_tokens_seen": 2901934080, + "step": 7380 + }, + { + "epoch": 0.8374060649865541, + "grad_norm": 0.6035661697387695, + "learning_rate": 8.557107053347655e-06, + "loss": 2.0365, + "num_input_tokens_seen": 2904293376, + "step": 7386 + }, + { + "epoch": 0.8380863298646911, + "grad_norm": 0.5593310594558716, + "learning_rate": 8.521303258145363e-06, + "loss": 2.0101, + "num_input_tokens_seen": 2906652672, + "step": 7392 + }, + { + "epoch": 0.838766594742828, + "grad_norm": 0.5158206820487976, + "learning_rate": 8.485499462943072e-06, + "loss": 2.0323, + "num_input_tokens_seen": 2909011968, + "step": 7398 + }, + { + "epoch": 0.8394468596209649, + "grad_norm": 0.519759476184845, + "learning_rate": 8.449695667740781e-06, + "loss": 2.0185, + "num_input_tokens_seen": 2911371264, + "step": 7404 + }, + { + "epoch": 0.8401271244991019, + "grad_norm": 0.5219857096672058, + "learning_rate": 8.41389187253849e-06, + "loss": 2.0214, + "num_input_tokens_seen": 2913730560, + "step": 7410 + }, + { + "epoch": 0.8408073893772388, + "grad_norm": 0.5320020914077759, + "learning_rate": 8.378088077336198e-06, + "loss": 2.0125, + "num_input_tokens_seen": 2916089856, + "step": 7416 + }, + { + "epoch": 0.8414876542553756, + "grad_norm": 0.5405346751213074, + "learning_rate": 8.342284282133908e-06, + "loss": 1.9639, + "num_input_tokens_seen": 2918449152, + "step": 7422 + }, + { + "epoch": 0.8421679191335126, + "grad_norm": 0.5031660199165344, + "learning_rate": 8.306480486931616e-06, + "loss": 1.9817, + "num_input_tokens_seen": 2920808448, + "step": 7428 + }, + { + "epoch": 0.8428481840116495, + "grad_norm": 0.5186684727668762, + "learning_rate": 8.270676691729324e-06, + "loss": 2.0954, + "num_input_tokens_seen": 2923167744, + "step": 7434 + }, + { + "epoch": 0.8435284488897865, + "grad_norm": 0.5321431756019592, + "learning_rate": 8.234872896527032e-06, + "loss": 2.0269, + "num_input_tokens_seen": 2925527040, + "step": 7440 + }, + { + "epoch": 0.8442087137679234, + "grad_norm": 0.49513474106788635, + "learning_rate": 8.199069101324742e-06, + "loss": 1.9744, + "num_input_tokens_seen": 2927886336, + "step": 7446 + }, + { + "epoch": 0.8448889786460603, + "grad_norm": 0.5057438015937805, + "learning_rate": 8.163265306122448e-06, + "loss": 1.986, + "num_input_tokens_seen": 2930245632, + "step": 7452 + }, + { + "epoch": 0.8455692435241973, + "grad_norm": 0.5168727040290833, + "learning_rate": 8.127461510920158e-06, + "loss": 2.0049, + "num_input_tokens_seen": 2932604928, + "step": 7458 + }, + { + "epoch": 0.8462495084023341, + "grad_norm": 0.584082841873169, + "learning_rate": 8.091657715717866e-06, + "loss": 2.0321, + "num_input_tokens_seen": 2934964224, + "step": 7464 + }, + { + "epoch": 0.846929773280471, + "grad_norm": 0.49962136149406433, + "learning_rate": 8.055853920515576e-06, + "loss": 2.0121, + "num_input_tokens_seen": 2937323520, + "step": 7470 + }, + { + "epoch": 0.847610038158608, + "grad_norm": 0.5198308825492859, + "learning_rate": 8.020050125313283e-06, + "loss": 2.0272, + "num_input_tokens_seen": 2939682816, + "step": 7476 + }, + { + "epoch": 0.8482903030367449, + "grad_norm": 0.5608158707618713, + "learning_rate": 7.984246330110993e-06, + "loss": 2.0204, + "num_input_tokens_seen": 2942042112, + "step": 7482 + }, + { + "epoch": 0.8489705679148819, + "grad_norm": 0.5206415057182312, + "learning_rate": 7.9484425349087e-06, + "loss": 1.9897, + "num_input_tokens_seen": 2944401408, + "step": 7488 + }, + { + "epoch": 0.8496508327930188, + "grad_norm": 0.4968629777431488, + "learning_rate": 7.912638739706409e-06, + "loss": 2.0773, + "num_input_tokens_seen": 2946760704, + "step": 7494 + }, + { + "epoch": 0.8503310976711557, + "grad_norm": 0.4873516857624054, + "learning_rate": 7.876834944504117e-06, + "loss": 1.9779, + "num_input_tokens_seen": 2949120000, + "step": 7500 + }, + { + "epoch": 0.8510113625492927, + "grad_norm": 0.5486623644828796, + "learning_rate": 7.841031149301827e-06, + "loss": 1.957, + "num_input_tokens_seen": 2951479296, + "step": 7506 + }, + { + "epoch": 0.8516916274274295, + "grad_norm": 0.6163302659988403, + "learning_rate": 7.805227354099535e-06, + "loss": 1.9882, + "num_input_tokens_seen": 2953838592, + "step": 7512 + }, + { + "epoch": 0.8523718923055664, + "grad_norm": 0.5529779195785522, + "learning_rate": 7.769423558897243e-06, + "loss": 2.0252, + "num_input_tokens_seen": 2956197888, + "step": 7518 + }, + { + "epoch": 0.8530521571837034, + "grad_norm": 0.5484551787376404, + "learning_rate": 7.733619763694951e-06, + "loss": 2.0415, + "num_input_tokens_seen": 2958557184, + "step": 7524 + }, + { + "epoch": 0.8537324220618403, + "grad_norm": 0.49321115016937256, + "learning_rate": 7.697815968492661e-06, + "loss": 2.0534, + "num_input_tokens_seen": 2960916480, + "step": 7530 + }, + { + "epoch": 0.8544126869399773, + "grad_norm": 0.4970216751098633, + "learning_rate": 7.66201217329037e-06, + "loss": 2.0491, + "num_input_tokens_seen": 2963275776, + "step": 7536 + }, + { + "epoch": 0.8550929518181142, + "grad_norm": 0.6006478667259216, + "learning_rate": 7.6262083780880775e-06, + "loss": 2.0457, + "num_input_tokens_seen": 2965635072, + "step": 7542 + }, + { + "epoch": 0.8557732166962511, + "grad_norm": 0.5233898162841797, + "learning_rate": 7.5904045828857865e-06, + "loss": 2.047, + "num_input_tokens_seen": 2967994368, + "step": 7548 + }, + { + "epoch": 0.856453481574388, + "grad_norm": 0.5446822047233582, + "learning_rate": 7.5546007876834955e-06, + "loss": 2.0136, + "num_input_tokens_seen": 2970353664, + "step": 7554 + }, + { + "epoch": 0.8571337464525249, + "grad_norm": 0.5539310574531555, + "learning_rate": 7.518796992481203e-06, + "loss": 1.9784, + "num_input_tokens_seen": 2972712960, + "step": 7560 + }, + { + "epoch": 0.8578140113306619, + "grad_norm": 0.5015861988067627, + "learning_rate": 7.482993197278912e-06, + "loss": 1.9579, + "num_input_tokens_seen": 2975072256, + "step": 7566 + }, + { + "epoch": 0.8584942762087988, + "grad_norm": 0.4794093072414398, + "learning_rate": 7.447189402076621e-06, + "loss": 2.0009, + "num_input_tokens_seen": 2977431552, + "step": 7572 + }, + { + "epoch": 0.8591745410869357, + "grad_norm": 0.5329228639602661, + "learning_rate": 7.411385606874328e-06, + "loss": 2.0169, + "num_input_tokens_seen": 2979790848, + "step": 7578 + }, + { + "epoch": 0.8598548059650727, + "grad_norm": 0.4937734305858612, + "learning_rate": 7.375581811672037e-06, + "loss": 2.0231, + "num_input_tokens_seen": 2982150144, + "step": 7584 + }, + { + "epoch": 0.8605350708432096, + "grad_norm": 0.5106194615364075, + "learning_rate": 7.339778016469746e-06, + "loss": 2.0039, + "num_input_tokens_seen": 2984509440, + "step": 7590 + }, + { + "epoch": 0.8612153357213465, + "grad_norm": 0.5017894506454468, + "learning_rate": 7.303974221267455e-06, + "loss": 1.9735, + "num_input_tokens_seen": 2986868736, + "step": 7596 + }, + { + "epoch": 0.8616688456401045, + "eval_accuracy": 0.587537851037851, + "eval_loss": 1.9987263679504395, + "eval_runtime": 129.4571, + "eval_samples_per_second": 3.09, + "eval_steps_per_second": 1.035, + "num_input_tokens_seen": 2988441600, + "step": 7600 + }, + { + "epoch": 0.8618956005994834, + "grad_norm": 0.5046854019165039, + "learning_rate": 7.2681704260651625e-06, + "loss": 2.0324, + "num_input_tokens_seen": 2989228032, + "step": 7602 + }, + { + "epoch": 0.8625758654776203, + "grad_norm": 0.541846513748169, + "learning_rate": 7.2323666308628715e-06, + "loss": 2.0062, + "num_input_tokens_seen": 2991587328, + "step": 7608 + }, + { + "epoch": 0.8632561303557573, + "grad_norm": 0.4783530533313751, + "learning_rate": 7.1965628356605805e-06, + "loss": 2.0239, + "num_input_tokens_seen": 2993946624, + "step": 7614 + }, + { + "epoch": 0.8639363952338942, + "grad_norm": 0.587407648563385, + "learning_rate": 7.160759040458289e-06, + "loss": 2.0063, + "num_input_tokens_seen": 2996305920, + "step": 7620 + }, + { + "epoch": 0.8646166601120311, + "grad_norm": 0.5383691191673279, + "learning_rate": 7.124955245255998e-06, + "loss": 2.03, + "num_input_tokens_seen": 2998665216, + "step": 7626 + }, + { + "epoch": 0.8652969249901681, + "grad_norm": 0.5405200719833374, + "learning_rate": 7.089151450053707e-06, + "loss": 2.0637, + "num_input_tokens_seen": 3001024512, + "step": 7632 + }, + { + "epoch": 0.865977189868305, + "grad_norm": 0.4942198395729065, + "learning_rate": 7.053347654851415e-06, + "loss": 1.9617, + "num_input_tokens_seen": 3003383808, + "step": 7638 + }, + { + "epoch": 0.8666574547464418, + "grad_norm": 0.5435467958450317, + "learning_rate": 7.017543859649123e-06, + "loss": 2.0243, + "num_input_tokens_seen": 3005743104, + "step": 7644 + }, + { + "epoch": 0.8673377196245788, + "grad_norm": 0.48755842447280884, + "learning_rate": 6.981740064446832e-06, + "loss": 1.9578, + "num_input_tokens_seen": 3008102400, + "step": 7650 + }, + { + "epoch": 0.8680179845027157, + "grad_norm": 0.4815945625305176, + "learning_rate": 6.945936269244541e-06, + "loss": 1.9798, + "num_input_tokens_seen": 3010461696, + "step": 7656 + }, + { + "epoch": 0.8686982493808527, + "grad_norm": 0.5009135007858276, + "learning_rate": 6.910132474042248e-06, + "loss": 2.0082, + "num_input_tokens_seen": 3012820992, + "step": 7662 + }, + { + "epoch": 0.8693785142589896, + "grad_norm": 0.492590069770813, + "learning_rate": 6.874328678839957e-06, + "loss": 2.0236, + "num_input_tokens_seen": 3015180288, + "step": 7668 + }, + { + "epoch": 0.8700587791371265, + "grad_norm": 0.4939536452293396, + "learning_rate": 6.838524883637666e-06, + "loss": 1.9894, + "num_input_tokens_seen": 3017539584, + "step": 7674 + }, + { + "epoch": 0.8707390440152635, + "grad_norm": 0.5177844166755676, + "learning_rate": 6.802721088435375e-06, + "loss": 2.0053, + "num_input_tokens_seen": 3019898880, + "step": 7680 + }, + { + "epoch": 0.8714193088934004, + "grad_norm": 0.5024730563163757, + "learning_rate": 6.766917293233083e-06, + "loss": 1.9954, + "num_input_tokens_seen": 3022258176, + "step": 7686 + }, + { + "epoch": 0.8720995737715372, + "grad_norm": 0.5295082330703735, + "learning_rate": 6.731113498030792e-06, + "loss": 2.0063, + "num_input_tokens_seen": 3024617472, + "step": 7692 + }, + { + "epoch": 0.8727798386496742, + "grad_norm": 0.5884028673171997, + "learning_rate": 6.695309702828501e-06, + "loss": 1.9762, + "num_input_tokens_seen": 3026976768, + "step": 7698 + }, + { + "epoch": 0.8734601035278111, + "grad_norm": 0.510733425617218, + "learning_rate": 6.659505907626208e-06, + "loss": 2.0053, + "num_input_tokens_seen": 3029336064, + "step": 7704 + }, + { + "epoch": 0.8741403684059481, + "grad_norm": 0.49165260791778564, + "learning_rate": 6.623702112423917e-06, + "loss": 1.9905, + "num_input_tokens_seen": 3031695360, + "step": 7710 + }, + { + "epoch": 0.874820633284085, + "grad_norm": 0.5171453356742859, + "learning_rate": 6.587898317221626e-06, + "loss": 1.9589, + "num_input_tokens_seen": 3034054656, + "step": 7716 + }, + { + "epoch": 0.8755008981622219, + "grad_norm": 0.5841086506843567, + "learning_rate": 6.552094522019335e-06, + "loss": 2.013, + "num_input_tokens_seen": 3036413952, + "step": 7722 + }, + { + "epoch": 0.8761811630403589, + "grad_norm": 0.5813525915145874, + "learning_rate": 6.516290726817042e-06, + "loss": 1.9854, + "num_input_tokens_seen": 3038773248, + "step": 7728 + }, + { + "epoch": 0.8768614279184958, + "grad_norm": 0.5193366408348083, + "learning_rate": 6.480486931614751e-06, + "loss": 2.0316, + "num_input_tokens_seen": 3041132544, + "step": 7734 + }, + { + "epoch": 0.8775416927966327, + "grad_norm": 0.5028855800628662, + "learning_rate": 6.44468313641246e-06, + "loss": 1.9995, + "num_input_tokens_seen": 3043491840, + "step": 7740 + }, + { + "epoch": 0.8782219576747696, + "grad_norm": 0.5069683194160461, + "learning_rate": 6.4088793412101684e-06, + "loss": 1.9518, + "num_input_tokens_seen": 3045851136, + "step": 7746 + }, + { + "epoch": 0.8789022225529065, + "grad_norm": 0.4742577373981476, + "learning_rate": 6.373075546007877e-06, + "loss": 2.0423, + "num_input_tokens_seen": 3048210432, + "step": 7752 + }, + { + "epoch": 0.8795824874310435, + "grad_norm": 0.528330385684967, + "learning_rate": 6.337271750805586e-06, + "loss": 2.0014, + "num_input_tokens_seen": 3050569728, + "step": 7758 + }, + { + "epoch": 0.8802627523091804, + "grad_norm": 0.47722598910331726, + "learning_rate": 6.301467955603295e-06, + "loss": 2.0389, + "num_input_tokens_seen": 3052929024, + "step": 7764 + }, + { + "epoch": 0.8809430171873173, + "grad_norm": 0.5158604383468628, + "learning_rate": 6.265664160401003e-06, + "loss": 2.032, + "num_input_tokens_seen": 3055288320, + "step": 7770 + }, + { + "epoch": 0.8816232820654543, + "grad_norm": 0.555617094039917, + "learning_rate": 6.229860365198712e-06, + "loss": 2.0911, + "num_input_tokens_seen": 3057647616, + "step": 7776 + }, + { + "epoch": 0.8823035469435911, + "grad_norm": 0.5554957389831543, + "learning_rate": 6.19405656999642e-06, + "loss": 1.9768, + "num_input_tokens_seen": 3060006912, + "step": 7782 + }, + { + "epoch": 0.8829838118217281, + "grad_norm": 0.5055182576179504, + "learning_rate": 6.158252774794129e-06, + "loss": 1.9987, + "num_input_tokens_seen": 3062366208, + "step": 7788 + }, + { + "epoch": 0.883664076699865, + "grad_norm": 0.5182470083236694, + "learning_rate": 6.122448979591837e-06, + "loss": 2.0091, + "num_input_tokens_seen": 3064725504, + "step": 7794 + }, + { + "epoch": 0.8843443415780019, + "grad_norm": 0.507174551486969, + "learning_rate": 6.086645184389546e-06, + "loss": 1.9631, + "num_input_tokens_seen": 3067084800, + "step": 7800 + }, + { + "epoch": 0.8850246064561389, + "grad_norm": 0.46559634804725647, + "learning_rate": 6.050841389187254e-06, + "loss": 2.0352, + "num_input_tokens_seen": 3069444096, + "step": 7806 + }, + { + "epoch": 0.8857048713342758, + "grad_norm": 0.5257137417793274, + "learning_rate": 6.015037593984962e-06, + "loss": 1.9883, + "num_input_tokens_seen": 3071803392, + "step": 7812 + }, + { + "epoch": 0.8863851362124127, + "grad_norm": 0.4962034225463867, + "learning_rate": 5.9792337987826714e-06, + "loss": 2.0039, + "num_input_tokens_seen": 3074162688, + "step": 7818 + }, + { + "epoch": 0.8870654010905497, + "grad_norm": 0.5294592380523682, + "learning_rate": 5.94343000358038e-06, + "loss": 1.9959, + "num_input_tokens_seen": 3076521984, + "step": 7824 + }, + { + "epoch": 0.8877456659686865, + "grad_norm": 0.5304044485092163, + "learning_rate": 5.907626208378089e-06, + "loss": 1.9609, + "num_input_tokens_seen": 3078881280, + "step": 7830 + }, + { + "epoch": 0.8884259308468235, + "grad_norm": 0.5610164403915405, + "learning_rate": 5.871822413175797e-06, + "loss": 2.0117, + "num_input_tokens_seen": 3081240576, + "step": 7836 + }, + { + "epoch": 0.8891061957249604, + "grad_norm": 0.5142529010772705, + "learning_rate": 5.836018617973506e-06, + "loss": 2.0373, + "num_input_tokens_seen": 3083599872, + "step": 7842 + }, + { + "epoch": 0.8897864606030973, + "grad_norm": 0.49102193117141724, + "learning_rate": 5.800214822771214e-06, + "loss": 2.0585, + "num_input_tokens_seen": 3085959168, + "step": 7848 + }, + { + "epoch": 0.8904667254812343, + "grad_norm": 0.5983024835586548, + "learning_rate": 5.764411027568922e-06, + "loss": 2.025, + "num_input_tokens_seen": 3088318464, + "step": 7854 + }, + { + "epoch": 0.8911469903593712, + "grad_norm": 0.5602377653121948, + "learning_rate": 5.728607232366631e-06, + "loss": 2.0028, + "num_input_tokens_seen": 3090677760, + "step": 7860 + }, + { + "epoch": 0.891827255237508, + "grad_norm": 0.4956376552581787, + "learning_rate": 5.692803437164339e-06, + "loss": 2.0413, + "num_input_tokens_seen": 3093037056, + "step": 7866 + }, + { + "epoch": 0.892507520115645, + "grad_norm": 0.46066755056381226, + "learning_rate": 5.656999641962048e-06, + "loss": 2.0162, + "num_input_tokens_seen": 3095396352, + "step": 7872 + }, + { + "epoch": 0.8931877849937819, + "grad_norm": 0.48907607793807983, + "learning_rate": 5.621195846759756e-06, + "loss": 1.9745, + "num_input_tokens_seen": 3097755648, + "step": 7878 + }, + { + "epoch": 0.8938680498719189, + "grad_norm": 0.4798557758331299, + "learning_rate": 5.585392051557465e-06, + "loss": 2.0045, + "num_input_tokens_seen": 3100114944, + "step": 7884 + }, + { + "epoch": 0.8945483147500558, + "grad_norm": 0.523992657661438, + "learning_rate": 5.5495882563551736e-06, + "loss": 2.0096, + "num_input_tokens_seen": 3102474240, + "step": 7890 + }, + { + "epoch": 0.8952285796281927, + "grad_norm": 0.48234423995018005, + "learning_rate": 5.5137844611528826e-06, + "loss": 1.9822, + "num_input_tokens_seen": 3104833536, + "step": 7896 + }, + { + "epoch": 0.8959088445063297, + "grad_norm": 0.51031494140625, + "learning_rate": 5.477980665950591e-06, + "loss": 2.0232, + "num_input_tokens_seen": 3107192832, + "step": 7902 + }, + { + "epoch": 0.8965891093844666, + "grad_norm": 0.5131000876426697, + "learning_rate": 5.4421768707483e-06, + "loss": 2.0059, + "num_input_tokens_seen": 3109552128, + "step": 7908 + }, + { + "epoch": 0.8972693742626034, + "grad_norm": 0.510401725769043, + "learning_rate": 5.406373075546009e-06, + "loss": 1.966, + "num_input_tokens_seen": 3111911424, + "step": 7914 + }, + { + "epoch": 0.8979496391407404, + "grad_norm": 0.541610062122345, + "learning_rate": 5.370569280343717e-06, + "loss": 1.9652, + "num_input_tokens_seen": 3114270720, + "step": 7920 + }, + { + "epoch": 0.8986299040188773, + "grad_norm": 0.5096346735954285, + "learning_rate": 5.334765485141426e-06, + "loss": 1.9736, + "num_input_tokens_seen": 3116630016, + "step": 7926 + }, + { + "epoch": 0.8993101688970143, + "grad_norm": 0.5285272002220154, + "learning_rate": 5.298961689939134e-06, + "loss": 1.9764, + "num_input_tokens_seen": 3118989312, + "step": 7932 + }, + { + "epoch": 0.8999904337751512, + "grad_norm": 0.4984615743160248, + "learning_rate": 5.263157894736842e-06, + "loss": 2.0364, + "num_input_tokens_seen": 3121348608, + "step": 7938 + }, + { + "epoch": 0.9006706986532881, + "grad_norm": 0.517405092716217, + "learning_rate": 5.227354099534551e-06, + "loss": 1.9954, + "num_input_tokens_seen": 3123707904, + "step": 7944 + }, + { + "epoch": 0.9013509635314251, + "grad_norm": 0.5145347118377686, + "learning_rate": 5.191550304332259e-06, + "loss": 2.0138, + "num_input_tokens_seen": 3126067200, + "step": 7950 + }, + { + "epoch": 0.902031228409562, + "grad_norm": 0.5413515567779541, + "learning_rate": 5.155746509129968e-06, + "loss": 1.9974, + "num_input_tokens_seen": 3128426496, + "step": 7956 + }, + { + "epoch": 0.902711493287699, + "grad_norm": 0.5247104167938232, + "learning_rate": 5.1199427139276766e-06, + "loss": 2.0054, + "num_input_tokens_seen": 3130785792, + "step": 7962 + }, + { + "epoch": 0.9033917581658358, + "grad_norm": 0.5259600281715393, + "learning_rate": 5.0841389187253856e-06, + "loss": 1.9812, + "num_input_tokens_seen": 3133145088, + "step": 7968 + }, + { + "epoch": 0.9040720230439727, + "grad_norm": 0.537581205368042, + "learning_rate": 5.048335123523094e-06, + "loss": 2.0135, + "num_input_tokens_seen": 3135504384, + "step": 7974 + }, + { + "epoch": 0.9047522879221097, + "grad_norm": 0.5331296920776367, + "learning_rate": 5.012531328320802e-06, + "loss": 2.0023, + "num_input_tokens_seen": 3137863680, + "step": 7980 + }, + { + "epoch": 0.9054325528002466, + "grad_norm": 0.5150538086891174, + "learning_rate": 4.976727533118511e-06, + "loss": 2.046, + "num_input_tokens_seen": 3140222976, + "step": 7986 + }, + { + "epoch": 0.9061128176783835, + "grad_norm": 0.5423092842102051, + "learning_rate": 4.940923737916219e-06, + "loss": 2.0323, + "num_input_tokens_seen": 3142582272, + "step": 7992 + }, + { + "epoch": 0.9067930825565205, + "grad_norm": 0.5528409481048584, + "learning_rate": 4.905119942713928e-06, + "loss": 1.9799, + "num_input_tokens_seen": 3144941568, + "step": 7998 + }, + { + "epoch": 0.9070198375158994, + "eval_accuracy": 0.5879627594627594, + "eval_loss": 1.996025800704956, + "eval_runtime": 129.4235, + "eval_samples_per_second": 3.091, + "eval_steps_per_second": 1.035, + "num_input_tokens_seen": 3145728000, + "step": 8000 + }, + { + "epoch": 0.9074733474346574, + "grad_norm": 0.4775083661079407, + "learning_rate": 4.869316147511636e-06, + "loss": 2.0185, + "num_input_tokens_seen": 3147300864, + "step": 8004 + }, + { + "epoch": 0.9081536123127943, + "grad_norm": 0.5261006355285645, + "learning_rate": 4.833512352309345e-06, + "loss": 1.9824, + "num_input_tokens_seen": 3149660160, + "step": 8010 + }, + { + "epoch": 0.9088338771909312, + "grad_norm": 0.4982771575450897, + "learning_rate": 4.797708557107053e-06, + "loss": 2.0035, + "num_input_tokens_seen": 3152019456, + "step": 8016 + }, + { + "epoch": 0.9095141420690681, + "grad_norm": 0.5401104092597961, + "learning_rate": 4.7619047619047615e-06, + "loss": 1.9991, + "num_input_tokens_seen": 3154378752, + "step": 8022 + }, + { + "epoch": 0.9101944069472051, + "grad_norm": 0.4819372594356537, + "learning_rate": 4.7261009667024705e-06, + "loss": 2.0375, + "num_input_tokens_seen": 3156738048, + "step": 8028 + }, + { + "epoch": 0.910874671825342, + "grad_norm": 0.51005619764328, + "learning_rate": 4.690297171500179e-06, + "loss": 2.0353, + "num_input_tokens_seen": 3159097344, + "step": 8034 + }, + { + "epoch": 0.9115549367034789, + "grad_norm": 0.49865275621414185, + "learning_rate": 4.654493376297888e-06, + "loss": 2.0177, + "num_input_tokens_seen": 3161456640, + "step": 8040 + }, + { + "epoch": 0.9122352015816159, + "grad_norm": 0.4954957962036133, + "learning_rate": 4.618689581095597e-06, + "loss": 2.0235, + "num_input_tokens_seen": 3163815936, + "step": 8046 + }, + { + "epoch": 0.9129154664597527, + "grad_norm": 0.48068705201148987, + "learning_rate": 4.582885785893305e-06, + "loss": 2.0415, + "num_input_tokens_seen": 3166175232, + "step": 8052 + }, + { + "epoch": 0.9135957313378897, + "grad_norm": 0.5091089606285095, + "learning_rate": 4.547081990691014e-06, + "loss": 2.0128, + "num_input_tokens_seen": 3168534528, + "step": 8058 + }, + { + "epoch": 0.9142759962160266, + "grad_norm": 0.48053333163261414, + "learning_rate": 4.511278195488722e-06, + "loss": 2.1089, + "num_input_tokens_seen": 3170893824, + "step": 8064 + }, + { + "epoch": 0.9149562610941635, + "grad_norm": 0.506682276725769, + "learning_rate": 4.475474400286431e-06, + "loss": 1.9918, + "num_input_tokens_seen": 3173253120, + "step": 8070 + }, + { + "epoch": 0.9156365259723005, + "grad_norm": 0.47464847564697266, + "learning_rate": 4.439670605084139e-06, + "loss": 2.0194, + "num_input_tokens_seen": 3175612416, + "step": 8076 + }, + { + "epoch": 0.9163167908504374, + "grad_norm": 0.4886178970336914, + "learning_rate": 4.403866809881848e-06, + "loss": 2.0147, + "num_input_tokens_seen": 3177971712, + "step": 8082 + }, + { + "epoch": 0.9169970557285743, + "grad_norm": 0.5083957314491272, + "learning_rate": 4.368063014679556e-06, + "loss": 1.9979, + "num_input_tokens_seen": 3180331008, + "step": 8088 + }, + { + "epoch": 0.9176773206067113, + "grad_norm": 0.5344352126121521, + "learning_rate": 4.3322592194772645e-06, + "loss": 2.0719, + "num_input_tokens_seen": 3182690304, + "step": 8094 + }, + { + "epoch": 0.9183575854848481, + "grad_norm": 0.4968712627887726, + "learning_rate": 4.2964554242749735e-06, + "loss": 1.987, + "num_input_tokens_seen": 3185049600, + "step": 8100 + }, + { + "epoch": 0.9190378503629851, + "grad_norm": 0.5581889748573303, + "learning_rate": 4.260651629072682e-06, + "loss": 2.0171, + "num_input_tokens_seen": 3187408896, + "step": 8106 + }, + { + "epoch": 0.919718115241122, + "grad_norm": 0.5023228526115417, + "learning_rate": 4.224847833870391e-06, + "loss": 1.9605, + "num_input_tokens_seen": 3189768192, + "step": 8112 + }, + { + "epoch": 0.9203983801192589, + "grad_norm": 0.5072777271270752, + "learning_rate": 4.189044038668099e-06, + "loss": 2.0555, + "num_input_tokens_seen": 3192127488, + "step": 8118 + }, + { + "epoch": 0.9210786449973959, + "grad_norm": 0.501773476600647, + "learning_rate": 4.153240243465808e-06, + "loss": 2.0703, + "num_input_tokens_seen": 3194486784, + "step": 8124 + }, + { + "epoch": 0.9217589098755328, + "grad_norm": 0.5245522856712341, + "learning_rate": 4.117436448263516e-06, + "loss": 2.0284, + "num_input_tokens_seen": 3196846080, + "step": 8130 + }, + { + "epoch": 0.9224391747536697, + "grad_norm": 0.5125513672828674, + "learning_rate": 4.081632653061224e-06, + "loss": 1.9894, + "num_input_tokens_seen": 3199205376, + "step": 8136 + }, + { + "epoch": 0.9231194396318066, + "grad_norm": 0.5277597904205322, + "learning_rate": 4.045828857858933e-06, + "loss": 1.9765, + "num_input_tokens_seen": 3201564672, + "step": 8142 + }, + { + "epoch": 0.9237997045099435, + "grad_norm": 0.48302364349365234, + "learning_rate": 4.010025062656641e-06, + "loss": 1.9909, + "num_input_tokens_seen": 3203923968, + "step": 8148 + }, + { + "epoch": 0.9244799693880805, + "grad_norm": 0.4815656244754791, + "learning_rate": 3.97422126745435e-06, + "loss": 2.069, + "num_input_tokens_seen": 3206283264, + "step": 8154 + }, + { + "epoch": 0.9251602342662174, + "grad_norm": 0.4820353090763092, + "learning_rate": 3.9384174722520585e-06, + "loss": 1.9816, + "num_input_tokens_seen": 3208642560, + "step": 8160 + }, + { + "epoch": 0.9258404991443543, + "grad_norm": 0.4983651041984558, + "learning_rate": 3.9026136770497675e-06, + "loss": 2.0272, + "num_input_tokens_seen": 3211001856, + "step": 8166 + }, + { + "epoch": 0.9265207640224913, + "grad_norm": 0.4833485186100006, + "learning_rate": 3.866809881847476e-06, + "loss": 2.0169, + "num_input_tokens_seen": 3213361152, + "step": 8172 + }, + { + "epoch": 0.9272010289006282, + "grad_norm": 0.5084331631660461, + "learning_rate": 3.831006086645185e-06, + "loss": 1.9514, + "num_input_tokens_seen": 3215720448, + "step": 8178 + }, + { + "epoch": 0.9278812937787652, + "grad_norm": 0.5215640068054199, + "learning_rate": 3.7952022914428932e-06, + "loss": 2.0084, + "num_input_tokens_seen": 3218079744, + "step": 8184 + }, + { + "epoch": 0.928561558656902, + "grad_norm": 0.48457303643226624, + "learning_rate": 3.7593984962406014e-06, + "loss": 1.99, + "num_input_tokens_seen": 3220439040, + "step": 8190 + }, + { + "epoch": 0.9292418235350389, + "grad_norm": 0.4908931851387024, + "learning_rate": 3.7235947010383104e-06, + "loss": 2.031, + "num_input_tokens_seen": 3222798336, + "step": 8196 + }, + { + "epoch": 0.9299220884131759, + "grad_norm": 0.486664354801178, + "learning_rate": 3.6877909058360186e-06, + "loss": 2.0522, + "num_input_tokens_seen": 3225157632, + "step": 8202 + }, + { + "epoch": 0.9306023532913128, + "grad_norm": 0.5011295676231384, + "learning_rate": 3.6519871106337276e-06, + "loss": 1.9818, + "num_input_tokens_seen": 3227516928, + "step": 8208 + }, + { + "epoch": 0.9312826181694497, + "grad_norm": 0.5124307870864868, + "learning_rate": 3.6161833154314357e-06, + "loss": 2.0587, + "num_input_tokens_seen": 3229876224, + "step": 8214 + }, + { + "epoch": 0.9319628830475867, + "grad_norm": 0.5010582804679871, + "learning_rate": 3.5803795202291443e-06, + "loss": 2.0345, + "num_input_tokens_seen": 3232235520, + "step": 8220 + }, + { + "epoch": 0.9326431479257236, + "grad_norm": 0.5208165645599365, + "learning_rate": 3.5445757250268533e-06, + "loss": 1.9941, + "num_input_tokens_seen": 3234594816, + "step": 8226 + }, + { + "epoch": 0.9333234128038606, + "grad_norm": 0.5112361311912537, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.9566, + "num_input_tokens_seen": 3236954112, + "step": 8232 + }, + { + "epoch": 0.9340036776819974, + "grad_norm": 0.48557788133621216, + "learning_rate": 3.4729681346222705e-06, + "loss": 1.9702, + "num_input_tokens_seen": 3239313408, + "step": 8238 + }, + { + "epoch": 0.9346839425601343, + "grad_norm": 0.5145829319953918, + "learning_rate": 3.4371643394199786e-06, + "loss": 2.0183, + "num_input_tokens_seen": 3241672704, + "step": 8244 + }, + { + "epoch": 0.9353642074382713, + "grad_norm": 0.44660866260528564, + "learning_rate": 3.4013605442176877e-06, + "loss": 2.0032, + "num_input_tokens_seen": 3244032000, + "step": 8250 + }, + { + "epoch": 0.9360444723164082, + "grad_norm": 0.5104228258132935, + "learning_rate": 3.365556749015396e-06, + "loss": 2.0313, + "num_input_tokens_seen": 3246391296, + "step": 8256 + }, + { + "epoch": 0.9367247371945451, + "grad_norm": 0.5160300731658936, + "learning_rate": 3.329752953813104e-06, + "loss": 1.9679, + "num_input_tokens_seen": 3248750592, + "step": 8262 + }, + { + "epoch": 0.9374050020726821, + "grad_norm": 0.4720374643802643, + "learning_rate": 3.293949158610813e-06, + "loss": 1.9988, + "num_input_tokens_seen": 3251109888, + "step": 8268 + }, + { + "epoch": 0.938085266950819, + "grad_norm": 0.4732125699520111, + "learning_rate": 3.258145363408521e-06, + "loss": 2.0359, + "num_input_tokens_seen": 3253469184, + "step": 8274 + }, + { + "epoch": 0.938765531828956, + "grad_norm": 0.4820111095905304, + "learning_rate": 3.22234156820623e-06, + "loss": 1.9772, + "num_input_tokens_seen": 3255828480, + "step": 8280 + }, + { + "epoch": 0.9394457967070928, + "grad_norm": 0.48187270760536194, + "learning_rate": 3.1865377730039383e-06, + "loss": 1.9989, + "num_input_tokens_seen": 3258187776, + "step": 8286 + }, + { + "epoch": 0.9401260615852297, + "grad_norm": 0.47333669662475586, + "learning_rate": 3.1507339778016473e-06, + "loss": 1.9811, + "num_input_tokens_seen": 3260547072, + "step": 8292 + }, + { + "epoch": 0.9408063264633667, + "grad_norm": 0.5094246864318848, + "learning_rate": 3.114930182599356e-06, + "loss": 1.9739, + "num_input_tokens_seen": 3262906368, + "step": 8298 + }, + { + "epoch": 0.9414865913415036, + "grad_norm": 0.5325969457626343, + "learning_rate": 3.0791263873970645e-06, + "loss": 2.015, + "num_input_tokens_seen": 3265265664, + "step": 8304 + }, + { + "epoch": 0.9421668562196405, + "grad_norm": 0.4827982485294342, + "learning_rate": 3.043322592194773e-06, + "loss": 1.9949, + "num_input_tokens_seen": 3267624960, + "step": 8310 + }, + { + "epoch": 0.9428471210977775, + "grad_norm": 0.4823977053165436, + "learning_rate": 3.007518796992481e-06, + "loss": 1.9807, + "num_input_tokens_seen": 3269984256, + "step": 8316 + }, + { + "epoch": 0.9435273859759143, + "grad_norm": 0.4721021354198456, + "learning_rate": 2.97171500179019e-06, + "loss": 1.9597, + "num_input_tokens_seen": 3272343552, + "step": 8322 + }, + { + "epoch": 0.9442076508540513, + "grad_norm": 0.4703858494758606, + "learning_rate": 2.9359112065878984e-06, + "loss": 1.9861, + "num_input_tokens_seen": 3274702848, + "step": 8328 + }, + { + "epoch": 0.9448879157321882, + "grad_norm": 0.5197435021400452, + "learning_rate": 2.900107411385607e-06, + "loss": 2.0625, + "num_input_tokens_seen": 3277062144, + "step": 8334 + }, + { + "epoch": 0.9455681806103251, + "grad_norm": 0.47608399391174316, + "learning_rate": 2.8643036161833155e-06, + "loss": 2.0277, + "num_input_tokens_seen": 3279421440, + "step": 8340 + }, + { + "epoch": 0.9462484454884621, + "grad_norm": 0.5438135266304016, + "learning_rate": 2.828499820981024e-06, + "loss": 1.9903, + "num_input_tokens_seen": 3281780736, + "step": 8346 + }, + { + "epoch": 0.946928710366599, + "grad_norm": 0.48217347264289856, + "learning_rate": 2.7926960257787327e-06, + "loss": 2.0009, + "num_input_tokens_seen": 3284140032, + "step": 8352 + }, + { + "epoch": 0.947608975244736, + "grad_norm": 0.47104737162590027, + "learning_rate": 2.7568922305764413e-06, + "loss": 2.0084, + "num_input_tokens_seen": 3286499328, + "step": 8358 + }, + { + "epoch": 0.9482892401228729, + "grad_norm": 0.5058236718177795, + "learning_rate": 2.72108843537415e-06, + "loss": 1.9667, + "num_input_tokens_seen": 3288858624, + "step": 8364 + }, + { + "epoch": 0.9489695050010097, + "grad_norm": 0.4855674207210541, + "learning_rate": 2.6852846401718585e-06, + "loss": 1.9953, + "num_input_tokens_seen": 3291217920, + "step": 8370 + }, + { + "epoch": 0.9496497698791467, + "grad_norm": 0.49368613958358765, + "learning_rate": 2.649480844969567e-06, + "loss": 2.0019, + "num_input_tokens_seen": 3293577216, + "step": 8376 + }, + { + "epoch": 0.9503300347572836, + "grad_norm": 0.4895451068878174, + "learning_rate": 2.6136770497672756e-06, + "loss": 2.0552, + "num_input_tokens_seen": 3295936512, + "step": 8382 + }, + { + "epoch": 0.9510102996354205, + "grad_norm": 0.4846164882183075, + "learning_rate": 2.577873254564984e-06, + "loss": 2.0175, + "num_input_tokens_seen": 3298295808, + "step": 8388 + }, + { + "epoch": 0.9516905645135575, + "grad_norm": 0.4728488028049469, + "learning_rate": 2.5420694593626928e-06, + "loss": 2.0314, + "num_input_tokens_seen": 3300655104, + "step": 8394 + }, + { + "epoch": 0.9523708293916944, + "grad_norm": 0.47742366790771484, + "learning_rate": 2.506265664160401e-06, + "loss": 2.0056, + "num_input_tokens_seen": 3303014400, + "step": 8400 + }, + { + "epoch": 0.9523708293916944, + "eval_accuracy": 0.5882161172161172, + "eval_loss": 1.9941824674606323, + "eval_runtime": 129.6127, + "eval_samples_per_second": 3.086, + "eval_steps_per_second": 1.034, + "num_input_tokens_seen": 3303014400, + "step": 8400 + }, + { + "epoch": 0.9530510942698314, + "grad_norm": 0.49103352427482605, + "learning_rate": 2.4704618689581095e-06, + "loss": 2.0074, + "num_input_tokens_seen": 3305373696, + "step": 8406 + }, + { + "epoch": 0.9537313591479682, + "grad_norm": 0.47667092084884644, + "learning_rate": 2.434658073755818e-06, + "loss": 2.0055, + "num_input_tokens_seen": 3307732992, + "step": 8412 + }, + { + "epoch": 0.9544116240261051, + "grad_norm": 0.5088315606117249, + "learning_rate": 2.3988542785535267e-06, + "loss": 2.0492, + "num_input_tokens_seen": 3310092288, + "step": 8418 + }, + { + "epoch": 0.9550918889042421, + "grad_norm": 0.5331157445907593, + "learning_rate": 2.3630504833512353e-06, + "loss": 1.991, + "num_input_tokens_seen": 3312451584, + "step": 8424 + }, + { + "epoch": 0.955772153782379, + "grad_norm": 0.4914342164993286, + "learning_rate": 2.327246688148944e-06, + "loss": 2.05, + "num_input_tokens_seen": 3314810880, + "step": 8430 + }, + { + "epoch": 0.9564524186605159, + "grad_norm": 0.5580516457557678, + "learning_rate": 2.2914428929466524e-06, + "loss": 2.0284, + "num_input_tokens_seen": 3317170176, + "step": 8436 + }, + { + "epoch": 0.9571326835386529, + "grad_norm": 0.5167604088783264, + "learning_rate": 2.255639097744361e-06, + "loss": 1.9835, + "num_input_tokens_seen": 3319529472, + "step": 8442 + }, + { + "epoch": 0.9578129484167898, + "grad_norm": 0.46328479051589966, + "learning_rate": 2.2198353025420696e-06, + "loss": 2.0001, + "num_input_tokens_seen": 3321888768, + "step": 8448 + }, + { + "epoch": 0.9584932132949268, + "grad_norm": 0.489848256111145, + "learning_rate": 2.184031507339778e-06, + "loss": 1.9874, + "num_input_tokens_seen": 3324248064, + "step": 8454 + }, + { + "epoch": 0.9591734781730636, + "grad_norm": 0.4731234312057495, + "learning_rate": 2.1482277121374868e-06, + "loss": 2.0577, + "num_input_tokens_seen": 3326607360, + "step": 8460 + }, + { + "epoch": 0.9598537430512005, + "grad_norm": 0.46996498107910156, + "learning_rate": 2.1124239169351953e-06, + "loss": 1.958, + "num_input_tokens_seen": 3328966656, + "step": 8466 + }, + { + "epoch": 0.9605340079293375, + "grad_norm": 0.4455466866493225, + "learning_rate": 2.076620121732904e-06, + "loss": 2.0721, + "num_input_tokens_seen": 3331325952, + "step": 8472 + }, + { + "epoch": 0.9612142728074744, + "grad_norm": 0.483164519071579, + "learning_rate": 2.040816326530612e-06, + "loss": 1.997, + "num_input_tokens_seen": 3333685248, + "step": 8478 + }, + { + "epoch": 0.9618945376856113, + "grad_norm": 0.46224308013916016, + "learning_rate": 2.0050125313283207e-06, + "loss": 1.9534, + "num_input_tokens_seen": 3336044544, + "step": 8484 + }, + { + "epoch": 0.9625748025637483, + "grad_norm": 0.5407201647758484, + "learning_rate": 1.9692087361260292e-06, + "loss": 2.0635, + "num_input_tokens_seen": 3338403840, + "step": 8490 + }, + { + "epoch": 0.9632550674418852, + "grad_norm": 0.49724259972572327, + "learning_rate": 1.933404940923738e-06, + "loss": 2.0165, + "num_input_tokens_seen": 3340763136, + "step": 8496 + }, + { + "epoch": 0.9639353323200222, + "grad_norm": 0.4704829156398773, + "learning_rate": 1.8976011457214466e-06, + "loss": 1.995, + "num_input_tokens_seen": 3343122432, + "step": 8502 + }, + { + "epoch": 0.964615597198159, + "grad_norm": 0.4864175319671631, + "learning_rate": 1.8617973505191552e-06, + "loss": 2.0195, + "num_input_tokens_seen": 3345481728, + "step": 8508 + }, + { + "epoch": 0.9652958620762959, + "grad_norm": 0.5042557120323181, + "learning_rate": 1.8259935553168638e-06, + "loss": 1.9928, + "num_input_tokens_seen": 3347841024, + "step": 8514 + }, + { + "epoch": 0.9659761269544329, + "grad_norm": 0.5622674822807312, + "learning_rate": 1.7901897601145722e-06, + "loss": 2.0166, + "num_input_tokens_seen": 3350200320, + "step": 8520 + }, + { + "epoch": 0.9666563918325698, + "grad_norm": 0.4886009395122528, + "learning_rate": 1.7543859649122807e-06, + "loss": 2.0512, + "num_input_tokens_seen": 3352559616, + "step": 8526 + }, + { + "epoch": 0.9673366567107067, + "grad_norm": 0.48748981952667236, + "learning_rate": 1.7185821697099893e-06, + "loss": 2.0336, + "num_input_tokens_seen": 3354918912, + "step": 8532 + }, + { + "epoch": 0.9680169215888437, + "grad_norm": 0.4899289608001709, + "learning_rate": 1.682778374507698e-06, + "loss": 2.0008, + "num_input_tokens_seen": 3357278208, + "step": 8538 + }, + { + "epoch": 0.9686971864669806, + "grad_norm": 0.465916246175766, + "learning_rate": 1.6469745793054065e-06, + "loss": 1.9239, + "num_input_tokens_seen": 3359637504, + "step": 8544 + }, + { + "epoch": 0.9693774513451175, + "grad_norm": 0.5022467374801636, + "learning_rate": 1.611170784103115e-06, + "loss": 2.0017, + "num_input_tokens_seen": 3361996800, + "step": 8550 + }, + { + "epoch": 0.9700577162232544, + "grad_norm": 0.47663870453834534, + "learning_rate": 1.5753669889008237e-06, + "loss": 1.9944, + "num_input_tokens_seen": 3364356096, + "step": 8556 + }, + { + "epoch": 0.9707379811013913, + "grad_norm": 0.48725882172584534, + "learning_rate": 1.5395631936985322e-06, + "loss": 1.9945, + "num_input_tokens_seen": 3366715392, + "step": 8562 + }, + { + "epoch": 0.9714182459795283, + "grad_norm": 0.47763851284980774, + "learning_rate": 1.5037593984962406e-06, + "loss": 2.0526, + "num_input_tokens_seen": 3369074688, + "step": 8568 + }, + { + "epoch": 0.9720985108576652, + "grad_norm": 0.4931076467037201, + "learning_rate": 1.4679556032939492e-06, + "loss": 1.9821, + "num_input_tokens_seen": 3371433984, + "step": 8574 + }, + { + "epoch": 0.9727787757358022, + "grad_norm": 0.4717552363872528, + "learning_rate": 1.4321518080916578e-06, + "loss": 1.9392, + "num_input_tokens_seen": 3373793280, + "step": 8580 + }, + { + "epoch": 0.9734590406139391, + "grad_norm": 0.46910974383354187, + "learning_rate": 1.3963480128893664e-06, + "loss": 2.061, + "num_input_tokens_seen": 3376152576, + "step": 8586 + }, + { + "epoch": 0.974139305492076, + "grad_norm": 0.4669153690338135, + "learning_rate": 1.360544217687075e-06, + "loss": 2.0235, + "num_input_tokens_seen": 3378511872, + "step": 8592 + }, + { + "epoch": 0.9748195703702129, + "grad_norm": 0.49458202719688416, + "learning_rate": 1.3247404224847835e-06, + "loss": 1.9982, + "num_input_tokens_seen": 3380871168, + "step": 8598 + }, + { + "epoch": 0.9754998352483498, + "grad_norm": 0.4920654892921448, + "learning_rate": 1.288936627282492e-06, + "loss": 2.0098, + "num_input_tokens_seen": 3383230464, + "step": 8604 + }, + { + "epoch": 0.9761801001264867, + "grad_norm": 0.46870675683021545, + "learning_rate": 1.2531328320802005e-06, + "loss": 2.0151, + "num_input_tokens_seen": 3385589760, + "step": 8610 + }, + { + "epoch": 0.9768603650046237, + "grad_norm": 0.4873650372028351, + "learning_rate": 1.217329036877909e-06, + "loss": 2.0151, + "num_input_tokens_seen": 3387949056, + "step": 8616 + }, + { + "epoch": 0.9775406298827606, + "grad_norm": 0.4861888587474823, + "learning_rate": 1.1815252416756176e-06, + "loss": 2.0329, + "num_input_tokens_seen": 3390308352, + "step": 8622 + }, + { + "epoch": 0.9782208947608976, + "grad_norm": 0.48227134346961975, + "learning_rate": 1.1457214464733262e-06, + "loss": 2.049, + "num_input_tokens_seen": 3392667648, + "step": 8628 + }, + { + "epoch": 0.9789011596390345, + "grad_norm": 0.5111281871795654, + "learning_rate": 1.1099176512710348e-06, + "loss": 2.0235, + "num_input_tokens_seen": 3395026944, + "step": 8634 + }, + { + "epoch": 0.9795814245171713, + "grad_norm": 0.4849562644958496, + "learning_rate": 1.0741138560687434e-06, + "loss": 2.043, + "num_input_tokens_seen": 3397386240, + "step": 8640 + }, + { + "epoch": 0.9802616893953083, + "grad_norm": 0.461967408657074, + "learning_rate": 1.038310060866452e-06, + "loss": 2.0346, + "num_input_tokens_seen": 3399745536, + "step": 8646 + }, + { + "epoch": 0.9809419542734452, + "grad_norm": 0.5269701480865479, + "learning_rate": 1.0025062656641603e-06, + "loss": 1.9499, + "num_input_tokens_seen": 3402104832, + "step": 8652 + }, + { + "epoch": 0.9816222191515821, + "grad_norm": 0.4462730586528778, + "learning_rate": 9.66702470461869e-07, + "loss": 1.9948, + "num_input_tokens_seen": 3404464128, + "step": 8658 + }, + { + "epoch": 0.9823024840297191, + "grad_norm": 0.459370493888855, + "learning_rate": 9.308986752595776e-07, + "loss": 2.0511, + "num_input_tokens_seen": 3406823424, + "step": 8664 + }, + { + "epoch": 0.982982748907856, + "grad_norm": 0.49241387844085693, + "learning_rate": 8.950948800572861e-07, + "loss": 1.994, + "num_input_tokens_seen": 3409182720, + "step": 8670 + }, + { + "epoch": 0.983663013785993, + "grad_norm": 0.4557535648345947, + "learning_rate": 8.592910848549947e-07, + "loss": 2.0506, + "num_input_tokens_seen": 3411542016, + "step": 8676 + }, + { + "epoch": 0.9843432786641299, + "grad_norm": 0.48074275255203247, + "learning_rate": 8.234872896527032e-07, + "loss": 1.946, + "num_input_tokens_seen": 3413901312, + "step": 8682 + }, + { + "epoch": 0.9850235435422667, + "grad_norm": 0.4807458221912384, + "learning_rate": 7.876834944504118e-07, + "loss": 1.9977, + "num_input_tokens_seen": 3416260608, + "step": 8688 + }, + { + "epoch": 0.9857038084204037, + "grad_norm": 0.4791664183139801, + "learning_rate": 7.518796992481203e-07, + "loss": 1.9519, + "num_input_tokens_seen": 3418619904, + "step": 8694 + }, + { + "epoch": 0.9863840732985406, + "grad_norm": 0.45667803287506104, + "learning_rate": 7.160759040458289e-07, + "loss": 2.0047, + "num_input_tokens_seen": 3420979200, + "step": 8700 + }, + { + "epoch": 0.9870643381766775, + "grad_norm": 0.456066370010376, + "learning_rate": 6.802721088435375e-07, + "loss": 2.023, + "num_input_tokens_seen": 3423338496, + "step": 8706 + }, + { + "epoch": 0.9877446030548145, + "grad_norm": 0.49028295278549194, + "learning_rate": 6.44468313641246e-07, + "loss": 2.0506, + "num_input_tokens_seen": 3425697792, + "step": 8712 + }, + { + "epoch": 0.9884248679329514, + "grad_norm": 0.4853602647781372, + "learning_rate": 6.086645184389545e-07, + "loss": 2.0127, + "num_input_tokens_seen": 3428057088, + "step": 8718 + }, + { + "epoch": 0.9891051328110884, + "grad_norm": 0.4771934449672699, + "learning_rate": 5.728607232366631e-07, + "loss": 2.0125, + "num_input_tokens_seen": 3430416384, + "step": 8724 + }, + { + "epoch": 0.9897853976892252, + "grad_norm": 0.47227999567985535, + "learning_rate": 5.370569280343717e-07, + "loss": 1.9608, + "num_input_tokens_seen": 3432775680, + "step": 8730 + }, + { + "epoch": 0.9904656625673621, + "grad_norm": 0.5044968128204346, + "learning_rate": 5.012531328320802e-07, + "loss": 2.0495, + "num_input_tokens_seen": 3435134976, + "step": 8736 + }, + { + "epoch": 0.9911459274454991, + "grad_norm": 0.47824859619140625, + "learning_rate": 4.654493376297888e-07, + "loss": 2.0122, + "num_input_tokens_seen": 3437494272, + "step": 8742 + }, + { + "epoch": 0.991826192323636, + "grad_norm": 0.4707111120223999, + "learning_rate": 4.2964554242749733e-07, + "loss": 2.0036, + "num_input_tokens_seen": 3439853568, + "step": 8748 + }, + { + "epoch": 0.992506457201773, + "grad_norm": 0.47407931089401245, + "learning_rate": 3.938417472252059e-07, + "loss": 2.0537, + "num_input_tokens_seen": 3442212864, + "step": 8754 + }, + { + "epoch": 0.9931867220799099, + "grad_norm": 0.46975448727607727, + "learning_rate": 3.5803795202291444e-07, + "loss": 1.9721, + "num_input_tokens_seen": 3444572160, + "step": 8760 + }, + { + "epoch": 0.9938669869580468, + "grad_norm": 0.46711423993110657, + "learning_rate": 3.22234156820623e-07, + "loss": 2.0284, + "num_input_tokens_seen": 3446931456, + "step": 8766 + }, + { + "epoch": 0.9945472518361838, + "grad_norm": 0.4574568271636963, + "learning_rate": 2.8643036161833155e-07, + "loss": 1.9764, + "num_input_tokens_seen": 3449290752, + "step": 8772 + }, + { + "epoch": 0.9952275167143206, + "grad_norm": 0.4745030105113983, + "learning_rate": 2.506265664160401e-07, + "loss": 2.0551, + "num_input_tokens_seen": 3451650048, + "step": 8778 + }, + { + "epoch": 0.9959077815924575, + "grad_norm": 0.48132795095443726, + "learning_rate": 2.1482277121374867e-07, + "loss": 1.9993, + "num_input_tokens_seen": 3454009344, + "step": 8784 + }, + { + "epoch": 0.9965880464705945, + "grad_norm": 0.4754565954208374, + "learning_rate": 1.7901897601145722e-07, + "loss": 2.0276, + "num_input_tokens_seen": 3456368640, + "step": 8790 + }, + { + "epoch": 0.9972683113487314, + "grad_norm": 0.4709925949573517, + "learning_rate": 1.4321518080916578e-07, + "loss": 1.9961, + "num_input_tokens_seen": 3458727936, + "step": 8796 + }, + { + "epoch": 0.9977218212674893, + "eval_accuracy": 0.5884273504273504, + "eval_loss": 1.992612361907959, + "eval_runtime": 129.0731, + "eval_samples_per_second": 3.099, + "eval_steps_per_second": 1.038, + "num_input_tokens_seen": 3460300800, + "step": 8800 + }, + { + "epoch": 0.9979485762268684, + "grad_norm": 0.4582703709602356, + "learning_rate": 1.0741138560687433e-07, + "loss": 2.0105, + "num_input_tokens_seen": 3461087232, + "step": 8802 + }, + { + "epoch": 0.9986288411050053, + "grad_norm": 0.4576333463191986, + "learning_rate": 7.160759040458289e-08, + "loss": 1.974, + "num_input_tokens_seen": 3463446528, + "step": 8808 + }, + { + "epoch": 0.9993091059831422, + "grad_norm": 0.46073076128959656, + "learning_rate": 3.5803795202291444e-08, + "loss": 2.0213, + "num_input_tokens_seen": 3465805824, + "step": 8814 + }, + { + "epoch": 0.9999893708612791, + "grad_norm": 0.48334258794784546, + "learning_rate": 0.0, + "loss": 1.9662, + "num_input_tokens_seen": 3468165120, + "step": 8820 + }, + { + "epoch": 0.9999893708612791, + "num_input_tokens_seen": 3468165120, + "step": 8820, + "total_flos": 4.540784328132526e+18, + "train_loss": 2.065564124978859, + "train_runtime": 98524.0638, + "train_samples_per_second": 8.594, + "train_steps_per_second": 0.09 + } + ], + "logging_steps": 6, + "max_steps": 8820, + "num_input_tokens_seen": 3468165120, + "num_train_epochs": 1, + "save_steps": 200, + "total_flos": 4.540784328132526e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}