{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999893708612791, "eval_steps": 400, "global_step": 8820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006802648781369246, "grad_norm": 3.334683656692505, "learning_rate": 6.802721088435375e-07, "loss": 2.9439, "num_input_tokens_seen": 2359296, "step": 6 }, { "epoch": 0.0013605297562738492, "grad_norm": 2.447828769683838, "learning_rate": 1.360544217687075e-06, "loss": 2.918, "num_input_tokens_seen": 4718592, "step": 12 }, { "epoch": 0.0020407946344107738, "grad_norm": 1.870556116104126, "learning_rate": 2.040816326530612e-06, "loss": 2.8392, "num_input_tokens_seen": 7077888, "step": 18 }, { "epoch": 0.0027210595125476984, "grad_norm": 1.367610216140747, "learning_rate": 2.72108843537415e-06, "loss": 2.7568, "num_input_tokens_seen": 9437184, "step": 24 }, { "epoch": 0.003401324390684623, "grad_norm": 1.1030510663986206, "learning_rate": 3.4013605442176877e-06, "loss": 2.71, "num_input_tokens_seen": 11796480, "step": 30 }, { "epoch": 0.0040815892688215475, "grad_norm": 0.904552161693573, "learning_rate": 4.081632653061224e-06, "loss": 2.6018, "num_input_tokens_seen": 14155776, "step": 36 }, { "epoch": 0.004761854146958472, "grad_norm": 0.7523818612098694, "learning_rate": 4.7619047619047615e-06, "loss": 2.5708, "num_input_tokens_seen": 16515072, "step": 42 }, { "epoch": 0.005442119025095397, "grad_norm": 0.6976463198661804, "learning_rate": 5.4421768707483e-06, "loss": 2.5212, "num_input_tokens_seen": 18874368, "step": 48 }, { "epoch": 0.006122383903232321, "grad_norm": 0.6655228734016418, "learning_rate": 6.122448979591837e-06, "loss": 2.502, "num_input_tokens_seen": 21233664, "step": 54 }, { "epoch": 0.006802648781369246, "grad_norm": 0.6486473679542542, "learning_rate": 6.802721088435375e-06, "loss": 2.529, "num_input_tokens_seen": 23592960, "step": 60 }, { "epoch": 0.00748291365950617, "grad_norm": 0.5861107110977173, "learning_rate": 7.482993197278912e-06, "loss": 2.501, "num_input_tokens_seen": 25952256, "step": 66 }, { "epoch": 0.008163178537643095, "grad_norm": 0.6155074834823608, "learning_rate": 8.163265306122448e-06, "loss": 2.4401, "num_input_tokens_seen": 28311552, "step": 72 }, { "epoch": 0.00884344341578002, "grad_norm": 0.6312918663024902, "learning_rate": 8.843537414965987e-06, "loss": 2.4007, "num_input_tokens_seen": 30670848, "step": 78 }, { "epoch": 0.009523708293916943, "grad_norm": 0.5730547904968262, "learning_rate": 9.523809523809523e-06, "loss": 2.4158, "num_input_tokens_seen": 33030144, "step": 84 }, { "epoch": 0.01020397317205387, "grad_norm": 0.6233570575714111, "learning_rate": 1.0204081632653061e-05, "loss": 2.3776, "num_input_tokens_seen": 35389440, "step": 90 }, { "epoch": 0.010884238050190793, "grad_norm": 0.6111224293708801, "learning_rate": 1.08843537414966e-05, "loss": 2.3986, "num_input_tokens_seen": 37748736, "step": 96 }, { "epoch": 0.011564502928327718, "grad_norm": 0.6153233647346497, "learning_rate": 1.1564625850340138e-05, "loss": 2.3898, "num_input_tokens_seen": 40108032, "step": 102 }, { "epoch": 0.012244767806464642, "grad_norm": 0.5808484554290771, "learning_rate": 1.2244897959183674e-05, "loss": 2.3324, "num_input_tokens_seen": 42467328, "step": 108 }, { "epoch": 0.012925032684601566, "grad_norm": 0.6722745299339294, "learning_rate": 1.2925170068027212e-05, "loss": 2.3763, "num_input_tokens_seen": 44826624, "step": 114 }, { "epoch": 0.013605297562738492, "grad_norm": 0.5742015242576599, "learning_rate": 1.360544217687075e-05, "loss": 2.341, "num_input_tokens_seen": 47185920, "step": 120 }, { "epoch": 0.014285562440875416, "grad_norm": 0.572271466255188, "learning_rate": 1.4285714285714285e-05, "loss": 2.3944, "num_input_tokens_seen": 49545216, "step": 126 }, { "epoch": 0.01496582731901234, "grad_norm": 0.6920814514160156, "learning_rate": 1.4965986394557824e-05, "loss": 2.2826, "num_input_tokens_seen": 51904512, "step": 132 }, { "epoch": 0.015646092197149266, "grad_norm": 0.6880550384521484, "learning_rate": 1.5646258503401362e-05, "loss": 2.3113, "num_input_tokens_seen": 54263808, "step": 138 }, { "epoch": 0.01632635707528619, "grad_norm": 0.7120464444160461, "learning_rate": 1.6326530612244897e-05, "loss": 2.299, "num_input_tokens_seen": 56623104, "step": 144 }, { "epoch": 0.017006621953423114, "grad_norm": 0.7833120226860046, "learning_rate": 1.7006802721088435e-05, "loss": 2.3243, "num_input_tokens_seen": 58982400, "step": 150 }, { "epoch": 0.01768688683156004, "grad_norm": 0.6928340792655945, "learning_rate": 1.7687074829931973e-05, "loss": 2.3074, "num_input_tokens_seen": 61341696, "step": 156 }, { "epoch": 0.018367151709696963, "grad_norm": 0.6552649736404419, "learning_rate": 1.836734693877551e-05, "loss": 2.2916, "num_input_tokens_seen": 63700992, "step": 162 }, { "epoch": 0.019047416587833887, "grad_norm": 0.776566743850708, "learning_rate": 1.9047619047619046e-05, "loss": 2.3148, "num_input_tokens_seen": 66060288, "step": 168 }, { "epoch": 0.01972768146597081, "grad_norm": 0.8164829611778259, "learning_rate": 1.9727891156462584e-05, "loss": 2.3097, "num_input_tokens_seen": 68419584, "step": 174 }, { "epoch": 0.02040794634410774, "grad_norm": 0.6100189685821533, "learning_rate": 2.0408163265306123e-05, "loss": 2.279, "num_input_tokens_seen": 70778880, "step": 180 }, { "epoch": 0.021088211222244663, "grad_norm": 0.6935499310493469, "learning_rate": 2.108843537414966e-05, "loss": 2.3033, "num_input_tokens_seen": 73138176, "step": 186 }, { "epoch": 0.021768476100381587, "grad_norm": 0.6464715600013733, "learning_rate": 2.17687074829932e-05, "loss": 2.29, "num_input_tokens_seen": 75497472, "step": 192 }, { "epoch": 0.02244874097851851, "grad_norm": 0.6630780696868896, "learning_rate": 2.2448979591836737e-05, "loss": 2.2882, "num_input_tokens_seen": 77856768, "step": 198 }, { "epoch": 0.023129005856655435, "grad_norm": 0.6190893054008484, "learning_rate": 2.3129251700680275e-05, "loss": 2.2754, "num_input_tokens_seen": 80216064, "step": 204 }, { "epoch": 0.02380927073479236, "grad_norm": 0.7775458097457886, "learning_rate": 2.380952380952381e-05, "loss": 2.2589, "num_input_tokens_seen": 82575360, "step": 210 }, { "epoch": 0.024489535612929283, "grad_norm": 0.7430821657180786, "learning_rate": 2.448979591836735e-05, "loss": 2.2157, "num_input_tokens_seen": 84934656, "step": 216 }, { "epoch": 0.025169800491066208, "grad_norm": 0.8468402624130249, "learning_rate": 2.5170068027210887e-05, "loss": 2.2681, "num_input_tokens_seen": 87293952, "step": 222 }, { "epoch": 0.025850065369203132, "grad_norm": 0.6779688000679016, "learning_rate": 2.5850340136054425e-05, "loss": 2.2527, "num_input_tokens_seen": 89653248, "step": 228 }, { "epoch": 0.02653033024734006, "grad_norm": 0.7911133170127869, "learning_rate": 2.6530612244897963e-05, "loss": 2.2089, "num_input_tokens_seen": 92012544, "step": 234 }, { "epoch": 0.027210595125476984, "grad_norm": 0.8615039587020874, "learning_rate": 2.72108843537415e-05, "loss": 2.2765, "num_input_tokens_seen": 94371840, "step": 240 }, { "epoch": 0.027890860003613908, "grad_norm": 0.8226616978645325, "learning_rate": 2.7891156462585033e-05, "loss": 2.2692, "num_input_tokens_seen": 96731136, "step": 246 }, { "epoch": 0.028571124881750832, "grad_norm": 0.7901866436004639, "learning_rate": 2.857142857142857e-05, "loss": 2.2716, "num_input_tokens_seen": 99090432, "step": 252 }, { "epoch": 0.029251389759887756, "grad_norm": 0.7447572350502014, "learning_rate": 2.925170068027211e-05, "loss": 2.2277, "num_input_tokens_seen": 101449728, "step": 258 }, { "epoch": 0.02993165463802468, "grad_norm": 0.7737066149711609, "learning_rate": 2.9931972789115647e-05, "loss": 2.2078, "num_input_tokens_seen": 103809024, "step": 264 }, { "epoch": 0.030611919516161604, "grad_norm": 0.9132639169692993, "learning_rate": 3.061224489795919e-05, "loss": 2.2601, "num_input_tokens_seen": 106168320, "step": 270 }, { "epoch": 0.03129218439429853, "grad_norm": 0.7523462772369385, "learning_rate": 3.1292517006802724e-05, "loss": 2.2504, "num_input_tokens_seen": 108527616, "step": 276 }, { "epoch": 0.031972449272435456, "grad_norm": 0.688888430595398, "learning_rate": 3.1972789115646265e-05, "loss": 2.2534, "num_input_tokens_seen": 110886912, "step": 282 }, { "epoch": 0.03265271415057238, "grad_norm": 1.0086206197738647, "learning_rate": 3.265306122448979e-05, "loss": 2.2697, "num_input_tokens_seen": 113246208, "step": 288 }, { "epoch": 0.033332979028709304, "grad_norm": 0.93113112449646, "learning_rate": 3.3333333333333335e-05, "loss": 2.2168, "num_input_tokens_seen": 115605504, "step": 294 }, { "epoch": 0.03401324390684623, "grad_norm": 1.0298339128494263, "learning_rate": 3.401360544217687e-05, "loss": 2.249, "num_input_tokens_seen": 117964800, "step": 300 }, { "epoch": 0.03469350878498315, "grad_norm": 0.807465136051178, "learning_rate": 3.469387755102041e-05, "loss": 2.2063, "num_input_tokens_seen": 120324096, "step": 306 }, { "epoch": 0.03537377366312008, "grad_norm": 0.8339959383010864, "learning_rate": 3.5374149659863946e-05, "loss": 2.2061, "num_input_tokens_seen": 122683392, "step": 312 }, { "epoch": 0.036054038541257, "grad_norm": 0.8316759467124939, "learning_rate": 3.605442176870749e-05, "loss": 2.2576, "num_input_tokens_seen": 125042688, "step": 318 }, { "epoch": 0.036734303419393925, "grad_norm": 0.7208542823791504, "learning_rate": 3.673469387755102e-05, "loss": 2.1918, "num_input_tokens_seen": 127401984, "step": 324 }, { "epoch": 0.03741456829753085, "grad_norm": 0.658276379108429, "learning_rate": 3.7414965986394564e-05, "loss": 2.231, "num_input_tokens_seen": 129761280, "step": 330 }, { "epoch": 0.038094833175667774, "grad_norm": 0.7552313208580017, "learning_rate": 3.809523809523809e-05, "loss": 2.1973, "num_input_tokens_seen": 132120576, "step": 336 }, { "epoch": 0.0387750980538047, "grad_norm": 0.6971672773361206, "learning_rate": 3.8775510204081634e-05, "loss": 2.2358, "num_input_tokens_seen": 134479872, "step": 342 }, { "epoch": 0.03945536293194162, "grad_norm": 0.9760845303535461, "learning_rate": 3.945578231292517e-05, "loss": 2.2392, "num_input_tokens_seen": 136839168, "step": 348 }, { "epoch": 0.040135627810078546, "grad_norm": 0.8042694330215454, "learning_rate": 4.013605442176871e-05, "loss": 2.2047, "num_input_tokens_seen": 139198464, "step": 354 }, { "epoch": 0.04081589268821548, "grad_norm": 0.6926929354667664, "learning_rate": 4.0816326530612245e-05, "loss": 2.2546, "num_input_tokens_seen": 141557760, "step": 360 }, { "epoch": 0.0414961575663524, "grad_norm": 0.8320844173431396, "learning_rate": 4.149659863945579e-05, "loss": 2.1762, "num_input_tokens_seen": 143917056, "step": 366 }, { "epoch": 0.042176422444489325, "grad_norm": 0.9185928106307983, "learning_rate": 4.217687074829932e-05, "loss": 2.2258, "num_input_tokens_seen": 146276352, "step": 372 }, { "epoch": 0.04285668732262625, "grad_norm": 0.9259613156318665, "learning_rate": 4.2857142857142856e-05, "loss": 2.1874, "num_input_tokens_seen": 148635648, "step": 378 }, { "epoch": 0.043536952200763174, "grad_norm": 0.7989060878753662, "learning_rate": 4.35374149659864e-05, "loss": 2.2364, "num_input_tokens_seen": 150994944, "step": 384 }, { "epoch": 0.0442172170789001, "grad_norm": 0.975271999835968, "learning_rate": 4.421768707482993e-05, "loss": 2.2092, "num_input_tokens_seen": 153354240, "step": 390 }, { "epoch": 0.04489748195703702, "grad_norm": 0.6971870064735413, "learning_rate": 4.4897959183673474e-05, "loss": 2.2374, "num_input_tokens_seen": 155713536, "step": 396 }, { "epoch": 0.04535099187579497, "eval_accuracy": 0.5588241758241759, "eval_loss": 2.1870992183685303, "eval_runtime": 128.6022, "eval_samples_per_second": 3.11, "eval_steps_per_second": 1.042, "num_input_tokens_seen": 157286400, "step": 400 }, { "epoch": 0.045577746835173946, "grad_norm": 0.8090885877609253, "learning_rate": 4.557823129251701e-05, "loss": 2.2098, "num_input_tokens_seen": 158072832, "step": 402 }, { "epoch": 0.04625801171331087, "grad_norm": 0.8753838539123535, "learning_rate": 4.625850340136055e-05, "loss": 2.1732, "num_input_tokens_seen": 160432128, "step": 408 }, { "epoch": 0.046938276591447795, "grad_norm": 1.0101414918899536, "learning_rate": 4.6938775510204086e-05, "loss": 2.1742, "num_input_tokens_seen": 162791424, "step": 414 }, { "epoch": 0.04761854146958472, "grad_norm": 0.841810405254364, "learning_rate": 4.761904761904762e-05, "loss": 2.1769, "num_input_tokens_seen": 165150720, "step": 420 }, { "epoch": 0.04829880634772164, "grad_norm": 0.9404070973396301, "learning_rate": 4.8299319727891155e-05, "loss": 2.2274, "num_input_tokens_seen": 167510016, "step": 426 }, { "epoch": 0.04897907122585857, "grad_norm": 0.7818936109542847, "learning_rate": 4.89795918367347e-05, "loss": 2.2116, "num_input_tokens_seen": 169869312, "step": 432 }, { "epoch": 0.04965933610399549, "grad_norm": 0.8758242130279541, "learning_rate": 4.965986394557823e-05, "loss": 2.2065, "num_input_tokens_seen": 172228608, "step": 438 }, { "epoch": 0.050339600982132415, "grad_norm": 0.8778213262557983, "learning_rate": 4.9982098102398855e-05, "loss": 2.1828, "num_input_tokens_seen": 174587904, "step": 444 }, { "epoch": 0.05101986586026934, "grad_norm": 0.8915372490882874, "learning_rate": 4.9946294307196566e-05, "loss": 2.1787, "num_input_tokens_seen": 176947200, "step": 450 }, { "epoch": 0.051700130738406264, "grad_norm": 0.8329364657402039, "learning_rate": 4.991049051199427e-05, "loss": 2.1746, "num_input_tokens_seen": 179306496, "step": 456 }, { "epoch": 0.052380395616543195, "grad_norm": 0.7898052334785461, "learning_rate": 4.987468671679198e-05, "loss": 2.1859, "num_input_tokens_seen": 181665792, "step": 462 }, { "epoch": 0.05306066049468012, "grad_norm": 0.9453828930854797, "learning_rate": 4.9838882921589694e-05, "loss": 2.2036, "num_input_tokens_seen": 184025088, "step": 468 }, { "epoch": 0.05374092537281704, "grad_norm": 0.8436469435691833, "learning_rate": 4.98030791263874e-05, "loss": 2.169, "num_input_tokens_seen": 186384384, "step": 474 }, { "epoch": 0.05442119025095397, "grad_norm": 0.9970703721046448, "learning_rate": 4.976727533118511e-05, "loss": 2.161, "num_input_tokens_seen": 188743680, "step": 480 }, { "epoch": 0.05510145512909089, "grad_norm": 0.7459275722503662, "learning_rate": 4.9731471535982815e-05, "loss": 2.2542, "num_input_tokens_seen": 191102976, "step": 486 }, { "epoch": 0.055781720007227815, "grad_norm": 0.9407626986503601, "learning_rate": 4.969566774078053e-05, "loss": 2.1648, "num_input_tokens_seen": 193462272, "step": 492 }, { "epoch": 0.05646198488536474, "grad_norm": 0.9194992780685425, "learning_rate": 4.965986394557823e-05, "loss": 2.142, "num_input_tokens_seen": 195821568, "step": 498 }, { "epoch": 0.057142249763501664, "grad_norm": 0.819237232208252, "learning_rate": 4.9624060150375936e-05, "loss": 2.2195, "num_input_tokens_seen": 198180864, "step": 504 }, { "epoch": 0.05782251464163859, "grad_norm": 0.8461591005325317, "learning_rate": 4.958825635517365e-05, "loss": 2.1649, "num_input_tokens_seen": 200540160, "step": 510 }, { "epoch": 0.05850277951977551, "grad_norm": 0.8540611267089844, "learning_rate": 4.955245255997136e-05, "loss": 2.1611, "num_input_tokens_seen": 202899456, "step": 516 }, { "epoch": 0.059183044397912436, "grad_norm": 0.767410933971405, "learning_rate": 4.951664876476907e-05, "loss": 2.1524, "num_input_tokens_seen": 205258752, "step": 522 }, { "epoch": 0.05986330927604936, "grad_norm": 1.049315333366394, "learning_rate": 4.9480844969566776e-05, "loss": 2.1582, "num_input_tokens_seen": 207618048, "step": 528 }, { "epoch": 0.060543574154186285, "grad_norm": 0.7773332595825195, "learning_rate": 4.944504117436449e-05, "loss": 2.193, "num_input_tokens_seen": 209977344, "step": 534 }, { "epoch": 0.06122383903232321, "grad_norm": 0.9237553477287292, "learning_rate": 4.940923737916219e-05, "loss": 2.158, "num_input_tokens_seen": 212336640, "step": 540 }, { "epoch": 0.06190410391046013, "grad_norm": 0.836017370223999, "learning_rate": 4.93734335839599e-05, "loss": 2.2233, "num_input_tokens_seen": 214695936, "step": 546 }, { "epoch": 0.06258436878859706, "grad_norm": 0.8589292168617249, "learning_rate": 4.933762978875761e-05, "loss": 2.2209, "num_input_tokens_seen": 217055232, "step": 552 }, { "epoch": 0.06326463366673399, "grad_norm": 0.7112890481948853, "learning_rate": 4.930182599355532e-05, "loss": 2.2024, "num_input_tokens_seen": 219414528, "step": 558 }, { "epoch": 0.06394489854487091, "grad_norm": 0.718296229839325, "learning_rate": 4.926602219835303e-05, "loss": 2.1465, "num_input_tokens_seen": 221773824, "step": 564 }, { "epoch": 0.06462516342300784, "grad_norm": 0.8518996238708496, "learning_rate": 4.9230218403150736e-05, "loss": 2.1673, "num_input_tokens_seen": 224133120, "step": 570 }, { "epoch": 0.06530542830114476, "grad_norm": 0.8261798024177551, "learning_rate": 4.919441460794845e-05, "loss": 2.1991, "num_input_tokens_seen": 226492416, "step": 576 }, { "epoch": 0.06598569317928168, "grad_norm": 0.9106934070587158, "learning_rate": 4.915861081274615e-05, "loss": 2.1448, "num_input_tokens_seen": 228851712, "step": 582 }, { "epoch": 0.06666595805741861, "grad_norm": 0.8303735256195068, "learning_rate": 4.912280701754386e-05, "loss": 2.1883, "num_input_tokens_seen": 231211008, "step": 588 }, { "epoch": 0.06734622293555553, "grad_norm": 0.7179098129272461, "learning_rate": 4.908700322234157e-05, "loss": 2.1744, "num_input_tokens_seen": 233570304, "step": 594 }, { "epoch": 0.06802648781369246, "grad_norm": 0.9411275386810303, "learning_rate": 4.905119942713928e-05, "loss": 2.1269, "num_input_tokens_seen": 235929600, "step": 600 }, { "epoch": 0.06870675269182938, "grad_norm": 0.9154955744743347, "learning_rate": 4.901539563193699e-05, "loss": 2.1807, "num_input_tokens_seen": 238288896, "step": 606 }, { "epoch": 0.0693870175699663, "grad_norm": 0.6920604705810547, "learning_rate": 4.89795918367347e-05, "loss": 2.1427, "num_input_tokens_seen": 240648192, "step": 612 }, { "epoch": 0.07006728244810323, "grad_norm": 0.6742058396339417, "learning_rate": 4.894378804153241e-05, "loss": 2.1443, "num_input_tokens_seen": 243007488, "step": 618 }, { "epoch": 0.07074754732624015, "grad_norm": 0.7783246040344238, "learning_rate": 4.890798424633011e-05, "loss": 2.1613, "num_input_tokens_seen": 245366784, "step": 624 }, { "epoch": 0.07142781220437708, "grad_norm": 0.9674144983291626, "learning_rate": 4.887218045112782e-05, "loss": 2.1811, "num_input_tokens_seen": 247726080, "step": 630 }, { "epoch": 0.072108077082514, "grad_norm": 0.7616820335388184, "learning_rate": 4.883637665592553e-05, "loss": 2.1264, "num_input_tokens_seen": 250085376, "step": 636 }, { "epoch": 0.07278834196065093, "grad_norm": 0.7255170941352844, "learning_rate": 4.8800572860723234e-05, "loss": 2.1, "num_input_tokens_seen": 252444672, "step": 642 }, { "epoch": 0.07346860683878785, "grad_norm": 0.7598791718482971, "learning_rate": 4.8764769065520946e-05, "loss": 2.1683, "num_input_tokens_seen": 254803968, "step": 648 }, { "epoch": 0.07414887171692477, "grad_norm": 0.6518032550811768, "learning_rate": 4.872896527031866e-05, "loss": 2.1476, "num_input_tokens_seen": 257163264, "step": 654 }, { "epoch": 0.0748291365950617, "grad_norm": 0.6598438024520874, "learning_rate": 4.869316147511637e-05, "loss": 2.1574, "num_input_tokens_seen": 259522560, "step": 660 }, { "epoch": 0.07550940147319862, "grad_norm": 0.7716994881629944, "learning_rate": 4.8657357679914074e-05, "loss": 2.1797, "num_input_tokens_seen": 261881856, "step": 666 }, { "epoch": 0.07618966635133555, "grad_norm": 0.6889157295227051, "learning_rate": 4.862155388471178e-05, "loss": 2.126, "num_input_tokens_seen": 264241152, "step": 672 }, { "epoch": 0.07686993122947247, "grad_norm": 0.7756158113479614, "learning_rate": 4.858575008950949e-05, "loss": 2.1199, "num_input_tokens_seen": 266600448, "step": 678 }, { "epoch": 0.0775501961076094, "grad_norm": 0.7093831896781921, "learning_rate": 4.8549946294307195e-05, "loss": 2.1428, "num_input_tokens_seen": 268959744, "step": 684 }, { "epoch": 0.07823046098574632, "grad_norm": 0.786888599395752, "learning_rate": 4.8514142499104906e-05, "loss": 2.2016, "num_input_tokens_seen": 271319040, "step": 690 }, { "epoch": 0.07891072586388324, "grad_norm": 1.1578270196914673, "learning_rate": 4.847833870390262e-05, "loss": 2.1278, "num_input_tokens_seen": 273678336, "step": 696 }, { "epoch": 0.07959099074202017, "grad_norm": 0.8472400307655334, "learning_rate": 4.844253490870033e-05, "loss": 2.1288, "num_input_tokens_seen": 276037632, "step": 702 }, { "epoch": 0.08027125562015709, "grad_norm": 0.708258867263794, "learning_rate": 4.8406731113498034e-05, "loss": 2.1609, "num_input_tokens_seen": 278396928, "step": 708 }, { "epoch": 0.08095152049829403, "grad_norm": 0.9472253918647766, "learning_rate": 4.837092731829574e-05, "loss": 2.1452, "num_input_tokens_seen": 280756224, "step": 714 }, { "epoch": 0.08163178537643095, "grad_norm": 0.7842580080032349, "learning_rate": 4.833512352309345e-05, "loss": 2.1191, "num_input_tokens_seen": 283115520, "step": 720 }, { "epoch": 0.08231205025456788, "grad_norm": 0.6376339793205261, "learning_rate": 4.8299319727891155e-05, "loss": 2.1575, "num_input_tokens_seen": 285474816, "step": 726 }, { "epoch": 0.0829923151327048, "grad_norm": 0.6511639952659607, "learning_rate": 4.826351593268887e-05, "loss": 2.1064, "num_input_tokens_seen": 287834112, "step": 732 }, { "epoch": 0.08367258001084173, "grad_norm": 0.599183976650238, "learning_rate": 4.822771213748657e-05, "loss": 2.1406, "num_input_tokens_seen": 290193408, "step": 738 }, { "epoch": 0.08435284488897865, "grad_norm": 0.6168906092643738, "learning_rate": 4.819190834228429e-05, "loss": 2.1267, "num_input_tokens_seen": 292552704, "step": 744 }, { "epoch": 0.08503310976711558, "grad_norm": 0.7475244998931885, "learning_rate": 4.8156104547081995e-05, "loss": 2.1511, "num_input_tokens_seen": 294912000, "step": 750 }, { "epoch": 0.0857133746452525, "grad_norm": 0.7636436223983765, "learning_rate": 4.81203007518797e-05, "loss": 2.1664, "num_input_tokens_seen": 297271296, "step": 756 }, { "epoch": 0.08639363952338942, "grad_norm": 0.8825888633728027, "learning_rate": 4.808449695667741e-05, "loss": 2.1587, "num_input_tokens_seen": 299630592, "step": 762 }, { "epoch": 0.08707390440152635, "grad_norm": 0.8732916712760925, "learning_rate": 4.8048693161475116e-05, "loss": 2.1548, "num_input_tokens_seen": 301989888, "step": 768 }, { "epoch": 0.08775416927966327, "grad_norm": 0.9067391157150269, "learning_rate": 4.801288936627283e-05, "loss": 2.1383, "num_input_tokens_seen": 304349184, "step": 774 }, { "epoch": 0.0884344341578002, "grad_norm": 0.795757532119751, "learning_rate": 4.797708557107053e-05, "loss": 2.1138, "num_input_tokens_seen": 306708480, "step": 780 }, { "epoch": 0.08911469903593712, "grad_norm": 0.8759602904319763, "learning_rate": 4.7941281775868244e-05, "loss": 2.1481, "num_input_tokens_seen": 309067776, "step": 786 }, { "epoch": 0.08979496391407404, "grad_norm": 0.6154273748397827, "learning_rate": 4.7905477980665955e-05, "loss": 2.1156, "num_input_tokens_seen": 311427072, "step": 792 }, { "epoch": 0.09047522879221097, "grad_norm": 0.6875632405281067, "learning_rate": 4.786967418546366e-05, "loss": 2.143, "num_input_tokens_seen": 313786368, "step": 798 }, { "epoch": 0.09070198375158994, "eval_accuracy": 0.5665372405372405, "eval_loss": 2.1336145401000977, "eval_runtime": 130.3356, "eval_samples_per_second": 3.069, "eval_steps_per_second": 1.028, "num_input_tokens_seen": 314572800, "step": 800 }, { "epoch": 0.09115549367034789, "grad_norm": 0.6513155698776245, "learning_rate": 4.783387039026137e-05, "loss": 2.1974, "num_input_tokens_seen": 316145664, "step": 804 }, { "epoch": 0.09183575854848482, "grad_norm": 0.6210876703262329, "learning_rate": 4.7798066595059076e-05, "loss": 2.1321, "num_input_tokens_seen": 318504960, "step": 810 }, { "epoch": 0.09251602342662174, "grad_norm": 0.6908721327781677, "learning_rate": 4.776226279985679e-05, "loss": 2.1574, "num_input_tokens_seen": 320864256, "step": 816 }, { "epoch": 0.09319628830475866, "grad_norm": 0.8170259594917297, "learning_rate": 4.772645900465449e-05, "loss": 2.0947, "num_input_tokens_seen": 323223552, "step": 822 }, { "epoch": 0.09387655318289559, "grad_norm": 0.7803713083267212, "learning_rate": 4.7690655209452204e-05, "loss": 2.1412, "num_input_tokens_seen": 325582848, "step": 828 }, { "epoch": 0.09455681806103251, "grad_norm": 0.9013774394989014, "learning_rate": 4.7654851414249916e-05, "loss": 2.1511, "num_input_tokens_seen": 327942144, "step": 834 }, { "epoch": 0.09523708293916944, "grad_norm": 0.691776692867279, "learning_rate": 4.761904761904762e-05, "loss": 2.149, "num_input_tokens_seen": 330301440, "step": 840 }, { "epoch": 0.09591734781730636, "grad_norm": 0.7903074622154236, "learning_rate": 4.758324382384533e-05, "loss": 2.1061, "num_input_tokens_seen": 332660736, "step": 846 }, { "epoch": 0.09659761269544329, "grad_norm": 0.7019173502922058, "learning_rate": 4.754744002864304e-05, "loss": 2.1225, "num_input_tokens_seen": 335020032, "step": 852 }, { "epoch": 0.09727787757358021, "grad_norm": 0.7324870824813843, "learning_rate": 4.751163623344075e-05, "loss": 2.0793, "num_input_tokens_seen": 337379328, "step": 858 }, { "epoch": 0.09795814245171713, "grad_norm": 0.6702744960784912, "learning_rate": 4.747583243823845e-05, "loss": 2.1023, "num_input_tokens_seen": 339738624, "step": 864 }, { "epoch": 0.09863840732985406, "grad_norm": 0.7916101217269897, "learning_rate": 4.7440028643036165e-05, "loss": 2.1885, "num_input_tokens_seen": 342097920, "step": 870 }, { "epoch": 0.09931867220799098, "grad_norm": 0.7214677929878235, "learning_rate": 4.740422484783387e-05, "loss": 2.2039, "num_input_tokens_seen": 344457216, "step": 876 }, { "epoch": 0.0999989370861279, "grad_norm": 0.9506244659423828, "learning_rate": 4.736842105263158e-05, "loss": 2.1401, "num_input_tokens_seen": 346816512, "step": 882 }, { "epoch": 0.10067920196426483, "grad_norm": 0.8334141969680786, "learning_rate": 4.733261725742929e-05, "loss": 2.1155, "num_input_tokens_seen": 349175808, "step": 888 }, { "epoch": 0.10135946684240175, "grad_norm": 0.8239167928695679, "learning_rate": 4.7296813462227e-05, "loss": 2.1143, "num_input_tokens_seen": 351535104, "step": 894 }, { "epoch": 0.10203973172053868, "grad_norm": 0.6935220956802368, "learning_rate": 4.726100966702471e-05, "loss": 2.1481, "num_input_tokens_seen": 353894400, "step": 900 }, { "epoch": 0.1027199965986756, "grad_norm": 0.6344029307365417, "learning_rate": 4.7225205871822413e-05, "loss": 2.1337, "num_input_tokens_seen": 356253696, "step": 906 }, { "epoch": 0.10340026147681253, "grad_norm": 0.9699512720108032, "learning_rate": 4.7189402076620125e-05, "loss": 2.1522, "num_input_tokens_seen": 358612992, "step": 912 }, { "epoch": 0.10408052635494947, "grad_norm": 0.6322435736656189, "learning_rate": 4.715359828141783e-05, "loss": 2.1133, "num_input_tokens_seen": 360972288, "step": 918 }, { "epoch": 0.10476079123308639, "grad_norm": 0.755022406578064, "learning_rate": 4.711779448621554e-05, "loss": 2.1191, "num_input_tokens_seen": 363331584, "step": 924 }, { "epoch": 0.10544105611122331, "grad_norm": 0.6669276356697083, "learning_rate": 4.708199069101325e-05, "loss": 2.1255, "num_input_tokens_seen": 365690880, "step": 930 }, { "epoch": 0.10612132098936024, "grad_norm": 0.7509146928787231, "learning_rate": 4.704618689581096e-05, "loss": 2.1711, "num_input_tokens_seen": 368050176, "step": 936 }, { "epoch": 0.10680158586749716, "grad_norm": 0.8903239369392395, "learning_rate": 4.701038310060867e-05, "loss": 2.1675, "num_input_tokens_seen": 370409472, "step": 942 }, { "epoch": 0.10748185074563409, "grad_norm": 0.6709368824958801, "learning_rate": 4.6974579305406374e-05, "loss": 2.1266, "num_input_tokens_seen": 372768768, "step": 948 }, { "epoch": 0.10816211562377101, "grad_norm": 0.6461692452430725, "learning_rate": 4.6938775510204086e-05, "loss": 2.1098, "num_input_tokens_seen": 375128064, "step": 954 }, { "epoch": 0.10884238050190793, "grad_norm": 0.8384061455726624, "learning_rate": 4.690297171500179e-05, "loss": 2.1397, "num_input_tokens_seen": 377487360, "step": 960 }, { "epoch": 0.10952264538004486, "grad_norm": 0.6946293115615845, "learning_rate": 4.6867167919799495e-05, "loss": 2.1413, "num_input_tokens_seen": 379846656, "step": 966 }, { "epoch": 0.11020291025818178, "grad_norm": 0.7992385625839233, "learning_rate": 4.6831364124597213e-05, "loss": 2.1188, "num_input_tokens_seen": 382205952, "step": 972 }, { "epoch": 0.1108831751363187, "grad_norm": 0.6177113056182861, "learning_rate": 4.679556032939492e-05, "loss": 2.1798, "num_input_tokens_seen": 384565248, "step": 978 }, { "epoch": 0.11156344001445563, "grad_norm": 0.6821500062942505, "learning_rate": 4.675975653419263e-05, "loss": 2.1062, "num_input_tokens_seen": 386924544, "step": 984 }, { "epoch": 0.11224370489259256, "grad_norm": 0.6839202642440796, "learning_rate": 4.6723952738990334e-05, "loss": 2.1472, "num_input_tokens_seen": 389283840, "step": 990 }, { "epoch": 0.11292396977072948, "grad_norm": 0.5608601570129395, "learning_rate": 4.6688148943788046e-05, "loss": 2.0845, "num_input_tokens_seen": 391643136, "step": 996 }, { "epoch": 0.1136042346488664, "grad_norm": 0.7359477877616882, "learning_rate": 4.665234514858575e-05, "loss": 2.1426, "num_input_tokens_seen": 394002432, "step": 1002 }, { "epoch": 0.11428449952700333, "grad_norm": 0.6624149084091187, "learning_rate": 4.6616541353383456e-05, "loss": 2.1188, "num_input_tokens_seen": 396361728, "step": 1008 }, { "epoch": 0.11496476440514025, "grad_norm": 0.6561130285263062, "learning_rate": 4.658073755818117e-05, "loss": 2.1141, "num_input_tokens_seen": 398721024, "step": 1014 }, { "epoch": 0.11564502928327718, "grad_norm": 0.76801598072052, "learning_rate": 4.654493376297888e-05, "loss": 2.1074, "num_input_tokens_seen": 401080320, "step": 1020 }, { "epoch": 0.1163252941614141, "grad_norm": 0.7016099095344543, "learning_rate": 4.650912996777659e-05, "loss": 2.116, "num_input_tokens_seen": 403439616, "step": 1026 }, { "epoch": 0.11700555903955102, "grad_norm": 0.7845112681388855, "learning_rate": 4.6473326172574295e-05, "loss": 2.0639, "num_input_tokens_seen": 405798912, "step": 1032 }, { "epoch": 0.11768582391768795, "grad_norm": 0.7502654194831848, "learning_rate": 4.6437522377372007e-05, "loss": 2.0549, "num_input_tokens_seen": 408158208, "step": 1038 }, { "epoch": 0.11836608879582487, "grad_norm": 0.8195413947105408, "learning_rate": 4.640171858216971e-05, "loss": 2.1463, "num_input_tokens_seen": 410517504, "step": 1044 }, { "epoch": 0.1190463536739618, "grad_norm": 0.6572406888008118, "learning_rate": 4.6365914786967416e-05, "loss": 2.1043, "num_input_tokens_seen": 412876800, "step": 1050 }, { "epoch": 0.11972661855209872, "grad_norm": 0.6787090301513672, "learning_rate": 4.633011099176513e-05, "loss": 2.1287, "num_input_tokens_seen": 415236096, "step": 1056 }, { "epoch": 0.12040688343023564, "grad_norm": 0.6975082755088806, "learning_rate": 4.629430719656284e-05, "loss": 2.1439, "num_input_tokens_seen": 417595392, "step": 1062 }, { "epoch": 0.12108714830837257, "grad_norm": 1.1566354036331177, "learning_rate": 4.625850340136055e-05, "loss": 2.1331, "num_input_tokens_seen": 419954688, "step": 1068 }, { "epoch": 0.1217674131865095, "grad_norm": 0.804976224899292, "learning_rate": 4.6222699606158255e-05, "loss": 2.1842, "num_input_tokens_seen": 422313984, "step": 1074 }, { "epoch": 0.12244767806464642, "grad_norm": 0.7782629728317261, "learning_rate": 4.618689581095597e-05, "loss": 2.1258, "num_input_tokens_seen": 424673280, "step": 1080 }, { "epoch": 0.12312794294278334, "grad_norm": 0.7431383728981018, "learning_rate": 4.615109201575367e-05, "loss": 2.0759, "num_input_tokens_seen": 427032576, "step": 1086 }, { "epoch": 0.12380820782092027, "grad_norm": 0.6219275593757629, "learning_rate": 4.6115288220551377e-05, "loss": 2.1158, "num_input_tokens_seen": 429391872, "step": 1092 }, { "epoch": 0.12448847269905719, "grad_norm": 0.7471150755882263, "learning_rate": 4.607948442534909e-05, "loss": 2.1602, "num_input_tokens_seen": 431751168, "step": 1098 }, { "epoch": 0.12516873757719413, "grad_norm": 0.788198709487915, "learning_rate": 4.604368063014679e-05, "loss": 2.1178, "num_input_tokens_seen": 434110464, "step": 1104 }, { "epoch": 0.12584900245533104, "grad_norm": 0.7239183187484741, "learning_rate": 4.6007876834944504e-05, "loss": 2.1157, "num_input_tokens_seen": 436469760, "step": 1110 }, { "epoch": 0.12652926733346798, "grad_norm": 0.6211867332458496, "learning_rate": 4.5972073039742216e-05, "loss": 2.1705, "num_input_tokens_seen": 438829056, "step": 1116 }, { "epoch": 0.1272095322116049, "grad_norm": 0.7338197231292725, "learning_rate": 4.593626924453993e-05, "loss": 2.1271, "num_input_tokens_seen": 441188352, "step": 1122 }, { "epoch": 0.12788979708974182, "grad_norm": 0.7123642563819885, "learning_rate": 4.590046544933763e-05, "loss": 2.0573, "num_input_tokens_seen": 443547648, "step": 1128 }, { "epoch": 0.12857006196787873, "grad_norm": 0.648991048336029, "learning_rate": 4.586466165413534e-05, "loss": 2.1645, "num_input_tokens_seen": 445906944, "step": 1134 }, { "epoch": 0.12925032684601567, "grad_norm": 0.727215051651001, "learning_rate": 4.582885785893305e-05, "loss": 2.1009, "num_input_tokens_seen": 448266240, "step": 1140 }, { "epoch": 0.12993059172415258, "grad_norm": 0.7532079219818115, "learning_rate": 4.5793054063730753e-05, "loss": 2.1067, "num_input_tokens_seen": 450625536, "step": 1146 }, { "epoch": 0.13061085660228952, "grad_norm": 0.7537828683853149, "learning_rate": 4.5757250268528465e-05, "loss": 2.1201, "num_input_tokens_seen": 452984832, "step": 1152 }, { "epoch": 0.13129112148042643, "grad_norm": 0.7720354795455933, "learning_rate": 4.5721446473326176e-05, "loss": 2.1422, "num_input_tokens_seen": 455344128, "step": 1158 }, { "epoch": 0.13197138635856337, "grad_norm": 0.9617815613746643, "learning_rate": 4.568564267812389e-05, "loss": 2.1267, "num_input_tokens_seen": 457703424, "step": 1164 }, { "epoch": 0.13265165123670028, "grad_norm": 0.6180392503738403, "learning_rate": 4.564983888292159e-05, "loss": 2.1077, "num_input_tokens_seen": 460062720, "step": 1170 }, { "epoch": 0.13333191611483722, "grad_norm": 0.6402847170829773, "learning_rate": 4.56140350877193e-05, "loss": 2.084, "num_input_tokens_seen": 462422016, "step": 1176 }, { "epoch": 0.13401218099297413, "grad_norm": 0.727862536907196, "learning_rate": 4.557823129251701e-05, "loss": 2.1177, "num_input_tokens_seen": 464781312, "step": 1182 }, { "epoch": 0.13469244587111107, "grad_norm": 0.6989423036575317, "learning_rate": 4.5542427497314714e-05, "loss": 2.1041, "num_input_tokens_seen": 467140608, "step": 1188 }, { "epoch": 0.13537271074924798, "grad_norm": 0.8146799206733704, "learning_rate": 4.5506623702112425e-05, "loss": 2.0976, "num_input_tokens_seen": 469499904, "step": 1194 }, { "epoch": 0.13605297562738491, "grad_norm": 0.7785530686378479, "learning_rate": 4.547081990691014e-05, "loss": 2.1272, "num_input_tokens_seen": 471859200, "step": 1200 }, { "epoch": 0.13605297562738491, "eval_accuracy": 0.5698174603174603, "eval_loss": 2.109198808670044, "eval_runtime": 129.4137, "eval_samples_per_second": 3.091, "eval_steps_per_second": 1.035, "num_input_tokens_seen": 471859200, "step": 1200 }, { "epoch": 0.13673324050552182, "grad_norm": 0.6469578742980957, "learning_rate": 4.543501611170785e-05, "loss": 2.1522, "num_input_tokens_seen": 474218496, "step": 1206 }, { "epoch": 0.13741350538365876, "grad_norm": 1.0499253273010254, "learning_rate": 4.539921231650555e-05, "loss": 2.1576, "num_input_tokens_seen": 476577792, "step": 1212 }, { "epoch": 0.13809377026179567, "grad_norm": 0.6888744235038757, "learning_rate": 4.536340852130326e-05, "loss": 2.1386, "num_input_tokens_seen": 478937088, "step": 1218 }, { "epoch": 0.1387740351399326, "grad_norm": 0.6668254733085632, "learning_rate": 4.532760472610097e-05, "loss": 2.1211, "num_input_tokens_seen": 481296384, "step": 1224 }, { "epoch": 0.13945430001806955, "grad_norm": 0.5561350584030151, "learning_rate": 4.5291800930898674e-05, "loss": 2.056, "num_input_tokens_seen": 483655680, "step": 1230 }, { "epoch": 0.14013456489620646, "grad_norm": 0.6395593285560608, "learning_rate": 4.5255997135696386e-05, "loss": 2.1032, "num_input_tokens_seen": 486014976, "step": 1236 }, { "epoch": 0.1408148297743434, "grad_norm": 0.5906882882118225, "learning_rate": 4.522019334049409e-05, "loss": 2.1373, "num_input_tokens_seen": 488374272, "step": 1242 }, { "epoch": 0.1414950946524803, "grad_norm": 0.776069700717926, "learning_rate": 4.51843895452918e-05, "loss": 2.1563, "num_input_tokens_seen": 490733568, "step": 1248 }, { "epoch": 0.14217535953061725, "grad_norm": 0.6770499348640442, "learning_rate": 4.5148585750089514e-05, "loss": 2.1556, "num_input_tokens_seen": 493092864, "step": 1254 }, { "epoch": 0.14285562440875416, "grad_norm": 0.6341859698295593, "learning_rate": 4.511278195488722e-05, "loss": 2.16, "num_input_tokens_seen": 495452160, "step": 1260 }, { "epoch": 0.1435358892868911, "grad_norm": 0.6789543032646179, "learning_rate": 4.507697815968493e-05, "loss": 2.1314, "num_input_tokens_seen": 497811456, "step": 1266 }, { "epoch": 0.144216154165028, "grad_norm": 0.5745943784713745, "learning_rate": 4.5041174364482635e-05, "loss": 2.0845, "num_input_tokens_seen": 500170752, "step": 1272 }, { "epoch": 0.14489641904316494, "grad_norm": 0.6102567315101624, "learning_rate": 4.5005370569280346e-05, "loss": 2.0663, "num_input_tokens_seen": 502530048, "step": 1278 }, { "epoch": 0.14557668392130185, "grad_norm": 0.5677859783172607, "learning_rate": 4.496956677407805e-05, "loss": 2.1446, "num_input_tokens_seen": 504889344, "step": 1284 }, { "epoch": 0.1462569487994388, "grad_norm": 0.7098356485366821, "learning_rate": 4.493376297887576e-05, "loss": 2.076, "num_input_tokens_seen": 507248640, "step": 1290 }, { "epoch": 0.1469372136775757, "grad_norm": 0.7611458897590637, "learning_rate": 4.4897959183673474e-05, "loss": 2.0869, "num_input_tokens_seen": 509607936, "step": 1296 }, { "epoch": 0.14761747855571264, "grad_norm": 0.7817174196243286, "learning_rate": 4.486215538847118e-05, "loss": 2.1251, "num_input_tokens_seen": 511967232, "step": 1302 }, { "epoch": 0.14829774343384955, "grad_norm": 0.7138420343399048, "learning_rate": 4.482635159326889e-05, "loss": 2.1547, "num_input_tokens_seen": 514326528, "step": 1308 }, { "epoch": 0.1489780083119865, "grad_norm": 0.6586819887161255, "learning_rate": 4.4790547798066595e-05, "loss": 2.0944, "num_input_tokens_seen": 516685824, "step": 1314 }, { "epoch": 0.1496582731901234, "grad_norm": 0.7534651160240173, "learning_rate": 4.475474400286431e-05, "loss": 2.1, "num_input_tokens_seen": 519045120, "step": 1320 }, { "epoch": 0.15033853806826034, "grad_norm": 0.677528440952301, "learning_rate": 4.471894020766201e-05, "loss": 2.096, "num_input_tokens_seen": 521404416, "step": 1326 }, { "epoch": 0.15101880294639725, "grad_norm": 0.5919771790504456, "learning_rate": 4.468313641245972e-05, "loss": 2.1184, "num_input_tokens_seen": 523763712, "step": 1332 }, { "epoch": 0.15169906782453418, "grad_norm": 0.6883030533790588, "learning_rate": 4.464733261725743e-05, "loss": 2.118, "num_input_tokens_seen": 526123008, "step": 1338 }, { "epoch": 0.1523793327026711, "grad_norm": 0.7062236666679382, "learning_rate": 4.461152882205514e-05, "loss": 2.1439, "num_input_tokens_seen": 528482304, "step": 1344 }, { "epoch": 0.15305959758080803, "grad_norm": 0.6822494268417358, "learning_rate": 4.457572502685285e-05, "loss": 2.1023, "num_input_tokens_seen": 530841600, "step": 1350 }, { "epoch": 0.15373986245894494, "grad_norm": 0.6365748047828674, "learning_rate": 4.4539921231650556e-05, "loss": 2.0743, "num_input_tokens_seen": 533200896, "step": 1356 }, { "epoch": 0.15442012733708188, "grad_norm": 0.6446681022644043, "learning_rate": 4.450411743644827e-05, "loss": 2.0933, "num_input_tokens_seen": 535560192, "step": 1362 }, { "epoch": 0.1551003922152188, "grad_norm": 0.6867052912712097, "learning_rate": 4.446831364124597e-05, "loss": 2.1392, "num_input_tokens_seen": 537919488, "step": 1368 }, { "epoch": 0.15578065709335573, "grad_norm": 0.6548677086830139, "learning_rate": 4.4432509846043684e-05, "loss": 2.1549, "num_input_tokens_seen": 540278784, "step": 1374 }, { "epoch": 0.15646092197149264, "grad_norm": 0.660763144493103, "learning_rate": 4.439670605084139e-05, "loss": 2.111, "num_input_tokens_seen": 542638080, "step": 1380 }, { "epoch": 0.15714118684962958, "grad_norm": 0.7522821426391602, "learning_rate": 4.43609022556391e-05, "loss": 2.0979, "num_input_tokens_seen": 544997376, "step": 1386 }, { "epoch": 0.1578214517277665, "grad_norm": 0.7142075896263123, "learning_rate": 4.432509846043681e-05, "loss": 2.0947, "num_input_tokens_seen": 547356672, "step": 1392 }, { "epoch": 0.15850171660590343, "grad_norm": 0.684587836265564, "learning_rate": 4.4289294665234516e-05, "loss": 2.1263, "num_input_tokens_seen": 549715968, "step": 1398 }, { "epoch": 0.15918198148404034, "grad_norm": 0.62205970287323, "learning_rate": 4.425349087003223e-05, "loss": 2.0615, "num_input_tokens_seen": 552075264, "step": 1404 }, { "epoch": 0.15986224636217727, "grad_norm": 0.6591596603393555, "learning_rate": 4.421768707482993e-05, "loss": 2.1085, "num_input_tokens_seen": 554434560, "step": 1410 }, { "epoch": 0.16054251124031418, "grad_norm": 0.6393697261810303, "learning_rate": 4.4181883279627644e-05, "loss": 2.1273, "num_input_tokens_seen": 556793856, "step": 1416 }, { "epoch": 0.16122277611845112, "grad_norm": 0.7339461445808411, "learning_rate": 4.414607948442535e-05, "loss": 2.078, "num_input_tokens_seen": 559153152, "step": 1422 }, { "epoch": 0.16190304099658806, "grad_norm": 0.5903000235557556, "learning_rate": 4.411027568922306e-05, "loss": 2.0781, "num_input_tokens_seen": 561512448, "step": 1428 }, { "epoch": 0.16258330587472497, "grad_norm": 0.5981512069702148, "learning_rate": 4.407447189402077e-05, "loss": 2.0697, "num_input_tokens_seen": 563871744, "step": 1434 }, { "epoch": 0.1632635707528619, "grad_norm": 0.7107566595077515, "learning_rate": 4.403866809881848e-05, "loss": 2.1106, "num_input_tokens_seen": 566231040, "step": 1440 }, { "epoch": 0.16394383563099882, "grad_norm": 0.66408371925354, "learning_rate": 4.400286430361619e-05, "loss": 2.178, "num_input_tokens_seen": 568590336, "step": 1446 }, { "epoch": 0.16462410050913576, "grad_norm": 0.7157317399978638, "learning_rate": 4.396706050841389e-05, "loss": 2.0805, "num_input_tokens_seen": 570949632, "step": 1452 }, { "epoch": 0.16530436538727267, "grad_norm": 0.5517193078994751, "learning_rate": 4.3931256713211605e-05, "loss": 2.0886, "num_input_tokens_seen": 573308928, "step": 1458 }, { "epoch": 0.1659846302654096, "grad_norm": 0.6534057259559631, "learning_rate": 4.389545291800931e-05, "loss": 2.0757, "num_input_tokens_seen": 575668224, "step": 1464 }, { "epoch": 0.16666489514354652, "grad_norm": 0.6548903584480286, "learning_rate": 4.3859649122807014e-05, "loss": 2.061, "num_input_tokens_seen": 578027520, "step": 1470 }, { "epoch": 0.16734516002168345, "grad_norm": 0.7592008113861084, "learning_rate": 4.3823845327604726e-05, "loss": 2.1042, "num_input_tokens_seen": 580386816, "step": 1476 }, { "epoch": 0.16802542489982036, "grad_norm": 0.6569022536277771, "learning_rate": 4.378804153240244e-05, "loss": 2.1117, "num_input_tokens_seen": 582746112, "step": 1482 }, { "epoch": 0.1687056897779573, "grad_norm": 0.666001558303833, "learning_rate": 4.375223773720015e-05, "loss": 2.1153, "num_input_tokens_seen": 585105408, "step": 1488 }, { "epoch": 0.1693859546560942, "grad_norm": 0.726992666721344, "learning_rate": 4.3716433941997854e-05, "loss": 2.1045, "num_input_tokens_seen": 587464704, "step": 1494 }, { "epoch": 0.17006621953423115, "grad_norm": 0.7073400020599365, "learning_rate": 4.3680630146795565e-05, "loss": 2.107, "num_input_tokens_seen": 589824000, "step": 1500 }, { "epoch": 0.17074648441236806, "grad_norm": 0.7716240286827087, "learning_rate": 4.364482635159327e-05, "loss": 2.0693, "num_input_tokens_seen": 592183296, "step": 1506 }, { "epoch": 0.171426749290505, "grad_norm": 0.6214017271995544, "learning_rate": 4.3609022556390975e-05, "loss": 2.0959, "num_input_tokens_seen": 594542592, "step": 1512 }, { "epoch": 0.1721070141686419, "grad_norm": 0.6559828519821167, "learning_rate": 4.3573218761188686e-05, "loss": 2.0871, "num_input_tokens_seen": 596901888, "step": 1518 }, { "epoch": 0.17278727904677885, "grad_norm": 0.5939403176307678, "learning_rate": 4.35374149659864e-05, "loss": 2.09, "num_input_tokens_seen": 599261184, "step": 1524 }, { "epoch": 0.17346754392491576, "grad_norm": 0.680909276008606, "learning_rate": 4.350161117078411e-05, "loss": 2.0679, "num_input_tokens_seen": 601620480, "step": 1530 }, { "epoch": 0.1741478088030527, "grad_norm": 0.6251941919326782, "learning_rate": 4.3465807375581814e-05, "loss": 2.0517, "num_input_tokens_seen": 603979776, "step": 1536 }, { "epoch": 0.1748280736811896, "grad_norm": 0.7877122759819031, "learning_rate": 4.3430003580379526e-05, "loss": 2.1092, "num_input_tokens_seen": 606339072, "step": 1542 }, { "epoch": 0.17550833855932654, "grad_norm": 0.6591320037841797, "learning_rate": 4.339419978517723e-05, "loss": 2.1308, "num_input_tokens_seen": 608698368, "step": 1548 }, { "epoch": 0.17618860343746345, "grad_norm": 0.6894817352294922, "learning_rate": 4.3358395989974935e-05, "loss": 2.0316, "num_input_tokens_seen": 611057664, "step": 1554 }, { "epoch": 0.1768688683156004, "grad_norm": 0.6120206713676453, "learning_rate": 4.332259219477265e-05, "loss": 2.1528, "num_input_tokens_seen": 613416960, "step": 1560 }, { "epoch": 0.1775491331937373, "grad_norm": 0.6873424649238586, "learning_rate": 4.328678839957035e-05, "loss": 2.12, "num_input_tokens_seen": 615776256, "step": 1566 }, { "epoch": 0.17822939807187424, "grad_norm": 0.6133244037628174, "learning_rate": 4.325098460436807e-05, "loss": 2.104, "num_input_tokens_seen": 618135552, "step": 1572 }, { "epoch": 0.17890966295001115, "grad_norm": 0.5738610029220581, "learning_rate": 4.3215180809165775e-05, "loss": 2.0827, "num_input_tokens_seen": 620494848, "step": 1578 }, { "epoch": 0.1795899278281481, "grad_norm": 0.6590917706489563, "learning_rate": 4.3179377013963486e-05, "loss": 2.1426, "num_input_tokens_seen": 622854144, "step": 1584 }, { "epoch": 0.180270192706285, "grad_norm": 0.7815598249435425, "learning_rate": 4.314357321876119e-05, "loss": 2.0688, "num_input_tokens_seen": 625213440, "step": 1590 }, { "epoch": 0.18095045758442194, "grad_norm": 0.6743267774581909, "learning_rate": 4.3107769423558896e-05, "loss": 2.1243, "num_input_tokens_seen": 627572736, "step": 1596 }, { "epoch": 0.18140396750317989, "eval_accuracy": 0.5724584859584859, "eval_loss": 2.0929105281829834, "eval_runtime": 128.5597, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.042, "num_input_tokens_seen": 629145600, "step": 1600 }, { "epoch": 0.18163072246255885, "grad_norm": 0.7218141555786133, "learning_rate": 4.307196562835661e-05, "loss": 2.1026, "num_input_tokens_seen": 629932032, "step": 1602 }, { "epoch": 0.18231098734069578, "grad_norm": 0.5616850852966309, "learning_rate": 4.303616183315431e-05, "loss": 2.0563, "num_input_tokens_seen": 632291328, "step": 1608 }, { "epoch": 0.1829912522188327, "grad_norm": 0.8092398643493652, "learning_rate": 4.3000358037952024e-05, "loss": 2.1115, "num_input_tokens_seen": 634650624, "step": 1614 }, { "epoch": 0.18367151709696963, "grad_norm": 0.8262616395950317, "learning_rate": 4.2964554242749735e-05, "loss": 2.1083, "num_input_tokens_seen": 637009920, "step": 1620 }, { "epoch": 0.18435178197510654, "grad_norm": 0.6983737349510193, "learning_rate": 4.292875044754745e-05, "loss": 2.0547, "num_input_tokens_seen": 639369216, "step": 1626 }, { "epoch": 0.18503204685324348, "grad_norm": 0.7725507616996765, "learning_rate": 4.289294665234515e-05, "loss": 2.0215, "num_input_tokens_seen": 641728512, "step": 1632 }, { "epoch": 0.18571231173138042, "grad_norm": 0.6409133672714233, "learning_rate": 4.2857142857142856e-05, "loss": 2.131, "num_input_tokens_seen": 644087808, "step": 1638 }, { "epoch": 0.18639257660951733, "grad_norm": 0.7413092851638794, "learning_rate": 4.282133906194057e-05, "loss": 2.1055, "num_input_tokens_seen": 646447104, "step": 1644 }, { "epoch": 0.18707284148765427, "grad_norm": 0.6062273383140564, "learning_rate": 4.278553526673827e-05, "loss": 2.1227, "num_input_tokens_seen": 648806400, "step": 1650 }, { "epoch": 0.18775310636579118, "grad_norm": 0.5928088426589966, "learning_rate": 4.2749731471535984e-05, "loss": 2.0063, "num_input_tokens_seen": 651165696, "step": 1656 }, { "epoch": 0.18843337124392812, "grad_norm": 0.7456128001213074, "learning_rate": 4.2713927676333696e-05, "loss": 2.1229, "num_input_tokens_seen": 653524992, "step": 1662 }, { "epoch": 0.18911363612206503, "grad_norm": 0.634148359298706, "learning_rate": 4.267812388113141e-05, "loss": 2.0924, "num_input_tokens_seen": 655884288, "step": 1668 }, { "epoch": 0.18979390100020196, "grad_norm": 0.5960593223571777, "learning_rate": 4.264232008592911e-05, "loss": 2.0759, "num_input_tokens_seen": 658243584, "step": 1674 }, { "epoch": 0.19047416587833887, "grad_norm": 0.5249684453010559, "learning_rate": 4.260651629072682e-05, "loss": 2.1217, "num_input_tokens_seen": 660602880, "step": 1680 }, { "epoch": 0.1911544307564758, "grad_norm": 0.6345716118812561, "learning_rate": 4.257071249552453e-05, "loss": 2.0945, "num_input_tokens_seen": 662962176, "step": 1686 }, { "epoch": 0.19183469563461272, "grad_norm": 0.6189055442810059, "learning_rate": 4.253490870032223e-05, "loss": 2.1206, "num_input_tokens_seen": 665321472, "step": 1692 }, { "epoch": 0.19251496051274966, "grad_norm": 0.6294938325881958, "learning_rate": 4.2499104905119945e-05, "loss": 2.0698, "num_input_tokens_seen": 667680768, "step": 1698 }, { "epoch": 0.19319522539088657, "grad_norm": 0.6409788131713867, "learning_rate": 4.246330110991765e-05, "loss": 2.0606, "num_input_tokens_seen": 670040064, "step": 1704 }, { "epoch": 0.1938754902690235, "grad_norm": 0.7358625531196594, "learning_rate": 4.242749731471536e-05, "loss": 2.1144, "num_input_tokens_seen": 672399360, "step": 1710 }, { "epoch": 0.19455575514716042, "grad_norm": 0.6153339743614197, "learning_rate": 4.239169351951307e-05, "loss": 2.119, "num_input_tokens_seen": 674758656, "step": 1716 }, { "epoch": 0.19523602002529736, "grad_norm": 0.6902744174003601, "learning_rate": 4.235588972431078e-05, "loss": 2.0636, "num_input_tokens_seen": 677117952, "step": 1722 }, { "epoch": 0.19591628490343427, "grad_norm": 0.7145854830741882, "learning_rate": 4.232008592910849e-05, "loss": 2.0937, "num_input_tokens_seen": 679477248, "step": 1728 }, { "epoch": 0.1965965497815712, "grad_norm": 0.7076539397239685, "learning_rate": 4.2284282133906194e-05, "loss": 2.1325, "num_input_tokens_seen": 681836544, "step": 1734 }, { "epoch": 0.19727681465970812, "grad_norm": 0.6653849482536316, "learning_rate": 4.2248478338703905e-05, "loss": 2.1266, "num_input_tokens_seen": 684195840, "step": 1740 }, { "epoch": 0.19795707953784505, "grad_norm": 0.7376857399940491, "learning_rate": 4.221267454350161e-05, "loss": 2.1273, "num_input_tokens_seen": 686555136, "step": 1746 }, { "epoch": 0.19863734441598196, "grad_norm": 0.6721606254577637, "learning_rate": 4.217687074829932e-05, "loss": 2.0921, "num_input_tokens_seen": 688914432, "step": 1752 }, { "epoch": 0.1993176092941189, "grad_norm": 0.8767059445381165, "learning_rate": 4.214106695309703e-05, "loss": 2.1027, "num_input_tokens_seen": 691273728, "step": 1758 }, { "epoch": 0.1999978741722558, "grad_norm": 0.6245223879814148, "learning_rate": 4.210526315789474e-05, "loss": 2.083, "num_input_tokens_seen": 693633024, "step": 1764 }, { "epoch": 0.20067813905039275, "grad_norm": 0.6684584021568298, "learning_rate": 4.206945936269245e-05, "loss": 2.0892, "num_input_tokens_seen": 695992320, "step": 1770 }, { "epoch": 0.20135840392852966, "grad_norm": 0.676654577255249, "learning_rate": 4.2033655567490154e-05, "loss": 2.1068, "num_input_tokens_seen": 698351616, "step": 1776 }, { "epoch": 0.2020386688066666, "grad_norm": 0.5355855226516724, "learning_rate": 4.1997851772287866e-05, "loss": 2.0531, "num_input_tokens_seen": 700710912, "step": 1782 }, { "epoch": 0.2027189336848035, "grad_norm": 0.6555802226066589, "learning_rate": 4.196204797708557e-05, "loss": 2.0901, "num_input_tokens_seen": 703070208, "step": 1788 }, { "epoch": 0.20339919856294045, "grad_norm": 0.6298051476478577, "learning_rate": 4.192624418188328e-05, "loss": 2.0877, "num_input_tokens_seen": 705429504, "step": 1794 }, { "epoch": 0.20407946344107736, "grad_norm": 0.720937192440033, "learning_rate": 4.1890440386680994e-05, "loss": 2.108, "num_input_tokens_seen": 707788800, "step": 1800 }, { "epoch": 0.2047597283192143, "grad_norm": 0.7328784465789795, "learning_rate": 4.18546365914787e-05, "loss": 2.0899, "num_input_tokens_seen": 710148096, "step": 1806 }, { "epoch": 0.2054399931973512, "grad_norm": 0.6752752065658569, "learning_rate": 4.181883279627641e-05, "loss": 2.1424, "num_input_tokens_seen": 712507392, "step": 1812 }, { "epoch": 0.20612025807548814, "grad_norm": 0.7588052153587341, "learning_rate": 4.1783029001074115e-05, "loss": 2.134, "num_input_tokens_seen": 714866688, "step": 1818 }, { "epoch": 0.20680052295362505, "grad_norm": 0.6495354771614075, "learning_rate": 4.1747225205871826e-05, "loss": 2.1091, "num_input_tokens_seen": 717225984, "step": 1824 }, { "epoch": 0.207480787831762, "grad_norm": 0.6155287623405457, "learning_rate": 4.171142141066953e-05, "loss": 2.1057, "num_input_tokens_seen": 719585280, "step": 1830 }, { "epoch": 0.20816105270989893, "grad_norm": 0.5426910519599915, "learning_rate": 4.167561761546724e-05, "loss": 2.0465, "num_input_tokens_seen": 721944576, "step": 1836 }, { "epoch": 0.20884131758803584, "grad_norm": 0.6535930633544922, "learning_rate": 4.163981382026495e-05, "loss": 2.046, "num_input_tokens_seen": 724303872, "step": 1842 }, { "epoch": 0.20952158246617278, "grad_norm": 0.639935314655304, "learning_rate": 4.160401002506266e-05, "loss": 2.1035, "num_input_tokens_seen": 726663168, "step": 1848 }, { "epoch": 0.2102018473443097, "grad_norm": 0.5828704833984375, "learning_rate": 4.156820622986037e-05, "loss": 2.0643, "num_input_tokens_seen": 729022464, "step": 1854 }, { "epoch": 0.21088211222244663, "grad_norm": 0.579765796661377, "learning_rate": 4.1532402434658075e-05, "loss": 2.1099, "num_input_tokens_seen": 731381760, "step": 1860 }, { "epoch": 0.21156237710058354, "grad_norm": 0.6833761930465698, "learning_rate": 4.149659863945579e-05, "loss": 2.0967, "num_input_tokens_seen": 733741056, "step": 1866 }, { "epoch": 0.21224264197872048, "grad_norm": 0.6318493485450745, "learning_rate": 4.146079484425349e-05, "loss": 2.1049, "num_input_tokens_seen": 736100352, "step": 1872 }, { "epoch": 0.21292290685685739, "grad_norm": 0.6708328127861023, "learning_rate": 4.14249910490512e-05, "loss": 2.0954, "num_input_tokens_seen": 738459648, "step": 1878 }, { "epoch": 0.21360317173499432, "grad_norm": 0.6389116644859314, "learning_rate": 4.138918725384891e-05, "loss": 2.113, "num_input_tokens_seen": 740818944, "step": 1884 }, { "epoch": 0.21428343661313123, "grad_norm": 0.6693724393844604, "learning_rate": 4.135338345864662e-05, "loss": 2.0786, "num_input_tokens_seen": 743178240, "step": 1890 }, { "epoch": 0.21496370149126817, "grad_norm": 0.6880051493644714, "learning_rate": 4.131757966344433e-05, "loss": 2.101, "num_input_tokens_seen": 745537536, "step": 1896 }, { "epoch": 0.21564396636940508, "grad_norm": 0.7772538065910339, "learning_rate": 4.1281775868242036e-05, "loss": 2.0912, "num_input_tokens_seen": 747896832, "step": 1902 }, { "epoch": 0.21632423124754202, "grad_norm": 0.5820342898368835, "learning_rate": 4.124597207303975e-05, "loss": 2.0999, "num_input_tokens_seen": 750256128, "step": 1908 }, { "epoch": 0.21700449612567893, "grad_norm": 0.6889671087265015, "learning_rate": 4.121016827783745e-05, "loss": 2.1452, "num_input_tokens_seen": 752615424, "step": 1914 }, { "epoch": 0.21768476100381587, "grad_norm": 0.7460409998893738, "learning_rate": 4.1174364482635163e-05, "loss": 2.0545, "num_input_tokens_seen": 754974720, "step": 1920 }, { "epoch": 0.21836502588195278, "grad_norm": 0.6621735692024231, "learning_rate": 4.113856068743287e-05, "loss": 2.1114, "num_input_tokens_seen": 757334016, "step": 1926 }, { "epoch": 0.21904529076008972, "grad_norm": 0.6911535859107971, "learning_rate": 4.110275689223057e-05, "loss": 2.0746, "num_input_tokens_seen": 759693312, "step": 1932 }, { "epoch": 0.21972555563822663, "grad_norm": 0.7786504626274109, "learning_rate": 4.1066953097028285e-05, "loss": 2.1343, "num_input_tokens_seen": 762052608, "step": 1938 }, { "epoch": 0.22040582051636357, "grad_norm": 0.6110914349555969, "learning_rate": 4.1031149301825996e-05, "loss": 2.1723, "num_input_tokens_seen": 764411904, "step": 1944 }, { "epoch": 0.22108608539450048, "grad_norm": 0.7057865858078003, "learning_rate": 4.099534550662371e-05, "loss": 2.1187, "num_input_tokens_seen": 766771200, "step": 1950 }, { "epoch": 0.2217663502726374, "grad_norm": 0.6199769973754883, "learning_rate": 4.095954171142141e-05, "loss": 2.0733, "num_input_tokens_seen": 769130496, "step": 1956 }, { "epoch": 0.22244661515077432, "grad_norm": 0.7107540965080261, "learning_rate": 4.0923737916219124e-05, "loss": 2.0964, "num_input_tokens_seen": 771489792, "step": 1962 }, { "epoch": 0.22312688002891126, "grad_norm": 0.6034384369850159, "learning_rate": 4.088793412101683e-05, "loss": 2.1232, "num_input_tokens_seen": 773849088, "step": 1968 }, { "epoch": 0.22380714490704817, "grad_norm": 0.6471470594406128, "learning_rate": 4.0852130325814534e-05, "loss": 2.0799, "num_input_tokens_seen": 776208384, "step": 1974 }, { "epoch": 0.2244874097851851, "grad_norm": 0.6443119049072266, "learning_rate": 4.0816326530612245e-05, "loss": 2.0843, "num_input_tokens_seen": 778567680, "step": 1980 }, { "epoch": 0.22516767466332202, "grad_norm": 0.6607959866523743, "learning_rate": 4.0780522735409957e-05, "loss": 2.1408, "num_input_tokens_seen": 780926976, "step": 1986 }, { "epoch": 0.22584793954145896, "grad_norm": 0.6692774891853333, "learning_rate": 4.074471894020767e-05, "loss": 2.0871, "num_input_tokens_seen": 783286272, "step": 1992 }, { "epoch": 0.22652820441959587, "grad_norm": 0.7502838969230652, "learning_rate": 4.070891514500537e-05, "loss": 2.1021, "num_input_tokens_seen": 785645568, "step": 1998 }, { "epoch": 0.22675495937897486, "eval_accuracy": 0.5746526251526252, "eval_loss": 2.079448699951172, "eval_runtime": 129.4769, "eval_samples_per_second": 3.089, "eval_steps_per_second": 1.035, "num_input_tokens_seen": 786432000, "step": 2000 }, { "epoch": 0.2272084692977328, "grad_norm": 0.6747561693191528, "learning_rate": 4.0673111349803084e-05, "loss": 2.0141, "num_input_tokens_seen": 788004864, "step": 2004 }, { "epoch": 0.22788873417586972, "grad_norm": 0.6549056172370911, "learning_rate": 4.063730755460079e-05, "loss": 2.1014, "num_input_tokens_seen": 790364160, "step": 2010 }, { "epoch": 0.22856899905400666, "grad_norm": 0.7539930939674377, "learning_rate": 4.0601503759398494e-05, "loss": 2.1268, "num_input_tokens_seen": 792723456, "step": 2016 }, { "epoch": 0.22924926393214357, "grad_norm": 0.5937004089355469, "learning_rate": 4.0565699964196206e-05, "loss": 2.0426, "num_input_tokens_seen": 795082752, "step": 2022 }, { "epoch": 0.2299295288102805, "grad_norm": 0.5992699861526489, "learning_rate": 4.052989616899392e-05, "loss": 2.1089, "num_input_tokens_seen": 797442048, "step": 2028 }, { "epoch": 0.2306097936884174, "grad_norm": 0.5451076030731201, "learning_rate": 4.049409237379163e-05, "loss": 2.0499, "num_input_tokens_seen": 799801344, "step": 2034 }, { "epoch": 0.23129005856655435, "grad_norm": 0.5855215787887573, "learning_rate": 4.0458288578589333e-05, "loss": 2.0501, "num_input_tokens_seen": 802160640, "step": 2040 }, { "epoch": 0.2319703234446913, "grad_norm": 0.6797962784767151, "learning_rate": 4.0422484783387045e-05, "loss": 2.1079, "num_input_tokens_seen": 804519936, "step": 2046 }, { "epoch": 0.2326505883228282, "grad_norm": 0.5858785510063171, "learning_rate": 4.038668098818475e-05, "loss": 2.1117, "num_input_tokens_seen": 806879232, "step": 2052 }, { "epoch": 0.23333085320096514, "grad_norm": 0.6085060834884644, "learning_rate": 4.0350877192982455e-05, "loss": 2.0706, "num_input_tokens_seen": 809238528, "step": 2058 }, { "epoch": 0.23401111807910205, "grad_norm": 0.5851722359657288, "learning_rate": 4.0315073397780166e-05, "loss": 2.1303, "num_input_tokens_seen": 811597824, "step": 2064 }, { "epoch": 0.234691382957239, "grad_norm": 0.6054412722587585, "learning_rate": 4.027926960257787e-05, "loss": 2.1186, "num_input_tokens_seen": 813957120, "step": 2070 }, { "epoch": 0.2353716478353759, "grad_norm": 0.6723355054855347, "learning_rate": 4.024346580737558e-05, "loss": 2.0892, "num_input_tokens_seen": 816316416, "step": 2076 }, { "epoch": 0.23605191271351283, "grad_norm": 0.6768056154251099, "learning_rate": 4.0207662012173294e-05, "loss": 2.0591, "num_input_tokens_seen": 818675712, "step": 2082 }, { "epoch": 0.23673217759164975, "grad_norm": 0.5856552124023438, "learning_rate": 4.0171858216971005e-05, "loss": 2.0735, "num_input_tokens_seen": 821035008, "step": 2088 }, { "epoch": 0.23741244246978668, "grad_norm": 0.7292026281356812, "learning_rate": 4.013605442176871e-05, "loss": 2.0721, "num_input_tokens_seen": 823394304, "step": 2094 }, { "epoch": 0.2380927073479236, "grad_norm": 0.6172975301742554, "learning_rate": 4.0100250626566415e-05, "loss": 2.0728, "num_input_tokens_seen": 825753600, "step": 2100 }, { "epoch": 0.23877297222606053, "grad_norm": 0.7551843523979187, "learning_rate": 4.0064446831364127e-05, "loss": 2.1213, "num_input_tokens_seen": 828112896, "step": 2106 }, { "epoch": 0.23945323710419744, "grad_norm": 0.625471830368042, "learning_rate": 4.002864303616183e-05, "loss": 2.066, "num_input_tokens_seen": 830472192, "step": 2112 }, { "epoch": 0.24013350198233438, "grad_norm": 0.6531856656074524, "learning_rate": 3.999283924095954e-05, "loss": 2.0666, "num_input_tokens_seen": 832831488, "step": 2118 }, { "epoch": 0.2408137668604713, "grad_norm": 0.652446448802948, "learning_rate": 3.9957035445757254e-05, "loss": 2.1193, "num_input_tokens_seen": 835190784, "step": 2124 }, { "epoch": 0.24149403173860823, "grad_norm": 0.6203518509864807, "learning_rate": 3.9921231650554966e-05, "loss": 2.1025, "num_input_tokens_seen": 837550080, "step": 2130 }, { "epoch": 0.24217429661674514, "grad_norm": 0.6497722268104553, "learning_rate": 3.988542785535267e-05, "loss": 2.0418, "num_input_tokens_seen": 839909376, "step": 2136 }, { "epoch": 0.24285456149488208, "grad_norm": 0.661279559135437, "learning_rate": 3.9849624060150376e-05, "loss": 2.0558, "num_input_tokens_seen": 842268672, "step": 2142 }, { "epoch": 0.243534826373019, "grad_norm": 0.5917189717292786, "learning_rate": 3.981382026494809e-05, "loss": 2.1041, "num_input_tokens_seen": 844627968, "step": 2148 }, { "epoch": 0.24421509125115592, "grad_norm": 0.8539558053016663, "learning_rate": 3.977801646974579e-05, "loss": 2.1188, "num_input_tokens_seen": 846987264, "step": 2154 }, { "epoch": 0.24489535612929283, "grad_norm": 0.5865846276283264, "learning_rate": 3.97422126745435e-05, "loss": 2.1069, "num_input_tokens_seen": 849346560, "step": 2160 }, { "epoch": 0.24557562100742977, "grad_norm": 0.6616944670677185, "learning_rate": 3.970640887934121e-05, "loss": 2.0633, "num_input_tokens_seen": 851705856, "step": 2166 }, { "epoch": 0.24625588588556668, "grad_norm": 0.5569839477539062, "learning_rate": 3.9670605084138926e-05, "loss": 2.0857, "num_input_tokens_seen": 854065152, "step": 2172 }, { "epoch": 0.24693615076370362, "grad_norm": 0.5691688060760498, "learning_rate": 3.963480128893663e-05, "loss": 2.061, "num_input_tokens_seen": 856424448, "step": 2178 }, { "epoch": 0.24761641564184053, "grad_norm": 0.607754647731781, "learning_rate": 3.9598997493734336e-05, "loss": 2.1715, "num_input_tokens_seen": 858783744, "step": 2184 }, { "epoch": 0.24829668051997747, "grad_norm": 0.7856176495552063, "learning_rate": 3.956319369853205e-05, "loss": 2.0697, "num_input_tokens_seen": 861143040, "step": 2190 }, { "epoch": 0.24897694539811438, "grad_norm": 0.7349157333374023, "learning_rate": 3.952738990332975e-05, "loss": 2.1124, "num_input_tokens_seen": 863502336, "step": 2196 }, { "epoch": 0.24965721027625132, "grad_norm": 0.6067531108856201, "learning_rate": 3.9491586108127464e-05, "loss": 2.0844, "num_input_tokens_seen": 865861632, "step": 2202 }, { "epoch": 0.25033747515438826, "grad_norm": 0.6353740692138672, "learning_rate": 3.945578231292517e-05, "loss": 2.0714, "num_input_tokens_seen": 868220928, "step": 2208 }, { "epoch": 0.25101774003252514, "grad_norm": 0.6207152605056763, "learning_rate": 3.941997851772288e-05, "loss": 2.1135, "num_input_tokens_seen": 870580224, "step": 2214 }, { "epoch": 0.2516980049106621, "grad_norm": 0.6664757132530212, "learning_rate": 3.938417472252059e-05, "loss": 2.0755, "num_input_tokens_seen": 872939520, "step": 2220 }, { "epoch": 0.252378269788799, "grad_norm": 0.6741634011268616, "learning_rate": 3.9348370927318297e-05, "loss": 2.1249, "num_input_tokens_seen": 875298816, "step": 2226 }, { "epoch": 0.25305853466693595, "grad_norm": 0.7361227869987488, "learning_rate": 3.931256713211601e-05, "loss": 2.0656, "num_input_tokens_seen": 877658112, "step": 2232 }, { "epoch": 0.25373879954507284, "grad_norm": 0.864486038684845, "learning_rate": 3.927676333691371e-05, "loss": 2.0808, "num_input_tokens_seen": 880017408, "step": 2238 }, { "epoch": 0.2544190644232098, "grad_norm": 0.817509114742279, "learning_rate": 3.9240959541711424e-05, "loss": 2.0223, "num_input_tokens_seen": 882376704, "step": 2244 }, { "epoch": 0.2550993293013467, "grad_norm": 0.6295050382614136, "learning_rate": 3.920515574650913e-05, "loss": 2.1069, "num_input_tokens_seen": 884736000, "step": 2250 }, { "epoch": 0.25577959417948365, "grad_norm": 0.557656466960907, "learning_rate": 3.916935195130684e-05, "loss": 2.0621, "num_input_tokens_seen": 887095296, "step": 2256 }, { "epoch": 0.2564598590576206, "grad_norm": 0.5819247364997864, "learning_rate": 3.913354815610455e-05, "loss": 2.0804, "num_input_tokens_seen": 889454592, "step": 2262 }, { "epoch": 0.25714012393575747, "grad_norm": 0.6297056674957275, "learning_rate": 3.909774436090226e-05, "loss": 2.0566, "num_input_tokens_seen": 891813888, "step": 2268 }, { "epoch": 0.2578203888138944, "grad_norm": 0.6011530756950378, "learning_rate": 3.906194056569997e-05, "loss": 2.0731, "num_input_tokens_seen": 894173184, "step": 2274 }, { "epoch": 0.25850065369203135, "grad_norm": 0.5878785252571106, "learning_rate": 3.902613677049767e-05, "loss": 2.115, "num_input_tokens_seen": 896532480, "step": 2280 }, { "epoch": 0.2591809185701683, "grad_norm": 0.6470881104469299, "learning_rate": 3.8990332975295385e-05, "loss": 2.0653, "num_input_tokens_seen": 898891776, "step": 2286 }, { "epoch": 0.25986118344830517, "grad_norm": 0.6201193332672119, "learning_rate": 3.895452918009309e-05, "loss": 2.0936, "num_input_tokens_seen": 901251072, "step": 2292 }, { "epoch": 0.2605414483264421, "grad_norm": 0.5656684637069702, "learning_rate": 3.89187253848908e-05, "loss": 2.1008, "num_input_tokens_seen": 903610368, "step": 2298 }, { "epoch": 0.26122171320457904, "grad_norm": 0.5908628106117249, "learning_rate": 3.8882921589688506e-05, "loss": 2.0602, "num_input_tokens_seen": 905969664, "step": 2304 }, { "epoch": 0.261901978082716, "grad_norm": 0.660382866859436, "learning_rate": 3.884711779448622e-05, "loss": 2.0933, "num_input_tokens_seen": 908328960, "step": 2310 }, { "epoch": 0.26258224296085286, "grad_norm": 0.5603790283203125, "learning_rate": 3.881131399928393e-05, "loss": 2.0898, "num_input_tokens_seen": 910688256, "step": 2316 }, { "epoch": 0.2632625078389898, "grad_norm": 0.6598983407020569, "learning_rate": 3.8775510204081634e-05, "loss": 2.0715, "num_input_tokens_seen": 913047552, "step": 2322 }, { "epoch": 0.26394277271712674, "grad_norm": 0.5827348828315735, "learning_rate": 3.8739706408879345e-05, "loss": 2.1179, "num_input_tokens_seen": 915406848, "step": 2328 }, { "epoch": 0.2646230375952637, "grad_norm": 0.7159097194671631, "learning_rate": 3.870390261367705e-05, "loss": 2.0344, "num_input_tokens_seen": 917766144, "step": 2334 }, { "epoch": 0.26530330247340056, "grad_norm": 0.6752398014068604, "learning_rate": 3.866809881847476e-05, "loss": 2.0513, "num_input_tokens_seen": 920125440, "step": 2340 }, { "epoch": 0.2659835673515375, "grad_norm": 0.598101794719696, "learning_rate": 3.8632295023272466e-05, "loss": 2.0701, "num_input_tokens_seen": 922484736, "step": 2346 }, { "epoch": 0.26666383222967444, "grad_norm": 0.6286051273345947, "learning_rate": 3.859649122807018e-05, "loss": 2.0854, "num_input_tokens_seen": 924844032, "step": 2352 }, { "epoch": 0.2673440971078114, "grad_norm": 0.6396269202232361, "learning_rate": 3.856068743286789e-05, "loss": 2.1089, "num_input_tokens_seen": 927203328, "step": 2358 }, { "epoch": 0.26802436198594826, "grad_norm": 0.6398798823356628, "learning_rate": 3.8524883637665594e-05, "loss": 2.0501, "num_input_tokens_seen": 929562624, "step": 2364 }, { "epoch": 0.2687046268640852, "grad_norm": 0.6426295042037964, "learning_rate": 3.8489079842463306e-05, "loss": 2.0617, "num_input_tokens_seen": 931921920, "step": 2370 }, { "epoch": 0.26938489174222213, "grad_norm": 0.6402562856674194, "learning_rate": 3.845327604726101e-05, "loss": 2.0715, "num_input_tokens_seen": 934281216, "step": 2376 }, { "epoch": 0.27006515662035907, "grad_norm": 0.699862539768219, "learning_rate": 3.841747225205872e-05, "loss": 2.13, "num_input_tokens_seen": 936640512, "step": 2382 }, { "epoch": 0.27074542149849595, "grad_norm": 0.8998868465423584, "learning_rate": 3.838166845685643e-05, "loss": 2.1084, "num_input_tokens_seen": 938999808, "step": 2388 }, { "epoch": 0.2714256863766329, "grad_norm": 0.665034294128418, "learning_rate": 3.834586466165413e-05, "loss": 2.0959, "num_input_tokens_seen": 941359104, "step": 2394 }, { "epoch": 0.27210595125476983, "grad_norm": 0.745847225189209, "learning_rate": 3.831006086645185e-05, "loss": 2.0794, "num_input_tokens_seen": 943718400, "step": 2400 }, { "epoch": 0.27210595125476983, "eval_accuracy": 0.5762228327228327, "eval_loss": 2.068709135055542, "eval_runtime": 128.4911, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.043, "num_input_tokens_seen": 943718400, "step": 2400 }, { "epoch": 0.27278621613290677, "grad_norm": 0.6324106454849243, "learning_rate": 3.8274257071249555e-05, "loss": 2.0734, "num_input_tokens_seen": 946077696, "step": 2406 }, { "epoch": 0.27346648101104365, "grad_norm": 0.7810145020484924, "learning_rate": 3.8238453276047266e-05, "loss": 2.0681, "num_input_tokens_seen": 948436992, "step": 2412 }, { "epoch": 0.2741467458891806, "grad_norm": 0.6391826272010803, "learning_rate": 3.820264948084497e-05, "loss": 2.0964, "num_input_tokens_seen": 950796288, "step": 2418 }, { "epoch": 0.2748270107673175, "grad_norm": 0.6988577842712402, "learning_rate": 3.816684568564268e-05, "loss": 2.1252, "num_input_tokens_seen": 953155584, "step": 2424 }, { "epoch": 0.27550727564545446, "grad_norm": 0.5647233128547668, "learning_rate": 3.813104189044039e-05, "loss": 2.0881, "num_input_tokens_seen": 955514880, "step": 2430 }, { "epoch": 0.27618754052359135, "grad_norm": 0.5780855417251587, "learning_rate": 3.809523809523809e-05, "loss": 2.0835, "num_input_tokens_seen": 957874176, "step": 2436 }, { "epoch": 0.2768678054017283, "grad_norm": 0.6789732575416565, "learning_rate": 3.8059434300035804e-05, "loss": 2.0732, "num_input_tokens_seen": 960233472, "step": 2442 }, { "epoch": 0.2775480702798652, "grad_norm": 0.6763067245483398, "learning_rate": 3.8023630504833515e-05, "loss": 2.0457, "num_input_tokens_seen": 962592768, "step": 2448 }, { "epoch": 0.27822833515800216, "grad_norm": 0.5905190110206604, "learning_rate": 3.798782670963123e-05, "loss": 2.0186, "num_input_tokens_seen": 964952064, "step": 2454 }, { "epoch": 0.2789086000361391, "grad_norm": 0.6527414321899414, "learning_rate": 3.795202291442893e-05, "loss": 2.0841, "num_input_tokens_seen": 967311360, "step": 2460 }, { "epoch": 0.279588864914276, "grad_norm": 0.5765488743782043, "learning_rate": 3.791621911922664e-05, "loss": 2.0811, "num_input_tokens_seen": 969670656, "step": 2466 }, { "epoch": 0.2802691297924129, "grad_norm": 0.6708554625511169, "learning_rate": 3.788041532402435e-05, "loss": 2.0625, "num_input_tokens_seen": 972029952, "step": 2472 }, { "epoch": 0.28094939467054986, "grad_norm": 0.6201637983322144, "learning_rate": 3.784461152882205e-05, "loss": 2.1125, "num_input_tokens_seen": 974389248, "step": 2478 }, { "epoch": 0.2816296595486868, "grad_norm": 0.6302900314331055, "learning_rate": 3.7808807733619764e-05, "loss": 2.1199, "num_input_tokens_seen": 976748544, "step": 2484 }, { "epoch": 0.2823099244268237, "grad_norm": 0.7140418887138367, "learning_rate": 3.7773003938417476e-05, "loss": 2.1241, "num_input_tokens_seen": 979107840, "step": 2490 }, { "epoch": 0.2829901893049606, "grad_norm": 0.5913351774215698, "learning_rate": 3.773720014321519e-05, "loss": 2.0307, "num_input_tokens_seen": 981467136, "step": 2496 }, { "epoch": 0.28367045418309755, "grad_norm": 0.5941835045814514, "learning_rate": 3.770139634801289e-05, "loss": 2.069, "num_input_tokens_seen": 983826432, "step": 2502 }, { "epoch": 0.2843507190612345, "grad_norm": 0.6847456097602844, "learning_rate": 3.7665592552810604e-05, "loss": 2.0953, "num_input_tokens_seen": 986185728, "step": 2508 }, { "epoch": 0.2850309839393714, "grad_norm": 0.6352680325508118, "learning_rate": 3.762978875760831e-05, "loss": 2.0734, "num_input_tokens_seen": 988545024, "step": 2514 }, { "epoch": 0.2857112488175083, "grad_norm": 0.6623321771621704, "learning_rate": 3.759398496240601e-05, "loss": 2.0512, "num_input_tokens_seen": 990904320, "step": 2520 }, { "epoch": 0.28639151369564525, "grad_norm": 0.718250572681427, "learning_rate": 3.7558181167203725e-05, "loss": 2.0888, "num_input_tokens_seen": 993263616, "step": 2526 }, { "epoch": 0.2870717785737822, "grad_norm": 0.5607486367225647, "learning_rate": 3.752237737200143e-05, "loss": 2.1202, "num_input_tokens_seen": 995622912, "step": 2532 }, { "epoch": 0.28775204345191907, "grad_norm": 0.653218150138855, "learning_rate": 3.748657357679914e-05, "loss": 2.1099, "num_input_tokens_seen": 997982208, "step": 2538 }, { "epoch": 0.288432308330056, "grad_norm": 0.6100384593009949, "learning_rate": 3.745076978159685e-05, "loss": 2.0464, "num_input_tokens_seen": 1000341504, "step": 2544 }, { "epoch": 0.28911257320819295, "grad_norm": 0.6485652327537537, "learning_rate": 3.7414965986394564e-05, "loss": 2.0631, "num_input_tokens_seen": 1002700800, "step": 2550 }, { "epoch": 0.2897928380863299, "grad_norm": 0.6714969873428345, "learning_rate": 3.737916219119227e-05, "loss": 2.0325, "num_input_tokens_seen": 1005060096, "step": 2556 }, { "epoch": 0.29047310296446677, "grad_norm": 0.629289448261261, "learning_rate": 3.7343358395989974e-05, "loss": 2.0345, "num_input_tokens_seen": 1007419392, "step": 2562 }, { "epoch": 0.2911533678426037, "grad_norm": 0.6530044078826904, "learning_rate": 3.7307554600787685e-05, "loss": 2.1037, "num_input_tokens_seen": 1009778688, "step": 2568 }, { "epoch": 0.29183363272074064, "grad_norm": 0.6162053942680359, "learning_rate": 3.727175080558539e-05, "loss": 2.0594, "num_input_tokens_seen": 1012137984, "step": 2574 }, { "epoch": 0.2925138975988776, "grad_norm": 0.6271448731422424, "learning_rate": 3.72359470103831e-05, "loss": 2.0737, "num_input_tokens_seen": 1014497280, "step": 2580 }, { "epoch": 0.29319416247701446, "grad_norm": 0.5966920256614685, "learning_rate": 3.720014321518081e-05, "loss": 2.1086, "num_input_tokens_seen": 1016856576, "step": 2586 }, { "epoch": 0.2938744273551514, "grad_norm": 0.6952504515647888, "learning_rate": 3.7164339419978525e-05, "loss": 2.0835, "num_input_tokens_seen": 1019215872, "step": 2592 }, { "epoch": 0.29455469223328834, "grad_norm": 0.6622751951217651, "learning_rate": 3.712853562477623e-05, "loss": 2.1187, "num_input_tokens_seen": 1021575168, "step": 2598 }, { "epoch": 0.2952349571114253, "grad_norm": 0.7054808139801025, "learning_rate": 3.7092731829573934e-05, "loss": 2.101, "num_input_tokens_seen": 1023934464, "step": 2604 }, { "epoch": 0.29591522198956216, "grad_norm": 0.5338059663772583, "learning_rate": 3.7056928034371646e-05, "loss": 2.0577, "num_input_tokens_seen": 1026293760, "step": 2610 }, { "epoch": 0.2965954868676991, "grad_norm": 0.6121593117713928, "learning_rate": 3.702112423916935e-05, "loss": 2.0464, "num_input_tokens_seen": 1028653056, "step": 2616 }, { "epoch": 0.29727575174583604, "grad_norm": 0.6173185706138611, "learning_rate": 3.698532044396706e-05, "loss": 2.0741, "num_input_tokens_seen": 1031012352, "step": 2622 }, { "epoch": 0.297956016623973, "grad_norm": 0.5515555739402771, "learning_rate": 3.6949516648764774e-05, "loss": 2.0617, "num_input_tokens_seen": 1033371648, "step": 2628 }, { "epoch": 0.29863628150210986, "grad_norm": 0.6501288414001465, "learning_rate": 3.6913712853562485e-05, "loss": 2.1319, "num_input_tokens_seen": 1035730944, "step": 2634 }, { "epoch": 0.2993165463802468, "grad_norm": 0.6460755467414856, "learning_rate": 3.687790905836019e-05, "loss": 2.0581, "num_input_tokens_seen": 1038090240, "step": 2640 }, { "epoch": 0.29999681125838373, "grad_norm": 0.5400772094726562, "learning_rate": 3.6842105263157895e-05, "loss": 2.0937, "num_input_tokens_seen": 1040449536, "step": 2646 }, { "epoch": 0.30067707613652067, "grad_norm": 0.7050911784172058, "learning_rate": 3.6806301467955606e-05, "loss": 2.0414, "num_input_tokens_seen": 1042808832, "step": 2652 }, { "epoch": 0.3013573410146576, "grad_norm": 0.502206563949585, "learning_rate": 3.677049767275331e-05, "loss": 2.0569, "num_input_tokens_seen": 1045168128, "step": 2658 }, { "epoch": 0.3020376058927945, "grad_norm": 0.6481841206550598, "learning_rate": 3.673469387755102e-05, "loss": 2.0846, "num_input_tokens_seen": 1047527424, "step": 2664 }, { "epoch": 0.30271787077093143, "grad_norm": 0.6112203598022461, "learning_rate": 3.669889008234873e-05, "loss": 2.0746, "num_input_tokens_seen": 1049886720, "step": 2670 }, { "epoch": 0.30339813564906837, "grad_norm": 0.6601382493972778, "learning_rate": 3.666308628714644e-05, "loss": 2.052, "num_input_tokens_seen": 1052246016, "step": 2676 }, { "epoch": 0.3040784005272053, "grad_norm": 0.7059093713760376, "learning_rate": 3.662728249194415e-05, "loss": 2.046, "num_input_tokens_seen": 1054605312, "step": 2682 }, { "epoch": 0.3047586654053422, "grad_norm": 0.7588717341423035, "learning_rate": 3.6591478696741855e-05, "loss": 2.0597, "num_input_tokens_seen": 1056964608, "step": 2688 }, { "epoch": 0.3054389302834791, "grad_norm": 0.7248372435569763, "learning_rate": 3.655567490153957e-05, "loss": 2.0673, "num_input_tokens_seen": 1059323904, "step": 2694 }, { "epoch": 0.30611919516161606, "grad_norm": 0.581738293170929, "learning_rate": 3.651987110633727e-05, "loss": 2.084, "num_input_tokens_seen": 1061683200, "step": 2700 }, { "epoch": 0.306799460039753, "grad_norm": 0.5930314064025879, "learning_rate": 3.648406731113498e-05, "loss": 2.1012, "num_input_tokens_seen": 1064042496, "step": 2706 }, { "epoch": 0.3074797249178899, "grad_norm": 0.584109365940094, "learning_rate": 3.644826351593269e-05, "loss": 2.0744, "num_input_tokens_seen": 1066401792, "step": 2712 }, { "epoch": 0.3081599897960268, "grad_norm": 0.5961458086967468, "learning_rate": 3.64124597207304e-05, "loss": 2.0837, "num_input_tokens_seen": 1068761088, "step": 2718 }, { "epoch": 0.30884025467416376, "grad_norm": 0.6335872411727905, "learning_rate": 3.637665592552811e-05, "loss": 2.0681, "num_input_tokens_seen": 1071120384, "step": 2724 }, { "epoch": 0.3095205195523007, "grad_norm": 0.6117258071899414, "learning_rate": 3.6340852130325816e-05, "loss": 2.1143, "num_input_tokens_seen": 1073479680, "step": 2730 }, { "epoch": 0.3102007844304376, "grad_norm": 0.5619468688964844, "learning_rate": 3.630504833512353e-05, "loss": 2.0558, "num_input_tokens_seen": 1075838976, "step": 2736 }, { "epoch": 0.3108810493085745, "grad_norm": 0.555188000202179, "learning_rate": 3.626924453992123e-05, "loss": 2.0814, "num_input_tokens_seen": 1078198272, "step": 2742 }, { "epoch": 0.31156131418671146, "grad_norm": 0.5773251056671143, "learning_rate": 3.6233440744718944e-05, "loss": 2.0728, "num_input_tokens_seen": 1080557568, "step": 2748 }, { "epoch": 0.3122415790648484, "grad_norm": 0.6792175769805908, "learning_rate": 3.619763694951665e-05, "loss": 2.0708, "num_input_tokens_seen": 1082916864, "step": 2754 }, { "epoch": 0.3129218439429853, "grad_norm": 0.6672898530960083, "learning_rate": 3.616183315431436e-05, "loss": 2.0112, "num_input_tokens_seen": 1085276160, "step": 2760 }, { "epoch": 0.3136021088211222, "grad_norm": 0.6736769676208496, "learning_rate": 3.6126029359112065e-05, "loss": 2.0495, "num_input_tokens_seen": 1087635456, "step": 2766 }, { "epoch": 0.31428237369925915, "grad_norm": 0.6413402557373047, "learning_rate": 3.6090225563909776e-05, "loss": 1.9964, "num_input_tokens_seen": 1089994752, "step": 2772 }, { "epoch": 0.3149626385773961, "grad_norm": 0.5596314668655396, "learning_rate": 3.605442176870749e-05, "loss": 2.0394, "num_input_tokens_seen": 1092354048, "step": 2778 }, { "epoch": 0.315642903455533, "grad_norm": 0.5517847537994385, "learning_rate": 3.601861797350519e-05, "loss": 2.0687, "num_input_tokens_seen": 1094713344, "step": 2784 }, { "epoch": 0.3163231683336699, "grad_norm": 0.6080681085586548, "learning_rate": 3.5982814178302904e-05, "loss": 2.1019, "num_input_tokens_seen": 1097072640, "step": 2790 }, { "epoch": 0.31700343321180685, "grad_norm": 0.6057153344154358, "learning_rate": 3.594701038310061e-05, "loss": 2.0843, "num_input_tokens_seen": 1099431936, "step": 2796 }, { "epoch": 0.3174569431305648, "eval_accuracy": 0.5775622710622711, "eval_loss": 2.0592379570007324, "eval_runtime": 128.8803, "eval_samples_per_second": 3.104, "eval_steps_per_second": 1.04, "num_input_tokens_seen": 1101004800, "step": 2800 }, { "epoch": 0.3176836980899438, "grad_norm": 0.6708900332450867, "learning_rate": 3.591120658789832e-05, "loss": 2.1171, "num_input_tokens_seen": 1101791232, "step": 2802 }, { "epoch": 0.31836396296808067, "grad_norm": 0.5367056727409363, "learning_rate": 3.5875402792696025e-05, "loss": 2.0848, "num_input_tokens_seen": 1104150528, "step": 2808 }, { "epoch": 0.3190442278462176, "grad_norm": 0.6883641481399536, "learning_rate": 3.583959899749374e-05, "loss": 2.1015, "num_input_tokens_seen": 1106509824, "step": 2814 }, { "epoch": 0.31972449272435455, "grad_norm": 0.6446415781974792, "learning_rate": 3.580379520229145e-05, "loss": 2.079, "num_input_tokens_seen": 1108869120, "step": 2820 }, { "epoch": 0.3204047576024915, "grad_norm": 0.642508864402771, "learning_rate": 3.576799140708915e-05, "loss": 2.1132, "num_input_tokens_seen": 1111228416, "step": 2826 }, { "epoch": 0.32108502248062837, "grad_norm": 0.5669949054718018, "learning_rate": 3.5732187611886865e-05, "loss": 2.0901, "num_input_tokens_seen": 1113587712, "step": 2832 }, { "epoch": 0.3217652873587653, "grad_norm": 0.7657294869422913, "learning_rate": 3.569638381668457e-05, "loss": 2.0556, "num_input_tokens_seen": 1115947008, "step": 2838 }, { "epoch": 0.32244555223690224, "grad_norm": 0.7742637991905212, "learning_rate": 3.5660580021482274e-05, "loss": 2.0113, "num_input_tokens_seen": 1118306304, "step": 2844 }, { "epoch": 0.3231258171150392, "grad_norm": 0.7039967179298401, "learning_rate": 3.5624776226279986e-05, "loss": 2.079, "num_input_tokens_seen": 1120665600, "step": 2850 }, { "epoch": 0.3238060819931761, "grad_norm": 0.580337643623352, "learning_rate": 3.55889724310777e-05, "loss": 2.0946, "num_input_tokens_seen": 1123024896, "step": 2856 }, { "epoch": 0.324486346871313, "grad_norm": 0.5866253972053528, "learning_rate": 3.555316863587541e-05, "loss": 2.086, "num_input_tokens_seen": 1125384192, "step": 2862 }, { "epoch": 0.32516661174944994, "grad_norm": 0.5165377259254456, "learning_rate": 3.5517364840673114e-05, "loss": 2.0419, "num_input_tokens_seen": 1127743488, "step": 2868 }, { "epoch": 0.3258468766275869, "grad_norm": 0.5327121615409851, "learning_rate": 3.5481561045470825e-05, "loss": 2.064, "num_input_tokens_seen": 1130102784, "step": 2874 }, { "epoch": 0.3265271415057238, "grad_norm": 0.7180930972099304, "learning_rate": 3.544575725026853e-05, "loss": 1.9961, "num_input_tokens_seen": 1132462080, "step": 2880 }, { "epoch": 0.3272074063838607, "grad_norm": 0.5961750745773315, "learning_rate": 3.5409953455066235e-05, "loss": 2.031, "num_input_tokens_seen": 1134821376, "step": 2886 }, { "epoch": 0.32788767126199764, "grad_norm": 0.6628397107124329, "learning_rate": 3.5374149659863946e-05, "loss": 1.9875, "num_input_tokens_seen": 1137180672, "step": 2892 }, { "epoch": 0.3285679361401346, "grad_norm": 0.606051504611969, "learning_rate": 3.533834586466165e-05, "loss": 2.0553, "num_input_tokens_seen": 1139539968, "step": 2898 }, { "epoch": 0.3292482010182715, "grad_norm": 0.6407272219657898, "learning_rate": 3.530254206945936e-05, "loss": 2.0333, "num_input_tokens_seen": 1141899264, "step": 2904 }, { "epoch": 0.3299284658964084, "grad_norm": 0.5641146302223206, "learning_rate": 3.5266738274257074e-05, "loss": 2.1093, "num_input_tokens_seen": 1144258560, "step": 2910 }, { "epoch": 0.33060873077454533, "grad_norm": 0.6447109580039978, "learning_rate": 3.5230934479054786e-05, "loss": 2.0108, "num_input_tokens_seen": 1146617856, "step": 2916 }, { "epoch": 0.33128899565268227, "grad_norm": 0.6956091523170471, "learning_rate": 3.519513068385249e-05, "loss": 2.0884, "num_input_tokens_seen": 1148977152, "step": 2922 }, { "epoch": 0.3319692605308192, "grad_norm": 0.6706202626228333, "learning_rate": 3.5159326888650195e-05, "loss": 2.0462, "num_input_tokens_seen": 1151336448, "step": 2928 }, { "epoch": 0.3326495254089561, "grad_norm": 0.5899391174316406, "learning_rate": 3.512352309344791e-05, "loss": 2.0629, "num_input_tokens_seen": 1153695744, "step": 2934 }, { "epoch": 0.33332979028709303, "grad_norm": 0.695925772190094, "learning_rate": 3.508771929824561e-05, "loss": 2.0594, "num_input_tokens_seen": 1156055040, "step": 2940 }, { "epoch": 0.33401005516522997, "grad_norm": 0.5403394103050232, "learning_rate": 3.505191550304332e-05, "loss": 2.0885, "num_input_tokens_seen": 1158414336, "step": 2946 }, { "epoch": 0.3346903200433669, "grad_norm": 0.6385943293571472, "learning_rate": 3.5016111707841035e-05, "loss": 2.0986, "num_input_tokens_seen": 1160773632, "step": 2952 }, { "epoch": 0.3353705849215038, "grad_norm": 0.5981218218803406, "learning_rate": 3.4980307912638746e-05, "loss": 2.0232, "num_input_tokens_seen": 1163132928, "step": 2958 }, { "epoch": 0.3360508497996407, "grad_norm": 0.6498490571975708, "learning_rate": 3.494450411743645e-05, "loss": 2.0837, "num_input_tokens_seen": 1165492224, "step": 2964 }, { "epoch": 0.33673111467777767, "grad_norm": 0.5568425059318542, "learning_rate": 3.4908700322234156e-05, "loss": 2.0792, "num_input_tokens_seen": 1167851520, "step": 2970 }, { "epoch": 0.3374113795559146, "grad_norm": 0.5944088697433472, "learning_rate": 3.487289652703187e-05, "loss": 2.0658, "num_input_tokens_seen": 1170210816, "step": 2976 }, { "epoch": 0.3380916444340515, "grad_norm": 0.6015023589134216, "learning_rate": 3.483709273182957e-05, "loss": 2.0746, "num_input_tokens_seen": 1172570112, "step": 2982 }, { "epoch": 0.3387719093121884, "grad_norm": 0.783666729927063, "learning_rate": 3.4801288936627283e-05, "loss": 2.0295, "num_input_tokens_seen": 1174929408, "step": 2988 }, { "epoch": 0.33945217419032536, "grad_norm": 0.5756369829177856, "learning_rate": 3.476548514142499e-05, "loss": 2.0684, "num_input_tokens_seen": 1177288704, "step": 2994 }, { "epoch": 0.3401324390684623, "grad_norm": 0.6056890487670898, "learning_rate": 3.4729681346222707e-05, "loss": 2.0383, "num_input_tokens_seen": 1179648000, "step": 3000 }, { "epoch": 0.3408127039465992, "grad_norm": 0.8019888997077942, "learning_rate": 3.469387755102041e-05, "loss": 2.0801, "num_input_tokens_seen": 1182007296, "step": 3006 }, { "epoch": 0.3414929688247361, "grad_norm": 0.6044601202011108, "learning_rate": 3.4658073755818116e-05, "loss": 2.0798, "num_input_tokens_seen": 1184366592, "step": 3012 }, { "epoch": 0.34217323370287306, "grad_norm": 0.5998896360397339, "learning_rate": 3.462226996061583e-05, "loss": 2.0317, "num_input_tokens_seen": 1186725888, "step": 3018 }, { "epoch": 0.34285349858101, "grad_norm": 0.5555676221847534, "learning_rate": 3.458646616541353e-05, "loss": 1.9898, "num_input_tokens_seen": 1189085184, "step": 3024 }, { "epoch": 0.3435337634591469, "grad_norm": 0.5591822862625122, "learning_rate": 3.4550662370211244e-05, "loss": 2.0605, "num_input_tokens_seen": 1191444480, "step": 3030 }, { "epoch": 0.3442140283372838, "grad_norm": 0.6183376908302307, "learning_rate": 3.451485857500895e-05, "loss": 2.1107, "num_input_tokens_seen": 1193803776, "step": 3036 }, { "epoch": 0.34489429321542076, "grad_norm": 0.6081872582435608, "learning_rate": 3.447905477980666e-05, "loss": 2.0465, "num_input_tokens_seen": 1196163072, "step": 3042 }, { "epoch": 0.3455745580935577, "grad_norm": 0.5790855288505554, "learning_rate": 3.444325098460437e-05, "loss": 2.0326, "num_input_tokens_seen": 1198522368, "step": 3048 }, { "epoch": 0.34625482297169463, "grad_norm": 0.7046033143997192, "learning_rate": 3.440744718940208e-05, "loss": 2.0282, "num_input_tokens_seen": 1200881664, "step": 3054 }, { "epoch": 0.3469350878498315, "grad_norm": 0.6874545812606812, "learning_rate": 3.437164339419979e-05, "loss": 2.0915, "num_input_tokens_seen": 1203240960, "step": 3060 }, { "epoch": 0.34761535272796845, "grad_norm": 0.5742839574813843, "learning_rate": 3.433583959899749e-05, "loss": 2.097, "num_input_tokens_seen": 1205600256, "step": 3066 }, { "epoch": 0.3482956176061054, "grad_norm": 0.5930187702178955, "learning_rate": 3.4300035803795204e-05, "loss": 2.1049, "num_input_tokens_seen": 1207959552, "step": 3072 }, { "epoch": 0.34897588248424233, "grad_norm": 0.5856387615203857, "learning_rate": 3.426423200859291e-05, "loss": 2.0913, "num_input_tokens_seen": 1210318848, "step": 3078 }, { "epoch": 0.3496561473623792, "grad_norm": 0.6059959530830383, "learning_rate": 3.422842821339062e-05, "loss": 2.0664, "num_input_tokens_seen": 1212678144, "step": 3084 }, { "epoch": 0.35033641224051615, "grad_norm": 0.6338859796524048, "learning_rate": 3.419262441818833e-05, "loss": 2.0543, "num_input_tokens_seen": 1215037440, "step": 3090 }, { "epoch": 0.3510166771186531, "grad_norm": 0.6134727001190186, "learning_rate": 3.415682062298604e-05, "loss": 2.0758, "num_input_tokens_seen": 1217396736, "step": 3096 }, { "epoch": 0.35169694199679, "grad_norm": 0.7190840244293213, "learning_rate": 3.412101682778375e-05, "loss": 2.0468, "num_input_tokens_seen": 1219756032, "step": 3102 }, { "epoch": 0.3523772068749269, "grad_norm": 0.6040173768997192, "learning_rate": 3.4085213032581453e-05, "loss": 2.0888, "num_input_tokens_seen": 1222115328, "step": 3108 }, { "epoch": 0.35305747175306385, "grad_norm": 0.6416704058647156, "learning_rate": 3.4049409237379165e-05, "loss": 2.0423, "num_input_tokens_seen": 1224474624, "step": 3114 }, { "epoch": 0.3537377366312008, "grad_norm": 0.6157965064048767, "learning_rate": 3.401360544217687e-05, "loss": 2.0746, "num_input_tokens_seen": 1226833920, "step": 3120 }, { "epoch": 0.3544180015093377, "grad_norm": 0.6185963153839111, "learning_rate": 3.397780164697458e-05, "loss": 2.0451, "num_input_tokens_seen": 1229193216, "step": 3126 }, { "epoch": 0.3550982663874746, "grad_norm": 0.5963800549507141, "learning_rate": 3.3941997851772286e-05, "loss": 2.0392, "num_input_tokens_seen": 1231552512, "step": 3132 }, { "epoch": 0.35577853126561154, "grad_norm": 0.6368474960327148, "learning_rate": 3.390619405657e-05, "loss": 2.0505, "num_input_tokens_seen": 1233911808, "step": 3138 }, { "epoch": 0.3564587961437485, "grad_norm": 0.675567090511322, "learning_rate": 3.387039026136771e-05, "loss": 2.0833, "num_input_tokens_seen": 1236271104, "step": 3144 }, { "epoch": 0.3571390610218854, "grad_norm": 0.6852293014526367, "learning_rate": 3.3834586466165414e-05, "loss": 2.0318, "num_input_tokens_seen": 1238630400, "step": 3150 }, { "epoch": 0.3578193259000223, "grad_norm": 0.7064585089683533, "learning_rate": 3.3798782670963125e-05, "loss": 2.0657, "num_input_tokens_seen": 1240989696, "step": 3156 }, { "epoch": 0.35849959077815924, "grad_norm": 0.6410323977470398, "learning_rate": 3.376297887576083e-05, "loss": 2.079, "num_input_tokens_seen": 1243348992, "step": 3162 }, { "epoch": 0.3591798556562962, "grad_norm": 0.7537684440612793, "learning_rate": 3.372717508055854e-05, "loss": 2.0817, "num_input_tokens_seen": 1245708288, "step": 3168 }, { "epoch": 0.3598601205344331, "grad_norm": 0.7127799391746521, "learning_rate": 3.3691371285356247e-05, "loss": 2.0786, "num_input_tokens_seen": 1248067584, "step": 3174 }, { "epoch": 0.36054038541257, "grad_norm": 0.5921429991722107, "learning_rate": 3.365556749015396e-05, "loss": 2.094, "num_input_tokens_seen": 1250426880, "step": 3180 }, { "epoch": 0.36122065029070693, "grad_norm": 0.5365628600120544, "learning_rate": 3.361976369495167e-05, "loss": 2.0306, "num_input_tokens_seen": 1252786176, "step": 3186 }, { "epoch": 0.3619009151688439, "grad_norm": 0.591437816619873, "learning_rate": 3.3583959899749374e-05, "loss": 2.0486, "num_input_tokens_seen": 1255145472, "step": 3192 }, { "epoch": 0.3625811800469808, "grad_norm": 0.5986304879188538, "learning_rate": 3.3548156104547086e-05, "loss": 2.0571, "num_input_tokens_seen": 1257504768, "step": 3198 }, { "epoch": 0.36280793500635977, "eval_accuracy": 0.5792582417582418, "eval_loss": 2.0506937503814697, "eval_runtime": 129.3292, "eval_samples_per_second": 3.093, "eval_steps_per_second": 1.036, "num_input_tokens_seen": 1258291200, "step": 3200 }, { "epoch": 0.3632614449251177, "grad_norm": 0.6536886096000671, "learning_rate": 3.351235230934479e-05, "loss": 2.0306, "num_input_tokens_seen": 1259864064, "step": 3204 }, { "epoch": 0.36394170980325463, "grad_norm": 0.6367084980010986, "learning_rate": 3.34765485141425e-05, "loss": 2.0495, "num_input_tokens_seen": 1262223360, "step": 3210 }, { "epoch": 0.36462197468139157, "grad_norm": 0.5505596995353699, "learning_rate": 3.344074471894021e-05, "loss": 2.0235, "num_input_tokens_seen": 1264582656, "step": 3216 }, { "epoch": 0.3653022395595285, "grad_norm": 0.6911424398422241, "learning_rate": 3.340494092373791e-05, "loss": 2.0436, "num_input_tokens_seen": 1266941952, "step": 3222 }, { "epoch": 0.3659825044376654, "grad_norm": 0.8652951717376709, "learning_rate": 3.336913712853563e-05, "loss": 2.043, "num_input_tokens_seen": 1269301248, "step": 3228 }, { "epoch": 0.36666276931580233, "grad_norm": 0.7431092262268066, "learning_rate": 3.3333333333333335e-05, "loss": 2.0535, "num_input_tokens_seen": 1271660544, "step": 3234 }, { "epoch": 0.36734303419393927, "grad_norm": 0.5794050097465515, "learning_rate": 3.3297529538131046e-05, "loss": 2.0574, "num_input_tokens_seen": 1274019840, "step": 3240 }, { "epoch": 0.3680232990720762, "grad_norm": 0.6215260624885559, "learning_rate": 3.326172574292875e-05, "loss": 2.086, "num_input_tokens_seen": 1276379136, "step": 3246 }, { "epoch": 0.3687035639502131, "grad_norm": 0.5912430286407471, "learning_rate": 3.322592194772646e-05, "loss": 2.079, "num_input_tokens_seen": 1278738432, "step": 3252 }, { "epoch": 0.36938382882835, "grad_norm": 0.5438397526741028, "learning_rate": 3.319011815252417e-05, "loss": 2.0822, "num_input_tokens_seen": 1281097728, "step": 3258 }, { "epoch": 0.37006409370648696, "grad_norm": 0.585678219795227, "learning_rate": 3.315431435732187e-05, "loss": 2.0288, "num_input_tokens_seen": 1283457024, "step": 3264 }, { "epoch": 0.3707443585846239, "grad_norm": 0.6816807985305786, "learning_rate": 3.3118510562119584e-05, "loss": 2.0905, "num_input_tokens_seen": 1285816320, "step": 3270 }, { "epoch": 0.37142462346276084, "grad_norm": 0.6669751405715942, "learning_rate": 3.3082706766917295e-05, "loss": 2.0476, "num_input_tokens_seen": 1288175616, "step": 3276 }, { "epoch": 0.3721048883408977, "grad_norm": 0.6388877034187317, "learning_rate": 3.304690297171501e-05, "loss": 2.0814, "num_input_tokens_seen": 1290534912, "step": 3282 }, { "epoch": 0.37278515321903466, "grad_norm": 0.6471198797225952, "learning_rate": 3.301109917651271e-05, "loss": 2.0667, "num_input_tokens_seen": 1292894208, "step": 3288 }, { "epoch": 0.3734654180971716, "grad_norm": 0.5172976851463318, "learning_rate": 3.297529538131042e-05, "loss": 2.0514, "num_input_tokens_seen": 1295253504, "step": 3294 }, { "epoch": 0.37414568297530854, "grad_norm": 0.7585137486457825, "learning_rate": 3.293949158610813e-05, "loss": 2.0004, "num_input_tokens_seen": 1297612800, "step": 3300 }, { "epoch": 0.3748259478534454, "grad_norm": 0.5259309411048889, "learning_rate": 3.290368779090583e-05, "loss": 2.0757, "num_input_tokens_seen": 1299972096, "step": 3306 }, { "epoch": 0.37550621273158236, "grad_norm": 0.58619225025177, "learning_rate": 3.2867883995703544e-05, "loss": 2.0677, "num_input_tokens_seen": 1302331392, "step": 3312 }, { "epoch": 0.3761864776097193, "grad_norm": 0.5706315636634827, "learning_rate": 3.2832080200501256e-05, "loss": 2.0797, "num_input_tokens_seen": 1304690688, "step": 3318 }, { "epoch": 0.37686674248785623, "grad_norm": 0.5927962064743042, "learning_rate": 3.279627640529897e-05, "loss": 2.079, "num_input_tokens_seen": 1307049984, "step": 3324 }, { "epoch": 0.3775470073659931, "grad_norm": 0.6606913208961487, "learning_rate": 3.276047261009667e-05, "loss": 2.1193, "num_input_tokens_seen": 1309409280, "step": 3330 }, { "epoch": 0.37822727224413005, "grad_norm": 0.585796058177948, "learning_rate": 3.2724668814894384e-05, "loss": 2.0842, "num_input_tokens_seen": 1311768576, "step": 3336 }, { "epoch": 0.378907537122267, "grad_norm": 0.6631506681442261, "learning_rate": 3.268886501969209e-05, "loss": 2.055, "num_input_tokens_seen": 1314127872, "step": 3342 }, { "epoch": 0.37958780200040393, "grad_norm": 0.577156126499176, "learning_rate": 3.265306122448979e-05, "loss": 2.0251, "num_input_tokens_seen": 1316487168, "step": 3348 }, { "epoch": 0.3802680668785408, "grad_norm": 0.5798112154006958, "learning_rate": 3.2617257429287505e-05, "loss": 2.0066, "num_input_tokens_seen": 1318846464, "step": 3354 }, { "epoch": 0.38094833175667775, "grad_norm": 0.5928402543067932, "learning_rate": 3.258145363408521e-05, "loss": 2.0594, "num_input_tokens_seen": 1321205760, "step": 3360 }, { "epoch": 0.3816285966348147, "grad_norm": 0.5730792880058289, "learning_rate": 3.254564983888292e-05, "loss": 2.014, "num_input_tokens_seen": 1323565056, "step": 3366 }, { "epoch": 0.3823088615129516, "grad_norm": 0.606386661529541, "learning_rate": 3.250984604368063e-05, "loss": 2.0674, "num_input_tokens_seen": 1325924352, "step": 3372 }, { "epoch": 0.3829891263910885, "grad_norm": 0.6056506037712097, "learning_rate": 3.2474042248478344e-05, "loss": 2.0847, "num_input_tokens_seen": 1328283648, "step": 3378 }, { "epoch": 0.38366939126922545, "grad_norm": 0.547749936580658, "learning_rate": 3.243823845327605e-05, "loss": 2.0519, "num_input_tokens_seen": 1330642944, "step": 3384 }, { "epoch": 0.3843496561473624, "grad_norm": 0.5810631513595581, "learning_rate": 3.2402434658073754e-05, "loss": 2.0552, "num_input_tokens_seen": 1333002240, "step": 3390 }, { "epoch": 0.3850299210254993, "grad_norm": 0.7057521343231201, "learning_rate": 3.2366630862871465e-05, "loss": 2.1079, "num_input_tokens_seen": 1335361536, "step": 3396 }, { "epoch": 0.3857101859036362, "grad_norm": 0.6711981892585754, "learning_rate": 3.233082706766917e-05, "loss": 2.0651, "num_input_tokens_seen": 1337720832, "step": 3402 }, { "epoch": 0.38639045078177314, "grad_norm": 0.7233543992042542, "learning_rate": 3.229502327246688e-05, "loss": 2.05, "num_input_tokens_seen": 1340080128, "step": 3408 }, { "epoch": 0.3870707156599101, "grad_norm": 0.6906174421310425, "learning_rate": 3.225921947726459e-05, "loss": 2.0953, "num_input_tokens_seen": 1342439424, "step": 3414 }, { "epoch": 0.387750980538047, "grad_norm": 0.634935736656189, "learning_rate": 3.2223415682062305e-05, "loss": 2.0757, "num_input_tokens_seen": 1344798720, "step": 3420 }, { "epoch": 0.3884312454161839, "grad_norm": 0.6409153342247009, "learning_rate": 3.218761188686001e-05, "loss": 2.0493, "num_input_tokens_seen": 1347158016, "step": 3426 }, { "epoch": 0.38911151029432084, "grad_norm": 0.5605142116546631, "learning_rate": 3.2151808091657714e-05, "loss": 2.0471, "num_input_tokens_seen": 1349517312, "step": 3432 }, { "epoch": 0.3897917751724578, "grad_norm": 0.5918275117874146, "learning_rate": 3.2116004296455426e-05, "loss": 2.0703, "num_input_tokens_seen": 1351876608, "step": 3438 }, { "epoch": 0.3904720400505947, "grad_norm": 0.583743155002594, "learning_rate": 3.208020050125313e-05, "loss": 2.0535, "num_input_tokens_seen": 1354235904, "step": 3444 }, { "epoch": 0.3911523049287316, "grad_norm": 0.5591037273406982, "learning_rate": 3.204439670605084e-05, "loss": 2.0863, "num_input_tokens_seen": 1356595200, "step": 3450 }, { "epoch": 0.39183256980686854, "grad_norm": 0.5802523493766785, "learning_rate": 3.2008592910848554e-05, "loss": 2.1016, "num_input_tokens_seen": 1358954496, "step": 3456 }, { "epoch": 0.3925128346850055, "grad_norm": 0.5013401508331299, "learning_rate": 3.1972789115646265e-05, "loss": 2.0506, "num_input_tokens_seen": 1361313792, "step": 3462 }, { "epoch": 0.3931930995631424, "grad_norm": 0.6218020915985107, "learning_rate": 3.193698532044397e-05, "loss": 2.079, "num_input_tokens_seen": 1363673088, "step": 3468 }, { "epoch": 0.39387336444127935, "grad_norm": 0.591705858707428, "learning_rate": 3.1901181525241675e-05, "loss": 2.0478, "num_input_tokens_seen": 1366032384, "step": 3474 }, { "epoch": 0.39455362931941623, "grad_norm": 0.6699190139770508, "learning_rate": 3.1865377730039386e-05, "loss": 2.0473, "num_input_tokens_seen": 1368391680, "step": 3480 }, { "epoch": 0.39523389419755317, "grad_norm": 0.6555076241493225, "learning_rate": 3.182957393483709e-05, "loss": 2.0559, "num_input_tokens_seen": 1370750976, "step": 3486 }, { "epoch": 0.3959141590756901, "grad_norm": 0.7014250159263611, "learning_rate": 3.17937701396348e-05, "loss": 2.0714, "num_input_tokens_seen": 1373110272, "step": 3492 }, { "epoch": 0.39659442395382705, "grad_norm": 0.588550329208374, "learning_rate": 3.175796634443251e-05, "loss": 2.0885, "num_input_tokens_seen": 1375469568, "step": 3498 }, { "epoch": 0.39727468883196393, "grad_norm": 0.572462260723114, "learning_rate": 3.172216254923022e-05, "loss": 2.0506, "num_input_tokens_seen": 1377828864, "step": 3504 }, { "epoch": 0.39795495371010087, "grad_norm": 0.534345805644989, "learning_rate": 3.168635875402793e-05, "loss": 2.0648, "num_input_tokens_seen": 1380188160, "step": 3510 }, { "epoch": 0.3986352185882378, "grad_norm": 0.6382195949554443, "learning_rate": 3.1650554958825635e-05, "loss": 2.0084, "num_input_tokens_seen": 1382547456, "step": 3516 }, { "epoch": 0.39931548346637474, "grad_norm": 0.6456411480903625, "learning_rate": 3.161475116362335e-05, "loss": 2.0935, "num_input_tokens_seen": 1384906752, "step": 3522 }, { "epoch": 0.3999957483445116, "grad_norm": 0.6360987424850464, "learning_rate": 3.157894736842105e-05, "loss": 2.0757, "num_input_tokens_seen": 1387266048, "step": 3528 }, { "epoch": 0.40067601322264856, "grad_norm": 0.6245688199996948, "learning_rate": 3.154314357321876e-05, "loss": 2.0381, "num_input_tokens_seen": 1389625344, "step": 3534 }, { "epoch": 0.4013562781007855, "grad_norm": 0.6057738661766052, "learning_rate": 3.150733977801647e-05, "loss": 2.0865, "num_input_tokens_seen": 1391984640, "step": 3540 }, { "epoch": 0.40203654297892244, "grad_norm": 0.6015221476554871, "learning_rate": 3.147153598281418e-05, "loss": 2.0725, "num_input_tokens_seen": 1394343936, "step": 3546 }, { "epoch": 0.4027168078570593, "grad_norm": 0.6431640982627869, "learning_rate": 3.143573218761189e-05, "loss": 2.0358, "num_input_tokens_seen": 1396703232, "step": 3552 }, { "epoch": 0.40339707273519626, "grad_norm": 0.5379701256752014, "learning_rate": 3.1399928392409596e-05, "loss": 2.0898, "num_input_tokens_seen": 1399062528, "step": 3558 }, { "epoch": 0.4040773376133332, "grad_norm": 0.647147536277771, "learning_rate": 3.136412459720731e-05, "loss": 2.0025, "num_input_tokens_seen": 1401421824, "step": 3564 }, { "epoch": 0.40475760249147014, "grad_norm": 0.547764003276825, "learning_rate": 3.132832080200501e-05, "loss": 2.0452, "num_input_tokens_seen": 1403781120, "step": 3570 }, { "epoch": 0.405437867369607, "grad_norm": 0.7669888734817505, "learning_rate": 3.1292517006802724e-05, "loss": 2.0367, "num_input_tokens_seen": 1406140416, "step": 3576 }, { "epoch": 0.40611813224774396, "grad_norm": 0.5902658700942993, "learning_rate": 3.125671321160043e-05, "loss": 2.0263, "num_input_tokens_seen": 1408499712, "step": 3582 }, { "epoch": 0.4067983971258809, "grad_norm": 0.5572285056114197, "learning_rate": 3.122090941639814e-05, "loss": 2.0315, "num_input_tokens_seen": 1410859008, "step": 3588 }, { "epoch": 0.40747866200401783, "grad_norm": 0.58447265625, "learning_rate": 3.1185105621195845e-05, "loss": 2.1052, "num_input_tokens_seen": 1413218304, "step": 3594 }, { "epoch": 0.4081589268821547, "grad_norm": 0.5623412132263184, "learning_rate": 3.1149301825993556e-05, "loss": 2.0841, "num_input_tokens_seen": 1415577600, "step": 3600 }, { "epoch": 0.4081589268821547, "eval_accuracy": 0.5801733821733822, "eval_loss": 2.0434608459472656, "eval_runtime": 128.4268, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.043, "num_input_tokens_seen": 1415577600, "step": 3600 }, { "epoch": 0.40883919176029165, "grad_norm": 0.5253978371620178, "learning_rate": 3.111349803079127e-05, "loss": 2.04, "num_input_tokens_seen": 1417936896, "step": 3606 }, { "epoch": 0.4095194566384286, "grad_norm": 0.5712242722511292, "learning_rate": 3.107769423558897e-05, "loss": 2.0483, "num_input_tokens_seen": 1420296192, "step": 3612 }, { "epoch": 0.41019972151656553, "grad_norm": 0.5923311710357666, "learning_rate": 3.1041890440386684e-05, "loss": 2.0499, "num_input_tokens_seen": 1422655488, "step": 3618 }, { "epoch": 0.4108799863947024, "grad_norm": 0.8339553475379944, "learning_rate": 3.100608664518439e-05, "loss": 2.1001, "num_input_tokens_seen": 1425014784, "step": 3624 }, { "epoch": 0.41156025127283935, "grad_norm": 0.6857354044914246, "learning_rate": 3.09702828499821e-05, "loss": 1.9921, "num_input_tokens_seen": 1427374080, "step": 3630 }, { "epoch": 0.4122405161509763, "grad_norm": 0.6165304183959961, "learning_rate": 3.0934479054779805e-05, "loss": 2.0732, "num_input_tokens_seen": 1429733376, "step": 3636 }, { "epoch": 0.4129207810291132, "grad_norm": 0.528439998626709, "learning_rate": 3.089867525957752e-05, "loss": 2.0696, "num_input_tokens_seen": 1432092672, "step": 3642 }, { "epoch": 0.4136010459072501, "grad_norm": 0.5932063460350037, "learning_rate": 3.086287146437523e-05, "loss": 2.0249, "num_input_tokens_seen": 1434451968, "step": 3648 }, { "epoch": 0.41428131078538705, "grad_norm": 0.6371628046035767, "learning_rate": 3.082706766917293e-05, "loss": 2.0799, "num_input_tokens_seen": 1436811264, "step": 3654 }, { "epoch": 0.414961575663524, "grad_norm": 0.5642787218093872, "learning_rate": 3.0791263873970645e-05, "loss": 2.0326, "num_input_tokens_seen": 1439170560, "step": 3660 }, { "epoch": 0.4156418405416609, "grad_norm": 0.6325972080230713, "learning_rate": 3.075546007876835e-05, "loss": 2.0502, "num_input_tokens_seen": 1441529856, "step": 3666 }, { "epoch": 0.41632210541979786, "grad_norm": 0.557271420955658, "learning_rate": 3.071965628356606e-05, "loss": 2.0377, "num_input_tokens_seen": 1443889152, "step": 3672 }, { "epoch": 0.41700237029793474, "grad_norm": 0.5825797319412231, "learning_rate": 3.0683852488363766e-05, "loss": 2.0536, "num_input_tokens_seen": 1446248448, "step": 3678 }, { "epoch": 0.4176826351760717, "grad_norm": 0.6249643564224243, "learning_rate": 3.064804869316148e-05, "loss": 2.0838, "num_input_tokens_seen": 1448607744, "step": 3684 }, { "epoch": 0.4183629000542086, "grad_norm": 0.6836763024330139, "learning_rate": 3.061224489795919e-05, "loss": 2.0301, "num_input_tokens_seen": 1450967040, "step": 3690 }, { "epoch": 0.41904316493234556, "grad_norm": 0.5793729424476624, "learning_rate": 3.0576441102756894e-05, "loss": 2.0429, "num_input_tokens_seen": 1453326336, "step": 3696 }, { "epoch": 0.41972342981048244, "grad_norm": 0.6290580034255981, "learning_rate": 3.0540637307554605e-05, "loss": 2.0906, "num_input_tokens_seen": 1455685632, "step": 3702 }, { "epoch": 0.4204036946886194, "grad_norm": 0.6509286165237427, "learning_rate": 3.050483351235231e-05, "loss": 2.0252, "num_input_tokens_seen": 1458044928, "step": 3708 }, { "epoch": 0.4210839595667563, "grad_norm": 0.5808912515640259, "learning_rate": 3.046902971715002e-05, "loss": 2.0958, "num_input_tokens_seen": 1460404224, "step": 3714 }, { "epoch": 0.42176422444489325, "grad_norm": 0.6550482511520386, "learning_rate": 3.0433225921947726e-05, "loss": 2.0413, "num_input_tokens_seen": 1462763520, "step": 3720 }, { "epoch": 0.42244448932303014, "grad_norm": 0.6474577784538269, "learning_rate": 3.0397422126745434e-05, "loss": 2.0362, "num_input_tokens_seen": 1465122816, "step": 3726 }, { "epoch": 0.4231247542011671, "grad_norm": 0.6413889527320862, "learning_rate": 3.0361618331543146e-05, "loss": 2.0385, "num_input_tokens_seen": 1467482112, "step": 3732 }, { "epoch": 0.423805019079304, "grad_norm": 0.5291987061500549, "learning_rate": 3.032581453634085e-05, "loss": 2.0211, "num_input_tokens_seen": 1469841408, "step": 3738 }, { "epoch": 0.42448528395744095, "grad_norm": 0.5267509818077087, "learning_rate": 3.0290010741138562e-05, "loss": 2.0544, "num_input_tokens_seen": 1472200704, "step": 3744 }, { "epoch": 0.42516554883557783, "grad_norm": 0.5063323974609375, "learning_rate": 3.025420694593627e-05, "loss": 2.112, "num_input_tokens_seen": 1474560000, "step": 3750 }, { "epoch": 0.42584581371371477, "grad_norm": 0.5626435875892639, "learning_rate": 3.0218403150733982e-05, "loss": 2.0111, "num_input_tokens_seen": 1476919296, "step": 3756 }, { "epoch": 0.4265260785918517, "grad_norm": 0.5254883766174316, "learning_rate": 3.0182599355531687e-05, "loss": 2.0665, "num_input_tokens_seen": 1479278592, "step": 3762 }, { "epoch": 0.42720634346998865, "grad_norm": 0.6676201224327087, "learning_rate": 3.0146795560329395e-05, "loss": 2.0553, "num_input_tokens_seen": 1481637888, "step": 3768 }, { "epoch": 0.42788660834812553, "grad_norm": 0.618036150932312, "learning_rate": 3.0110991765127106e-05, "loss": 2.0896, "num_input_tokens_seen": 1483997184, "step": 3774 }, { "epoch": 0.42856687322626247, "grad_norm": 0.5902726054191589, "learning_rate": 3.007518796992481e-05, "loss": 2.0941, "num_input_tokens_seen": 1486356480, "step": 3780 }, { "epoch": 0.4292471381043994, "grad_norm": 0.6541236639022827, "learning_rate": 3.0039384174722523e-05, "loss": 2.0441, "num_input_tokens_seen": 1488715776, "step": 3786 }, { "epoch": 0.42992740298253634, "grad_norm": 0.6734746098518372, "learning_rate": 3.000358037952023e-05, "loss": 2.098, "num_input_tokens_seen": 1491075072, "step": 3792 }, { "epoch": 0.4306076678606732, "grad_norm": 0.690733015537262, "learning_rate": 2.9967776584317943e-05, "loss": 2.0626, "num_input_tokens_seen": 1493434368, "step": 3798 }, { "epoch": 0.43128793273881016, "grad_norm": 0.6429844498634338, "learning_rate": 2.9931972789115647e-05, "loss": 2.0556, "num_input_tokens_seen": 1495793664, "step": 3804 }, { "epoch": 0.4319681976169471, "grad_norm": 0.692583441734314, "learning_rate": 2.9896168993913355e-05, "loss": 2.041, "num_input_tokens_seen": 1498152960, "step": 3810 }, { "epoch": 0.43264846249508404, "grad_norm": 0.5887177586555481, "learning_rate": 2.9860365198711067e-05, "loss": 2.0964, "num_input_tokens_seen": 1500512256, "step": 3816 }, { "epoch": 0.4333287273732209, "grad_norm": 0.5106215476989746, "learning_rate": 2.9824561403508772e-05, "loss": 1.9808, "num_input_tokens_seen": 1502871552, "step": 3822 }, { "epoch": 0.43400899225135786, "grad_norm": 0.648137629032135, "learning_rate": 2.9788757608306483e-05, "loss": 2.0234, "num_input_tokens_seen": 1505230848, "step": 3828 }, { "epoch": 0.4346892571294948, "grad_norm": 0.5356113314628601, "learning_rate": 2.9752953813104188e-05, "loss": 2.0467, "num_input_tokens_seen": 1507590144, "step": 3834 }, { "epoch": 0.43536952200763174, "grad_norm": 0.5586897134780884, "learning_rate": 2.9717150017901903e-05, "loss": 2.0798, "num_input_tokens_seen": 1509949440, "step": 3840 }, { "epoch": 0.4360497868857686, "grad_norm": 0.5449038743972778, "learning_rate": 2.9681346222699608e-05, "loss": 2.074, "num_input_tokens_seen": 1512308736, "step": 3846 }, { "epoch": 0.43673005176390556, "grad_norm": 0.610542893409729, "learning_rate": 2.9645542427497313e-05, "loss": 2.0539, "num_input_tokens_seen": 1514668032, "step": 3852 }, { "epoch": 0.4374103166420425, "grad_norm": 0.591969907283783, "learning_rate": 2.9609738632295024e-05, "loss": 2.0768, "num_input_tokens_seen": 1517027328, "step": 3858 }, { "epoch": 0.43809058152017943, "grad_norm": 0.6223018765449524, "learning_rate": 2.9573934837092732e-05, "loss": 1.9882, "num_input_tokens_seen": 1519386624, "step": 3864 }, { "epoch": 0.43877084639831637, "grad_norm": 0.5089840292930603, "learning_rate": 2.9538131041890444e-05, "loss": 2.03, "num_input_tokens_seen": 1521745920, "step": 3870 }, { "epoch": 0.43945111127645325, "grad_norm": 0.6258916854858398, "learning_rate": 2.950232724668815e-05, "loss": 2.0287, "num_input_tokens_seen": 1524105216, "step": 3876 }, { "epoch": 0.4401313761545902, "grad_norm": 0.7169709205627441, "learning_rate": 2.946652345148586e-05, "loss": 2.0542, "num_input_tokens_seen": 1526464512, "step": 3882 }, { "epoch": 0.44081164103272713, "grad_norm": 0.7032943367958069, "learning_rate": 2.9430719656283568e-05, "loss": 2.0616, "num_input_tokens_seen": 1528823808, "step": 3888 }, { "epoch": 0.44149190591086407, "grad_norm": 0.7222307324409485, "learning_rate": 2.9394915861081273e-05, "loss": 2.0536, "num_input_tokens_seen": 1531183104, "step": 3894 }, { "epoch": 0.44217217078900095, "grad_norm": 0.6075944304466248, "learning_rate": 2.9359112065878985e-05, "loss": 2.0727, "num_input_tokens_seen": 1533542400, "step": 3900 }, { "epoch": 0.4428524356671379, "grad_norm": 0.5314520597457886, "learning_rate": 2.9323308270676693e-05, "loss": 2.0807, "num_input_tokens_seen": 1535901696, "step": 3906 }, { "epoch": 0.4435327005452748, "grad_norm": 0.5990427136421204, "learning_rate": 2.9287504475474404e-05, "loss": 2.0537, "num_input_tokens_seen": 1538260992, "step": 3912 }, { "epoch": 0.44421296542341177, "grad_norm": 0.6096120476722717, "learning_rate": 2.925170068027211e-05, "loss": 2.0997, "num_input_tokens_seen": 1540620288, "step": 3918 }, { "epoch": 0.44489323030154865, "grad_norm": 0.5830526351928711, "learning_rate": 2.921589688506982e-05, "loss": 2.0595, "num_input_tokens_seen": 1542979584, "step": 3924 }, { "epoch": 0.4455734951796856, "grad_norm": 0.5455676913261414, "learning_rate": 2.918009308986753e-05, "loss": 2.0565, "num_input_tokens_seen": 1545338880, "step": 3930 }, { "epoch": 0.4462537600578225, "grad_norm": 0.5473060011863708, "learning_rate": 2.9144289294665234e-05, "loss": 2.0598, "num_input_tokens_seen": 1547698176, "step": 3936 }, { "epoch": 0.44693402493595946, "grad_norm": 0.5502248406410217, "learning_rate": 2.9108485499462945e-05, "loss": 2.0327, "num_input_tokens_seen": 1550057472, "step": 3942 }, { "epoch": 0.44761428981409634, "grad_norm": 0.5692510008811951, "learning_rate": 2.907268170426065e-05, "loss": 2.0605, "num_input_tokens_seen": 1552416768, "step": 3948 }, { "epoch": 0.4482945546922333, "grad_norm": 0.5818192362785339, "learning_rate": 2.9036877909058365e-05, "loss": 2.058, "num_input_tokens_seen": 1554776064, "step": 3954 }, { "epoch": 0.4489748195703702, "grad_norm": 0.6013736128807068, "learning_rate": 2.900107411385607e-05, "loss": 2.0503, "num_input_tokens_seen": 1557135360, "step": 3960 }, { "epoch": 0.44965508444850716, "grad_norm": 0.5566754341125488, "learning_rate": 2.896527031865378e-05, "loss": 2.0796, "num_input_tokens_seen": 1559494656, "step": 3966 }, { "epoch": 0.45033534932664404, "grad_norm": 0.6082068085670471, "learning_rate": 2.8929466523451486e-05, "loss": 2.0633, "num_input_tokens_seen": 1561853952, "step": 3972 }, { "epoch": 0.451015614204781, "grad_norm": 0.5426793694496155, "learning_rate": 2.8893662728249194e-05, "loss": 2.0964, "num_input_tokens_seen": 1564213248, "step": 3978 }, { "epoch": 0.4516958790829179, "grad_norm": 0.549892246723175, "learning_rate": 2.8857858933046906e-05, "loss": 2.0454, "num_input_tokens_seen": 1566572544, "step": 3984 }, { "epoch": 0.45237614396105486, "grad_norm": 0.5879752039909363, "learning_rate": 2.882205513784461e-05, "loss": 2.0256, "num_input_tokens_seen": 1568931840, "step": 3990 }, { "epoch": 0.45305640883919174, "grad_norm": 0.6315425038337708, "learning_rate": 2.8786251342642322e-05, "loss": 2.0484, "num_input_tokens_seen": 1571291136, "step": 3996 }, { "epoch": 0.4535099187579497, "eval_accuracy": 0.5812625152625153, "eval_loss": 2.036273241043091, "eval_runtime": 128.5473, "eval_samples_per_second": 3.112, "eval_steps_per_second": 1.042, "num_input_tokens_seen": 1572864000, "step": 4000 }, { "epoch": 0.4537366737173287, "grad_norm": 0.5952754616737366, "learning_rate": 2.875044754744003e-05, "loss": 2.0647, "num_input_tokens_seen": 1573650432, "step": 4002 }, { "epoch": 0.4544169385954656, "grad_norm": 0.6178935766220093, "learning_rate": 2.871464375223774e-05, "loss": 2.0479, "num_input_tokens_seen": 1576009728, "step": 4008 }, { "epoch": 0.45509720347360255, "grad_norm": 0.5785337090492249, "learning_rate": 2.8678839957035446e-05, "loss": 2.0627, "num_input_tokens_seen": 1578369024, "step": 4014 }, { "epoch": 0.45577746835173943, "grad_norm": 0.7288320064544678, "learning_rate": 2.8643036161833155e-05, "loss": 2.0494, "num_input_tokens_seen": 1580728320, "step": 4020 }, { "epoch": 0.4564577332298764, "grad_norm": 0.5112663507461548, "learning_rate": 2.8607232366630866e-05, "loss": 2.0734, "num_input_tokens_seen": 1583087616, "step": 4026 }, { "epoch": 0.4571379981080133, "grad_norm": 0.6311584115028381, "learning_rate": 2.857142857142857e-05, "loss": 2.0358, "num_input_tokens_seen": 1585446912, "step": 4032 }, { "epoch": 0.45781826298615025, "grad_norm": 0.7229108214378357, "learning_rate": 2.8535624776226282e-05, "loss": 2.1127, "num_input_tokens_seen": 1587806208, "step": 4038 }, { "epoch": 0.45849852786428713, "grad_norm": 0.6722413301467896, "learning_rate": 2.849982098102399e-05, "loss": 2.0238, "num_input_tokens_seen": 1590165504, "step": 4044 }, { "epoch": 0.45917879274242407, "grad_norm": 0.5801773071289062, "learning_rate": 2.8464017185821702e-05, "loss": 2.0511, "num_input_tokens_seen": 1592524800, "step": 4050 }, { "epoch": 0.459859057620561, "grad_norm": 0.5483337044715881, "learning_rate": 2.8428213390619407e-05, "loss": 2.0307, "num_input_tokens_seen": 1594884096, "step": 4056 }, { "epoch": 0.46053932249869795, "grad_norm": 0.6021159887313843, "learning_rate": 2.839240959541711e-05, "loss": 2.0612, "num_input_tokens_seen": 1597243392, "step": 4062 }, { "epoch": 0.4612195873768348, "grad_norm": 0.6574224233627319, "learning_rate": 2.8356605800214827e-05, "loss": 2.1141, "num_input_tokens_seen": 1599602688, "step": 4068 }, { "epoch": 0.46189985225497177, "grad_norm": 0.6547892093658447, "learning_rate": 2.832080200501253e-05, "loss": 2.028, "num_input_tokens_seen": 1601961984, "step": 4074 }, { "epoch": 0.4625801171331087, "grad_norm": 0.5769033432006836, "learning_rate": 2.8284998209810243e-05, "loss": 2.0161, "num_input_tokens_seen": 1604321280, "step": 4080 }, { "epoch": 0.46326038201124564, "grad_norm": 0.5694014430046082, "learning_rate": 2.8249194414607948e-05, "loss": 2.0506, "num_input_tokens_seen": 1606680576, "step": 4086 }, { "epoch": 0.4639406468893826, "grad_norm": 0.522310733795166, "learning_rate": 2.821339061940566e-05, "loss": 2.0775, "num_input_tokens_seen": 1609039872, "step": 4092 }, { "epoch": 0.46462091176751946, "grad_norm": 0.5985013842582703, "learning_rate": 2.8177586824203367e-05, "loss": 2.0341, "num_input_tokens_seen": 1611399168, "step": 4098 }, { "epoch": 0.4653011766456564, "grad_norm": 0.6218437552452087, "learning_rate": 2.8141783029001072e-05, "loss": 2.0579, "num_input_tokens_seen": 1613758464, "step": 4104 }, { "epoch": 0.46598144152379334, "grad_norm": 0.6592255234718323, "learning_rate": 2.8105979233798784e-05, "loss": 2.0348, "num_input_tokens_seen": 1616117760, "step": 4110 }, { "epoch": 0.4666617064019303, "grad_norm": 0.583972692489624, "learning_rate": 2.8070175438596492e-05, "loss": 2.0909, "num_input_tokens_seen": 1618477056, "step": 4116 }, { "epoch": 0.46734197128006716, "grad_norm": 0.5974957346916199, "learning_rate": 2.8034371643394203e-05, "loss": 2.0778, "num_input_tokens_seen": 1620836352, "step": 4122 }, { "epoch": 0.4680222361582041, "grad_norm": 0.6519222855567932, "learning_rate": 2.7998567848191908e-05, "loss": 2.1053, "num_input_tokens_seen": 1623195648, "step": 4128 }, { "epoch": 0.46870250103634103, "grad_norm": 0.6389775276184082, "learning_rate": 2.796276405298962e-05, "loss": 2.0589, "num_input_tokens_seen": 1625554944, "step": 4134 }, { "epoch": 0.469382765914478, "grad_norm": 0.559096097946167, "learning_rate": 2.7926960257787328e-05, "loss": 2.0486, "num_input_tokens_seen": 1627914240, "step": 4140 }, { "epoch": 0.47006303079261486, "grad_norm": 0.6475938558578491, "learning_rate": 2.7891156462585033e-05, "loss": 2.0917, "num_input_tokens_seen": 1630273536, "step": 4146 }, { "epoch": 0.4707432956707518, "grad_norm": 0.5632593035697937, "learning_rate": 2.7855352667382744e-05, "loss": 2.1186, "num_input_tokens_seen": 1632632832, "step": 4152 }, { "epoch": 0.47142356054888873, "grad_norm": 0.6311493515968323, "learning_rate": 2.7819548872180452e-05, "loss": 2.0449, "num_input_tokens_seen": 1634992128, "step": 4158 }, { "epoch": 0.47210382542702567, "grad_norm": 0.5263134837150574, "learning_rate": 2.7783745076978164e-05, "loss": 2.0727, "num_input_tokens_seen": 1637351424, "step": 4164 }, { "epoch": 0.47278409030516255, "grad_norm": 0.6025407910346985, "learning_rate": 2.774794128177587e-05, "loss": 2.0727, "num_input_tokens_seen": 1639710720, "step": 4170 }, { "epoch": 0.4734643551832995, "grad_norm": 0.5921410322189331, "learning_rate": 2.771213748657358e-05, "loss": 2.0652, "num_input_tokens_seen": 1642070016, "step": 4176 }, { "epoch": 0.47414462006143643, "grad_norm": 0.631074845790863, "learning_rate": 2.767633369137129e-05, "loss": 2.0445, "num_input_tokens_seen": 1644429312, "step": 4182 }, { "epoch": 0.47482488493957337, "grad_norm": 0.6067950129508972, "learning_rate": 2.7640529896168993e-05, "loss": 2.0518, "num_input_tokens_seen": 1646788608, "step": 4188 }, { "epoch": 0.47550514981771025, "grad_norm": 0.7098994851112366, "learning_rate": 2.7604726100966705e-05, "loss": 2.0632, "num_input_tokens_seen": 1649147904, "step": 4194 }, { "epoch": 0.4761854146958472, "grad_norm": 0.59510338306427, "learning_rate": 2.756892230576441e-05, "loss": 2.0555, "num_input_tokens_seen": 1651507200, "step": 4200 }, { "epoch": 0.4768656795739841, "grad_norm": 0.5363790392875671, "learning_rate": 2.753311851056212e-05, "loss": 2.0648, "num_input_tokens_seen": 1653866496, "step": 4206 }, { "epoch": 0.47754594445212106, "grad_norm": 0.6022222638130188, "learning_rate": 2.749731471535983e-05, "loss": 2.0435, "num_input_tokens_seen": 1656225792, "step": 4212 }, { "epoch": 0.47822620933025795, "grad_norm": 0.6179582476615906, "learning_rate": 2.746151092015754e-05, "loss": 2.0574, "num_input_tokens_seen": 1658585088, "step": 4218 }, { "epoch": 0.4789064742083949, "grad_norm": 0.5979219079017639, "learning_rate": 2.7425707124955245e-05, "loss": 2.0891, "num_input_tokens_seen": 1660944384, "step": 4224 }, { "epoch": 0.4795867390865318, "grad_norm": 0.5890262722969055, "learning_rate": 2.7389903329752954e-05, "loss": 2.0579, "num_input_tokens_seen": 1663303680, "step": 4230 }, { "epoch": 0.48026700396466876, "grad_norm": 0.5914406776428223, "learning_rate": 2.7354099534550665e-05, "loss": 2.0666, "num_input_tokens_seen": 1665662976, "step": 4236 }, { "epoch": 0.48094726884280564, "grad_norm": 0.5765048861503601, "learning_rate": 2.731829573934837e-05, "loss": 2.0057, "num_input_tokens_seen": 1668022272, "step": 4242 }, { "epoch": 0.4816275337209426, "grad_norm": 0.5583755970001221, "learning_rate": 2.728249194414608e-05, "loss": 2.0303, "num_input_tokens_seen": 1670381568, "step": 4248 }, { "epoch": 0.4823077985990795, "grad_norm": 0.5724276900291443, "learning_rate": 2.724668814894379e-05, "loss": 2.0575, "num_input_tokens_seen": 1672740864, "step": 4254 }, { "epoch": 0.48298806347721646, "grad_norm": 0.6578513979911804, "learning_rate": 2.72108843537415e-05, "loss": 2.0213, "num_input_tokens_seen": 1675100160, "step": 4260 }, { "epoch": 0.48366832835535334, "grad_norm": 0.6713804602622986, "learning_rate": 2.7175080558539206e-05, "loss": 2.0428, "num_input_tokens_seen": 1677459456, "step": 4266 }, { "epoch": 0.4843485932334903, "grad_norm": 0.5634715557098389, "learning_rate": 2.7139276763336914e-05, "loss": 2.0461, "num_input_tokens_seen": 1679818752, "step": 4272 }, { "epoch": 0.4850288581116272, "grad_norm": 0.6080957055091858, "learning_rate": 2.7103472968134626e-05, "loss": 2.1146, "num_input_tokens_seen": 1682178048, "step": 4278 }, { "epoch": 0.48570912298976415, "grad_norm": 0.5606207251548767, "learning_rate": 2.706766917293233e-05, "loss": 2.0456, "num_input_tokens_seen": 1684537344, "step": 4284 }, { "epoch": 0.4863893878679011, "grad_norm": 0.6365392208099365, "learning_rate": 2.7031865377730042e-05, "loss": 2.0354, "num_input_tokens_seen": 1686896640, "step": 4290 }, { "epoch": 0.487069652746038, "grad_norm": 0.6958995461463928, "learning_rate": 2.699606158252775e-05, "loss": 2.0047, "num_input_tokens_seen": 1689255936, "step": 4296 }, { "epoch": 0.4877499176241749, "grad_norm": 0.7712015509605408, "learning_rate": 2.6960257787325462e-05, "loss": 2.0329, "num_input_tokens_seen": 1691615232, "step": 4302 }, { "epoch": 0.48843018250231185, "grad_norm": 0.5757302045822144, "learning_rate": 2.6924453992123166e-05, "loss": 2.0256, "num_input_tokens_seen": 1693974528, "step": 4308 }, { "epoch": 0.4891104473804488, "grad_norm": 0.5976963639259338, "learning_rate": 2.688865019692087e-05, "loss": 2.0844, "num_input_tokens_seen": 1696333824, "step": 4314 }, { "epoch": 0.48979071225858567, "grad_norm": 0.5346413254737854, "learning_rate": 2.6852846401718583e-05, "loss": 2.0371, "num_input_tokens_seen": 1698693120, "step": 4320 }, { "epoch": 0.4904709771367226, "grad_norm": 0.5894768238067627, "learning_rate": 2.681704260651629e-05, "loss": 2.0424, "num_input_tokens_seen": 1701052416, "step": 4326 }, { "epoch": 0.49115124201485955, "grad_norm": 0.5185033679008484, "learning_rate": 2.6781238811314003e-05, "loss": 2.0221, "num_input_tokens_seen": 1703411712, "step": 4332 }, { "epoch": 0.4918315068929965, "grad_norm": 0.708967924118042, "learning_rate": 2.6745435016111707e-05, "loss": 2.0596, "num_input_tokens_seen": 1705771008, "step": 4338 }, { "epoch": 0.49251177177113337, "grad_norm": 0.539934515953064, "learning_rate": 2.670963122090942e-05, "loss": 2.0724, "num_input_tokens_seen": 1708130304, "step": 4344 }, { "epoch": 0.4931920366492703, "grad_norm": 0.536572277545929, "learning_rate": 2.6673827425707127e-05, "loss": 2.0064, "num_input_tokens_seen": 1710489600, "step": 4350 }, { "epoch": 0.49387230152740724, "grad_norm": 0.6322241425514221, "learning_rate": 2.6638023630504832e-05, "loss": 2.0661, "num_input_tokens_seen": 1712848896, "step": 4356 }, { "epoch": 0.4945525664055442, "grad_norm": 0.6483719348907471, "learning_rate": 2.6602219835302543e-05, "loss": 2.0301, "num_input_tokens_seen": 1715208192, "step": 4362 }, { "epoch": 0.49523283128368106, "grad_norm": 0.7183097004890442, "learning_rate": 2.656641604010025e-05, "loss": 2.0372, "num_input_tokens_seen": 1717567488, "step": 4368 }, { "epoch": 0.495913096161818, "grad_norm": 0.5163341760635376, "learning_rate": 2.6530612244897963e-05, "loss": 2.0536, "num_input_tokens_seen": 1719926784, "step": 4374 }, { "epoch": 0.49659336103995494, "grad_norm": 0.5748982429504395, "learning_rate": 2.6494808449695668e-05, "loss": 2.0573, "num_input_tokens_seen": 1722286080, "step": 4380 }, { "epoch": 0.4972736259180919, "grad_norm": 0.5292128324508667, "learning_rate": 2.645900465449338e-05, "loss": 2.0675, "num_input_tokens_seen": 1724645376, "step": 4386 }, { "epoch": 0.49795389079622876, "grad_norm": 0.5424016714096069, "learning_rate": 2.6423200859291087e-05, "loss": 2.0624, "num_input_tokens_seen": 1727004672, "step": 4392 }, { "epoch": 0.4986341556743657, "grad_norm": 0.5220805406570435, "learning_rate": 2.6387397064088792e-05, "loss": 2.0199, "num_input_tokens_seen": 1729363968, "step": 4398 }, { "epoch": 0.49886091063374466, "eval_accuracy": 0.5820293040293041, "eval_loss": 2.0314505100250244, "eval_runtime": 129.9151, "eval_samples_per_second": 3.079, "eval_steps_per_second": 1.031, "num_input_tokens_seen": 1730150400, "step": 4400 }, { "epoch": 0.49931442055250264, "grad_norm": 0.5268684029579163, "learning_rate": 2.6351593268886504e-05, "loss": 2.0301, "num_input_tokens_seen": 1731723264, "step": 4404 }, { "epoch": 0.4999946854306396, "grad_norm": 0.53367018699646, "learning_rate": 2.6315789473684212e-05, "loss": 2.0371, "num_input_tokens_seen": 1734082560, "step": 4410 }, { "epoch": 0.5006749503087765, "grad_norm": 0.6249188184738159, "learning_rate": 2.6279985678481924e-05, "loss": 2.0451, "num_input_tokens_seen": 1736441856, "step": 4416 }, { "epoch": 0.5013552151869134, "grad_norm": 0.5434116125106812, "learning_rate": 2.6244181883279628e-05, "loss": 2.0524, "num_input_tokens_seen": 1738801152, "step": 4422 }, { "epoch": 0.5020354800650503, "grad_norm": 0.6205160021781921, "learning_rate": 2.620837808807734e-05, "loss": 2.0484, "num_input_tokens_seen": 1741160448, "step": 4428 }, { "epoch": 0.5027157449431873, "grad_norm": 0.6665350198745728, "learning_rate": 2.6172574292875045e-05, "loss": 2.0277, "num_input_tokens_seen": 1743519744, "step": 4434 }, { "epoch": 0.5033960098213242, "grad_norm": 0.6074947714805603, "learning_rate": 2.6136770497672753e-05, "loss": 1.9797, "num_input_tokens_seen": 1745879040, "step": 4440 }, { "epoch": 0.5040762746994611, "grad_norm": 0.801179051399231, "learning_rate": 2.6100966702470464e-05, "loss": 2.0997, "num_input_tokens_seen": 1748238336, "step": 4446 }, { "epoch": 0.504756539577598, "grad_norm": 0.6840958595275879, "learning_rate": 2.606516290726817e-05, "loss": 2.0205, "num_input_tokens_seen": 1750597632, "step": 4452 }, { "epoch": 0.5054368044557349, "grad_norm": 0.6576579809188843, "learning_rate": 2.602935911206588e-05, "loss": 2.0203, "num_input_tokens_seen": 1752956928, "step": 4458 }, { "epoch": 0.5061170693338719, "grad_norm": 0.6868447065353394, "learning_rate": 2.599355531686359e-05, "loss": 1.9909, "num_input_tokens_seen": 1755316224, "step": 4464 }, { "epoch": 0.5067973342120088, "grad_norm": 0.5453672409057617, "learning_rate": 2.59577515216613e-05, "loss": 2.006, "num_input_tokens_seen": 1757675520, "step": 4470 }, { "epoch": 0.5074775990901457, "grad_norm": 0.6682925224304199, "learning_rate": 2.5921947726459005e-05, "loss": 2.0254, "num_input_tokens_seen": 1760034816, "step": 4476 }, { "epoch": 0.5081578639682827, "grad_norm": 0.5425339937210083, "learning_rate": 2.5886143931256713e-05, "loss": 2.0437, "num_input_tokens_seen": 1762394112, "step": 4482 }, { "epoch": 0.5088381288464195, "grad_norm": 0.5697811245918274, "learning_rate": 2.5850340136054425e-05, "loss": 2.0491, "num_input_tokens_seen": 1764753408, "step": 4488 }, { "epoch": 0.5095183937245565, "grad_norm": 0.5228165984153748, "learning_rate": 2.581453634085213e-05, "loss": 2.0265, "num_input_tokens_seen": 1767112704, "step": 4494 }, { "epoch": 0.5101986586026934, "grad_norm": 0.5687726736068726, "learning_rate": 2.577873254564984e-05, "loss": 2.0485, "num_input_tokens_seen": 1769472000, "step": 4500 }, { "epoch": 0.5108789234808303, "grad_norm": 0.5901363492012024, "learning_rate": 2.574292875044755e-05, "loss": 2.0407, "num_input_tokens_seen": 1771831296, "step": 4506 }, { "epoch": 0.5115591883589673, "grad_norm": 0.585297703742981, "learning_rate": 2.570712495524526e-05, "loss": 2.0564, "num_input_tokens_seen": 1774190592, "step": 4512 }, { "epoch": 0.5122394532371042, "grad_norm": 0.5718687772750854, "learning_rate": 2.5671321160042966e-05, "loss": 2.0269, "num_input_tokens_seen": 1776549888, "step": 4518 }, { "epoch": 0.5129197181152412, "grad_norm": 0.5592496991157532, "learning_rate": 2.5635517364840674e-05, "loss": 2.038, "num_input_tokens_seen": 1778909184, "step": 4524 }, { "epoch": 0.5135999829933781, "grad_norm": 0.5649601817131042, "learning_rate": 2.5599713569638385e-05, "loss": 2.0191, "num_input_tokens_seen": 1781268480, "step": 4530 }, { "epoch": 0.5142802478715149, "grad_norm": 0.5326777696609497, "learning_rate": 2.556390977443609e-05, "loss": 2.0632, "num_input_tokens_seen": 1783627776, "step": 4536 }, { "epoch": 0.5149605127496519, "grad_norm": 0.6131651401519775, "learning_rate": 2.55281059792338e-05, "loss": 2.0646, "num_input_tokens_seen": 1785987072, "step": 4542 }, { "epoch": 0.5156407776277888, "grad_norm": 0.6253523230552673, "learning_rate": 2.5492302184031506e-05, "loss": 2.023, "num_input_tokens_seen": 1788346368, "step": 4548 }, { "epoch": 0.5163210425059257, "grad_norm": 0.5869733095169067, "learning_rate": 2.545649838882922e-05, "loss": 2.0196, "num_input_tokens_seen": 1790705664, "step": 4554 }, { "epoch": 0.5170013073840627, "grad_norm": 0.6161871552467346, "learning_rate": 2.5420694593626926e-05, "loss": 2.0352, "num_input_tokens_seen": 1793064960, "step": 4560 }, { "epoch": 0.5176815722621996, "grad_norm": 0.5765447020530701, "learning_rate": 2.538489079842463e-05, "loss": 2.0604, "num_input_tokens_seen": 1795424256, "step": 4566 }, { "epoch": 0.5183618371403366, "grad_norm": 0.5514522194862366, "learning_rate": 2.5349087003222342e-05, "loss": 2.0117, "num_input_tokens_seen": 1797783552, "step": 4572 }, { "epoch": 0.5190421020184735, "grad_norm": 0.6634048223495483, "learning_rate": 2.531328320802005e-05, "loss": 2.0526, "num_input_tokens_seen": 1800142848, "step": 4578 }, { "epoch": 0.5197223668966103, "grad_norm": 0.5515666604042053, "learning_rate": 2.5277479412817762e-05, "loss": 2.0145, "num_input_tokens_seen": 1802502144, "step": 4584 }, { "epoch": 0.5204026317747473, "grad_norm": 0.5209513902664185, "learning_rate": 2.5241675617615467e-05, "loss": 2.0392, "num_input_tokens_seen": 1804861440, "step": 4590 }, { "epoch": 0.5210828966528842, "grad_norm": 0.586448609828949, "learning_rate": 2.520587182241318e-05, "loss": 2.0467, "num_input_tokens_seen": 1807220736, "step": 4596 }, { "epoch": 0.5217631615310211, "grad_norm": 0.6192159652709961, "learning_rate": 2.5170068027210887e-05, "loss": 2.0113, "num_input_tokens_seen": 1809580032, "step": 4602 }, { "epoch": 0.5224434264091581, "grad_norm": 0.5666912794113159, "learning_rate": 2.513426423200859e-05, "loss": 2.0656, "num_input_tokens_seen": 1811939328, "step": 4608 }, { "epoch": 0.523123691287295, "grad_norm": 0.553439199924469, "learning_rate": 2.5098460436806303e-05, "loss": 2.0829, "num_input_tokens_seen": 1814298624, "step": 4614 }, { "epoch": 0.523803956165432, "grad_norm": 0.5798554420471191, "learning_rate": 2.506265664160401e-05, "loss": 2.0018, "num_input_tokens_seen": 1816657920, "step": 4620 }, { "epoch": 0.5244842210435688, "grad_norm": 0.5392002463340759, "learning_rate": 2.5026852846401723e-05, "loss": 2.0356, "num_input_tokens_seen": 1819017216, "step": 4626 }, { "epoch": 0.5251644859217057, "grad_norm": 0.5326321721076965, "learning_rate": 2.4991049051199427e-05, "loss": 2.0293, "num_input_tokens_seen": 1821376512, "step": 4632 }, { "epoch": 0.5258447507998427, "grad_norm": 0.5811095237731934, "learning_rate": 2.4955245255997136e-05, "loss": 2.0293, "num_input_tokens_seen": 1823735808, "step": 4638 }, { "epoch": 0.5265250156779796, "grad_norm": 0.6148865222930908, "learning_rate": 2.4919441460794847e-05, "loss": 2.0708, "num_input_tokens_seen": 1826095104, "step": 4644 }, { "epoch": 0.5272052805561165, "grad_norm": 0.552721381187439, "learning_rate": 2.4883637665592555e-05, "loss": 2.0677, "num_input_tokens_seen": 1828454400, "step": 4650 }, { "epoch": 0.5278855454342535, "grad_norm": 0.7011052966117859, "learning_rate": 2.4847833870390263e-05, "loss": 2.0427, "num_input_tokens_seen": 1830813696, "step": 4656 }, { "epoch": 0.5285658103123904, "grad_norm": 0.698103129863739, "learning_rate": 2.4812030075187968e-05, "loss": 2.0446, "num_input_tokens_seen": 1833172992, "step": 4662 }, { "epoch": 0.5292460751905274, "grad_norm": 0.6698234677314758, "learning_rate": 2.477622627998568e-05, "loss": 1.989, "num_input_tokens_seen": 1835532288, "step": 4668 }, { "epoch": 0.5299263400686642, "grad_norm": 0.686160683631897, "learning_rate": 2.4740422484783388e-05, "loss": 2.0177, "num_input_tokens_seen": 1837891584, "step": 4674 }, { "epoch": 0.5306066049468011, "grad_norm": 0.7087785601615906, "learning_rate": 2.4704618689581096e-05, "loss": 1.9913, "num_input_tokens_seen": 1840250880, "step": 4680 }, { "epoch": 0.5312868698249381, "grad_norm": 0.7636250257492065, "learning_rate": 2.4668814894378804e-05, "loss": 2.0415, "num_input_tokens_seen": 1842610176, "step": 4686 }, { "epoch": 0.531967134703075, "grad_norm": 0.6679571270942688, "learning_rate": 2.4633011099176516e-05, "loss": 2.0405, "num_input_tokens_seen": 1844969472, "step": 4692 }, { "epoch": 0.532647399581212, "grad_norm": 0.6184023022651672, "learning_rate": 2.4597207303974224e-05, "loss": 2.0398, "num_input_tokens_seen": 1847328768, "step": 4698 }, { "epoch": 0.5333276644593489, "grad_norm": 0.6215579509735107, "learning_rate": 2.456140350877193e-05, "loss": 2.0421, "num_input_tokens_seen": 1849688064, "step": 4704 }, { "epoch": 0.5340079293374858, "grad_norm": 0.6236794590950012, "learning_rate": 2.452559971356964e-05, "loss": 2.0267, "num_input_tokens_seen": 1852047360, "step": 4710 }, { "epoch": 0.5346881942156227, "grad_norm": 0.5402464270591736, "learning_rate": 2.448979591836735e-05, "loss": 2.0491, "num_input_tokens_seen": 1854406656, "step": 4716 }, { "epoch": 0.5353684590937596, "grad_norm": 0.6620278358459473, "learning_rate": 2.4453992123165057e-05, "loss": 2.0231, "num_input_tokens_seen": 1856765952, "step": 4722 }, { "epoch": 0.5360487239718965, "grad_norm": 0.6001939177513123, "learning_rate": 2.4418188327962765e-05, "loss": 2.07, "num_input_tokens_seen": 1859125248, "step": 4728 }, { "epoch": 0.5367289888500335, "grad_norm": 0.5543425679206848, "learning_rate": 2.4382384532760473e-05, "loss": 2.0106, "num_input_tokens_seen": 1861484544, "step": 4734 }, { "epoch": 0.5374092537281704, "grad_norm": 0.4775249660015106, "learning_rate": 2.4346580737558184e-05, "loss": 2.0754, "num_input_tokens_seen": 1863843840, "step": 4740 }, { "epoch": 0.5380895186063074, "grad_norm": 0.5671461224555969, "learning_rate": 2.431077694235589e-05, "loss": 2.0448, "num_input_tokens_seen": 1866203136, "step": 4746 }, { "epoch": 0.5387697834844443, "grad_norm": 0.6002800464630127, "learning_rate": 2.4274973147153597e-05, "loss": 2.0081, "num_input_tokens_seen": 1868562432, "step": 4752 }, { "epoch": 0.5394500483625811, "grad_norm": 0.5340938568115234, "learning_rate": 2.423916935195131e-05, "loss": 2.0068, "num_input_tokens_seen": 1870921728, "step": 4758 }, { "epoch": 0.5401303132407181, "grad_norm": 0.550006628036499, "learning_rate": 2.4203365556749017e-05, "loss": 2.0779, "num_input_tokens_seen": 1873281024, "step": 4764 }, { "epoch": 0.540810578118855, "grad_norm": 0.6014347672462463, "learning_rate": 2.4167561761546725e-05, "loss": 2.0408, "num_input_tokens_seen": 1875640320, "step": 4770 }, { "epoch": 0.5414908429969919, "grad_norm": 0.6230180859565735, "learning_rate": 2.4131757966344433e-05, "loss": 2.0508, "num_input_tokens_seen": 1877999616, "step": 4776 }, { "epoch": 0.5421711078751289, "grad_norm": 0.570754885673523, "learning_rate": 2.4095954171142145e-05, "loss": 2.0594, "num_input_tokens_seen": 1880358912, "step": 4782 }, { "epoch": 0.5428513727532658, "grad_norm": 0.5309892892837524, "learning_rate": 2.406015037593985e-05, "loss": 2.0985, "num_input_tokens_seen": 1882718208, "step": 4788 }, { "epoch": 0.5435316376314028, "grad_norm": 0.6809681057929993, "learning_rate": 2.4024346580737558e-05, "loss": 2.0498, "num_input_tokens_seen": 1885077504, "step": 4794 }, { "epoch": 0.5442119025095397, "grad_norm": 0.5176597237586975, "learning_rate": 2.3988542785535266e-05, "loss": 2.0361, "num_input_tokens_seen": 1887436800, "step": 4800 }, { "epoch": 0.5442119025095397, "eval_accuracy": 0.582943833943834, "eval_loss": 2.0261168479919434, "eval_runtime": 128.4528, "eval_samples_per_second": 3.114, "eval_steps_per_second": 1.043, "num_input_tokens_seen": 1887436800, "step": 4800 }, { "epoch": 0.5448921673876765, "grad_norm": 0.6304104924201965, "learning_rate": 2.3952738990332978e-05, "loss": 2.051, "num_input_tokens_seen": 1889796096, "step": 4806 }, { "epoch": 0.5455724322658135, "grad_norm": 0.6736240983009338, "learning_rate": 2.3916935195130686e-05, "loss": 2.0604, "num_input_tokens_seen": 1892155392, "step": 4812 }, { "epoch": 0.5462526971439504, "grad_norm": 0.6046845316886902, "learning_rate": 2.3881131399928394e-05, "loss": 2.0382, "num_input_tokens_seen": 1894514688, "step": 4818 }, { "epoch": 0.5469329620220873, "grad_norm": 0.6192537546157837, "learning_rate": 2.3845327604726102e-05, "loss": 2.017, "num_input_tokens_seen": 1896873984, "step": 4824 }, { "epoch": 0.5476132269002243, "grad_norm": 0.6411442160606384, "learning_rate": 2.380952380952381e-05, "loss": 2.0302, "num_input_tokens_seen": 1899233280, "step": 4830 }, { "epoch": 0.5482934917783612, "grad_norm": 0.5721175670623779, "learning_rate": 2.377372001432152e-05, "loss": 2.0338, "num_input_tokens_seen": 1901592576, "step": 4836 }, { "epoch": 0.5489737566564982, "grad_norm": 0.5743176937103271, "learning_rate": 2.3737916219119226e-05, "loss": 2.0426, "num_input_tokens_seen": 1903951872, "step": 4842 }, { "epoch": 0.549654021534635, "grad_norm": 0.5680631995201111, "learning_rate": 2.3702112423916935e-05, "loss": 1.9743, "num_input_tokens_seen": 1906311168, "step": 4848 }, { "epoch": 0.5503342864127719, "grad_norm": 0.5353610515594482, "learning_rate": 2.3666308628714646e-05, "loss": 2.0359, "num_input_tokens_seen": 1908670464, "step": 4854 }, { "epoch": 0.5510145512909089, "grad_norm": 0.5486804842948914, "learning_rate": 2.3630504833512354e-05, "loss": 2.0598, "num_input_tokens_seen": 1911029760, "step": 4860 }, { "epoch": 0.5516948161690458, "grad_norm": 0.5187994241714478, "learning_rate": 2.3594701038310063e-05, "loss": 2.0122, "num_input_tokens_seen": 1913389056, "step": 4866 }, { "epoch": 0.5523750810471827, "grad_norm": 0.5575273036956787, "learning_rate": 2.355889724310777e-05, "loss": 1.9224, "num_input_tokens_seen": 1915748352, "step": 4872 }, { "epoch": 0.5530553459253197, "grad_norm": 0.5445749163627625, "learning_rate": 2.352309344790548e-05, "loss": 2.0599, "num_input_tokens_seen": 1918107648, "step": 4878 }, { "epoch": 0.5537356108034566, "grad_norm": 0.555401086807251, "learning_rate": 2.3487289652703187e-05, "loss": 1.9883, "num_input_tokens_seen": 1920466944, "step": 4884 }, { "epoch": 0.5544158756815936, "grad_norm": 0.6415968537330627, "learning_rate": 2.3451485857500895e-05, "loss": 2.0529, "num_input_tokens_seen": 1922826240, "step": 4890 }, { "epoch": 0.5550961405597304, "grad_norm": 0.5108083486557007, "learning_rate": 2.3415682062298607e-05, "loss": 2.0445, "num_input_tokens_seen": 1925185536, "step": 4896 }, { "epoch": 0.5557764054378673, "grad_norm": 0.5858399868011475, "learning_rate": 2.3379878267096315e-05, "loss": 2.0457, "num_input_tokens_seen": 1927544832, "step": 4902 }, { "epoch": 0.5564566703160043, "grad_norm": 0.5217518210411072, "learning_rate": 2.3344074471894023e-05, "loss": 2.091, "num_input_tokens_seen": 1929904128, "step": 4908 }, { "epoch": 0.5571369351941412, "grad_norm": 0.596930980682373, "learning_rate": 2.3308270676691728e-05, "loss": 1.9999, "num_input_tokens_seen": 1932263424, "step": 4914 }, { "epoch": 0.5578172000722782, "grad_norm": 0.5927145481109619, "learning_rate": 2.327246688148944e-05, "loss": 2.0268, "num_input_tokens_seen": 1934622720, "step": 4920 }, { "epoch": 0.5584974649504151, "grad_norm": 0.5516043305397034, "learning_rate": 2.3236663086287147e-05, "loss": 1.9878, "num_input_tokens_seen": 1936982016, "step": 4926 }, { "epoch": 0.559177729828552, "grad_norm": 0.627019464969635, "learning_rate": 2.3200859291084856e-05, "loss": 2.0346, "num_input_tokens_seen": 1939341312, "step": 4932 }, { "epoch": 0.559857994706689, "grad_norm": 0.5447899699211121, "learning_rate": 2.3165055495882564e-05, "loss": 2.0513, "num_input_tokens_seen": 1941700608, "step": 4938 }, { "epoch": 0.5605382595848258, "grad_norm": 0.5802628397941589, "learning_rate": 2.3129251700680275e-05, "loss": 2.0973, "num_input_tokens_seen": 1944059904, "step": 4944 }, { "epoch": 0.5612185244629627, "grad_norm": 0.5924973487854004, "learning_rate": 2.3093447905477984e-05, "loss": 2.0682, "num_input_tokens_seen": 1946419200, "step": 4950 }, { "epoch": 0.5618987893410997, "grad_norm": 0.7050002813339233, "learning_rate": 2.3057644110275688e-05, "loss": 2.0724, "num_input_tokens_seen": 1948778496, "step": 4956 }, { "epoch": 0.5625790542192366, "grad_norm": 0.5495327711105347, "learning_rate": 2.3021840315073396e-05, "loss": 1.9644, "num_input_tokens_seen": 1951137792, "step": 4962 }, { "epoch": 0.5632593190973736, "grad_norm": 0.5360766649246216, "learning_rate": 2.2986036519871108e-05, "loss": 1.9898, "num_input_tokens_seen": 1953497088, "step": 4968 }, { "epoch": 0.5639395839755105, "grad_norm": 0.5336543321609497, "learning_rate": 2.2950232724668816e-05, "loss": 2.0032, "num_input_tokens_seen": 1955856384, "step": 4974 }, { "epoch": 0.5646198488536474, "grad_norm": 0.6477164626121521, "learning_rate": 2.2914428929466524e-05, "loss": 2.0446, "num_input_tokens_seen": 1958215680, "step": 4980 }, { "epoch": 0.5653001137317843, "grad_norm": 0.6914204955101013, "learning_rate": 2.2878625134264232e-05, "loss": 2.0031, "num_input_tokens_seen": 1960574976, "step": 4986 }, { "epoch": 0.5659803786099212, "grad_norm": 0.6375141143798828, "learning_rate": 2.2842821339061944e-05, "loss": 2.0626, "num_input_tokens_seen": 1962934272, "step": 4992 }, { "epoch": 0.5666606434880581, "grad_norm": 0.6484731435775757, "learning_rate": 2.280701754385965e-05, "loss": 1.9985, "num_input_tokens_seen": 1965293568, "step": 4998 }, { "epoch": 0.5673409083661951, "grad_norm": 0.6331252455711365, "learning_rate": 2.2771213748657357e-05, "loss": 2.0994, "num_input_tokens_seen": 1967652864, "step": 5004 }, { "epoch": 0.568021173244332, "grad_norm": 0.5034027695655823, "learning_rate": 2.273540995345507e-05, "loss": 2.0355, "num_input_tokens_seen": 1970012160, "step": 5010 }, { "epoch": 0.568701438122469, "grad_norm": 0.5032246708869934, "learning_rate": 2.2699606158252777e-05, "loss": 2.0454, "num_input_tokens_seen": 1972371456, "step": 5016 }, { "epoch": 0.5693817030006059, "grad_norm": 0.6073436737060547, "learning_rate": 2.2663802363050485e-05, "loss": 2.0395, "num_input_tokens_seen": 1974730752, "step": 5022 }, { "epoch": 0.5700619678787427, "grad_norm": 0.5456060767173767, "learning_rate": 2.2627998567848193e-05, "loss": 2.0063, "num_input_tokens_seen": 1977090048, "step": 5028 }, { "epoch": 0.5707422327568797, "grad_norm": 0.505669891834259, "learning_rate": 2.25921947726459e-05, "loss": 1.9756, "num_input_tokens_seen": 1979449344, "step": 5034 }, { "epoch": 0.5714224976350166, "grad_norm": 0.4892086088657379, "learning_rate": 2.255639097744361e-05, "loss": 2.0305, "num_input_tokens_seen": 1981808640, "step": 5040 }, { "epoch": 0.5721027625131535, "grad_norm": 0.5138855576515198, "learning_rate": 2.2520587182241317e-05, "loss": 2.0669, "num_input_tokens_seen": 1984167936, "step": 5046 }, { "epoch": 0.5727830273912905, "grad_norm": 0.5779445767402649, "learning_rate": 2.2484783387039026e-05, "loss": 2.0237, "num_input_tokens_seen": 1986527232, "step": 5052 }, { "epoch": 0.5734632922694274, "grad_norm": 0.6262017488479614, "learning_rate": 2.2448979591836737e-05, "loss": 2.0587, "num_input_tokens_seen": 1988886528, "step": 5058 }, { "epoch": 0.5741435571475644, "grad_norm": 0.5465866923332214, "learning_rate": 2.2413175796634445e-05, "loss": 2.0295, "num_input_tokens_seen": 1991245824, "step": 5064 }, { "epoch": 0.5748238220257013, "grad_norm": 0.653732419013977, "learning_rate": 2.2377372001432153e-05, "loss": 2.0325, "num_input_tokens_seen": 1993605120, "step": 5070 }, { "epoch": 0.5755040869038381, "grad_norm": 0.5579174160957336, "learning_rate": 2.234156820622986e-05, "loss": 2.0289, "num_input_tokens_seen": 1995964416, "step": 5076 }, { "epoch": 0.5761843517819751, "grad_norm": 0.5739534497261047, "learning_rate": 2.230576441102757e-05, "loss": 2.0592, "num_input_tokens_seen": 1998323712, "step": 5082 }, { "epoch": 0.576864616660112, "grad_norm": 0.5563738346099854, "learning_rate": 2.2269960615825278e-05, "loss": 2.0624, "num_input_tokens_seen": 2000683008, "step": 5088 }, { "epoch": 0.577544881538249, "grad_norm": 0.6019315719604492, "learning_rate": 2.2234156820622986e-05, "loss": 2.0466, "num_input_tokens_seen": 2003042304, "step": 5094 }, { "epoch": 0.5782251464163859, "grad_norm": 0.7558180689811707, "learning_rate": 2.2198353025420694e-05, "loss": 2.0494, "num_input_tokens_seen": 2005401600, "step": 5100 }, { "epoch": 0.5789054112945228, "grad_norm": 0.7054827213287354, "learning_rate": 2.2162549230218406e-05, "loss": 2.0022, "num_input_tokens_seen": 2007760896, "step": 5106 }, { "epoch": 0.5795856761726598, "grad_norm": 0.5728718042373657, "learning_rate": 2.2126745435016114e-05, "loss": 2.0443, "num_input_tokens_seen": 2010120192, "step": 5112 }, { "epoch": 0.5802659410507967, "grad_norm": 0.6196462512016296, "learning_rate": 2.2090941639813822e-05, "loss": 2.0502, "num_input_tokens_seen": 2012479488, "step": 5118 }, { "epoch": 0.5809462059289335, "grad_norm": 0.5134831070899963, "learning_rate": 2.205513784461153e-05, "loss": 1.9629, "num_input_tokens_seen": 2014838784, "step": 5124 }, { "epoch": 0.5816264708070705, "grad_norm": 0.5899059176445007, "learning_rate": 2.201933404940924e-05, "loss": 2.0479, "num_input_tokens_seen": 2017198080, "step": 5130 }, { "epoch": 0.5823067356852074, "grad_norm": 0.5743027329444885, "learning_rate": 2.1983530254206947e-05, "loss": 2.0633, "num_input_tokens_seen": 2019557376, "step": 5136 }, { "epoch": 0.5829870005633444, "grad_norm": 0.6599460244178772, "learning_rate": 2.1947726459004655e-05, "loss": 2.0429, "num_input_tokens_seen": 2021916672, "step": 5142 }, { "epoch": 0.5836672654414813, "grad_norm": 0.5353802442550659, "learning_rate": 2.1911922663802363e-05, "loss": 2.0422, "num_input_tokens_seen": 2024275968, "step": 5148 }, { "epoch": 0.5843475303196182, "grad_norm": 0.6173189282417297, "learning_rate": 2.1876118868600074e-05, "loss": 2.0345, "num_input_tokens_seen": 2026635264, "step": 5154 }, { "epoch": 0.5850277951977552, "grad_norm": 0.5757061839103699, "learning_rate": 2.1840315073397783e-05, "loss": 2.061, "num_input_tokens_seen": 2028994560, "step": 5160 }, { "epoch": 0.585708060075892, "grad_norm": 0.537259042263031, "learning_rate": 2.1804511278195487e-05, "loss": 2.067, "num_input_tokens_seen": 2031353856, "step": 5166 }, { "epoch": 0.5863883249540289, "grad_norm": 0.6066441535949707, "learning_rate": 2.17687074829932e-05, "loss": 2.0059, "num_input_tokens_seen": 2033713152, "step": 5172 }, { "epoch": 0.5870685898321659, "grad_norm": 0.5652830600738525, "learning_rate": 2.1732903687790907e-05, "loss": 2.0117, "num_input_tokens_seen": 2036072448, "step": 5178 }, { "epoch": 0.5877488547103028, "grad_norm": 0.5732890367507935, "learning_rate": 2.1697099892588615e-05, "loss": 2.0529, "num_input_tokens_seen": 2038431744, "step": 5184 }, { "epoch": 0.5884291195884398, "grad_norm": 0.5537346601486206, "learning_rate": 2.1661296097386323e-05, "loss": 2.0331, "num_input_tokens_seen": 2040791040, "step": 5190 }, { "epoch": 0.5891093844665767, "grad_norm": 0.5528678894042969, "learning_rate": 2.1625492302184035e-05, "loss": 2.057, "num_input_tokens_seen": 2043150336, "step": 5196 }, { "epoch": 0.5895628943853346, "eval_accuracy": 0.5838162393162393, "eval_loss": 2.0207207202911377, "eval_runtime": 128.1654, "eval_samples_per_second": 3.121, "eval_steps_per_second": 1.046, "num_input_tokens_seen": 2044723200, "step": 5200 }, { "epoch": 0.5897896493447136, "grad_norm": 0.6346768140792847, "learning_rate": 2.1589688506981743e-05, "loss": 2.0408, "num_input_tokens_seen": 2045509632, "step": 5202 }, { "epoch": 0.5904699142228506, "grad_norm": 0.64287930727005, "learning_rate": 2.1553884711779448e-05, "loss": 1.9892, "num_input_tokens_seen": 2047868928, "step": 5208 }, { "epoch": 0.5911501791009874, "grad_norm": 0.5813894271850586, "learning_rate": 2.1518080916577156e-05, "loss": 2.041, "num_input_tokens_seen": 2050228224, "step": 5214 }, { "epoch": 0.5918304439791243, "grad_norm": 0.5855168104171753, "learning_rate": 2.1482277121374868e-05, "loss": 1.9966, "num_input_tokens_seen": 2052587520, "step": 5220 }, { "epoch": 0.5925107088572613, "grad_norm": 0.5594427585601807, "learning_rate": 2.1446473326172576e-05, "loss": 2.0597, "num_input_tokens_seen": 2054946816, "step": 5226 }, { "epoch": 0.5931909737353982, "grad_norm": 0.5417963266372681, "learning_rate": 2.1410669530970284e-05, "loss": 2.0369, "num_input_tokens_seen": 2057306112, "step": 5232 }, { "epoch": 0.5938712386135352, "grad_norm": 0.5939909815788269, "learning_rate": 2.1374865735767992e-05, "loss": 2.0407, "num_input_tokens_seen": 2059665408, "step": 5238 }, { "epoch": 0.5945515034916721, "grad_norm": 0.5187436938285828, "learning_rate": 2.1339061940565704e-05, "loss": 2.0203, "num_input_tokens_seen": 2062024704, "step": 5244 }, { "epoch": 0.595231768369809, "grad_norm": 0.5559435486793518, "learning_rate": 2.130325814536341e-05, "loss": 1.9577, "num_input_tokens_seen": 2064384000, "step": 5250 }, { "epoch": 0.595912033247946, "grad_norm": 0.5728279948234558, "learning_rate": 2.1267454350161117e-05, "loss": 1.9924, "num_input_tokens_seen": 2066743296, "step": 5256 }, { "epoch": 0.5965922981260828, "grad_norm": 0.5480873584747314, "learning_rate": 2.1231650554958825e-05, "loss": 2.0437, "num_input_tokens_seen": 2069102592, "step": 5262 }, { "epoch": 0.5972725630042197, "grad_norm": 0.5689738392829895, "learning_rate": 2.1195846759756536e-05, "loss": 2.0233, "num_input_tokens_seen": 2071461888, "step": 5268 }, { "epoch": 0.5979528278823567, "grad_norm": 0.5396390557289124, "learning_rate": 2.1160042964554244e-05, "loss": 2.0224, "num_input_tokens_seen": 2073821184, "step": 5274 }, { "epoch": 0.5986330927604936, "grad_norm": 0.6151924133300781, "learning_rate": 2.1124239169351953e-05, "loss": 2.0475, "num_input_tokens_seen": 2076180480, "step": 5280 }, { "epoch": 0.5993133576386306, "grad_norm": 0.5532135367393494, "learning_rate": 2.108843537414966e-05, "loss": 2.0383, "num_input_tokens_seen": 2078539776, "step": 5286 }, { "epoch": 0.5999936225167675, "grad_norm": 0.5925424695014954, "learning_rate": 2.105263157894737e-05, "loss": 2.001, "num_input_tokens_seen": 2080899072, "step": 5292 }, { "epoch": 0.6006738873949043, "grad_norm": 0.5219939947128296, "learning_rate": 2.1016827783745077e-05, "loss": 2.0261, "num_input_tokens_seen": 2083258368, "step": 5298 }, { "epoch": 0.6013541522730413, "grad_norm": 0.5546817779541016, "learning_rate": 2.0981023988542785e-05, "loss": 2.0535, "num_input_tokens_seen": 2085617664, "step": 5304 }, { "epoch": 0.6020344171511782, "grad_norm": 0.5580465793609619, "learning_rate": 2.0945220193340497e-05, "loss": 2.0498, "num_input_tokens_seen": 2087976960, "step": 5310 }, { "epoch": 0.6027146820293152, "grad_norm": 0.5796703696250916, "learning_rate": 2.0909416398138205e-05, "loss": 2.0512, "num_input_tokens_seen": 2090336256, "step": 5316 }, { "epoch": 0.6033949469074521, "grad_norm": 0.6007897257804871, "learning_rate": 2.0873612602935913e-05, "loss": 2.0236, "num_input_tokens_seen": 2092695552, "step": 5322 }, { "epoch": 0.604075211785589, "grad_norm": 0.5929319262504578, "learning_rate": 2.083780880773362e-05, "loss": 2.0342, "num_input_tokens_seen": 2095054848, "step": 5328 }, { "epoch": 0.604755476663726, "grad_norm": 0.6018472909927368, "learning_rate": 2.080200501253133e-05, "loss": 1.9576, "num_input_tokens_seen": 2097414144, "step": 5334 }, { "epoch": 0.6054357415418629, "grad_norm": 0.546468198299408, "learning_rate": 2.0766201217329038e-05, "loss": 2.0191, "num_input_tokens_seen": 2099773440, "step": 5340 }, { "epoch": 0.6061160064199997, "grad_norm": 0.5679252743721008, "learning_rate": 2.0730397422126746e-05, "loss": 2.0272, "num_input_tokens_seen": 2102132736, "step": 5346 }, { "epoch": 0.6067962712981367, "grad_norm": 0.5521571040153503, "learning_rate": 2.0694593626924454e-05, "loss": 2.0423, "num_input_tokens_seen": 2104492032, "step": 5352 }, { "epoch": 0.6074765361762736, "grad_norm": 0.597798228263855, "learning_rate": 2.0658789831722165e-05, "loss": 2.0251, "num_input_tokens_seen": 2106851328, "step": 5358 }, { "epoch": 0.6081568010544106, "grad_norm": 0.557311475276947, "learning_rate": 2.0622986036519874e-05, "loss": 2.0391, "num_input_tokens_seen": 2109210624, "step": 5364 }, { "epoch": 0.6088370659325475, "grad_norm": 0.6349292397499084, "learning_rate": 2.0587182241317582e-05, "loss": 2.0279, "num_input_tokens_seen": 2111569920, "step": 5370 }, { "epoch": 0.6095173308106844, "grad_norm": 0.5559099912643433, "learning_rate": 2.0551378446115287e-05, "loss": 2.01, "num_input_tokens_seen": 2113929216, "step": 5376 }, { "epoch": 0.6101975956888214, "grad_norm": 0.6606104969978333, "learning_rate": 2.0515574650912998e-05, "loss": 2.058, "num_input_tokens_seen": 2116288512, "step": 5382 }, { "epoch": 0.6108778605669583, "grad_norm": 0.5794185996055603, "learning_rate": 2.0479770855710706e-05, "loss": 2.0456, "num_input_tokens_seen": 2118647808, "step": 5388 }, { "epoch": 0.6115581254450951, "grad_norm": 0.5368028879165649, "learning_rate": 2.0443967060508414e-05, "loss": 2.003, "num_input_tokens_seen": 2121007104, "step": 5394 }, { "epoch": 0.6122383903232321, "grad_norm": 0.6016758680343628, "learning_rate": 2.0408163265306123e-05, "loss": 2.0776, "num_input_tokens_seen": 2123366400, "step": 5400 }, { "epoch": 0.612918655201369, "grad_norm": 0.5522080659866333, "learning_rate": 2.0372359470103834e-05, "loss": 2.0433, "num_input_tokens_seen": 2125725696, "step": 5406 }, { "epoch": 0.613598920079506, "grad_norm": 0.622534990310669, "learning_rate": 2.0336555674901542e-05, "loss": 2.0225, "num_input_tokens_seen": 2128084992, "step": 5412 }, { "epoch": 0.6142791849576429, "grad_norm": 0.5342540144920349, "learning_rate": 2.0300751879699247e-05, "loss": 2.0334, "num_input_tokens_seen": 2130444288, "step": 5418 }, { "epoch": 0.6149594498357798, "grad_norm": 0.5722180604934692, "learning_rate": 2.026494808449696e-05, "loss": 2.0581, "num_input_tokens_seen": 2132803584, "step": 5424 }, { "epoch": 0.6156397147139168, "grad_norm": 0.6203873157501221, "learning_rate": 2.0229144289294667e-05, "loss": 2.0202, "num_input_tokens_seen": 2135162880, "step": 5430 }, { "epoch": 0.6163199795920536, "grad_norm": 0.5414242148399353, "learning_rate": 2.0193340494092375e-05, "loss": 2.0302, "num_input_tokens_seen": 2137522176, "step": 5436 }, { "epoch": 0.6170002444701905, "grad_norm": 0.6533239483833313, "learning_rate": 2.0157536698890083e-05, "loss": 2.046, "num_input_tokens_seen": 2139881472, "step": 5442 }, { "epoch": 0.6176805093483275, "grad_norm": 0.647675096988678, "learning_rate": 2.012173290368779e-05, "loss": 2.0384, "num_input_tokens_seen": 2142240768, "step": 5448 }, { "epoch": 0.6183607742264644, "grad_norm": 0.7028170228004456, "learning_rate": 2.0085929108485503e-05, "loss": 2.0393, "num_input_tokens_seen": 2144600064, "step": 5454 }, { "epoch": 0.6190410391046014, "grad_norm": 0.6044926047325134, "learning_rate": 2.0050125313283208e-05, "loss": 2.0437, "num_input_tokens_seen": 2146959360, "step": 5460 }, { "epoch": 0.6197213039827383, "grad_norm": 0.5385059714317322, "learning_rate": 2.0014321518080916e-05, "loss": 2.016, "num_input_tokens_seen": 2149318656, "step": 5466 }, { "epoch": 0.6204015688608752, "grad_norm": 0.6397769451141357, "learning_rate": 1.9978517722878627e-05, "loss": 2.0065, "num_input_tokens_seen": 2151677952, "step": 5472 }, { "epoch": 0.6210818337390122, "grad_norm": 0.6114969253540039, "learning_rate": 1.9942713927676335e-05, "loss": 2.0456, "num_input_tokens_seen": 2154037248, "step": 5478 }, { "epoch": 0.621762098617149, "grad_norm": 0.585610568523407, "learning_rate": 1.9906910132474044e-05, "loss": 2.0395, "num_input_tokens_seen": 2156396544, "step": 5484 }, { "epoch": 0.6224423634952859, "grad_norm": 0.5144538879394531, "learning_rate": 1.987110633727175e-05, "loss": 2.0605, "num_input_tokens_seen": 2158755840, "step": 5490 }, { "epoch": 0.6231226283734229, "grad_norm": 0.5797079205513, "learning_rate": 1.9835302542069463e-05, "loss": 2.0206, "num_input_tokens_seen": 2161115136, "step": 5496 }, { "epoch": 0.6238028932515598, "grad_norm": 0.49237367510795593, "learning_rate": 1.9799498746867168e-05, "loss": 1.9534, "num_input_tokens_seen": 2163474432, "step": 5502 }, { "epoch": 0.6244831581296968, "grad_norm": 0.5623591542243958, "learning_rate": 1.9763694951664876e-05, "loss": 2.0229, "num_input_tokens_seen": 2165833728, "step": 5508 }, { "epoch": 0.6251634230078337, "grad_norm": 0.5550060868263245, "learning_rate": 1.9727891156462584e-05, "loss": 2.0442, "num_input_tokens_seen": 2168193024, "step": 5514 }, { "epoch": 0.6258436878859706, "grad_norm": 0.5482363104820251, "learning_rate": 1.9692087361260296e-05, "loss": 2.0418, "num_input_tokens_seen": 2170552320, "step": 5520 }, { "epoch": 0.6265239527641076, "grad_norm": 0.612348198890686, "learning_rate": 1.9656283566058004e-05, "loss": 1.9943, "num_input_tokens_seen": 2172911616, "step": 5526 }, { "epoch": 0.6272042176422444, "grad_norm": 0.5311436057090759, "learning_rate": 1.9620479770855712e-05, "loss": 2.0369, "num_input_tokens_seen": 2175270912, "step": 5532 }, { "epoch": 0.6278844825203814, "grad_norm": 0.5449828505516052, "learning_rate": 1.958467597565342e-05, "loss": 2.0169, "num_input_tokens_seen": 2177630208, "step": 5538 }, { "epoch": 0.6285647473985183, "grad_norm": 0.5630024671554565, "learning_rate": 1.954887218045113e-05, "loss": 2.0441, "num_input_tokens_seen": 2179989504, "step": 5544 }, { "epoch": 0.6292450122766552, "grad_norm": 0.5129250288009644, "learning_rate": 1.9513068385248837e-05, "loss": 1.9904, "num_input_tokens_seen": 2182348800, "step": 5550 }, { "epoch": 0.6299252771547922, "grad_norm": 0.6263514161109924, "learning_rate": 1.9477264590046545e-05, "loss": 2.0084, "num_input_tokens_seen": 2184708096, "step": 5556 }, { "epoch": 0.6306055420329291, "grad_norm": 0.5702618360519409, "learning_rate": 1.9441460794844253e-05, "loss": 2.0487, "num_input_tokens_seen": 2187067392, "step": 5562 }, { "epoch": 0.631285806911066, "grad_norm": 0.6130457520484924, "learning_rate": 1.9405656999641965e-05, "loss": 2.047, "num_input_tokens_seen": 2189426688, "step": 5568 }, { "epoch": 0.6319660717892029, "grad_norm": 0.5357660055160522, "learning_rate": 1.9369853204439673e-05, "loss": 1.9605, "num_input_tokens_seen": 2191785984, "step": 5574 }, { "epoch": 0.6326463366673398, "grad_norm": 0.515099287033081, "learning_rate": 1.933404940923738e-05, "loss": 1.9558, "num_input_tokens_seen": 2194145280, "step": 5580 }, { "epoch": 0.6333266015454768, "grad_norm": 0.5823177695274353, "learning_rate": 1.929824561403509e-05, "loss": 2.0505, "num_input_tokens_seen": 2196504576, "step": 5586 }, { "epoch": 0.6340068664236137, "grad_norm": 0.5302935838699341, "learning_rate": 1.9262441818832797e-05, "loss": 2.0165, "num_input_tokens_seen": 2198863872, "step": 5592 }, { "epoch": 0.6346871313017506, "grad_norm": 0.6928517818450928, "learning_rate": 1.9226638023630505e-05, "loss": 2.0234, "num_input_tokens_seen": 2201223168, "step": 5598 }, { "epoch": 0.6349138862611295, "eval_accuracy": 0.5844572649572649, "eval_loss": 2.0162546634674072, "eval_runtime": 129.1053, "eval_samples_per_second": 3.098, "eval_steps_per_second": 1.038, "num_input_tokens_seen": 2202009600, "step": 5600 }, { "epoch": 0.6353673961798876, "grad_norm": 0.6656593084335327, "learning_rate": 1.9190834228428213e-05, "loss": 2.0049, "num_input_tokens_seen": 2203582464, "step": 5604 }, { "epoch": 0.6360476610580245, "grad_norm": 0.5688000917434692, "learning_rate": 1.9155030433225925e-05, "loss": 2.0513, "num_input_tokens_seen": 2205941760, "step": 5610 }, { "epoch": 0.6367279259361613, "grad_norm": 0.5396568179130554, "learning_rate": 1.9119226638023633e-05, "loss": 2.0563, "num_input_tokens_seen": 2208301056, "step": 5616 }, { "epoch": 0.6374081908142983, "grad_norm": 0.6275331974029541, "learning_rate": 1.908342284282134e-05, "loss": 2.0474, "num_input_tokens_seen": 2210660352, "step": 5622 }, { "epoch": 0.6380884556924352, "grad_norm": 0.53306645154953, "learning_rate": 1.9047619047619046e-05, "loss": 2.0287, "num_input_tokens_seen": 2213019648, "step": 5628 }, { "epoch": 0.6387687205705722, "grad_norm": 0.5248289704322815, "learning_rate": 1.9011815252416758e-05, "loss": 1.9989, "num_input_tokens_seen": 2215378944, "step": 5634 }, { "epoch": 0.6394489854487091, "grad_norm": 0.5750051140785217, "learning_rate": 1.8976011457214466e-05, "loss": 2.103, "num_input_tokens_seen": 2217738240, "step": 5640 }, { "epoch": 0.640129250326846, "grad_norm": 0.6214660406112671, "learning_rate": 1.8940207662012174e-05, "loss": 2.0476, "num_input_tokens_seen": 2220097536, "step": 5646 }, { "epoch": 0.640809515204983, "grad_norm": 0.6341087222099304, "learning_rate": 1.8904403866809882e-05, "loss": 2.0424, "num_input_tokens_seen": 2222456832, "step": 5652 }, { "epoch": 0.6414897800831199, "grad_norm": 0.5591261386871338, "learning_rate": 1.8868600071607594e-05, "loss": 2.0624, "num_input_tokens_seen": 2224816128, "step": 5658 }, { "epoch": 0.6421700449612567, "grad_norm": 0.538550615310669, "learning_rate": 1.8832796276405302e-05, "loss": 2.0207, "num_input_tokens_seen": 2227175424, "step": 5664 }, { "epoch": 0.6428503098393937, "grad_norm": 0.5393409729003906, "learning_rate": 1.8796992481203007e-05, "loss": 2.0775, "num_input_tokens_seen": 2229534720, "step": 5670 }, { "epoch": 0.6435305747175306, "grad_norm": 0.6216705441474915, "learning_rate": 1.8761188686000715e-05, "loss": 2.0119, "num_input_tokens_seen": 2231894016, "step": 5676 }, { "epoch": 0.6442108395956676, "grad_norm": 0.7083945274353027, "learning_rate": 1.8725384890798426e-05, "loss": 2.0221, "num_input_tokens_seen": 2234253312, "step": 5682 }, { "epoch": 0.6448911044738045, "grad_norm": 0.6334338784217834, "learning_rate": 1.8689581095596134e-05, "loss": 2.025, "num_input_tokens_seen": 2236612608, "step": 5688 }, { "epoch": 0.6455713693519414, "grad_norm": 0.8451031446456909, "learning_rate": 1.8653777300393843e-05, "loss": 2.0488, "num_input_tokens_seen": 2238971904, "step": 5694 }, { "epoch": 0.6462516342300784, "grad_norm": 0.6180372834205627, "learning_rate": 1.861797350519155e-05, "loss": 2.0468, "num_input_tokens_seen": 2241331200, "step": 5700 }, { "epoch": 0.6469318991082152, "grad_norm": 0.5460434556007385, "learning_rate": 1.8582169709989262e-05, "loss": 2.0523, "num_input_tokens_seen": 2243690496, "step": 5706 }, { "epoch": 0.6476121639863522, "grad_norm": 0.54164719581604, "learning_rate": 1.8546365914786967e-05, "loss": 2.0406, "num_input_tokens_seen": 2246049792, "step": 5712 }, { "epoch": 0.6482924288644891, "grad_norm": 0.5062336921691895, "learning_rate": 1.8510562119584675e-05, "loss": 2.0353, "num_input_tokens_seen": 2248409088, "step": 5718 }, { "epoch": 0.648972693742626, "grad_norm": 0.5514745712280273, "learning_rate": 1.8474758324382387e-05, "loss": 2.0479, "num_input_tokens_seen": 2250768384, "step": 5724 }, { "epoch": 0.649652958620763, "grad_norm": 0.6137591600418091, "learning_rate": 1.8438954529180095e-05, "loss": 2.0144, "num_input_tokens_seen": 2253127680, "step": 5730 }, { "epoch": 0.6503332234988999, "grad_norm": 0.6638593077659607, "learning_rate": 1.8403150733977803e-05, "loss": 2.0213, "num_input_tokens_seen": 2255486976, "step": 5736 }, { "epoch": 0.6510134883770368, "grad_norm": 0.5813100337982178, "learning_rate": 1.836734693877551e-05, "loss": 1.9889, "num_input_tokens_seen": 2257846272, "step": 5742 }, { "epoch": 0.6516937532551738, "grad_norm": 0.5840685963630676, "learning_rate": 1.833154314357322e-05, "loss": 1.9623, "num_input_tokens_seen": 2260205568, "step": 5748 }, { "epoch": 0.6523740181333106, "grad_norm": 0.6277685761451721, "learning_rate": 1.8295739348370928e-05, "loss": 2.0894, "num_input_tokens_seen": 2262564864, "step": 5754 }, { "epoch": 0.6530542830114476, "grad_norm": 0.5178935527801514, "learning_rate": 1.8259935553168636e-05, "loss": 2.0359, "num_input_tokens_seen": 2264924160, "step": 5760 }, { "epoch": 0.6537345478895845, "grad_norm": 0.5926587581634521, "learning_rate": 1.8224131757966344e-05, "loss": 1.9869, "num_input_tokens_seen": 2267283456, "step": 5766 }, { "epoch": 0.6544148127677214, "grad_norm": 0.5860345959663391, "learning_rate": 1.8188327962764055e-05, "loss": 2.0953, "num_input_tokens_seen": 2269642752, "step": 5772 }, { "epoch": 0.6550950776458584, "grad_norm": 0.5178937315940857, "learning_rate": 1.8152524167561764e-05, "loss": 2.0017, "num_input_tokens_seen": 2272002048, "step": 5778 }, { "epoch": 0.6557753425239953, "grad_norm": 0.5575287938117981, "learning_rate": 1.8116720372359472e-05, "loss": 1.9847, "num_input_tokens_seen": 2274361344, "step": 5784 }, { "epoch": 0.6564556074021322, "grad_norm": 0.7367034554481506, "learning_rate": 1.808091657715718e-05, "loss": 2.0555, "num_input_tokens_seen": 2276720640, "step": 5790 }, { "epoch": 0.6571358722802692, "grad_norm": 0.5715209245681763, "learning_rate": 1.8045112781954888e-05, "loss": 2.0645, "num_input_tokens_seen": 2279079936, "step": 5796 }, { "epoch": 0.657816137158406, "grad_norm": 0.5563541054725647, "learning_rate": 1.8009308986752596e-05, "loss": 1.9926, "num_input_tokens_seen": 2281439232, "step": 5802 }, { "epoch": 0.658496402036543, "grad_norm": 0.6194490194320679, "learning_rate": 1.7973505191550304e-05, "loss": 2.0258, "num_input_tokens_seen": 2283798528, "step": 5808 }, { "epoch": 0.6591766669146799, "grad_norm": 0.548372209072113, "learning_rate": 1.7937701396348013e-05, "loss": 2.0432, "num_input_tokens_seen": 2286157824, "step": 5814 }, { "epoch": 0.6598569317928168, "grad_norm": 0.5206018090248108, "learning_rate": 1.7901897601145724e-05, "loss": 2.0205, "num_input_tokens_seen": 2288517120, "step": 5820 }, { "epoch": 0.6605371966709538, "grad_norm": 0.5887606739997864, "learning_rate": 1.7866093805943432e-05, "loss": 2.0464, "num_input_tokens_seen": 2290876416, "step": 5826 }, { "epoch": 0.6612174615490907, "grad_norm": 0.5260190367698669, "learning_rate": 1.7830290010741137e-05, "loss": 2.0031, "num_input_tokens_seen": 2293235712, "step": 5832 }, { "epoch": 0.6618977264272276, "grad_norm": 0.5165619850158691, "learning_rate": 1.779448621553885e-05, "loss": 2.0114, "num_input_tokens_seen": 2295595008, "step": 5838 }, { "epoch": 0.6625779913053645, "grad_norm": 0.5297482013702393, "learning_rate": 1.7758682420336557e-05, "loss": 2.0397, "num_input_tokens_seen": 2297954304, "step": 5844 }, { "epoch": 0.6632582561835014, "grad_norm": 0.5319347977638245, "learning_rate": 1.7722878625134265e-05, "loss": 1.9881, "num_input_tokens_seen": 2300313600, "step": 5850 }, { "epoch": 0.6639385210616384, "grad_norm": 0.5887789726257324, "learning_rate": 1.7687074829931973e-05, "loss": 2.0377, "num_input_tokens_seen": 2302672896, "step": 5856 }, { "epoch": 0.6646187859397753, "grad_norm": 0.5922223925590515, "learning_rate": 1.765127103472968e-05, "loss": 2.0109, "num_input_tokens_seen": 2305032192, "step": 5862 }, { "epoch": 0.6652990508179122, "grad_norm": 0.5542311072349548, "learning_rate": 1.7615467239527393e-05, "loss": 2.0284, "num_input_tokens_seen": 2307391488, "step": 5868 }, { "epoch": 0.6659793156960492, "grad_norm": 0.5875877141952515, "learning_rate": 1.7579663444325098e-05, "loss": 1.9765, "num_input_tokens_seen": 2309750784, "step": 5874 }, { "epoch": 0.6666595805741861, "grad_norm": 0.5613130927085876, "learning_rate": 1.7543859649122806e-05, "loss": 2.0177, "num_input_tokens_seen": 2312110080, "step": 5880 }, { "epoch": 0.6673398454523229, "grad_norm": 0.6007198691368103, "learning_rate": 1.7508055853920517e-05, "loss": 2.0009, "num_input_tokens_seen": 2314469376, "step": 5886 }, { "epoch": 0.6680201103304599, "grad_norm": 0.5905076861381531, "learning_rate": 1.7472252058718225e-05, "loss": 2.0161, "num_input_tokens_seen": 2316828672, "step": 5892 }, { "epoch": 0.6687003752085968, "grad_norm": 0.5711119771003723, "learning_rate": 1.7436448263515934e-05, "loss": 2.0024, "num_input_tokens_seen": 2319187968, "step": 5898 }, { "epoch": 0.6693806400867338, "grad_norm": 0.5723757147789001, "learning_rate": 1.7400644468313642e-05, "loss": 2.0639, "num_input_tokens_seen": 2321547264, "step": 5904 }, { "epoch": 0.6700609049648707, "grad_norm": 0.6024678349494934, "learning_rate": 1.7364840673111353e-05, "loss": 1.9558, "num_input_tokens_seen": 2323906560, "step": 5910 }, { "epoch": 0.6707411698430076, "grad_norm": 0.5753939151763916, "learning_rate": 1.7329036877909058e-05, "loss": 2.066, "num_input_tokens_seen": 2326265856, "step": 5916 }, { "epoch": 0.6714214347211446, "grad_norm": 0.6269899606704712, "learning_rate": 1.7293233082706766e-05, "loss": 1.9689, "num_input_tokens_seen": 2328625152, "step": 5922 }, { "epoch": 0.6721016995992815, "grad_norm": 0.5666351318359375, "learning_rate": 1.7257429287504474e-05, "loss": 1.9687, "num_input_tokens_seen": 2330984448, "step": 5928 }, { "epoch": 0.6727819644774184, "grad_norm": 0.5650635957717896, "learning_rate": 1.7221625492302186e-05, "loss": 2.0503, "num_input_tokens_seen": 2333343744, "step": 5934 }, { "epoch": 0.6734622293555553, "grad_norm": 0.5674002766609192, "learning_rate": 1.7185821697099894e-05, "loss": 2.0113, "num_input_tokens_seen": 2335703040, "step": 5940 }, { "epoch": 0.6741424942336922, "grad_norm": 0.5696431398391724, "learning_rate": 1.7150017901897602e-05, "loss": 2.0004, "num_input_tokens_seen": 2338062336, "step": 5946 }, { "epoch": 0.6748227591118292, "grad_norm": 0.5952620506286621, "learning_rate": 1.711421410669531e-05, "loss": 2.0284, "num_input_tokens_seen": 2340421632, "step": 5952 }, { "epoch": 0.6755030239899661, "grad_norm": 0.5755632519721985, "learning_rate": 1.707841031149302e-05, "loss": 2.0401, "num_input_tokens_seen": 2342780928, "step": 5958 }, { "epoch": 0.676183288868103, "grad_norm": 0.5462335348129272, "learning_rate": 1.7042606516290727e-05, "loss": 2.0531, "num_input_tokens_seen": 2345140224, "step": 5964 }, { "epoch": 0.67686355374624, "grad_norm": 0.5699030756950378, "learning_rate": 1.7006802721088435e-05, "loss": 2.0432, "num_input_tokens_seen": 2347499520, "step": 5970 }, { "epoch": 0.6775438186243768, "grad_norm": 0.546146035194397, "learning_rate": 1.6970998925886143e-05, "loss": 2.0535, "num_input_tokens_seen": 2349858816, "step": 5976 }, { "epoch": 0.6782240835025138, "grad_norm": 0.5467692613601685, "learning_rate": 1.6935195130683855e-05, "loss": 2.054, "num_input_tokens_seen": 2352218112, "step": 5982 }, { "epoch": 0.6789043483806507, "grad_norm": 0.5051277875900269, "learning_rate": 1.6899391335481563e-05, "loss": 2.0584, "num_input_tokens_seen": 2354577408, "step": 5988 }, { "epoch": 0.6795846132587876, "grad_norm": 0.5340428352355957, "learning_rate": 1.686358754027927e-05, "loss": 2.0183, "num_input_tokens_seen": 2356936704, "step": 5994 }, { "epoch": 0.6802648781369246, "grad_norm": 0.5289435386657715, "learning_rate": 1.682778374507698e-05, "loss": 2.073, "num_input_tokens_seen": 2359296000, "step": 6000 }, { "epoch": 0.6802648781369246, "eval_accuracy": 0.5850433455433456, "eval_loss": 2.012032985687256, "eval_runtime": 128.1029, "eval_samples_per_second": 3.122, "eval_steps_per_second": 1.046, "num_input_tokens_seen": 2359296000, "step": 6000 }, { "epoch": 0.6809451430150615, "grad_norm": 0.5148223638534546, "learning_rate": 1.6791979949874687e-05, "loss": 2.0406, "num_input_tokens_seen": 2361655296, "step": 6006 }, { "epoch": 0.6816254078931984, "grad_norm": 0.5186204314231873, "learning_rate": 1.6756176154672395e-05, "loss": 2.0097, "num_input_tokens_seen": 2364014592, "step": 6012 }, { "epoch": 0.6823056727713354, "grad_norm": 0.5499780774116516, "learning_rate": 1.6720372359470104e-05, "loss": 1.9898, "num_input_tokens_seen": 2366373888, "step": 6018 }, { "epoch": 0.6829859376494722, "grad_norm": 0.5377045273780823, "learning_rate": 1.6684568564267815e-05, "loss": 2.0034, "num_input_tokens_seen": 2368733184, "step": 6024 }, { "epoch": 0.6836662025276092, "grad_norm": 0.6097836494445801, "learning_rate": 1.6648764769065523e-05, "loss": 2.048, "num_input_tokens_seen": 2371092480, "step": 6030 }, { "epoch": 0.6843464674057461, "grad_norm": 0.5289618968963623, "learning_rate": 1.661296097386323e-05, "loss": 2.0785, "num_input_tokens_seen": 2373451776, "step": 6036 }, { "epoch": 0.685026732283883, "grad_norm": 0.522239089012146, "learning_rate": 1.6577157178660936e-05, "loss": 2.0472, "num_input_tokens_seen": 2375811072, "step": 6042 }, { "epoch": 0.68570699716202, "grad_norm": 0.5830532908439636, "learning_rate": 1.6541353383458648e-05, "loss": 1.9969, "num_input_tokens_seen": 2378170368, "step": 6048 }, { "epoch": 0.6863872620401569, "grad_norm": 0.5812861919403076, "learning_rate": 1.6505549588256356e-05, "loss": 2.0162, "num_input_tokens_seen": 2380529664, "step": 6054 }, { "epoch": 0.6870675269182938, "grad_norm": 0.607001543045044, "learning_rate": 1.6469745793054064e-05, "loss": 1.977, "num_input_tokens_seen": 2382888960, "step": 6060 }, { "epoch": 0.6877477917964308, "grad_norm": 0.5279623866081238, "learning_rate": 1.6433941997851772e-05, "loss": 2.0245, "num_input_tokens_seen": 2385248256, "step": 6066 }, { "epoch": 0.6884280566745676, "grad_norm": 0.49770334362983704, "learning_rate": 1.6398138202649484e-05, "loss": 2.0936, "num_input_tokens_seen": 2387607552, "step": 6072 }, { "epoch": 0.6891083215527046, "grad_norm": 0.5788572430610657, "learning_rate": 1.6362334407447192e-05, "loss": 2.0201, "num_input_tokens_seen": 2389966848, "step": 6078 }, { "epoch": 0.6897885864308415, "grad_norm": 0.5910390019416809, "learning_rate": 1.6326530612244897e-05, "loss": 2.0595, "num_input_tokens_seen": 2392326144, "step": 6084 }, { "epoch": 0.6904688513089784, "grad_norm": 0.5049883127212524, "learning_rate": 1.6290726817042605e-05, "loss": 2.0315, "num_input_tokens_seen": 2394685440, "step": 6090 }, { "epoch": 0.6911491161871154, "grad_norm": 0.6210893988609314, "learning_rate": 1.6254923021840316e-05, "loss": 2.0042, "num_input_tokens_seen": 2397044736, "step": 6096 }, { "epoch": 0.6918293810652523, "grad_norm": 0.5483914017677307, "learning_rate": 1.6219119226638025e-05, "loss": 1.9958, "num_input_tokens_seen": 2399404032, "step": 6102 }, { "epoch": 0.6925096459433893, "grad_norm": 0.5364962220191956, "learning_rate": 1.6183315431435733e-05, "loss": 2.0369, "num_input_tokens_seen": 2401763328, "step": 6108 }, { "epoch": 0.6931899108215261, "grad_norm": 0.5008904933929443, "learning_rate": 1.614751163623344e-05, "loss": 2.0351, "num_input_tokens_seen": 2404122624, "step": 6114 }, { "epoch": 0.693870175699663, "grad_norm": 0.5553967356681824, "learning_rate": 1.6111707841031152e-05, "loss": 2.0215, "num_input_tokens_seen": 2406481920, "step": 6120 }, { "epoch": 0.6945504405778, "grad_norm": 0.5240505337715149, "learning_rate": 1.6075904045828857e-05, "loss": 2.0129, "num_input_tokens_seen": 2408841216, "step": 6126 }, { "epoch": 0.6952307054559369, "grad_norm": 0.5032373070716858, "learning_rate": 1.6040100250626565e-05, "loss": 2.003, "num_input_tokens_seen": 2411200512, "step": 6132 }, { "epoch": 0.6959109703340738, "grad_norm": 0.5194136500358582, "learning_rate": 1.6004296455424277e-05, "loss": 1.9717, "num_input_tokens_seen": 2413559808, "step": 6138 }, { "epoch": 0.6965912352122108, "grad_norm": 0.6916829943656921, "learning_rate": 1.5968492660221985e-05, "loss": 2.0228, "num_input_tokens_seen": 2415919104, "step": 6144 }, { "epoch": 0.6972715000903477, "grad_norm": 0.5243424773216248, "learning_rate": 1.5932688865019693e-05, "loss": 2.0121, "num_input_tokens_seen": 2418278400, "step": 6150 }, { "epoch": 0.6979517649684847, "grad_norm": 0.6381689310073853, "learning_rate": 1.58968850698174e-05, "loss": 2.0556, "num_input_tokens_seen": 2420637696, "step": 6156 }, { "epoch": 0.6986320298466215, "grad_norm": 0.537339448928833, "learning_rate": 1.586108127461511e-05, "loss": 1.9974, "num_input_tokens_seen": 2422996992, "step": 6162 }, { "epoch": 0.6993122947247584, "grad_norm": 0.5361027717590332, "learning_rate": 1.5825277479412818e-05, "loss": 2.0205, "num_input_tokens_seen": 2425356288, "step": 6168 }, { "epoch": 0.6999925596028954, "grad_norm": 0.5492175817489624, "learning_rate": 1.5789473684210526e-05, "loss": 2.0168, "num_input_tokens_seen": 2427715584, "step": 6174 }, { "epoch": 0.7006728244810323, "grad_norm": 0.5616779923439026, "learning_rate": 1.5753669889008234e-05, "loss": 2.0008, "num_input_tokens_seen": 2430074880, "step": 6180 }, { "epoch": 0.7013530893591692, "grad_norm": 0.5484799146652222, "learning_rate": 1.5717866093805946e-05, "loss": 1.9296, "num_input_tokens_seen": 2432434176, "step": 6186 }, { "epoch": 0.7020333542373062, "grad_norm": 0.6379416584968567, "learning_rate": 1.5682062298603654e-05, "loss": 2.0134, "num_input_tokens_seen": 2434793472, "step": 6192 }, { "epoch": 0.7027136191154431, "grad_norm": 0.582015872001648, "learning_rate": 1.5646258503401362e-05, "loss": 2.04, "num_input_tokens_seen": 2437152768, "step": 6198 }, { "epoch": 0.70339388399358, "grad_norm": 0.545238196849823, "learning_rate": 1.561045470819907e-05, "loss": 2.0462, "num_input_tokens_seen": 2439512064, "step": 6204 }, { "epoch": 0.7040741488717169, "grad_norm": 0.6547414660453796, "learning_rate": 1.5574650912996778e-05, "loss": 2.0875, "num_input_tokens_seen": 2441871360, "step": 6210 }, { "epoch": 0.7047544137498538, "grad_norm": 0.5639871954917908, "learning_rate": 1.5538847117794486e-05, "loss": 2.0379, "num_input_tokens_seen": 2444230656, "step": 6216 }, { "epoch": 0.7054346786279908, "grad_norm": 0.5656554698944092, "learning_rate": 1.5503043322592194e-05, "loss": 2.0478, "num_input_tokens_seen": 2446589952, "step": 6222 }, { "epoch": 0.7061149435061277, "grad_norm": 0.5386386513710022, "learning_rate": 1.5467239527389903e-05, "loss": 2.0567, "num_input_tokens_seen": 2448949248, "step": 6228 }, { "epoch": 0.7067952083842646, "grad_norm": 0.5181793570518494, "learning_rate": 1.5431435732187614e-05, "loss": 2.0285, "num_input_tokens_seen": 2451308544, "step": 6234 }, { "epoch": 0.7074754732624016, "grad_norm": 0.5418627858161926, "learning_rate": 1.5395631936985322e-05, "loss": 2.0252, "num_input_tokens_seen": 2453667840, "step": 6240 }, { "epoch": 0.7081557381405384, "grad_norm": 0.5028561353683472, "learning_rate": 1.535982814178303e-05, "loss": 2.0651, "num_input_tokens_seen": 2456027136, "step": 6246 }, { "epoch": 0.7088360030186754, "grad_norm": 0.5279743075370789, "learning_rate": 1.532402434658074e-05, "loss": 1.9913, "num_input_tokens_seen": 2458386432, "step": 6252 }, { "epoch": 0.7095162678968123, "grad_norm": 0.5399214029312134, "learning_rate": 1.5288220551378447e-05, "loss": 2.0098, "num_input_tokens_seen": 2460745728, "step": 6258 }, { "epoch": 0.7101965327749492, "grad_norm": 0.5421512722969055, "learning_rate": 1.5252416756176155e-05, "loss": 2.0065, "num_input_tokens_seen": 2463105024, "step": 6264 }, { "epoch": 0.7108767976530862, "grad_norm": 0.5487905740737915, "learning_rate": 1.5216612960973863e-05, "loss": 2.0431, "num_input_tokens_seen": 2465464320, "step": 6270 }, { "epoch": 0.7115570625312231, "grad_norm": 0.5177443623542786, "learning_rate": 1.5180809165771573e-05, "loss": 2.032, "num_input_tokens_seen": 2467823616, "step": 6276 }, { "epoch": 0.71223732740936, "grad_norm": 0.5425601601600647, "learning_rate": 1.5145005370569281e-05, "loss": 2.0493, "num_input_tokens_seen": 2470182912, "step": 6282 }, { "epoch": 0.712917592287497, "grad_norm": 0.5033071041107178, "learning_rate": 1.5109201575366991e-05, "loss": 1.9629, "num_input_tokens_seen": 2472542208, "step": 6288 }, { "epoch": 0.7135978571656338, "grad_norm": 0.5855656862258911, "learning_rate": 1.5073397780164697e-05, "loss": 1.9704, "num_input_tokens_seen": 2474901504, "step": 6294 }, { "epoch": 0.7142781220437708, "grad_norm": 0.5889589786529541, "learning_rate": 1.5037593984962406e-05, "loss": 1.9958, "num_input_tokens_seen": 2477260800, "step": 6300 }, { "epoch": 0.7149583869219077, "grad_norm": 0.5564432740211487, "learning_rate": 1.5001790189760115e-05, "loss": 2.017, "num_input_tokens_seen": 2479620096, "step": 6306 }, { "epoch": 0.7156386518000446, "grad_norm": 0.5233476161956787, "learning_rate": 1.4965986394557824e-05, "loss": 2.0426, "num_input_tokens_seen": 2481979392, "step": 6312 }, { "epoch": 0.7163189166781816, "grad_norm": 0.5455360412597656, "learning_rate": 1.4930182599355533e-05, "loss": 1.9938, "num_input_tokens_seen": 2484338688, "step": 6318 }, { "epoch": 0.7169991815563185, "grad_norm": 0.4952971935272217, "learning_rate": 1.4894378804153242e-05, "loss": 1.9942, "num_input_tokens_seen": 2486697984, "step": 6324 }, { "epoch": 0.7176794464344555, "grad_norm": 0.5228383541107178, "learning_rate": 1.4858575008950952e-05, "loss": 1.9894, "num_input_tokens_seen": 2489057280, "step": 6330 }, { "epoch": 0.7183597113125924, "grad_norm": 0.5237627029418945, "learning_rate": 1.4822771213748656e-05, "loss": 2.0579, "num_input_tokens_seen": 2491416576, "step": 6336 }, { "epoch": 0.7190399761907292, "grad_norm": 0.541063666343689, "learning_rate": 1.4786967418546366e-05, "loss": 2.0231, "num_input_tokens_seen": 2493775872, "step": 6342 }, { "epoch": 0.7197202410688662, "grad_norm": 0.5005678534507751, "learning_rate": 1.4751163623344074e-05, "loss": 2.0143, "num_input_tokens_seen": 2496135168, "step": 6348 }, { "epoch": 0.7204005059470031, "grad_norm": 0.5186320543289185, "learning_rate": 1.4715359828141784e-05, "loss": 2.0299, "num_input_tokens_seen": 2498494464, "step": 6354 }, { "epoch": 0.72108077082514, "grad_norm": 0.5344519019126892, "learning_rate": 1.4679556032939492e-05, "loss": 2.0408, "num_input_tokens_seen": 2500853760, "step": 6360 }, { "epoch": 0.721761035703277, "grad_norm": 0.5525063276290894, "learning_rate": 1.4643752237737202e-05, "loss": 2.0227, "num_input_tokens_seen": 2503213056, "step": 6366 }, { "epoch": 0.7224413005814139, "grad_norm": 0.5622419118881226, "learning_rate": 1.460794844253491e-05, "loss": 2.0103, "num_input_tokens_seen": 2505572352, "step": 6372 }, { "epoch": 0.7231215654595509, "grad_norm": 0.5473782420158386, "learning_rate": 1.4572144647332617e-05, "loss": 2.0163, "num_input_tokens_seen": 2507931648, "step": 6378 }, { "epoch": 0.7238018303376877, "grad_norm": 0.5566105246543884, "learning_rate": 1.4536340852130325e-05, "loss": 2.0512, "num_input_tokens_seen": 2510290944, "step": 6384 }, { "epoch": 0.7244820952158246, "grad_norm": 0.5519588589668274, "learning_rate": 1.4500537056928035e-05, "loss": 2.0518, "num_input_tokens_seen": 2512650240, "step": 6390 }, { "epoch": 0.7251623600939616, "grad_norm": 0.6528738737106323, "learning_rate": 1.4464733261725743e-05, "loss": 2.058, "num_input_tokens_seen": 2515009536, "step": 6396 }, { "epoch": 0.7256158700127195, "eval_accuracy": 0.5862222222222222, "eval_loss": 2.0074377059936523, "eval_runtime": 129.8788, "eval_samples_per_second": 3.08, "eval_steps_per_second": 1.032, "num_input_tokens_seen": 2516582400, "step": 6400 }, { "epoch": 0.7258426249720985, "grad_norm": 0.5393079519271851, "learning_rate": 1.4428929466523453e-05, "loss": 1.9774, "num_input_tokens_seen": 2517368832, "step": 6402 }, { "epoch": 0.7265228898502354, "grad_norm": 0.6304501295089722, "learning_rate": 1.4393125671321161e-05, "loss": 2.0694, "num_input_tokens_seen": 2519728128, "step": 6408 }, { "epoch": 0.7272031547283724, "grad_norm": 0.5596165060997009, "learning_rate": 1.435732187611887e-05, "loss": 2.0378, "num_input_tokens_seen": 2522087424, "step": 6414 }, { "epoch": 0.7278834196065093, "grad_norm": 0.5285012125968933, "learning_rate": 1.4321518080916577e-05, "loss": 2.0108, "num_input_tokens_seen": 2524446720, "step": 6420 }, { "epoch": 0.7285636844846463, "grad_norm": 0.5212527513504028, "learning_rate": 1.4285714285714285e-05, "loss": 2.0467, "num_input_tokens_seen": 2526806016, "step": 6426 }, { "epoch": 0.7292439493627831, "grad_norm": 0.5333656072616577, "learning_rate": 1.4249910490511995e-05, "loss": 2.0218, "num_input_tokens_seen": 2529165312, "step": 6432 }, { "epoch": 0.72992421424092, "grad_norm": 0.5523655414581299, "learning_rate": 1.4214106695309703e-05, "loss": 1.9989, "num_input_tokens_seen": 2531524608, "step": 6438 }, { "epoch": 0.730604479119057, "grad_norm": 0.5648514032363892, "learning_rate": 1.4178302900107413e-05, "loss": 2.0348, "num_input_tokens_seen": 2533883904, "step": 6444 }, { "epoch": 0.7312847439971939, "grad_norm": 0.5992633700370789, "learning_rate": 1.4142499104905121e-05, "loss": 2.0292, "num_input_tokens_seen": 2536243200, "step": 6450 }, { "epoch": 0.7319650088753308, "grad_norm": 0.5971994400024414, "learning_rate": 1.410669530970283e-05, "loss": 1.9372, "num_input_tokens_seen": 2538602496, "step": 6456 }, { "epoch": 0.7326452737534678, "grad_norm": 0.5448790192604065, "learning_rate": 1.4070891514500536e-05, "loss": 2.0675, "num_input_tokens_seen": 2540961792, "step": 6462 }, { "epoch": 0.7333255386316047, "grad_norm": 0.542102038860321, "learning_rate": 1.4035087719298246e-05, "loss": 2.0221, "num_input_tokens_seen": 2543321088, "step": 6468 }, { "epoch": 0.7340058035097417, "grad_norm": 0.5358943939208984, "learning_rate": 1.3999283924095954e-05, "loss": 1.9774, "num_input_tokens_seen": 2545680384, "step": 6474 }, { "epoch": 0.7346860683878785, "grad_norm": 0.532248318195343, "learning_rate": 1.3963480128893664e-05, "loss": 2.0362, "num_input_tokens_seen": 2548039680, "step": 6480 }, { "epoch": 0.7353663332660154, "grad_norm": 0.52365642786026, "learning_rate": 1.3927676333691372e-05, "loss": 2.0386, "num_input_tokens_seen": 2550398976, "step": 6486 }, { "epoch": 0.7360465981441524, "grad_norm": 0.5019295811653137, "learning_rate": 1.3891872538489082e-05, "loss": 2.0007, "num_input_tokens_seen": 2552758272, "step": 6492 }, { "epoch": 0.7367268630222893, "grad_norm": 0.5861442685127258, "learning_rate": 1.385606874328679e-05, "loss": 2.0565, "num_input_tokens_seen": 2555117568, "step": 6498 }, { "epoch": 0.7374071279004262, "grad_norm": 0.5421295762062073, "learning_rate": 1.3820264948084497e-05, "loss": 2.0346, "num_input_tokens_seen": 2557476864, "step": 6504 }, { "epoch": 0.7380873927785632, "grad_norm": 0.5109795331954956, "learning_rate": 1.3784461152882205e-05, "loss": 2.0375, "num_input_tokens_seen": 2559836160, "step": 6510 }, { "epoch": 0.7387676576567, "grad_norm": 0.5438470840454102, "learning_rate": 1.3748657357679915e-05, "loss": 2.041, "num_input_tokens_seen": 2562195456, "step": 6516 }, { "epoch": 0.739447922534837, "grad_norm": 0.568148672580719, "learning_rate": 1.3712853562477623e-05, "loss": 2.0456, "num_input_tokens_seen": 2564554752, "step": 6522 }, { "epoch": 0.7401281874129739, "grad_norm": 0.5753766298294067, "learning_rate": 1.3677049767275333e-05, "loss": 2.0265, "num_input_tokens_seen": 2566914048, "step": 6528 }, { "epoch": 0.7408084522911108, "grad_norm": 0.5368342399597168, "learning_rate": 1.364124597207304e-05, "loss": 2.0367, "num_input_tokens_seen": 2569273344, "step": 6534 }, { "epoch": 0.7414887171692478, "grad_norm": 0.5538693070411682, "learning_rate": 1.360544217687075e-05, "loss": 2.0138, "num_input_tokens_seen": 2571632640, "step": 6540 }, { "epoch": 0.7421689820473847, "grad_norm": 0.5635451674461365, "learning_rate": 1.3569638381668457e-05, "loss": 1.9975, "num_input_tokens_seen": 2573991936, "step": 6546 }, { "epoch": 0.7428492469255217, "grad_norm": 0.6103095412254333, "learning_rate": 1.3533834586466165e-05, "loss": 2.0166, "num_input_tokens_seen": 2576351232, "step": 6552 }, { "epoch": 0.7435295118036586, "grad_norm": 0.576130211353302, "learning_rate": 1.3498030791263875e-05, "loss": 2.0627, "num_input_tokens_seen": 2578710528, "step": 6558 }, { "epoch": 0.7442097766817954, "grad_norm": 0.5822626948356628, "learning_rate": 1.3462226996061583e-05, "loss": 2.0108, "num_input_tokens_seen": 2581069824, "step": 6564 }, { "epoch": 0.7448900415599324, "grad_norm": 0.5234280824661255, "learning_rate": 1.3426423200859291e-05, "loss": 1.9447, "num_input_tokens_seen": 2583429120, "step": 6570 }, { "epoch": 0.7455703064380693, "grad_norm": 0.544247031211853, "learning_rate": 1.3390619405657001e-05, "loss": 1.9903, "num_input_tokens_seen": 2585788416, "step": 6576 }, { "epoch": 0.7462505713162062, "grad_norm": 0.5101417899131775, "learning_rate": 1.335481561045471e-05, "loss": 2.0365, "num_input_tokens_seen": 2588147712, "step": 6582 }, { "epoch": 0.7469308361943432, "grad_norm": 0.498687744140625, "learning_rate": 1.3319011815252416e-05, "loss": 2.0617, "num_input_tokens_seen": 2590507008, "step": 6588 }, { "epoch": 0.7476111010724801, "grad_norm": 0.49293607473373413, "learning_rate": 1.3283208020050126e-05, "loss": 2.0262, "num_input_tokens_seen": 2592866304, "step": 6594 }, { "epoch": 0.7482913659506171, "grad_norm": 0.544154703617096, "learning_rate": 1.3247404224847834e-05, "loss": 2.0151, "num_input_tokens_seen": 2595225600, "step": 6600 }, { "epoch": 0.748971630828754, "grad_norm": 0.5422555208206177, "learning_rate": 1.3211600429645544e-05, "loss": 2.0348, "num_input_tokens_seen": 2597584896, "step": 6606 }, { "epoch": 0.7496518957068908, "grad_norm": 0.5072320699691772, "learning_rate": 1.3175796634443252e-05, "loss": 2.0011, "num_input_tokens_seen": 2599944192, "step": 6612 }, { "epoch": 0.7503321605850278, "grad_norm": 0.6141318678855896, "learning_rate": 1.3139992839240962e-05, "loss": 2.043, "num_input_tokens_seen": 2602303488, "step": 6618 }, { "epoch": 0.7510124254631647, "grad_norm": 0.5565091967582703, "learning_rate": 1.310418904403867e-05, "loss": 2.0446, "num_input_tokens_seen": 2604662784, "step": 6624 }, { "epoch": 0.7516926903413016, "grad_norm": 0.5500200986862183, "learning_rate": 1.3068385248836376e-05, "loss": 2.0254, "num_input_tokens_seen": 2607022080, "step": 6630 }, { "epoch": 0.7523729552194386, "grad_norm": 0.6047897338867188, "learning_rate": 1.3032581453634085e-05, "loss": 2.0415, "num_input_tokens_seen": 2609381376, "step": 6636 }, { "epoch": 0.7530532200975755, "grad_norm": 0.6070099472999573, "learning_rate": 1.2996777658431794e-05, "loss": 2.0615, "num_input_tokens_seen": 2611740672, "step": 6642 }, { "epoch": 0.7537334849757125, "grad_norm": 0.525489866733551, "learning_rate": 1.2960973863229503e-05, "loss": 2.0465, "num_input_tokens_seen": 2614099968, "step": 6648 }, { "epoch": 0.7544137498538493, "grad_norm": 0.4968653917312622, "learning_rate": 1.2925170068027212e-05, "loss": 1.9749, "num_input_tokens_seen": 2616459264, "step": 6654 }, { "epoch": 0.7550940147319862, "grad_norm": 0.5281318426132202, "learning_rate": 1.288936627282492e-05, "loss": 1.9575, "num_input_tokens_seen": 2618818560, "step": 6660 }, { "epoch": 0.7557742796101232, "grad_norm": 0.5236896872520447, "learning_rate": 1.285356247762263e-05, "loss": 2.0634, "num_input_tokens_seen": 2621177856, "step": 6666 }, { "epoch": 0.7564545444882601, "grad_norm": 0.6024266481399536, "learning_rate": 1.2817758682420337e-05, "loss": 1.9841, "num_input_tokens_seen": 2623537152, "step": 6672 }, { "epoch": 0.757134809366397, "grad_norm": 0.5235931277275085, "learning_rate": 1.2781954887218045e-05, "loss": 1.9866, "num_input_tokens_seen": 2625896448, "step": 6678 }, { "epoch": 0.757815074244534, "grad_norm": 0.5041958093643188, "learning_rate": 1.2746151092015753e-05, "loss": 2.0075, "num_input_tokens_seen": 2628255744, "step": 6684 }, { "epoch": 0.7584953391226709, "grad_norm": 0.5709572434425354, "learning_rate": 1.2710347296813463e-05, "loss": 2.0231, "num_input_tokens_seen": 2630615040, "step": 6690 }, { "epoch": 0.7591756040008079, "grad_norm": 0.5276849269866943, "learning_rate": 1.2674543501611171e-05, "loss": 2.0314, "num_input_tokens_seen": 2632974336, "step": 6696 }, { "epoch": 0.7598558688789447, "grad_norm": 0.5111777186393738, "learning_rate": 1.2638739706408881e-05, "loss": 2.0542, "num_input_tokens_seen": 2635333632, "step": 6702 }, { "epoch": 0.7605361337570816, "grad_norm": 0.5633344650268555, "learning_rate": 1.260293591120659e-05, "loss": 2.0438, "num_input_tokens_seen": 2637692928, "step": 6708 }, { "epoch": 0.7612163986352186, "grad_norm": 0.5294421315193176, "learning_rate": 1.2567132116004296e-05, "loss": 2.0113, "num_input_tokens_seen": 2640052224, "step": 6714 }, { "epoch": 0.7618966635133555, "grad_norm": 0.5252106189727783, "learning_rate": 1.2531328320802006e-05, "loss": 1.9799, "num_input_tokens_seen": 2642411520, "step": 6720 }, { "epoch": 0.7625769283914925, "grad_norm": 0.5746698379516602, "learning_rate": 1.2495524525599714e-05, "loss": 2.0742, "num_input_tokens_seen": 2644770816, "step": 6726 }, { "epoch": 0.7632571932696294, "grad_norm": 0.5127720236778259, "learning_rate": 1.2459720730397424e-05, "loss": 1.9997, "num_input_tokens_seen": 2647130112, "step": 6732 }, { "epoch": 0.7639374581477663, "grad_norm": 0.654504120349884, "learning_rate": 1.2423916935195132e-05, "loss": 2.0319, "num_input_tokens_seen": 2649489408, "step": 6738 }, { "epoch": 0.7646177230259033, "grad_norm": 0.5886629819869995, "learning_rate": 1.238811313999284e-05, "loss": 2.0316, "num_input_tokens_seen": 2651848704, "step": 6744 }, { "epoch": 0.7652979879040401, "grad_norm": 0.6034631133079529, "learning_rate": 1.2352309344790548e-05, "loss": 2.0575, "num_input_tokens_seen": 2654208000, "step": 6750 }, { "epoch": 0.765978252782177, "grad_norm": 0.5659487247467041, "learning_rate": 1.2316505549588258e-05, "loss": 2.0641, "num_input_tokens_seen": 2656567296, "step": 6756 }, { "epoch": 0.766658517660314, "grad_norm": 0.5753200650215149, "learning_rate": 1.2280701754385964e-05, "loss": 2.0554, "num_input_tokens_seen": 2658926592, "step": 6762 }, { "epoch": 0.7673387825384509, "grad_norm": 0.553452730178833, "learning_rate": 1.2244897959183674e-05, "loss": 2.0957, "num_input_tokens_seen": 2661285888, "step": 6768 }, { "epoch": 0.7680190474165879, "grad_norm": 0.5258597731590271, "learning_rate": 1.2209094163981382e-05, "loss": 2.0221, "num_input_tokens_seen": 2663645184, "step": 6774 }, { "epoch": 0.7686993122947248, "grad_norm": 0.5694190859794617, "learning_rate": 1.2173290368779092e-05, "loss": 2.0153, "num_input_tokens_seen": 2666004480, "step": 6780 }, { "epoch": 0.7693795771728617, "grad_norm": 0.5532529354095459, "learning_rate": 1.2137486573576799e-05, "loss": 2.0087, "num_input_tokens_seen": 2668363776, "step": 6786 }, { "epoch": 0.7700598420509986, "grad_norm": 0.5136593580245972, "learning_rate": 1.2101682778374509e-05, "loss": 2.0324, "num_input_tokens_seen": 2670723072, "step": 6792 }, { "epoch": 0.7707401069291355, "grad_norm": 0.5418703556060791, "learning_rate": 1.2065878983172217e-05, "loss": 2.0253, "num_input_tokens_seen": 2673082368, "step": 6798 }, { "epoch": 0.7709668618885145, "eval_accuracy": 0.5866208791208791, "eval_loss": 2.00406551361084, "eval_runtime": 129.6133, "eval_samples_per_second": 3.086, "eval_steps_per_second": 1.034, "num_input_tokens_seen": 2673868800, "step": 6800 }, { "epoch": 0.7714203718072724, "grad_norm": 0.6343456506729126, "learning_rate": 1.2030075187969925e-05, "loss": 2.0459, "num_input_tokens_seen": 2675441664, "step": 6804 }, { "epoch": 0.7721006366854094, "grad_norm": 0.5664966702461243, "learning_rate": 1.1994271392767633e-05, "loss": 2.0402, "num_input_tokens_seen": 2677800960, "step": 6810 }, { "epoch": 0.7727809015635463, "grad_norm": 0.5292795300483704, "learning_rate": 1.1958467597565343e-05, "loss": 2.0636, "num_input_tokens_seen": 2680160256, "step": 6816 }, { "epoch": 0.7734611664416833, "grad_norm": 0.5384446978569031, "learning_rate": 1.1922663802363051e-05, "loss": 2.0319, "num_input_tokens_seen": 2682519552, "step": 6822 }, { "epoch": 0.7741414313198202, "grad_norm": 0.6125785112380981, "learning_rate": 1.188686000716076e-05, "loss": 2.0176, "num_input_tokens_seen": 2684878848, "step": 6828 }, { "epoch": 0.774821696197957, "grad_norm": 0.5301167368888855, "learning_rate": 1.1851056211958467e-05, "loss": 2.0285, "num_input_tokens_seen": 2687238144, "step": 6834 }, { "epoch": 0.775501961076094, "grad_norm": 0.5614597201347351, "learning_rate": 1.1815252416756177e-05, "loss": 2.0672, "num_input_tokens_seen": 2689597440, "step": 6840 }, { "epoch": 0.7761822259542309, "grad_norm": 0.5375152826309204, "learning_rate": 1.1779448621553885e-05, "loss": 1.9599, "num_input_tokens_seen": 2691956736, "step": 6846 }, { "epoch": 0.7768624908323678, "grad_norm": 0.5689718127250671, "learning_rate": 1.1743644826351593e-05, "loss": 2.0173, "num_input_tokens_seen": 2694316032, "step": 6852 }, { "epoch": 0.7775427557105048, "grad_norm": 0.5268839597702026, "learning_rate": 1.1707841031149303e-05, "loss": 2.0727, "num_input_tokens_seen": 2696675328, "step": 6858 }, { "epoch": 0.7782230205886417, "grad_norm": 0.5965040326118469, "learning_rate": 1.1672037235947012e-05, "loss": 2.0151, "num_input_tokens_seen": 2699034624, "step": 6864 }, { "epoch": 0.7789032854667787, "grad_norm": 0.5147624611854553, "learning_rate": 1.163623344074472e-05, "loss": 1.9906, "num_input_tokens_seen": 2701393920, "step": 6870 }, { "epoch": 0.7795835503449156, "grad_norm": 0.5790501832962036, "learning_rate": 1.1600429645542428e-05, "loss": 1.959, "num_input_tokens_seen": 2703753216, "step": 6876 }, { "epoch": 0.7802638152230524, "grad_norm": 0.5348291993141174, "learning_rate": 1.1564625850340138e-05, "loss": 2.025, "num_input_tokens_seen": 2706112512, "step": 6882 }, { "epoch": 0.7809440801011894, "grad_norm": 0.5875257849693298, "learning_rate": 1.1528822055137844e-05, "loss": 2.0391, "num_input_tokens_seen": 2708471808, "step": 6888 }, { "epoch": 0.7816243449793263, "grad_norm": 0.5611306428909302, "learning_rate": 1.1493018259935554e-05, "loss": 2.0412, "num_input_tokens_seen": 2710831104, "step": 6894 }, { "epoch": 0.7823046098574632, "grad_norm": 0.6583240628242493, "learning_rate": 1.1457214464733262e-05, "loss": 2.0196, "num_input_tokens_seen": 2713190400, "step": 6900 }, { "epoch": 0.7829848747356002, "grad_norm": 0.5052137970924377, "learning_rate": 1.1421410669530972e-05, "loss": 1.978, "num_input_tokens_seen": 2715549696, "step": 6906 }, { "epoch": 0.7836651396137371, "grad_norm": 0.5078434348106384, "learning_rate": 1.1385606874328678e-05, "loss": 1.9985, "num_input_tokens_seen": 2717908992, "step": 6912 }, { "epoch": 0.7843454044918741, "grad_norm": 0.5650710463523865, "learning_rate": 1.1349803079126388e-05, "loss": 2.0562, "num_input_tokens_seen": 2720268288, "step": 6918 }, { "epoch": 0.785025669370011, "grad_norm": 0.5435272455215454, "learning_rate": 1.1313999283924096e-05, "loss": 1.969, "num_input_tokens_seen": 2722627584, "step": 6924 }, { "epoch": 0.7857059342481478, "grad_norm": 0.5620574951171875, "learning_rate": 1.1278195488721805e-05, "loss": 2.0299, "num_input_tokens_seen": 2724986880, "step": 6930 }, { "epoch": 0.7863861991262848, "grad_norm": 0.5396995544433594, "learning_rate": 1.1242391693519513e-05, "loss": 2.0016, "num_input_tokens_seen": 2727346176, "step": 6936 }, { "epoch": 0.7870664640044217, "grad_norm": 0.5387789011001587, "learning_rate": 1.1206587898317223e-05, "loss": 1.9991, "num_input_tokens_seen": 2729705472, "step": 6942 }, { "epoch": 0.7877467288825587, "grad_norm": 0.5884684920310974, "learning_rate": 1.117078410311493e-05, "loss": 2.0051, "num_input_tokens_seen": 2732064768, "step": 6948 }, { "epoch": 0.7884269937606956, "grad_norm": 0.5044008493423462, "learning_rate": 1.1134980307912639e-05, "loss": 2.0211, "num_input_tokens_seen": 2734424064, "step": 6954 }, { "epoch": 0.7891072586388325, "grad_norm": 0.5228249430656433, "learning_rate": 1.1099176512710347e-05, "loss": 2.0181, "num_input_tokens_seen": 2736783360, "step": 6960 }, { "epoch": 0.7897875235169695, "grad_norm": 0.5564016103744507, "learning_rate": 1.1063372717508057e-05, "loss": 1.9769, "num_input_tokens_seen": 2739142656, "step": 6966 }, { "epoch": 0.7904677883951063, "grad_norm": 0.505305826663971, "learning_rate": 1.1027568922305765e-05, "loss": 2.013, "num_input_tokens_seen": 2741501952, "step": 6972 }, { "epoch": 0.7911480532732432, "grad_norm": 0.5639991760253906, "learning_rate": 1.0991765127103473e-05, "loss": 2.0399, "num_input_tokens_seen": 2743861248, "step": 6978 }, { "epoch": 0.7918283181513802, "grad_norm": 0.583869218826294, "learning_rate": 1.0955961331901181e-05, "loss": 2.0013, "num_input_tokens_seen": 2746220544, "step": 6984 }, { "epoch": 0.7925085830295171, "grad_norm": 0.538934051990509, "learning_rate": 1.0920157536698891e-05, "loss": 1.9875, "num_input_tokens_seen": 2748579840, "step": 6990 }, { "epoch": 0.7931888479076541, "grad_norm": 0.5135723948478699, "learning_rate": 1.08843537414966e-05, "loss": 1.9953, "num_input_tokens_seen": 2750939136, "step": 6996 }, { "epoch": 0.793869112785791, "grad_norm": 0.4978064298629761, "learning_rate": 1.0848549946294308e-05, "loss": 2.0216, "num_input_tokens_seen": 2753298432, "step": 7002 }, { "epoch": 0.7945493776639279, "grad_norm": 0.5002549886703491, "learning_rate": 1.0812746151092017e-05, "loss": 2.0746, "num_input_tokens_seen": 2755657728, "step": 7008 }, { "epoch": 0.7952296425420649, "grad_norm": 0.5456427335739136, "learning_rate": 1.0776942355889724e-05, "loss": 1.9826, "num_input_tokens_seen": 2758017024, "step": 7014 }, { "epoch": 0.7959099074202017, "grad_norm": 0.5168840885162354, "learning_rate": 1.0741138560687434e-05, "loss": 1.9527, "num_input_tokens_seen": 2760376320, "step": 7020 }, { "epoch": 0.7965901722983386, "grad_norm": 0.6147148013114929, "learning_rate": 1.0705334765485142e-05, "loss": 1.9762, "num_input_tokens_seen": 2762735616, "step": 7026 }, { "epoch": 0.7972704371764756, "grad_norm": 0.5876660346984863, "learning_rate": 1.0669530970282852e-05, "loss": 1.9757, "num_input_tokens_seen": 2765094912, "step": 7032 }, { "epoch": 0.7979507020546125, "grad_norm": 0.5405025482177734, "learning_rate": 1.0633727175080558e-05, "loss": 2.0329, "num_input_tokens_seen": 2767454208, "step": 7038 }, { "epoch": 0.7986309669327495, "grad_norm": 0.5711649656295776, "learning_rate": 1.0597923379878268e-05, "loss": 2.0332, "num_input_tokens_seen": 2769813504, "step": 7044 }, { "epoch": 0.7993112318108864, "grad_norm": 0.5035630464553833, "learning_rate": 1.0562119584675976e-05, "loss": 1.9839, "num_input_tokens_seen": 2772172800, "step": 7050 }, { "epoch": 0.7999914966890233, "grad_norm": 0.6048845052719116, "learning_rate": 1.0526315789473684e-05, "loss": 2.0051, "num_input_tokens_seen": 2774532096, "step": 7056 }, { "epoch": 0.8006717615671602, "grad_norm": 0.528458297252655, "learning_rate": 1.0490511994271393e-05, "loss": 2.0181, "num_input_tokens_seen": 2776891392, "step": 7062 }, { "epoch": 0.8013520264452971, "grad_norm": 0.5959348678588867, "learning_rate": 1.0454708199069102e-05, "loss": 2.0368, "num_input_tokens_seen": 2779250688, "step": 7068 }, { "epoch": 0.802032291323434, "grad_norm": 0.6448953151702881, "learning_rate": 1.041890440386681e-05, "loss": 2.0584, "num_input_tokens_seen": 2781609984, "step": 7074 }, { "epoch": 0.802712556201571, "grad_norm": 0.5180519223213196, "learning_rate": 1.0383100608664519e-05, "loss": 1.9999, "num_input_tokens_seen": 2783969280, "step": 7080 }, { "epoch": 0.8033928210797079, "grad_norm": 0.5477824807167053, "learning_rate": 1.0347296813462227e-05, "loss": 2.0164, "num_input_tokens_seen": 2786328576, "step": 7086 }, { "epoch": 0.8040730859578449, "grad_norm": 0.5789219737052917, "learning_rate": 1.0311493018259937e-05, "loss": 2.0155, "num_input_tokens_seen": 2788687872, "step": 7092 }, { "epoch": 0.8047533508359818, "grad_norm": 0.5404716730117798, "learning_rate": 1.0275689223057643e-05, "loss": 1.9827, "num_input_tokens_seen": 2791047168, "step": 7098 }, { "epoch": 0.8054336157141186, "grad_norm": 0.583008885383606, "learning_rate": 1.0239885427855353e-05, "loss": 2.0253, "num_input_tokens_seen": 2793406464, "step": 7104 }, { "epoch": 0.8061138805922556, "grad_norm": 0.5732628703117371, "learning_rate": 1.0204081632653061e-05, "loss": 2.0036, "num_input_tokens_seen": 2795765760, "step": 7110 }, { "epoch": 0.8067941454703925, "grad_norm": 0.48751312494277954, "learning_rate": 1.0168277837450771e-05, "loss": 1.997, "num_input_tokens_seen": 2798125056, "step": 7116 }, { "epoch": 0.8074744103485294, "grad_norm": 0.4976810812950134, "learning_rate": 1.013247404224848e-05, "loss": 1.9776, "num_input_tokens_seen": 2800484352, "step": 7122 }, { "epoch": 0.8081546752266664, "grad_norm": 0.5391010642051697, "learning_rate": 1.0096670247046187e-05, "loss": 2.03, "num_input_tokens_seen": 2802843648, "step": 7128 }, { "epoch": 0.8088349401048033, "grad_norm": 0.5493925213813782, "learning_rate": 1.0060866451843896e-05, "loss": 2.0477, "num_input_tokens_seen": 2805202944, "step": 7134 }, { "epoch": 0.8095152049829403, "grad_norm": 0.544150173664093, "learning_rate": 1.0025062656641604e-05, "loss": 1.9951, "num_input_tokens_seen": 2807562240, "step": 7140 }, { "epoch": 0.8101954698610772, "grad_norm": 0.48092713952064514, "learning_rate": 9.989258861439314e-06, "loss": 2.017, "num_input_tokens_seen": 2809921536, "step": 7146 }, { "epoch": 0.810875734739214, "grad_norm": 0.5483365058898926, "learning_rate": 9.953455066237022e-06, "loss": 1.9781, "num_input_tokens_seen": 2812280832, "step": 7152 }, { "epoch": 0.811555999617351, "grad_norm": 0.5313854813575745, "learning_rate": 9.917651271034732e-06, "loss": 2.0425, "num_input_tokens_seen": 2814640128, "step": 7158 }, { "epoch": 0.8122362644954879, "grad_norm": 0.4918869137763977, "learning_rate": 9.881847475832438e-06, "loss": 2.0208, "num_input_tokens_seen": 2816999424, "step": 7164 }, { "epoch": 0.8129165293736249, "grad_norm": 0.5176813006401062, "learning_rate": 9.846043680630148e-06, "loss": 1.9843, "num_input_tokens_seen": 2819358720, "step": 7170 }, { "epoch": 0.8135967942517618, "grad_norm": 0.5130747556686401, "learning_rate": 9.810239885427856e-06, "loss": 1.9857, "num_input_tokens_seen": 2821718016, "step": 7176 }, { "epoch": 0.8142770591298987, "grad_norm": 0.5297340750694275, "learning_rate": 9.774436090225564e-06, "loss": 1.9911, "num_input_tokens_seen": 2824077312, "step": 7182 }, { "epoch": 0.8149573240080357, "grad_norm": 0.5061428546905518, "learning_rate": 9.738632295023272e-06, "loss": 1.9687, "num_input_tokens_seen": 2826436608, "step": 7188 }, { "epoch": 0.8156375888861725, "grad_norm": 0.5404644012451172, "learning_rate": 9.702828499820982e-06, "loss": 1.9658, "num_input_tokens_seen": 2828795904, "step": 7194 }, { "epoch": 0.8163178537643094, "grad_norm": 0.5056130886077881, "learning_rate": 9.66702470461869e-06, "loss": 1.995, "num_input_tokens_seen": 2831155200, "step": 7200 }, { "epoch": 0.8163178537643094, "eval_accuracy": 0.5872002442002442, "eval_loss": 2.000960350036621, "eval_runtime": 128.4298, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.043, "num_input_tokens_seen": 2831155200, "step": 7200 }, { "epoch": 0.8169981186424464, "grad_norm": 0.5106950402259827, "learning_rate": 9.631220909416399e-06, "loss": 2.0265, "num_input_tokens_seen": 2833514496, "step": 7206 }, { "epoch": 0.8176783835205833, "grad_norm": 0.5333806276321411, "learning_rate": 9.595417114214107e-06, "loss": 1.9495, "num_input_tokens_seen": 2835873792, "step": 7212 }, { "epoch": 0.8183586483987203, "grad_norm": 0.5510848164558411, "learning_rate": 9.559613319011817e-06, "loss": 2.0107, "num_input_tokens_seen": 2838233088, "step": 7218 }, { "epoch": 0.8190389132768572, "grad_norm": 0.5609148144721985, "learning_rate": 9.523809523809523e-06, "loss": 2.0433, "num_input_tokens_seen": 2840592384, "step": 7224 }, { "epoch": 0.8197191781549941, "grad_norm": 0.5448800921440125, "learning_rate": 9.488005728607233e-06, "loss": 1.9898, "num_input_tokens_seen": 2842951680, "step": 7230 }, { "epoch": 0.8203994430331311, "grad_norm": 0.5326699018478394, "learning_rate": 9.452201933404941e-06, "loss": 1.9797, "num_input_tokens_seen": 2845310976, "step": 7236 }, { "epoch": 0.8210797079112679, "grad_norm": 0.5335074663162231, "learning_rate": 9.416398138202651e-06, "loss": 2.0008, "num_input_tokens_seen": 2847670272, "step": 7242 }, { "epoch": 0.8217599727894048, "grad_norm": 0.49363961815834045, "learning_rate": 9.380594343000357e-06, "loss": 1.9962, "num_input_tokens_seen": 2850029568, "step": 7248 }, { "epoch": 0.8224402376675418, "grad_norm": 0.5354363918304443, "learning_rate": 9.344790547798067e-06, "loss": 2.0384, "num_input_tokens_seen": 2852388864, "step": 7254 }, { "epoch": 0.8231205025456787, "grad_norm": 0.5777958631515503, "learning_rate": 9.308986752595775e-06, "loss": 1.9839, "num_input_tokens_seen": 2854748160, "step": 7260 }, { "epoch": 0.8238007674238157, "grad_norm": 0.5475658178329468, "learning_rate": 9.273182957393484e-06, "loss": 1.9714, "num_input_tokens_seen": 2857107456, "step": 7266 }, { "epoch": 0.8244810323019526, "grad_norm": 0.5318132638931274, "learning_rate": 9.237379162191193e-06, "loss": 1.9882, "num_input_tokens_seen": 2859466752, "step": 7272 }, { "epoch": 0.8251612971800895, "grad_norm": 0.5123252272605896, "learning_rate": 9.201575366988902e-06, "loss": 2.0005, "num_input_tokens_seen": 2861826048, "step": 7278 }, { "epoch": 0.8258415620582265, "grad_norm": 0.5346453785896301, "learning_rate": 9.16577157178661e-06, "loss": 1.9623, "num_input_tokens_seen": 2864185344, "step": 7284 }, { "epoch": 0.8265218269363633, "grad_norm": 0.5490689277648926, "learning_rate": 9.129967776584318e-06, "loss": 2.0012, "num_input_tokens_seen": 2866544640, "step": 7290 }, { "epoch": 0.8272020918145002, "grad_norm": 0.5043258666992188, "learning_rate": 9.094163981382028e-06, "loss": 1.9961, "num_input_tokens_seen": 2868903936, "step": 7296 }, { "epoch": 0.8278823566926372, "grad_norm": 0.5261684060096741, "learning_rate": 9.058360186179736e-06, "loss": 2.0281, "num_input_tokens_seen": 2871263232, "step": 7302 }, { "epoch": 0.8285626215707741, "grad_norm": 0.503143846988678, "learning_rate": 9.022556390977444e-06, "loss": 1.9774, "num_input_tokens_seen": 2873622528, "step": 7308 }, { "epoch": 0.8292428864489111, "grad_norm": 0.548707127571106, "learning_rate": 8.986752595775152e-06, "loss": 1.9829, "num_input_tokens_seen": 2875981824, "step": 7314 }, { "epoch": 0.829923151327048, "grad_norm": 0.5377416610717773, "learning_rate": 8.950948800572862e-06, "loss": 2.002, "num_input_tokens_seen": 2878341120, "step": 7320 }, { "epoch": 0.8306034162051849, "grad_norm": 0.5520649552345276, "learning_rate": 8.915145005370569e-06, "loss": 1.9893, "num_input_tokens_seen": 2880700416, "step": 7326 }, { "epoch": 0.8312836810833218, "grad_norm": 0.6141591668128967, "learning_rate": 8.879341210168278e-06, "loss": 2.0882, "num_input_tokens_seen": 2883059712, "step": 7332 }, { "epoch": 0.8319639459614587, "grad_norm": 0.5497307181358337, "learning_rate": 8.843537414965987e-06, "loss": 2.0063, "num_input_tokens_seen": 2885419008, "step": 7338 }, { "epoch": 0.8326442108395957, "grad_norm": 0.5077412724494934, "learning_rate": 8.807733619763696e-06, "loss": 2.0364, "num_input_tokens_seen": 2887778304, "step": 7344 }, { "epoch": 0.8333244757177326, "grad_norm": 0.5644519925117493, "learning_rate": 8.771929824561403e-06, "loss": 2.0403, "num_input_tokens_seen": 2890137600, "step": 7350 }, { "epoch": 0.8340047405958695, "grad_norm": 0.5899609327316284, "learning_rate": 8.736126029359113e-06, "loss": 2.0182, "num_input_tokens_seen": 2892496896, "step": 7356 }, { "epoch": 0.8346850054740065, "grad_norm": 0.5211791396141052, "learning_rate": 8.700322234156821e-06, "loss": 2.0668, "num_input_tokens_seen": 2894856192, "step": 7362 }, { "epoch": 0.8353652703521434, "grad_norm": 0.5184838771820068, "learning_rate": 8.664518438954529e-06, "loss": 2.0001, "num_input_tokens_seen": 2897215488, "step": 7368 }, { "epoch": 0.8360455352302802, "grad_norm": 0.49509575963020325, "learning_rate": 8.628714643752237e-06, "loss": 1.9863, "num_input_tokens_seen": 2899574784, "step": 7374 }, { "epoch": 0.8367258001084172, "grad_norm": 0.6435425877571106, "learning_rate": 8.592910848549947e-06, "loss": 2.0189, "num_input_tokens_seen": 2901934080, "step": 7380 }, { "epoch": 0.8374060649865541, "grad_norm": 0.6035661697387695, "learning_rate": 8.557107053347655e-06, "loss": 2.0365, "num_input_tokens_seen": 2904293376, "step": 7386 }, { "epoch": 0.8380863298646911, "grad_norm": 0.5593310594558716, "learning_rate": 8.521303258145363e-06, "loss": 2.0101, "num_input_tokens_seen": 2906652672, "step": 7392 }, { "epoch": 0.838766594742828, "grad_norm": 0.5158206820487976, "learning_rate": 8.485499462943072e-06, "loss": 2.0323, "num_input_tokens_seen": 2909011968, "step": 7398 }, { "epoch": 0.8394468596209649, "grad_norm": 0.519759476184845, "learning_rate": 8.449695667740781e-06, "loss": 2.0185, "num_input_tokens_seen": 2911371264, "step": 7404 }, { "epoch": 0.8401271244991019, "grad_norm": 0.5219857096672058, "learning_rate": 8.41389187253849e-06, "loss": 2.0214, "num_input_tokens_seen": 2913730560, "step": 7410 }, { "epoch": 0.8408073893772388, "grad_norm": 0.5320020914077759, "learning_rate": 8.378088077336198e-06, "loss": 2.0125, "num_input_tokens_seen": 2916089856, "step": 7416 }, { "epoch": 0.8414876542553756, "grad_norm": 0.5405346751213074, "learning_rate": 8.342284282133908e-06, "loss": 1.9639, "num_input_tokens_seen": 2918449152, "step": 7422 }, { "epoch": 0.8421679191335126, "grad_norm": 0.5031660199165344, "learning_rate": 8.306480486931616e-06, "loss": 1.9817, "num_input_tokens_seen": 2920808448, "step": 7428 }, { "epoch": 0.8428481840116495, "grad_norm": 0.5186684727668762, "learning_rate": 8.270676691729324e-06, "loss": 2.0954, "num_input_tokens_seen": 2923167744, "step": 7434 }, { "epoch": 0.8435284488897865, "grad_norm": 0.5321431756019592, "learning_rate": 8.234872896527032e-06, "loss": 2.0269, "num_input_tokens_seen": 2925527040, "step": 7440 }, { "epoch": 0.8442087137679234, "grad_norm": 0.49513474106788635, "learning_rate": 8.199069101324742e-06, "loss": 1.9744, "num_input_tokens_seen": 2927886336, "step": 7446 }, { "epoch": 0.8448889786460603, "grad_norm": 0.5057438015937805, "learning_rate": 8.163265306122448e-06, "loss": 1.986, "num_input_tokens_seen": 2930245632, "step": 7452 }, { "epoch": 0.8455692435241973, "grad_norm": 0.5168727040290833, "learning_rate": 8.127461510920158e-06, "loss": 2.0049, "num_input_tokens_seen": 2932604928, "step": 7458 }, { "epoch": 0.8462495084023341, "grad_norm": 0.584082841873169, "learning_rate": 8.091657715717866e-06, "loss": 2.0321, "num_input_tokens_seen": 2934964224, "step": 7464 }, { "epoch": 0.846929773280471, "grad_norm": 0.49962136149406433, "learning_rate": 8.055853920515576e-06, "loss": 2.0121, "num_input_tokens_seen": 2937323520, "step": 7470 }, { "epoch": 0.847610038158608, "grad_norm": 0.5198308825492859, "learning_rate": 8.020050125313283e-06, "loss": 2.0272, "num_input_tokens_seen": 2939682816, "step": 7476 }, { "epoch": 0.8482903030367449, "grad_norm": 0.5608158707618713, "learning_rate": 7.984246330110993e-06, "loss": 2.0204, "num_input_tokens_seen": 2942042112, "step": 7482 }, { "epoch": 0.8489705679148819, "grad_norm": 0.5206415057182312, "learning_rate": 7.9484425349087e-06, "loss": 1.9897, "num_input_tokens_seen": 2944401408, "step": 7488 }, { "epoch": 0.8496508327930188, "grad_norm": 0.4968629777431488, "learning_rate": 7.912638739706409e-06, "loss": 2.0773, "num_input_tokens_seen": 2946760704, "step": 7494 }, { "epoch": 0.8503310976711557, "grad_norm": 0.4873516857624054, "learning_rate": 7.876834944504117e-06, "loss": 1.9779, "num_input_tokens_seen": 2949120000, "step": 7500 }, { "epoch": 0.8510113625492927, "grad_norm": 0.5486623644828796, "learning_rate": 7.841031149301827e-06, "loss": 1.957, "num_input_tokens_seen": 2951479296, "step": 7506 }, { "epoch": 0.8516916274274295, "grad_norm": 0.6163302659988403, "learning_rate": 7.805227354099535e-06, "loss": 1.9882, "num_input_tokens_seen": 2953838592, "step": 7512 }, { "epoch": 0.8523718923055664, "grad_norm": 0.5529779195785522, "learning_rate": 7.769423558897243e-06, "loss": 2.0252, "num_input_tokens_seen": 2956197888, "step": 7518 }, { "epoch": 0.8530521571837034, "grad_norm": 0.5484551787376404, "learning_rate": 7.733619763694951e-06, "loss": 2.0415, "num_input_tokens_seen": 2958557184, "step": 7524 }, { "epoch": 0.8537324220618403, "grad_norm": 0.49321115016937256, "learning_rate": 7.697815968492661e-06, "loss": 2.0534, "num_input_tokens_seen": 2960916480, "step": 7530 }, { "epoch": 0.8544126869399773, "grad_norm": 0.4970216751098633, "learning_rate": 7.66201217329037e-06, "loss": 2.0491, "num_input_tokens_seen": 2963275776, "step": 7536 }, { "epoch": 0.8550929518181142, "grad_norm": 0.6006478667259216, "learning_rate": 7.6262083780880775e-06, "loss": 2.0457, "num_input_tokens_seen": 2965635072, "step": 7542 }, { "epoch": 0.8557732166962511, "grad_norm": 0.5233898162841797, "learning_rate": 7.5904045828857865e-06, "loss": 2.047, "num_input_tokens_seen": 2967994368, "step": 7548 }, { "epoch": 0.856453481574388, "grad_norm": 0.5446822047233582, "learning_rate": 7.5546007876834955e-06, "loss": 2.0136, "num_input_tokens_seen": 2970353664, "step": 7554 }, { "epoch": 0.8571337464525249, "grad_norm": 0.5539310574531555, "learning_rate": 7.518796992481203e-06, "loss": 1.9784, "num_input_tokens_seen": 2972712960, "step": 7560 }, { "epoch": 0.8578140113306619, "grad_norm": 0.5015861988067627, "learning_rate": 7.482993197278912e-06, "loss": 1.9579, "num_input_tokens_seen": 2975072256, "step": 7566 }, { "epoch": 0.8584942762087988, "grad_norm": 0.4794093072414398, "learning_rate": 7.447189402076621e-06, "loss": 2.0009, "num_input_tokens_seen": 2977431552, "step": 7572 }, { "epoch": 0.8591745410869357, "grad_norm": 0.5329228639602661, "learning_rate": 7.411385606874328e-06, "loss": 2.0169, "num_input_tokens_seen": 2979790848, "step": 7578 }, { "epoch": 0.8598548059650727, "grad_norm": 0.4937734305858612, "learning_rate": 7.375581811672037e-06, "loss": 2.0231, "num_input_tokens_seen": 2982150144, "step": 7584 }, { "epoch": 0.8605350708432096, "grad_norm": 0.5106194615364075, "learning_rate": 7.339778016469746e-06, "loss": 2.0039, "num_input_tokens_seen": 2984509440, "step": 7590 }, { "epoch": 0.8612153357213465, "grad_norm": 0.5017894506454468, "learning_rate": 7.303974221267455e-06, "loss": 1.9735, "num_input_tokens_seen": 2986868736, "step": 7596 }, { "epoch": 0.8616688456401045, "eval_accuracy": 0.587537851037851, "eval_loss": 1.9987263679504395, "eval_runtime": 129.4571, "eval_samples_per_second": 3.09, "eval_steps_per_second": 1.035, "num_input_tokens_seen": 2988441600, "step": 7600 }, { "epoch": 0.8618956005994834, "grad_norm": 0.5046854019165039, "learning_rate": 7.2681704260651625e-06, "loss": 2.0324, "num_input_tokens_seen": 2989228032, "step": 7602 }, { "epoch": 0.8625758654776203, "grad_norm": 0.541846513748169, "learning_rate": 7.2323666308628715e-06, "loss": 2.0062, "num_input_tokens_seen": 2991587328, "step": 7608 }, { "epoch": 0.8632561303557573, "grad_norm": 0.4783530533313751, "learning_rate": 7.1965628356605805e-06, "loss": 2.0239, "num_input_tokens_seen": 2993946624, "step": 7614 }, { "epoch": 0.8639363952338942, "grad_norm": 0.587407648563385, "learning_rate": 7.160759040458289e-06, "loss": 2.0063, "num_input_tokens_seen": 2996305920, "step": 7620 }, { "epoch": 0.8646166601120311, "grad_norm": 0.5383691191673279, "learning_rate": 7.124955245255998e-06, "loss": 2.03, "num_input_tokens_seen": 2998665216, "step": 7626 }, { "epoch": 0.8652969249901681, "grad_norm": 0.5405200719833374, "learning_rate": 7.089151450053707e-06, "loss": 2.0637, "num_input_tokens_seen": 3001024512, "step": 7632 }, { "epoch": 0.865977189868305, "grad_norm": 0.4942198395729065, "learning_rate": 7.053347654851415e-06, "loss": 1.9617, "num_input_tokens_seen": 3003383808, "step": 7638 }, { "epoch": 0.8666574547464418, "grad_norm": 0.5435467958450317, "learning_rate": 7.017543859649123e-06, "loss": 2.0243, "num_input_tokens_seen": 3005743104, "step": 7644 }, { "epoch": 0.8673377196245788, "grad_norm": 0.48755842447280884, "learning_rate": 6.981740064446832e-06, "loss": 1.9578, "num_input_tokens_seen": 3008102400, "step": 7650 }, { "epoch": 0.8680179845027157, "grad_norm": 0.4815945625305176, "learning_rate": 6.945936269244541e-06, "loss": 1.9798, "num_input_tokens_seen": 3010461696, "step": 7656 }, { "epoch": 0.8686982493808527, "grad_norm": 0.5009135007858276, "learning_rate": 6.910132474042248e-06, "loss": 2.0082, "num_input_tokens_seen": 3012820992, "step": 7662 }, { "epoch": 0.8693785142589896, "grad_norm": 0.492590069770813, "learning_rate": 6.874328678839957e-06, "loss": 2.0236, "num_input_tokens_seen": 3015180288, "step": 7668 }, { "epoch": 0.8700587791371265, "grad_norm": 0.4939536452293396, "learning_rate": 6.838524883637666e-06, "loss": 1.9894, "num_input_tokens_seen": 3017539584, "step": 7674 }, { "epoch": 0.8707390440152635, "grad_norm": 0.5177844166755676, "learning_rate": 6.802721088435375e-06, "loss": 2.0053, "num_input_tokens_seen": 3019898880, "step": 7680 }, { "epoch": 0.8714193088934004, "grad_norm": 0.5024730563163757, "learning_rate": 6.766917293233083e-06, "loss": 1.9954, "num_input_tokens_seen": 3022258176, "step": 7686 }, { "epoch": 0.8720995737715372, "grad_norm": 0.5295082330703735, "learning_rate": 6.731113498030792e-06, "loss": 2.0063, "num_input_tokens_seen": 3024617472, "step": 7692 }, { "epoch": 0.8727798386496742, "grad_norm": 0.5884028673171997, "learning_rate": 6.695309702828501e-06, "loss": 1.9762, "num_input_tokens_seen": 3026976768, "step": 7698 }, { "epoch": 0.8734601035278111, "grad_norm": 0.510733425617218, "learning_rate": 6.659505907626208e-06, "loss": 2.0053, "num_input_tokens_seen": 3029336064, "step": 7704 }, { "epoch": 0.8741403684059481, "grad_norm": 0.49165260791778564, "learning_rate": 6.623702112423917e-06, "loss": 1.9905, "num_input_tokens_seen": 3031695360, "step": 7710 }, { "epoch": 0.874820633284085, "grad_norm": 0.5171453356742859, "learning_rate": 6.587898317221626e-06, "loss": 1.9589, "num_input_tokens_seen": 3034054656, "step": 7716 }, { "epoch": 0.8755008981622219, "grad_norm": 0.5841086506843567, "learning_rate": 6.552094522019335e-06, "loss": 2.013, "num_input_tokens_seen": 3036413952, "step": 7722 }, { "epoch": 0.8761811630403589, "grad_norm": 0.5813525915145874, "learning_rate": 6.516290726817042e-06, "loss": 1.9854, "num_input_tokens_seen": 3038773248, "step": 7728 }, { "epoch": 0.8768614279184958, "grad_norm": 0.5193366408348083, "learning_rate": 6.480486931614751e-06, "loss": 2.0316, "num_input_tokens_seen": 3041132544, "step": 7734 }, { "epoch": 0.8775416927966327, "grad_norm": 0.5028855800628662, "learning_rate": 6.44468313641246e-06, "loss": 1.9995, "num_input_tokens_seen": 3043491840, "step": 7740 }, { "epoch": 0.8782219576747696, "grad_norm": 0.5069683194160461, "learning_rate": 6.4088793412101684e-06, "loss": 1.9518, "num_input_tokens_seen": 3045851136, "step": 7746 }, { "epoch": 0.8789022225529065, "grad_norm": 0.4742577373981476, "learning_rate": 6.373075546007877e-06, "loss": 2.0423, "num_input_tokens_seen": 3048210432, "step": 7752 }, { "epoch": 0.8795824874310435, "grad_norm": 0.528330385684967, "learning_rate": 6.337271750805586e-06, "loss": 2.0014, "num_input_tokens_seen": 3050569728, "step": 7758 }, { "epoch": 0.8802627523091804, "grad_norm": 0.47722598910331726, "learning_rate": 6.301467955603295e-06, "loss": 2.0389, "num_input_tokens_seen": 3052929024, "step": 7764 }, { "epoch": 0.8809430171873173, "grad_norm": 0.5158604383468628, "learning_rate": 6.265664160401003e-06, "loss": 2.032, "num_input_tokens_seen": 3055288320, "step": 7770 }, { "epoch": 0.8816232820654543, "grad_norm": 0.555617094039917, "learning_rate": 6.229860365198712e-06, "loss": 2.0911, "num_input_tokens_seen": 3057647616, "step": 7776 }, { "epoch": 0.8823035469435911, "grad_norm": 0.5554957389831543, "learning_rate": 6.19405656999642e-06, "loss": 1.9768, "num_input_tokens_seen": 3060006912, "step": 7782 }, { "epoch": 0.8829838118217281, "grad_norm": 0.5055182576179504, "learning_rate": 6.158252774794129e-06, "loss": 1.9987, "num_input_tokens_seen": 3062366208, "step": 7788 }, { "epoch": 0.883664076699865, "grad_norm": 0.5182470083236694, "learning_rate": 6.122448979591837e-06, "loss": 2.0091, "num_input_tokens_seen": 3064725504, "step": 7794 }, { "epoch": 0.8843443415780019, "grad_norm": 0.507174551486969, "learning_rate": 6.086645184389546e-06, "loss": 1.9631, "num_input_tokens_seen": 3067084800, "step": 7800 }, { "epoch": 0.8850246064561389, "grad_norm": 0.46559634804725647, "learning_rate": 6.050841389187254e-06, "loss": 2.0352, "num_input_tokens_seen": 3069444096, "step": 7806 }, { "epoch": 0.8857048713342758, "grad_norm": 0.5257137417793274, "learning_rate": 6.015037593984962e-06, "loss": 1.9883, "num_input_tokens_seen": 3071803392, "step": 7812 }, { "epoch": 0.8863851362124127, "grad_norm": 0.4962034225463867, "learning_rate": 5.9792337987826714e-06, "loss": 2.0039, "num_input_tokens_seen": 3074162688, "step": 7818 }, { "epoch": 0.8870654010905497, "grad_norm": 0.5294592380523682, "learning_rate": 5.94343000358038e-06, "loss": 1.9959, "num_input_tokens_seen": 3076521984, "step": 7824 }, { "epoch": 0.8877456659686865, "grad_norm": 0.5304044485092163, "learning_rate": 5.907626208378089e-06, "loss": 1.9609, "num_input_tokens_seen": 3078881280, "step": 7830 }, { "epoch": 0.8884259308468235, "grad_norm": 0.5610164403915405, "learning_rate": 5.871822413175797e-06, "loss": 2.0117, "num_input_tokens_seen": 3081240576, "step": 7836 }, { "epoch": 0.8891061957249604, "grad_norm": 0.5142529010772705, "learning_rate": 5.836018617973506e-06, "loss": 2.0373, "num_input_tokens_seen": 3083599872, "step": 7842 }, { "epoch": 0.8897864606030973, "grad_norm": 0.49102193117141724, "learning_rate": 5.800214822771214e-06, "loss": 2.0585, "num_input_tokens_seen": 3085959168, "step": 7848 }, { "epoch": 0.8904667254812343, "grad_norm": 0.5983024835586548, "learning_rate": 5.764411027568922e-06, "loss": 2.025, "num_input_tokens_seen": 3088318464, "step": 7854 }, { "epoch": 0.8911469903593712, "grad_norm": 0.5602377653121948, "learning_rate": 5.728607232366631e-06, "loss": 2.0028, "num_input_tokens_seen": 3090677760, "step": 7860 }, { "epoch": 0.891827255237508, "grad_norm": 0.4956376552581787, "learning_rate": 5.692803437164339e-06, "loss": 2.0413, "num_input_tokens_seen": 3093037056, "step": 7866 }, { "epoch": 0.892507520115645, "grad_norm": 0.46066755056381226, "learning_rate": 5.656999641962048e-06, "loss": 2.0162, "num_input_tokens_seen": 3095396352, "step": 7872 }, { "epoch": 0.8931877849937819, "grad_norm": 0.48907607793807983, "learning_rate": 5.621195846759756e-06, "loss": 1.9745, "num_input_tokens_seen": 3097755648, "step": 7878 }, { "epoch": 0.8938680498719189, "grad_norm": 0.4798557758331299, "learning_rate": 5.585392051557465e-06, "loss": 2.0045, "num_input_tokens_seen": 3100114944, "step": 7884 }, { "epoch": 0.8945483147500558, "grad_norm": 0.523992657661438, "learning_rate": 5.5495882563551736e-06, "loss": 2.0096, "num_input_tokens_seen": 3102474240, "step": 7890 }, { "epoch": 0.8952285796281927, "grad_norm": 0.48234423995018005, "learning_rate": 5.5137844611528826e-06, "loss": 1.9822, "num_input_tokens_seen": 3104833536, "step": 7896 }, { "epoch": 0.8959088445063297, "grad_norm": 0.51031494140625, "learning_rate": 5.477980665950591e-06, "loss": 2.0232, "num_input_tokens_seen": 3107192832, "step": 7902 }, { "epoch": 0.8965891093844666, "grad_norm": 0.5131000876426697, "learning_rate": 5.4421768707483e-06, "loss": 2.0059, "num_input_tokens_seen": 3109552128, "step": 7908 }, { "epoch": 0.8972693742626034, "grad_norm": 0.510401725769043, "learning_rate": 5.406373075546009e-06, "loss": 1.966, "num_input_tokens_seen": 3111911424, "step": 7914 }, { "epoch": 0.8979496391407404, "grad_norm": 0.541610062122345, "learning_rate": 5.370569280343717e-06, "loss": 1.9652, "num_input_tokens_seen": 3114270720, "step": 7920 }, { "epoch": 0.8986299040188773, "grad_norm": 0.5096346735954285, "learning_rate": 5.334765485141426e-06, "loss": 1.9736, "num_input_tokens_seen": 3116630016, "step": 7926 }, { "epoch": 0.8993101688970143, "grad_norm": 0.5285272002220154, "learning_rate": 5.298961689939134e-06, "loss": 1.9764, "num_input_tokens_seen": 3118989312, "step": 7932 }, { "epoch": 0.8999904337751512, "grad_norm": 0.4984615743160248, "learning_rate": 5.263157894736842e-06, "loss": 2.0364, "num_input_tokens_seen": 3121348608, "step": 7938 }, { "epoch": 0.9006706986532881, "grad_norm": 0.517405092716217, "learning_rate": 5.227354099534551e-06, "loss": 1.9954, "num_input_tokens_seen": 3123707904, "step": 7944 }, { "epoch": 0.9013509635314251, "grad_norm": 0.5145347118377686, "learning_rate": 5.191550304332259e-06, "loss": 2.0138, "num_input_tokens_seen": 3126067200, "step": 7950 }, { "epoch": 0.902031228409562, "grad_norm": 0.5413515567779541, "learning_rate": 5.155746509129968e-06, "loss": 1.9974, "num_input_tokens_seen": 3128426496, "step": 7956 }, { "epoch": 0.902711493287699, "grad_norm": 0.5247104167938232, "learning_rate": 5.1199427139276766e-06, "loss": 2.0054, "num_input_tokens_seen": 3130785792, "step": 7962 }, { "epoch": 0.9033917581658358, "grad_norm": 0.5259600281715393, "learning_rate": 5.0841389187253856e-06, "loss": 1.9812, "num_input_tokens_seen": 3133145088, "step": 7968 }, { "epoch": 0.9040720230439727, "grad_norm": 0.537581205368042, "learning_rate": 5.048335123523094e-06, "loss": 2.0135, "num_input_tokens_seen": 3135504384, "step": 7974 }, { "epoch": 0.9047522879221097, "grad_norm": 0.5331296920776367, "learning_rate": 5.012531328320802e-06, "loss": 2.0023, "num_input_tokens_seen": 3137863680, "step": 7980 }, { "epoch": 0.9054325528002466, "grad_norm": 0.5150538086891174, "learning_rate": 4.976727533118511e-06, "loss": 2.046, "num_input_tokens_seen": 3140222976, "step": 7986 }, { "epoch": 0.9061128176783835, "grad_norm": 0.5423092842102051, "learning_rate": 4.940923737916219e-06, "loss": 2.0323, "num_input_tokens_seen": 3142582272, "step": 7992 }, { "epoch": 0.9067930825565205, "grad_norm": 0.5528409481048584, "learning_rate": 4.905119942713928e-06, "loss": 1.9799, "num_input_tokens_seen": 3144941568, "step": 7998 }, { "epoch": 0.9070198375158994, "eval_accuracy": 0.5879627594627594, "eval_loss": 1.996025800704956, "eval_runtime": 129.4235, "eval_samples_per_second": 3.091, "eval_steps_per_second": 1.035, "num_input_tokens_seen": 3145728000, "step": 8000 }, { "epoch": 0.9074733474346574, "grad_norm": 0.4775083661079407, "learning_rate": 4.869316147511636e-06, "loss": 2.0185, "num_input_tokens_seen": 3147300864, "step": 8004 }, { "epoch": 0.9081536123127943, "grad_norm": 0.5261006355285645, "learning_rate": 4.833512352309345e-06, "loss": 1.9824, "num_input_tokens_seen": 3149660160, "step": 8010 }, { "epoch": 0.9088338771909312, "grad_norm": 0.4982771575450897, "learning_rate": 4.797708557107053e-06, "loss": 2.0035, "num_input_tokens_seen": 3152019456, "step": 8016 }, { "epoch": 0.9095141420690681, "grad_norm": 0.5401104092597961, "learning_rate": 4.7619047619047615e-06, "loss": 1.9991, "num_input_tokens_seen": 3154378752, "step": 8022 }, { "epoch": 0.9101944069472051, "grad_norm": 0.4819372594356537, "learning_rate": 4.7261009667024705e-06, "loss": 2.0375, "num_input_tokens_seen": 3156738048, "step": 8028 }, { "epoch": 0.910874671825342, "grad_norm": 0.51005619764328, "learning_rate": 4.690297171500179e-06, "loss": 2.0353, "num_input_tokens_seen": 3159097344, "step": 8034 }, { "epoch": 0.9115549367034789, "grad_norm": 0.49865275621414185, "learning_rate": 4.654493376297888e-06, "loss": 2.0177, "num_input_tokens_seen": 3161456640, "step": 8040 }, { "epoch": 0.9122352015816159, "grad_norm": 0.4954957962036133, "learning_rate": 4.618689581095597e-06, "loss": 2.0235, "num_input_tokens_seen": 3163815936, "step": 8046 }, { "epoch": 0.9129154664597527, "grad_norm": 0.48068705201148987, "learning_rate": 4.582885785893305e-06, "loss": 2.0415, "num_input_tokens_seen": 3166175232, "step": 8052 }, { "epoch": 0.9135957313378897, "grad_norm": 0.5091089606285095, "learning_rate": 4.547081990691014e-06, "loss": 2.0128, "num_input_tokens_seen": 3168534528, "step": 8058 }, { "epoch": 0.9142759962160266, "grad_norm": 0.48053333163261414, "learning_rate": 4.511278195488722e-06, "loss": 2.1089, "num_input_tokens_seen": 3170893824, "step": 8064 }, { "epoch": 0.9149562610941635, "grad_norm": 0.506682276725769, "learning_rate": 4.475474400286431e-06, "loss": 1.9918, "num_input_tokens_seen": 3173253120, "step": 8070 }, { "epoch": 0.9156365259723005, "grad_norm": 0.47464847564697266, "learning_rate": 4.439670605084139e-06, "loss": 2.0194, "num_input_tokens_seen": 3175612416, "step": 8076 }, { "epoch": 0.9163167908504374, "grad_norm": 0.4886178970336914, "learning_rate": 4.403866809881848e-06, "loss": 2.0147, "num_input_tokens_seen": 3177971712, "step": 8082 }, { "epoch": 0.9169970557285743, "grad_norm": 0.5083957314491272, "learning_rate": 4.368063014679556e-06, "loss": 1.9979, "num_input_tokens_seen": 3180331008, "step": 8088 }, { "epoch": 0.9176773206067113, "grad_norm": 0.5344352126121521, "learning_rate": 4.3322592194772645e-06, "loss": 2.0719, "num_input_tokens_seen": 3182690304, "step": 8094 }, { "epoch": 0.9183575854848481, "grad_norm": 0.4968712627887726, "learning_rate": 4.2964554242749735e-06, "loss": 1.987, "num_input_tokens_seen": 3185049600, "step": 8100 }, { "epoch": 0.9190378503629851, "grad_norm": 0.5581889748573303, "learning_rate": 4.260651629072682e-06, "loss": 2.0171, "num_input_tokens_seen": 3187408896, "step": 8106 }, { "epoch": 0.919718115241122, "grad_norm": 0.5023228526115417, "learning_rate": 4.224847833870391e-06, "loss": 1.9605, "num_input_tokens_seen": 3189768192, "step": 8112 }, { "epoch": 0.9203983801192589, "grad_norm": 0.5072777271270752, "learning_rate": 4.189044038668099e-06, "loss": 2.0555, "num_input_tokens_seen": 3192127488, "step": 8118 }, { "epoch": 0.9210786449973959, "grad_norm": 0.501773476600647, "learning_rate": 4.153240243465808e-06, "loss": 2.0703, "num_input_tokens_seen": 3194486784, "step": 8124 }, { "epoch": 0.9217589098755328, "grad_norm": 0.5245522856712341, "learning_rate": 4.117436448263516e-06, "loss": 2.0284, "num_input_tokens_seen": 3196846080, "step": 8130 }, { "epoch": 0.9224391747536697, "grad_norm": 0.5125513672828674, "learning_rate": 4.081632653061224e-06, "loss": 1.9894, "num_input_tokens_seen": 3199205376, "step": 8136 }, { "epoch": 0.9231194396318066, "grad_norm": 0.5277597904205322, "learning_rate": 4.045828857858933e-06, "loss": 1.9765, "num_input_tokens_seen": 3201564672, "step": 8142 }, { "epoch": 0.9237997045099435, "grad_norm": 0.48302364349365234, "learning_rate": 4.010025062656641e-06, "loss": 1.9909, "num_input_tokens_seen": 3203923968, "step": 8148 }, { "epoch": 0.9244799693880805, "grad_norm": 0.4815656244754791, "learning_rate": 3.97422126745435e-06, "loss": 2.069, "num_input_tokens_seen": 3206283264, "step": 8154 }, { "epoch": 0.9251602342662174, "grad_norm": 0.4820353090763092, "learning_rate": 3.9384174722520585e-06, "loss": 1.9816, "num_input_tokens_seen": 3208642560, "step": 8160 }, { "epoch": 0.9258404991443543, "grad_norm": 0.4983651041984558, "learning_rate": 3.9026136770497675e-06, "loss": 2.0272, "num_input_tokens_seen": 3211001856, "step": 8166 }, { "epoch": 0.9265207640224913, "grad_norm": 0.4833485186100006, "learning_rate": 3.866809881847476e-06, "loss": 2.0169, "num_input_tokens_seen": 3213361152, "step": 8172 }, { "epoch": 0.9272010289006282, "grad_norm": 0.5084331631660461, "learning_rate": 3.831006086645185e-06, "loss": 1.9514, "num_input_tokens_seen": 3215720448, "step": 8178 }, { "epoch": 0.9278812937787652, "grad_norm": 0.5215640068054199, "learning_rate": 3.7952022914428932e-06, "loss": 2.0084, "num_input_tokens_seen": 3218079744, "step": 8184 }, { "epoch": 0.928561558656902, "grad_norm": 0.48457303643226624, "learning_rate": 3.7593984962406014e-06, "loss": 1.99, "num_input_tokens_seen": 3220439040, "step": 8190 }, { "epoch": 0.9292418235350389, "grad_norm": 0.4908931851387024, "learning_rate": 3.7235947010383104e-06, "loss": 2.031, "num_input_tokens_seen": 3222798336, "step": 8196 }, { "epoch": 0.9299220884131759, "grad_norm": 0.486664354801178, "learning_rate": 3.6877909058360186e-06, "loss": 2.0522, "num_input_tokens_seen": 3225157632, "step": 8202 }, { "epoch": 0.9306023532913128, "grad_norm": 0.5011295676231384, "learning_rate": 3.6519871106337276e-06, "loss": 1.9818, "num_input_tokens_seen": 3227516928, "step": 8208 }, { "epoch": 0.9312826181694497, "grad_norm": 0.5124307870864868, "learning_rate": 3.6161833154314357e-06, "loss": 2.0587, "num_input_tokens_seen": 3229876224, "step": 8214 }, { "epoch": 0.9319628830475867, "grad_norm": 0.5010582804679871, "learning_rate": 3.5803795202291443e-06, "loss": 2.0345, "num_input_tokens_seen": 3232235520, "step": 8220 }, { "epoch": 0.9326431479257236, "grad_norm": 0.5208165645599365, "learning_rate": 3.5445757250268533e-06, "loss": 1.9941, "num_input_tokens_seen": 3234594816, "step": 8226 }, { "epoch": 0.9333234128038606, "grad_norm": 0.5112361311912537, "learning_rate": 3.5087719298245615e-06, "loss": 1.9566, "num_input_tokens_seen": 3236954112, "step": 8232 }, { "epoch": 0.9340036776819974, "grad_norm": 0.48557788133621216, "learning_rate": 3.4729681346222705e-06, "loss": 1.9702, "num_input_tokens_seen": 3239313408, "step": 8238 }, { "epoch": 0.9346839425601343, "grad_norm": 0.5145829319953918, "learning_rate": 3.4371643394199786e-06, "loss": 2.0183, "num_input_tokens_seen": 3241672704, "step": 8244 }, { "epoch": 0.9353642074382713, "grad_norm": 0.44660866260528564, "learning_rate": 3.4013605442176877e-06, "loss": 2.0032, "num_input_tokens_seen": 3244032000, "step": 8250 }, { "epoch": 0.9360444723164082, "grad_norm": 0.5104228258132935, "learning_rate": 3.365556749015396e-06, "loss": 2.0313, "num_input_tokens_seen": 3246391296, "step": 8256 }, { "epoch": 0.9367247371945451, "grad_norm": 0.5160300731658936, "learning_rate": 3.329752953813104e-06, "loss": 1.9679, "num_input_tokens_seen": 3248750592, "step": 8262 }, { "epoch": 0.9374050020726821, "grad_norm": 0.4720374643802643, "learning_rate": 3.293949158610813e-06, "loss": 1.9988, "num_input_tokens_seen": 3251109888, "step": 8268 }, { "epoch": 0.938085266950819, "grad_norm": 0.4732125699520111, "learning_rate": 3.258145363408521e-06, "loss": 2.0359, "num_input_tokens_seen": 3253469184, "step": 8274 }, { "epoch": 0.938765531828956, "grad_norm": 0.4820111095905304, "learning_rate": 3.22234156820623e-06, "loss": 1.9772, "num_input_tokens_seen": 3255828480, "step": 8280 }, { "epoch": 0.9394457967070928, "grad_norm": 0.48187270760536194, "learning_rate": 3.1865377730039383e-06, "loss": 1.9989, "num_input_tokens_seen": 3258187776, "step": 8286 }, { "epoch": 0.9401260615852297, "grad_norm": 0.47333669662475586, "learning_rate": 3.1507339778016473e-06, "loss": 1.9811, "num_input_tokens_seen": 3260547072, "step": 8292 }, { "epoch": 0.9408063264633667, "grad_norm": 0.5094246864318848, "learning_rate": 3.114930182599356e-06, "loss": 1.9739, "num_input_tokens_seen": 3262906368, "step": 8298 }, { "epoch": 0.9414865913415036, "grad_norm": 0.5325969457626343, "learning_rate": 3.0791263873970645e-06, "loss": 2.015, "num_input_tokens_seen": 3265265664, "step": 8304 }, { "epoch": 0.9421668562196405, "grad_norm": 0.4827982485294342, "learning_rate": 3.043322592194773e-06, "loss": 1.9949, "num_input_tokens_seen": 3267624960, "step": 8310 }, { "epoch": 0.9428471210977775, "grad_norm": 0.4823977053165436, "learning_rate": 3.007518796992481e-06, "loss": 1.9807, "num_input_tokens_seen": 3269984256, "step": 8316 }, { "epoch": 0.9435273859759143, "grad_norm": 0.4721021354198456, "learning_rate": 2.97171500179019e-06, "loss": 1.9597, "num_input_tokens_seen": 3272343552, "step": 8322 }, { "epoch": 0.9442076508540513, "grad_norm": 0.4703858494758606, "learning_rate": 2.9359112065878984e-06, "loss": 1.9861, "num_input_tokens_seen": 3274702848, "step": 8328 }, { "epoch": 0.9448879157321882, "grad_norm": 0.5197435021400452, "learning_rate": 2.900107411385607e-06, "loss": 2.0625, "num_input_tokens_seen": 3277062144, "step": 8334 }, { "epoch": 0.9455681806103251, "grad_norm": 0.47608399391174316, "learning_rate": 2.8643036161833155e-06, "loss": 2.0277, "num_input_tokens_seen": 3279421440, "step": 8340 }, { "epoch": 0.9462484454884621, "grad_norm": 0.5438135266304016, "learning_rate": 2.828499820981024e-06, "loss": 1.9903, "num_input_tokens_seen": 3281780736, "step": 8346 }, { "epoch": 0.946928710366599, "grad_norm": 0.48217347264289856, "learning_rate": 2.7926960257787327e-06, "loss": 2.0009, "num_input_tokens_seen": 3284140032, "step": 8352 }, { "epoch": 0.947608975244736, "grad_norm": 0.47104737162590027, "learning_rate": 2.7568922305764413e-06, "loss": 2.0084, "num_input_tokens_seen": 3286499328, "step": 8358 }, { "epoch": 0.9482892401228729, "grad_norm": 0.5058236718177795, "learning_rate": 2.72108843537415e-06, "loss": 1.9667, "num_input_tokens_seen": 3288858624, "step": 8364 }, { "epoch": 0.9489695050010097, "grad_norm": 0.4855674207210541, "learning_rate": 2.6852846401718585e-06, "loss": 1.9953, "num_input_tokens_seen": 3291217920, "step": 8370 }, { "epoch": 0.9496497698791467, "grad_norm": 0.49368613958358765, "learning_rate": 2.649480844969567e-06, "loss": 2.0019, "num_input_tokens_seen": 3293577216, "step": 8376 }, { "epoch": 0.9503300347572836, "grad_norm": 0.4895451068878174, "learning_rate": 2.6136770497672756e-06, "loss": 2.0552, "num_input_tokens_seen": 3295936512, "step": 8382 }, { "epoch": 0.9510102996354205, "grad_norm": 0.4846164882183075, "learning_rate": 2.577873254564984e-06, "loss": 2.0175, "num_input_tokens_seen": 3298295808, "step": 8388 }, { "epoch": 0.9516905645135575, "grad_norm": 0.4728488028049469, "learning_rate": 2.5420694593626928e-06, "loss": 2.0314, "num_input_tokens_seen": 3300655104, "step": 8394 }, { "epoch": 0.9523708293916944, "grad_norm": 0.47742366790771484, "learning_rate": 2.506265664160401e-06, "loss": 2.0056, "num_input_tokens_seen": 3303014400, "step": 8400 }, { "epoch": 0.9523708293916944, "eval_accuracy": 0.5882161172161172, "eval_loss": 1.9941824674606323, "eval_runtime": 129.6127, "eval_samples_per_second": 3.086, "eval_steps_per_second": 1.034, "num_input_tokens_seen": 3303014400, "step": 8400 }, { "epoch": 0.9530510942698314, "grad_norm": 0.49103352427482605, "learning_rate": 2.4704618689581095e-06, "loss": 2.0074, "num_input_tokens_seen": 3305373696, "step": 8406 }, { "epoch": 0.9537313591479682, "grad_norm": 0.47667092084884644, "learning_rate": 2.434658073755818e-06, "loss": 2.0055, "num_input_tokens_seen": 3307732992, "step": 8412 }, { "epoch": 0.9544116240261051, "grad_norm": 0.5088315606117249, "learning_rate": 2.3988542785535267e-06, "loss": 2.0492, "num_input_tokens_seen": 3310092288, "step": 8418 }, { "epoch": 0.9550918889042421, "grad_norm": 0.5331157445907593, "learning_rate": 2.3630504833512353e-06, "loss": 1.991, "num_input_tokens_seen": 3312451584, "step": 8424 }, { "epoch": 0.955772153782379, "grad_norm": 0.4914342164993286, "learning_rate": 2.327246688148944e-06, "loss": 2.05, "num_input_tokens_seen": 3314810880, "step": 8430 }, { "epoch": 0.9564524186605159, "grad_norm": 0.5580516457557678, "learning_rate": 2.2914428929466524e-06, "loss": 2.0284, "num_input_tokens_seen": 3317170176, "step": 8436 }, { "epoch": 0.9571326835386529, "grad_norm": 0.5167604088783264, "learning_rate": 2.255639097744361e-06, "loss": 1.9835, "num_input_tokens_seen": 3319529472, "step": 8442 }, { "epoch": 0.9578129484167898, "grad_norm": 0.46328479051589966, "learning_rate": 2.2198353025420696e-06, "loss": 2.0001, "num_input_tokens_seen": 3321888768, "step": 8448 }, { "epoch": 0.9584932132949268, "grad_norm": 0.489848256111145, "learning_rate": 2.184031507339778e-06, "loss": 1.9874, "num_input_tokens_seen": 3324248064, "step": 8454 }, { "epoch": 0.9591734781730636, "grad_norm": 0.4731234312057495, "learning_rate": 2.1482277121374868e-06, "loss": 2.0577, "num_input_tokens_seen": 3326607360, "step": 8460 }, { "epoch": 0.9598537430512005, "grad_norm": 0.46996498107910156, "learning_rate": 2.1124239169351953e-06, "loss": 1.958, "num_input_tokens_seen": 3328966656, "step": 8466 }, { "epoch": 0.9605340079293375, "grad_norm": 0.4455466866493225, "learning_rate": 2.076620121732904e-06, "loss": 2.0721, "num_input_tokens_seen": 3331325952, "step": 8472 }, { "epoch": 0.9612142728074744, "grad_norm": 0.483164519071579, "learning_rate": 2.040816326530612e-06, "loss": 1.997, "num_input_tokens_seen": 3333685248, "step": 8478 }, { "epoch": 0.9618945376856113, "grad_norm": 0.46224308013916016, "learning_rate": 2.0050125313283207e-06, "loss": 1.9534, "num_input_tokens_seen": 3336044544, "step": 8484 }, { "epoch": 0.9625748025637483, "grad_norm": 0.5407201647758484, "learning_rate": 1.9692087361260292e-06, "loss": 2.0635, "num_input_tokens_seen": 3338403840, "step": 8490 }, { "epoch": 0.9632550674418852, "grad_norm": 0.49724259972572327, "learning_rate": 1.933404940923738e-06, "loss": 2.0165, "num_input_tokens_seen": 3340763136, "step": 8496 }, { "epoch": 0.9639353323200222, "grad_norm": 0.4704829156398773, "learning_rate": 1.8976011457214466e-06, "loss": 1.995, "num_input_tokens_seen": 3343122432, "step": 8502 }, { "epoch": 0.964615597198159, "grad_norm": 0.4864175319671631, "learning_rate": 1.8617973505191552e-06, "loss": 2.0195, "num_input_tokens_seen": 3345481728, "step": 8508 }, { "epoch": 0.9652958620762959, "grad_norm": 0.5042557120323181, "learning_rate": 1.8259935553168638e-06, "loss": 1.9928, "num_input_tokens_seen": 3347841024, "step": 8514 }, { "epoch": 0.9659761269544329, "grad_norm": 0.5622674822807312, "learning_rate": 1.7901897601145722e-06, "loss": 2.0166, "num_input_tokens_seen": 3350200320, "step": 8520 }, { "epoch": 0.9666563918325698, "grad_norm": 0.4886009395122528, "learning_rate": 1.7543859649122807e-06, "loss": 2.0512, "num_input_tokens_seen": 3352559616, "step": 8526 }, { "epoch": 0.9673366567107067, "grad_norm": 0.48748981952667236, "learning_rate": 1.7185821697099893e-06, "loss": 2.0336, "num_input_tokens_seen": 3354918912, "step": 8532 }, { "epoch": 0.9680169215888437, "grad_norm": 0.4899289608001709, "learning_rate": 1.682778374507698e-06, "loss": 2.0008, "num_input_tokens_seen": 3357278208, "step": 8538 }, { "epoch": 0.9686971864669806, "grad_norm": 0.465916246175766, "learning_rate": 1.6469745793054065e-06, "loss": 1.9239, "num_input_tokens_seen": 3359637504, "step": 8544 }, { "epoch": 0.9693774513451175, "grad_norm": 0.5022467374801636, "learning_rate": 1.611170784103115e-06, "loss": 2.0017, "num_input_tokens_seen": 3361996800, "step": 8550 }, { "epoch": 0.9700577162232544, "grad_norm": 0.47663870453834534, "learning_rate": 1.5753669889008237e-06, "loss": 1.9944, "num_input_tokens_seen": 3364356096, "step": 8556 }, { "epoch": 0.9707379811013913, "grad_norm": 0.48725882172584534, "learning_rate": 1.5395631936985322e-06, "loss": 1.9945, "num_input_tokens_seen": 3366715392, "step": 8562 }, { "epoch": 0.9714182459795283, "grad_norm": 0.47763851284980774, "learning_rate": 1.5037593984962406e-06, "loss": 2.0526, "num_input_tokens_seen": 3369074688, "step": 8568 }, { "epoch": 0.9720985108576652, "grad_norm": 0.4931076467037201, "learning_rate": 1.4679556032939492e-06, "loss": 1.9821, "num_input_tokens_seen": 3371433984, "step": 8574 }, { "epoch": 0.9727787757358022, "grad_norm": 0.4717552363872528, "learning_rate": 1.4321518080916578e-06, "loss": 1.9392, "num_input_tokens_seen": 3373793280, "step": 8580 }, { "epoch": 0.9734590406139391, "grad_norm": 0.46910974383354187, "learning_rate": 1.3963480128893664e-06, "loss": 2.061, "num_input_tokens_seen": 3376152576, "step": 8586 }, { "epoch": 0.974139305492076, "grad_norm": 0.4669153690338135, "learning_rate": 1.360544217687075e-06, "loss": 2.0235, "num_input_tokens_seen": 3378511872, "step": 8592 }, { "epoch": 0.9748195703702129, "grad_norm": 0.49458202719688416, "learning_rate": 1.3247404224847835e-06, "loss": 1.9982, "num_input_tokens_seen": 3380871168, "step": 8598 }, { "epoch": 0.9754998352483498, "grad_norm": 0.4920654892921448, "learning_rate": 1.288936627282492e-06, "loss": 2.0098, "num_input_tokens_seen": 3383230464, "step": 8604 }, { "epoch": 0.9761801001264867, "grad_norm": 0.46870675683021545, "learning_rate": 1.2531328320802005e-06, "loss": 2.0151, "num_input_tokens_seen": 3385589760, "step": 8610 }, { "epoch": 0.9768603650046237, "grad_norm": 0.4873650372028351, "learning_rate": 1.217329036877909e-06, "loss": 2.0151, "num_input_tokens_seen": 3387949056, "step": 8616 }, { "epoch": 0.9775406298827606, "grad_norm": 0.4861888587474823, "learning_rate": 1.1815252416756176e-06, "loss": 2.0329, "num_input_tokens_seen": 3390308352, "step": 8622 }, { "epoch": 0.9782208947608976, "grad_norm": 0.48227134346961975, "learning_rate": 1.1457214464733262e-06, "loss": 2.049, "num_input_tokens_seen": 3392667648, "step": 8628 }, { "epoch": 0.9789011596390345, "grad_norm": 0.5111281871795654, "learning_rate": 1.1099176512710348e-06, "loss": 2.0235, "num_input_tokens_seen": 3395026944, "step": 8634 }, { "epoch": 0.9795814245171713, "grad_norm": 0.4849562644958496, "learning_rate": 1.0741138560687434e-06, "loss": 2.043, "num_input_tokens_seen": 3397386240, "step": 8640 }, { "epoch": 0.9802616893953083, "grad_norm": 0.461967408657074, "learning_rate": 1.038310060866452e-06, "loss": 2.0346, "num_input_tokens_seen": 3399745536, "step": 8646 }, { "epoch": 0.9809419542734452, "grad_norm": 0.5269701480865479, "learning_rate": 1.0025062656641603e-06, "loss": 1.9499, "num_input_tokens_seen": 3402104832, "step": 8652 }, { "epoch": 0.9816222191515821, "grad_norm": 0.4462730586528778, "learning_rate": 9.66702470461869e-07, "loss": 1.9948, "num_input_tokens_seen": 3404464128, "step": 8658 }, { "epoch": 0.9823024840297191, "grad_norm": 0.459370493888855, "learning_rate": 9.308986752595776e-07, "loss": 2.0511, "num_input_tokens_seen": 3406823424, "step": 8664 }, { "epoch": 0.982982748907856, "grad_norm": 0.49241387844085693, "learning_rate": 8.950948800572861e-07, "loss": 1.994, "num_input_tokens_seen": 3409182720, "step": 8670 }, { "epoch": 0.983663013785993, "grad_norm": 0.4557535648345947, "learning_rate": 8.592910848549947e-07, "loss": 2.0506, "num_input_tokens_seen": 3411542016, "step": 8676 }, { "epoch": 0.9843432786641299, "grad_norm": 0.48074275255203247, "learning_rate": 8.234872896527032e-07, "loss": 1.946, "num_input_tokens_seen": 3413901312, "step": 8682 }, { "epoch": 0.9850235435422667, "grad_norm": 0.4807458221912384, "learning_rate": 7.876834944504118e-07, "loss": 1.9977, "num_input_tokens_seen": 3416260608, "step": 8688 }, { "epoch": 0.9857038084204037, "grad_norm": 0.4791664183139801, "learning_rate": 7.518796992481203e-07, "loss": 1.9519, "num_input_tokens_seen": 3418619904, "step": 8694 }, { "epoch": 0.9863840732985406, "grad_norm": 0.45667803287506104, "learning_rate": 7.160759040458289e-07, "loss": 2.0047, "num_input_tokens_seen": 3420979200, "step": 8700 }, { "epoch": 0.9870643381766775, "grad_norm": 0.456066370010376, "learning_rate": 6.802721088435375e-07, "loss": 2.023, "num_input_tokens_seen": 3423338496, "step": 8706 }, { "epoch": 0.9877446030548145, "grad_norm": 0.49028295278549194, "learning_rate": 6.44468313641246e-07, "loss": 2.0506, "num_input_tokens_seen": 3425697792, "step": 8712 }, { "epoch": 0.9884248679329514, "grad_norm": 0.4853602647781372, "learning_rate": 6.086645184389545e-07, "loss": 2.0127, "num_input_tokens_seen": 3428057088, "step": 8718 }, { "epoch": 0.9891051328110884, "grad_norm": 0.4771934449672699, "learning_rate": 5.728607232366631e-07, "loss": 2.0125, "num_input_tokens_seen": 3430416384, "step": 8724 }, { "epoch": 0.9897853976892252, "grad_norm": 0.47227999567985535, "learning_rate": 5.370569280343717e-07, "loss": 1.9608, "num_input_tokens_seen": 3432775680, "step": 8730 }, { "epoch": 0.9904656625673621, "grad_norm": 0.5044968128204346, "learning_rate": 5.012531328320802e-07, "loss": 2.0495, "num_input_tokens_seen": 3435134976, "step": 8736 }, { "epoch": 0.9911459274454991, "grad_norm": 0.47824859619140625, "learning_rate": 4.654493376297888e-07, "loss": 2.0122, "num_input_tokens_seen": 3437494272, "step": 8742 }, { "epoch": 0.991826192323636, "grad_norm": 0.4707111120223999, "learning_rate": 4.2964554242749733e-07, "loss": 2.0036, "num_input_tokens_seen": 3439853568, "step": 8748 }, { "epoch": 0.992506457201773, "grad_norm": 0.47407931089401245, "learning_rate": 3.938417472252059e-07, "loss": 2.0537, "num_input_tokens_seen": 3442212864, "step": 8754 }, { "epoch": 0.9931867220799099, "grad_norm": 0.46975448727607727, "learning_rate": 3.5803795202291444e-07, "loss": 1.9721, "num_input_tokens_seen": 3444572160, "step": 8760 }, { "epoch": 0.9938669869580468, "grad_norm": 0.46711423993110657, "learning_rate": 3.22234156820623e-07, "loss": 2.0284, "num_input_tokens_seen": 3446931456, "step": 8766 }, { "epoch": 0.9945472518361838, "grad_norm": 0.4574568271636963, "learning_rate": 2.8643036161833155e-07, "loss": 1.9764, "num_input_tokens_seen": 3449290752, "step": 8772 }, { "epoch": 0.9952275167143206, "grad_norm": 0.4745030105113983, "learning_rate": 2.506265664160401e-07, "loss": 2.0551, "num_input_tokens_seen": 3451650048, "step": 8778 }, { "epoch": 0.9959077815924575, "grad_norm": 0.48132795095443726, "learning_rate": 2.1482277121374867e-07, "loss": 1.9993, "num_input_tokens_seen": 3454009344, "step": 8784 }, { "epoch": 0.9965880464705945, "grad_norm": 0.4754565954208374, "learning_rate": 1.7901897601145722e-07, "loss": 2.0276, "num_input_tokens_seen": 3456368640, "step": 8790 }, { "epoch": 0.9972683113487314, "grad_norm": 0.4709925949573517, "learning_rate": 1.4321518080916578e-07, "loss": 1.9961, "num_input_tokens_seen": 3458727936, "step": 8796 }, { "epoch": 0.9977218212674893, "eval_accuracy": 0.5884273504273504, "eval_loss": 1.992612361907959, "eval_runtime": 129.0731, "eval_samples_per_second": 3.099, "eval_steps_per_second": 1.038, "num_input_tokens_seen": 3460300800, "step": 8800 }, { "epoch": 0.9979485762268684, "grad_norm": 0.4582703709602356, "learning_rate": 1.0741138560687433e-07, "loss": 2.0105, "num_input_tokens_seen": 3461087232, "step": 8802 }, { "epoch": 0.9986288411050053, "grad_norm": 0.4576333463191986, "learning_rate": 7.160759040458289e-08, "loss": 1.974, "num_input_tokens_seen": 3463446528, "step": 8808 }, { "epoch": 0.9993091059831422, "grad_norm": 0.46073076128959656, "learning_rate": 3.5803795202291444e-08, "loss": 2.0213, "num_input_tokens_seen": 3465805824, "step": 8814 }, { "epoch": 0.9999893708612791, "grad_norm": 0.48334258794784546, "learning_rate": 0.0, "loss": 1.9662, "num_input_tokens_seen": 3468165120, "step": 8820 }, { "epoch": 0.9999893708612791, "num_input_tokens_seen": 3468165120, "step": 8820, "total_flos": 4.540784328132526e+18, "train_loss": 2.065564124978859, "train_runtime": 98524.0638, "train_samples_per_second": 8.594, "train_steps_per_second": 0.09 } ], "logging_steps": 6, "max_steps": 8820, "num_input_tokens_seen": 3468165120, "num_train_epochs": 1, "save_steps": 200, "total_flos": 4.540784328132526e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }