{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 20638, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00048454307587944567, "grad_norm": 0.4255552887916565, "learning_rate": 8.064516129032258e-07, "loss": 1.221, "num_input_tokens_seen": 392540, "step": 10 }, { "epoch": 0.0009690861517588913, "grad_norm": 0.4066425859928131, "learning_rate": 1.6129032258064516e-06, "loss": 1.2327, "num_input_tokens_seen": 788876, "step": 20 }, { "epoch": 0.0014536292276383371, "grad_norm": 0.37268057465553284, "learning_rate": 2.4193548387096776e-06, "loss": 1.2174, "num_input_tokens_seen": 1187392, "step": 30 }, { "epoch": 0.0019381723035177827, "grad_norm": 0.38770782947540283, "learning_rate": 3.225806451612903e-06, "loss": 1.1503, "num_input_tokens_seen": 1556468, "step": 40 }, { "epoch": 0.0024227153793972282, "grad_norm": 0.40057116746902466, "learning_rate": 4.032258064516129e-06, "loss": 1.2156, "num_input_tokens_seen": 1950296, "step": 50 }, { "epoch": 0.0029072584552766742, "grad_norm": 0.4051766097545624, "learning_rate": 4.838709677419355e-06, "loss": 1.2149, "num_input_tokens_seen": 2336152, "step": 60 }, { "epoch": 0.00339180153115612, "grad_norm": 0.41299352049827576, "learning_rate": 5.64516129032258e-06, "loss": 1.19, "num_input_tokens_seen": 2728108, "step": 70 }, { "epoch": 0.0038763446070355654, "grad_norm": 0.4118805527687073, "learning_rate": 6.451612903225806e-06, "loss": 1.1994, "num_input_tokens_seen": 3111872, "step": 80 }, { "epoch": 0.004360887682915011, "grad_norm": 0.42130813002586365, "learning_rate": 7.258064516129033e-06, "loss": 1.2706, "num_input_tokens_seen": 3506908, "step": 90 }, { "epoch": 0.0048454307587944565, "grad_norm": 0.3949407637119293, "learning_rate": 8.064516129032258e-06, "loss": 1.187, "num_input_tokens_seen": 3920340, "step": 100 }, { "epoch": 0.005329973834673903, "grad_norm": 0.42650577425956726, "learning_rate": 8.870967741935484e-06, "loss": 1.2195, "num_input_tokens_seen": 4303752, "step": 110 }, { "epoch": 0.0058145169105533485, "grad_norm": 0.43526706099510193, "learning_rate": 9.67741935483871e-06, "loss": 1.2141, "num_input_tokens_seen": 4684992, "step": 120 }, { "epoch": 0.006299059986432794, "grad_norm": 0.4030572474002838, "learning_rate": 1.0483870967741936e-05, "loss": 1.2873, "num_input_tokens_seen": 5052616, "step": 130 }, { "epoch": 0.00678360306231224, "grad_norm": 0.41993653774261475, "learning_rate": 1.129032258064516e-05, "loss": 1.2175, "num_input_tokens_seen": 5427828, "step": 140 }, { "epoch": 0.007268146138191685, "grad_norm": 0.4032376706600189, "learning_rate": 1.2096774193548388e-05, "loss": 1.2747, "num_input_tokens_seen": 5819148, "step": 150 }, { "epoch": 0.007752689214071131, "grad_norm": 0.4497855603694916, "learning_rate": 1.2903225806451613e-05, "loss": 1.2422, "num_input_tokens_seen": 6198028, "step": 160 }, { "epoch": 0.008237232289950576, "grad_norm": 0.4343348741531372, "learning_rate": 1.3709677419354839e-05, "loss": 1.2621, "num_input_tokens_seen": 6604716, "step": 170 }, { "epoch": 0.008721775365830022, "grad_norm": 0.4532233774662018, "learning_rate": 1.4516129032258066e-05, "loss": 1.2263, "num_input_tokens_seen": 6995768, "step": 180 }, { "epoch": 0.009206318441709467, "grad_norm": 0.4360114634037018, "learning_rate": 1.5322580645161292e-05, "loss": 1.2022, "num_input_tokens_seen": 7395348, "step": 190 }, { "epoch": 0.009690861517588913, "grad_norm": 0.4088006913661957, "learning_rate": 1.6129032258064517e-05, "loss": 1.2199, "num_input_tokens_seen": 7783256, "step": 200 }, { "epoch": 0.010175404593468359, "grad_norm": 0.4394921064376831, "learning_rate": 1.693548387096774e-05, "loss": 1.2227, "num_input_tokens_seen": 8183636, "step": 210 }, { "epoch": 0.010659947669347806, "grad_norm": 0.4413774311542511, "learning_rate": 1.774193548387097e-05, "loss": 1.2134, "num_input_tokens_seen": 8541160, "step": 220 }, { "epoch": 0.011144490745227251, "grad_norm": 0.4048616588115692, "learning_rate": 1.8548387096774193e-05, "loss": 1.1969, "num_input_tokens_seen": 8913056, "step": 230 }, { "epoch": 0.011629033821106697, "grad_norm": 0.4245803654193878, "learning_rate": 1.935483870967742e-05, "loss": 1.2161, "num_input_tokens_seen": 9304116, "step": 240 }, { "epoch": 0.012113576896986143, "grad_norm": 0.4485531151294708, "learning_rate": 2.0161290322580645e-05, "loss": 1.1966, "num_input_tokens_seen": 9690224, "step": 250 }, { "epoch": 0.012598119972865588, "grad_norm": 0.44514381885528564, "learning_rate": 2.0967741935483873e-05, "loss": 1.2736, "num_input_tokens_seen": 10093420, "step": 260 }, { "epoch": 0.013082663048745034, "grad_norm": 0.47046637535095215, "learning_rate": 2.1774193548387097e-05, "loss": 1.2217, "num_input_tokens_seen": 10480092, "step": 270 }, { "epoch": 0.01356720612462448, "grad_norm": 0.4320948123931885, "learning_rate": 2.258064516129032e-05, "loss": 1.2185, "num_input_tokens_seen": 10856260, "step": 280 }, { "epoch": 0.014051749200503925, "grad_norm": 0.418040931224823, "learning_rate": 2.338709677419355e-05, "loss": 1.2504, "num_input_tokens_seen": 11231836, "step": 290 }, { "epoch": 0.01453629227638337, "grad_norm": 0.4360697865486145, "learning_rate": 2.4193548387096777e-05, "loss": 1.2199, "num_input_tokens_seen": 11622080, "step": 300 }, { "epoch": 0.015020835352262816, "grad_norm": 0.4039739668369293, "learning_rate": 2.5e-05, "loss": 1.1947, "num_input_tokens_seen": 12009244, "step": 310 }, { "epoch": 0.015505378428142261, "grad_norm": 0.4114651381969452, "learning_rate": 2.5806451612903226e-05, "loss": 1.2345, "num_input_tokens_seen": 12404068, "step": 320 }, { "epoch": 0.015989921504021707, "grad_norm": 0.44177761673927307, "learning_rate": 2.661290322580645e-05, "loss": 1.2303, "num_input_tokens_seen": 12774432, "step": 330 }, { "epoch": 0.016474464579901153, "grad_norm": 0.4257271885871887, "learning_rate": 2.7419354838709678e-05, "loss": 1.2513, "num_input_tokens_seen": 13155544, "step": 340 }, { "epoch": 0.016959007655780598, "grad_norm": 0.4340864419937134, "learning_rate": 2.822580645161291e-05, "loss": 1.2486, "num_input_tokens_seen": 13554616, "step": 350 }, { "epoch": 0.017443550731660044, "grad_norm": 0.4148235023021698, "learning_rate": 2.9032258064516133e-05, "loss": 1.2193, "num_input_tokens_seen": 13937664, "step": 360 }, { "epoch": 0.01792809380753949, "grad_norm": 0.417041152715683, "learning_rate": 2.9838709677419357e-05, "loss": 1.2142, "num_input_tokens_seen": 14331280, "step": 370 }, { "epoch": 0.018412636883418935, "grad_norm": 0.39417609572410583, "learning_rate": 3.0645161290322585e-05, "loss": 1.2865, "num_input_tokens_seen": 14707672, "step": 380 }, { "epoch": 0.01889717995929838, "grad_norm": 0.4522784948348999, "learning_rate": 3.1451612903225806e-05, "loss": 1.2437, "num_input_tokens_seen": 15106840, "step": 390 }, { "epoch": 0.019381723035177826, "grad_norm": 0.43782681226730347, "learning_rate": 3.2258064516129034e-05, "loss": 1.2215, "num_input_tokens_seen": 15480172, "step": 400 }, { "epoch": 0.01986626611105727, "grad_norm": 0.4035712480545044, "learning_rate": 3.306451612903226e-05, "loss": 1.2133, "num_input_tokens_seen": 15879184, "step": 410 }, { "epoch": 0.020350809186936717, "grad_norm": 0.4507812261581421, "learning_rate": 3.387096774193548e-05, "loss": 1.2469, "num_input_tokens_seen": 16282872, "step": 420 }, { "epoch": 0.020835352262816166, "grad_norm": 0.41557714343070984, "learning_rate": 3.467741935483872e-05, "loss": 1.2694, "num_input_tokens_seen": 16649572, "step": 430 }, { "epoch": 0.02131989533869561, "grad_norm": 0.43545469641685486, "learning_rate": 3.548387096774194e-05, "loss": 1.2469, "num_input_tokens_seen": 17047428, "step": 440 }, { "epoch": 0.021804438414575057, "grad_norm": 0.46730995178222656, "learning_rate": 3.6290322580645165e-05, "loss": 1.2082, "num_input_tokens_seen": 17447240, "step": 450 }, { "epoch": 0.022288981490454503, "grad_norm": 0.45304545760154724, "learning_rate": 3.7096774193548386e-05, "loss": 1.1941, "num_input_tokens_seen": 17831524, "step": 460 }, { "epoch": 0.02277352456633395, "grad_norm": 0.44148966670036316, "learning_rate": 3.7903225806451614e-05, "loss": 1.2507, "num_input_tokens_seen": 18226524, "step": 470 }, { "epoch": 0.023258067642213394, "grad_norm": 0.46780791878700256, "learning_rate": 3.870967741935484e-05, "loss": 1.2136, "num_input_tokens_seen": 18621280, "step": 480 }, { "epoch": 0.02374261071809284, "grad_norm": 0.40209197998046875, "learning_rate": 3.951612903225806e-05, "loss": 1.3024, "num_input_tokens_seen": 19023172, "step": 490 }, { "epoch": 0.024227153793972285, "grad_norm": 0.45693570375442505, "learning_rate": 4.032258064516129e-05, "loss": 1.2317, "num_input_tokens_seen": 19384076, "step": 500 }, { "epoch": 0.02471169686985173, "grad_norm": 0.4480474293231964, "learning_rate": 4.112903225806452e-05, "loss": 1.244, "num_input_tokens_seen": 19772672, "step": 510 }, { "epoch": 0.025196239945731176, "grad_norm": 0.3898276686668396, "learning_rate": 4.1935483870967746e-05, "loss": 1.2248, "num_input_tokens_seen": 20147580, "step": 520 }, { "epoch": 0.02568078302161062, "grad_norm": 0.45244449377059937, "learning_rate": 4.2741935483870973e-05, "loss": 1.2479, "num_input_tokens_seen": 20538480, "step": 530 }, { "epoch": 0.026165326097490067, "grad_norm": 0.4444933235645294, "learning_rate": 4.3548387096774194e-05, "loss": 1.2349, "num_input_tokens_seen": 20938472, "step": 540 }, { "epoch": 0.026649869173369513, "grad_norm": 0.4245850741863251, "learning_rate": 4.435483870967742e-05, "loss": 1.2061, "num_input_tokens_seen": 21323560, "step": 550 }, { "epoch": 0.02713441224924896, "grad_norm": 0.4357333481311798, "learning_rate": 4.516129032258064e-05, "loss": 1.2532, "num_input_tokens_seen": 21725356, "step": 560 }, { "epoch": 0.027618955325128404, "grad_norm": 0.4161827564239502, "learning_rate": 4.596774193548387e-05, "loss": 1.241, "num_input_tokens_seen": 22107816, "step": 570 }, { "epoch": 0.02810349840100785, "grad_norm": 0.44058066606521606, "learning_rate": 4.67741935483871e-05, "loss": 1.284, "num_input_tokens_seen": 22490484, "step": 580 }, { "epoch": 0.028588041476887295, "grad_norm": 0.4422559142112732, "learning_rate": 4.7580645161290326e-05, "loss": 1.2633, "num_input_tokens_seen": 22890776, "step": 590 }, { "epoch": 0.02907258455276674, "grad_norm": 0.4280403256416321, "learning_rate": 4.8387096774193554e-05, "loss": 1.2649, "num_input_tokens_seen": 23274324, "step": 600 }, { "epoch": 0.029557127628646186, "grad_norm": 0.4213055968284607, "learning_rate": 4.9193548387096775e-05, "loss": 1.2544, "num_input_tokens_seen": 23668904, "step": 610 }, { "epoch": 0.030041670704525632, "grad_norm": 0.4733617901802063, "learning_rate": 5e-05, "loss": 1.2238, "num_input_tokens_seen": 24062116, "step": 620 }, { "epoch": 0.030526213780405077, "grad_norm": 0.4112738370895386, "learning_rate": 4.999996921293424e-05, "loss": 1.2544, "num_input_tokens_seen": 24430924, "step": 630 }, { "epoch": 0.031010756856284523, "grad_norm": 0.3989699184894562, "learning_rate": 4.999987685181276e-05, "loss": 1.2679, "num_input_tokens_seen": 24803116, "step": 640 }, { "epoch": 0.03149529993216397, "grad_norm": 0.46107640862464905, "learning_rate": 4.9999722916863064e-05, "loss": 1.2498, "num_input_tokens_seen": 25174648, "step": 650 }, { "epoch": 0.031979843008043414, "grad_norm": 0.4159105122089386, "learning_rate": 4.999950740846427e-05, "loss": 1.2148, "num_input_tokens_seen": 25572492, "step": 660 }, { "epoch": 0.03246438608392286, "grad_norm": 0.41995078325271606, "learning_rate": 4.9999230327147187e-05, "loss": 1.2921, "num_input_tokens_seen": 25970816, "step": 670 }, { "epoch": 0.032948929159802305, "grad_norm": 0.4316340386867523, "learning_rate": 4.999889167359425e-05, "loss": 1.2401, "num_input_tokens_seen": 26346612, "step": 680 }, { "epoch": 0.03343347223568175, "grad_norm": 0.4364665448665619, "learning_rate": 4.999849144863954e-05, "loss": 1.228, "num_input_tokens_seen": 26750808, "step": 690 }, { "epoch": 0.033918015311561196, "grad_norm": 0.418192595243454, "learning_rate": 4.9998029653268805e-05, "loss": 1.2565, "num_input_tokens_seen": 27144672, "step": 700 }, { "epoch": 0.03440255838744064, "grad_norm": 0.42366668581962585, "learning_rate": 4.9997506288619436e-05, "loss": 1.3009, "num_input_tokens_seen": 27537568, "step": 710 }, { "epoch": 0.03488710146332009, "grad_norm": 0.40567639470100403, "learning_rate": 4.9996921355980456e-05, "loss": 1.1824, "num_input_tokens_seen": 27934864, "step": 720 }, { "epoch": 0.03537164453919953, "grad_norm": 0.4429372549057007, "learning_rate": 4.999627485679254e-05, "loss": 1.2941, "num_input_tokens_seen": 28315316, "step": 730 }, { "epoch": 0.03585618761507898, "grad_norm": 0.4326581358909607, "learning_rate": 4.999556679264798e-05, "loss": 1.2178, "num_input_tokens_seen": 28695628, "step": 740 }, { "epoch": 0.036340730690958424, "grad_norm": 0.39658111333847046, "learning_rate": 4.9994797165290724e-05, "loss": 1.2194, "num_input_tokens_seen": 29075004, "step": 750 }, { "epoch": 0.03682527376683787, "grad_norm": 0.42921292781829834, "learning_rate": 4.999396597661634e-05, "loss": 1.262, "num_input_tokens_seen": 29452648, "step": 760 }, { "epoch": 0.037309816842717315, "grad_norm": 0.42564624547958374, "learning_rate": 4.999307322867201e-05, "loss": 1.2686, "num_input_tokens_seen": 29806040, "step": 770 }, { "epoch": 0.03779435991859676, "grad_norm": 0.42206162214279175, "learning_rate": 4.9992118923656525e-05, "loss": 1.2644, "num_input_tokens_seen": 30187964, "step": 780 }, { "epoch": 0.038278902994476206, "grad_norm": 0.40607237815856934, "learning_rate": 4.999110306392034e-05, "loss": 1.2095, "num_input_tokens_seen": 30595388, "step": 790 }, { "epoch": 0.03876344607035565, "grad_norm": 0.4239601492881775, "learning_rate": 4.999002565196546e-05, "loss": 1.2785, "num_input_tokens_seen": 30969404, "step": 800 }, { "epoch": 0.0392479891462351, "grad_norm": 0.37221038341522217, "learning_rate": 4.9988886690445524e-05, "loss": 1.2645, "num_input_tokens_seen": 31352172, "step": 810 }, { "epoch": 0.03973253222211454, "grad_norm": 0.42992502450942993, "learning_rate": 4.998768618216575e-05, "loss": 1.2646, "num_input_tokens_seen": 31707436, "step": 820 }, { "epoch": 0.04021707529799399, "grad_norm": 0.40228039026260376, "learning_rate": 4.998642413008294e-05, "loss": 1.2796, "num_input_tokens_seen": 32113760, "step": 830 }, { "epoch": 0.040701618373873434, "grad_norm": 0.4015443027019501, "learning_rate": 4.9985100537305494e-05, "loss": 1.271, "num_input_tokens_seen": 32495944, "step": 840 }, { "epoch": 0.04118616144975288, "grad_norm": 0.4232306480407715, "learning_rate": 4.998371540709338e-05, "loss": 1.2403, "num_input_tokens_seen": 32878288, "step": 850 }, { "epoch": 0.04167070452563233, "grad_norm": 0.4329904317855835, "learning_rate": 4.998226874285811e-05, "loss": 1.2433, "num_input_tokens_seen": 33283124, "step": 860 }, { "epoch": 0.04215524760151178, "grad_norm": 0.3939256966114044, "learning_rate": 4.998076054816278e-05, "loss": 1.2513, "num_input_tokens_seen": 33675788, "step": 870 }, { "epoch": 0.04263979067739122, "grad_norm": 0.40896937251091003, "learning_rate": 4.997919082672201e-05, "loss": 1.2058, "num_input_tokens_seen": 34078876, "step": 880 }, { "epoch": 0.04312433375327067, "grad_norm": 0.41988757252693176, "learning_rate": 4.997755958240198e-05, "loss": 1.2704, "num_input_tokens_seen": 34476892, "step": 890 }, { "epoch": 0.043608876829150114, "grad_norm": 0.3970952332019806, "learning_rate": 4.997586681922039e-05, "loss": 1.2592, "num_input_tokens_seen": 34879628, "step": 900 }, { "epoch": 0.04409341990502956, "grad_norm": 0.3762570917606354, "learning_rate": 4.997411254134645e-05, "loss": 1.2744, "num_input_tokens_seen": 35263368, "step": 910 }, { "epoch": 0.044577962980909006, "grad_norm": 0.44425544142723083, "learning_rate": 4.9972296753100875e-05, "loss": 1.27, "num_input_tokens_seen": 35669980, "step": 920 }, { "epoch": 0.04506250605678845, "grad_norm": 0.3981640636920929, "learning_rate": 4.9970419458955916e-05, "loss": 1.2627, "num_input_tokens_seen": 36075236, "step": 930 }, { "epoch": 0.0455470491326679, "grad_norm": 0.42503446340560913, "learning_rate": 4.996848066353526e-05, "loss": 1.2742, "num_input_tokens_seen": 36460584, "step": 940 }, { "epoch": 0.04603159220854734, "grad_norm": 0.44177576899528503, "learning_rate": 4.99664803716141e-05, "loss": 1.2338, "num_input_tokens_seen": 36844140, "step": 950 }, { "epoch": 0.04651613528442679, "grad_norm": 0.4258316159248352, "learning_rate": 4.996441858811909e-05, "loss": 1.251, "num_input_tokens_seen": 37245224, "step": 960 }, { "epoch": 0.04700067836030623, "grad_norm": 0.39270251989364624, "learning_rate": 4.996229531812833e-05, "loss": 1.2682, "num_input_tokens_seen": 37623120, "step": 970 }, { "epoch": 0.04748522143618568, "grad_norm": 0.40389484167099, "learning_rate": 4.996011056687135e-05, "loss": 1.2599, "num_input_tokens_seen": 37992076, "step": 980 }, { "epoch": 0.047969764512065124, "grad_norm": 0.4183795154094696, "learning_rate": 4.9957864339729126e-05, "loss": 1.2647, "num_input_tokens_seen": 38392008, "step": 990 }, { "epoch": 0.04845430758794457, "grad_norm": 0.3972471356391907, "learning_rate": 4.9955556642234034e-05, "loss": 1.2387, "num_input_tokens_seen": 38802860, "step": 1000 }, { "epoch": 0.048938850663824016, "grad_norm": 0.3918885886669159, "learning_rate": 4.9953187480069854e-05, "loss": 1.2611, "num_input_tokens_seen": 39175600, "step": 1010 }, { "epoch": 0.04942339373970346, "grad_norm": 0.4439409673213959, "learning_rate": 4.9950756859071755e-05, "loss": 1.1817, "num_input_tokens_seen": 39586740, "step": 1020 }, { "epoch": 0.04990793681558291, "grad_norm": 0.43080589175224304, "learning_rate": 4.994826478522626e-05, "loss": 1.2453, "num_input_tokens_seen": 39993292, "step": 1030 }, { "epoch": 0.05039247989146235, "grad_norm": 0.41559743881225586, "learning_rate": 4.9945711264671276e-05, "loss": 1.2635, "num_input_tokens_seen": 40412108, "step": 1040 }, { "epoch": 0.0508770229673418, "grad_norm": 0.4444926381111145, "learning_rate": 4.994309630369602e-05, "loss": 1.2742, "num_input_tokens_seen": 40815268, "step": 1050 }, { "epoch": 0.05136156604322124, "grad_norm": 0.43432703614234924, "learning_rate": 4.9940419908741065e-05, "loss": 1.2782, "num_input_tokens_seen": 41228008, "step": 1060 }, { "epoch": 0.05184610911910069, "grad_norm": 0.46701622009277344, "learning_rate": 4.993768208639826e-05, "loss": 1.1921, "num_input_tokens_seen": 41641820, "step": 1070 }, { "epoch": 0.052330652194980135, "grad_norm": 0.42707207798957825, "learning_rate": 4.993488284341078e-05, "loss": 1.221, "num_input_tokens_seen": 42035360, "step": 1080 }, { "epoch": 0.05281519527085958, "grad_norm": 0.4334350824356079, "learning_rate": 4.993202218667307e-05, "loss": 1.2273, "num_input_tokens_seen": 42437020, "step": 1090 }, { "epoch": 0.053299738346739026, "grad_norm": 0.3986518681049347, "learning_rate": 4.99291001232308e-05, "loss": 1.2333, "num_input_tokens_seen": 42804100, "step": 1100 }, { "epoch": 0.05378428142261847, "grad_norm": 0.4267319142818451, "learning_rate": 4.992611666028094e-05, "loss": 1.2582, "num_input_tokens_seen": 43211524, "step": 1110 }, { "epoch": 0.05426882449849792, "grad_norm": 0.4689897894859314, "learning_rate": 4.992307180517165e-05, "loss": 1.2354, "num_input_tokens_seen": 43626756, "step": 1120 }, { "epoch": 0.05475336757437736, "grad_norm": 0.45038825273513794, "learning_rate": 4.991996556540229e-05, "loss": 1.2588, "num_input_tokens_seen": 44045992, "step": 1130 }, { "epoch": 0.05523791065025681, "grad_norm": 0.4214176535606384, "learning_rate": 4.991679794862343e-05, "loss": 1.1964, "num_input_tokens_seen": 44421180, "step": 1140 }, { "epoch": 0.055722453726136253, "grad_norm": 0.41900694370269775, "learning_rate": 4.99135689626368e-05, "loss": 1.2565, "num_input_tokens_seen": 44831096, "step": 1150 }, { "epoch": 0.0562069968020157, "grad_norm": 0.39536434412002563, "learning_rate": 4.9910278615395276e-05, "loss": 1.2181, "num_input_tokens_seen": 45272232, "step": 1160 }, { "epoch": 0.056691539877895145, "grad_norm": 0.4110620319843292, "learning_rate": 4.9906926915002875e-05, "loss": 1.2794, "num_input_tokens_seen": 45672992, "step": 1170 }, { "epoch": 0.05717608295377459, "grad_norm": 0.4034215211868286, "learning_rate": 4.9903513869714704e-05, "loss": 1.2561, "num_input_tokens_seen": 46085848, "step": 1180 }, { "epoch": 0.057660626029654036, "grad_norm": 0.456302672624588, "learning_rate": 4.990003948793699e-05, "loss": 1.258, "num_input_tokens_seen": 46486968, "step": 1190 }, { "epoch": 0.05814516910553348, "grad_norm": 0.4716334044933319, "learning_rate": 4.989650377822702e-05, "loss": 1.2495, "num_input_tokens_seen": 46892488, "step": 1200 }, { "epoch": 0.05862971218141293, "grad_norm": 0.4383469820022583, "learning_rate": 4.98929067492931e-05, "loss": 1.2278, "num_input_tokens_seen": 47271524, "step": 1210 }, { "epoch": 0.05911425525729237, "grad_norm": 0.4059995710849762, "learning_rate": 4.988924840999462e-05, "loss": 1.2907, "num_input_tokens_seen": 47666308, "step": 1220 }, { "epoch": 0.05959879833317182, "grad_norm": 0.42955759167671204, "learning_rate": 4.9885528769341905e-05, "loss": 1.2455, "num_input_tokens_seen": 48036456, "step": 1230 }, { "epoch": 0.060083341409051264, "grad_norm": 0.43056222796440125, "learning_rate": 4.988174783649633e-05, "loss": 1.2205, "num_input_tokens_seen": 48426548, "step": 1240 }, { "epoch": 0.06056788448493071, "grad_norm": 0.408191442489624, "learning_rate": 4.987790562077019e-05, "loss": 1.2517, "num_input_tokens_seen": 48803056, "step": 1250 }, { "epoch": 0.061052427560810155, "grad_norm": 0.3784200847148895, "learning_rate": 4.987400213162673e-05, "loss": 1.2009, "num_input_tokens_seen": 49204892, "step": 1260 }, { "epoch": 0.0615369706366896, "grad_norm": 0.4229642450809479, "learning_rate": 4.987003737868011e-05, "loss": 1.2979, "num_input_tokens_seen": 49597520, "step": 1270 }, { "epoch": 0.062021513712569046, "grad_norm": 0.4075496792793274, "learning_rate": 4.9866011371695374e-05, "loss": 1.2041, "num_input_tokens_seen": 49973816, "step": 1280 }, { "epoch": 0.06250605678844849, "grad_norm": 0.4234859049320221, "learning_rate": 4.9861924120588445e-05, "loss": 1.2489, "num_input_tokens_seen": 50382180, "step": 1290 }, { "epoch": 0.06299059986432794, "grad_norm": 0.4083632528781891, "learning_rate": 4.985777563542607e-05, "loss": 1.2459, "num_input_tokens_seen": 50746876, "step": 1300 }, { "epoch": 0.06347514294020738, "grad_norm": 0.4187311828136444, "learning_rate": 4.985356592642584e-05, "loss": 1.2366, "num_input_tokens_seen": 51150280, "step": 1310 }, { "epoch": 0.06395968601608683, "grad_norm": 0.4542880952358246, "learning_rate": 4.984929500395611e-05, "loss": 1.2511, "num_input_tokens_seen": 51527856, "step": 1320 }, { "epoch": 0.06444422909196627, "grad_norm": 0.40190425515174866, "learning_rate": 4.9844962878536004e-05, "loss": 1.2314, "num_input_tokens_seen": 51885820, "step": 1330 }, { "epoch": 0.06492877216784572, "grad_norm": 0.3928161859512329, "learning_rate": 4.9840569560835416e-05, "loss": 1.2156, "num_input_tokens_seen": 52276412, "step": 1340 }, { "epoch": 0.06541331524372516, "grad_norm": 0.407649964094162, "learning_rate": 4.9836115061674925e-05, "loss": 1.2379, "num_input_tokens_seen": 52647892, "step": 1350 }, { "epoch": 0.06589785831960461, "grad_norm": 0.4136994183063507, "learning_rate": 4.983159939202582e-05, "loss": 1.2633, "num_input_tokens_seen": 53053832, "step": 1360 }, { "epoch": 0.06638240139548406, "grad_norm": 0.4096525311470032, "learning_rate": 4.9827022563010016e-05, "loss": 1.2265, "num_input_tokens_seen": 53431196, "step": 1370 }, { "epoch": 0.0668669444713635, "grad_norm": 0.4357371926307678, "learning_rate": 4.982238458590009e-05, "loss": 1.2617, "num_input_tokens_seen": 53833560, "step": 1380 }, { "epoch": 0.06735148754724295, "grad_norm": 0.4323122799396515, "learning_rate": 4.9817685472119246e-05, "loss": 1.3117, "num_input_tokens_seen": 54217320, "step": 1390 }, { "epoch": 0.06783603062312239, "grad_norm": 0.4947691261768341, "learning_rate": 4.98129252332412e-05, "loss": 1.2197, "num_input_tokens_seen": 54609344, "step": 1400 }, { "epoch": 0.06832057369900184, "grad_norm": 0.43488138914108276, "learning_rate": 4.980810388099028e-05, "loss": 1.256, "num_input_tokens_seen": 55028116, "step": 1410 }, { "epoch": 0.06880511677488128, "grad_norm": 0.4741380512714386, "learning_rate": 4.980322142724129e-05, "loss": 1.2525, "num_input_tokens_seen": 55416832, "step": 1420 }, { "epoch": 0.06928965985076073, "grad_norm": 0.4468291103839874, "learning_rate": 4.979827788401956e-05, "loss": 1.2414, "num_input_tokens_seen": 55798836, "step": 1430 }, { "epoch": 0.06977420292664017, "grad_norm": 0.42211663722991943, "learning_rate": 4.979327326350086e-05, "loss": 1.2589, "num_input_tokens_seen": 56187416, "step": 1440 }, { "epoch": 0.07025874600251962, "grad_norm": 0.4140244722366333, "learning_rate": 4.9788207578011405e-05, "loss": 1.239, "num_input_tokens_seen": 56577512, "step": 1450 }, { "epoch": 0.07074328907839907, "grad_norm": 0.39189738035202026, "learning_rate": 4.978308084002779e-05, "loss": 1.1905, "num_input_tokens_seen": 56978600, "step": 1460 }, { "epoch": 0.07122783215427851, "grad_norm": 0.3976607024669647, "learning_rate": 4.9777893062176986e-05, "loss": 1.252, "num_input_tokens_seen": 57367068, "step": 1470 }, { "epoch": 0.07171237523015796, "grad_norm": 0.3938475251197815, "learning_rate": 4.977264425723632e-05, "loss": 1.2644, "num_input_tokens_seen": 57756716, "step": 1480 }, { "epoch": 0.0721969183060374, "grad_norm": 0.3958350718021393, "learning_rate": 4.976733443813343e-05, "loss": 1.2834, "num_input_tokens_seen": 58127588, "step": 1490 }, { "epoch": 0.07268146138191685, "grad_norm": 0.41049420833587646, "learning_rate": 4.976196361794619e-05, "loss": 1.2359, "num_input_tokens_seen": 58531680, "step": 1500 }, { "epoch": 0.0731660044577963, "grad_norm": 0.40144941210746765, "learning_rate": 4.9756531809902765e-05, "loss": 1.2358, "num_input_tokens_seen": 58917960, "step": 1510 }, { "epoch": 0.07365054753367574, "grad_norm": 0.4596763551235199, "learning_rate": 4.975103902738149e-05, "loss": 1.2676, "num_input_tokens_seen": 59324348, "step": 1520 }, { "epoch": 0.07413509060955518, "grad_norm": 0.41802385449409485, "learning_rate": 4.974548528391091e-05, "loss": 1.2669, "num_input_tokens_seen": 59734700, "step": 1530 }, { "epoch": 0.07461963368543463, "grad_norm": 0.409807026386261, "learning_rate": 4.9739870593169705e-05, "loss": 1.2579, "num_input_tokens_seen": 60135636, "step": 1540 }, { "epoch": 0.07510417676131408, "grad_norm": 0.41342952847480774, "learning_rate": 4.9734194968986656e-05, "loss": 1.2319, "num_input_tokens_seen": 60516912, "step": 1550 }, { "epoch": 0.07558871983719352, "grad_norm": 0.4141559898853302, "learning_rate": 4.972845842534063e-05, "loss": 1.2452, "num_input_tokens_seen": 60899940, "step": 1560 }, { "epoch": 0.07607326291307297, "grad_norm": 0.4386405646800995, "learning_rate": 4.9722660976360534e-05, "loss": 1.2262, "num_input_tokens_seen": 61295608, "step": 1570 }, { "epoch": 0.07655780598895241, "grad_norm": 0.40798524022102356, "learning_rate": 4.9716802636325286e-05, "loss": 1.2671, "num_input_tokens_seen": 61693040, "step": 1580 }, { "epoch": 0.07704234906483186, "grad_norm": 0.37625306844711304, "learning_rate": 4.9710883419663774e-05, "loss": 1.2962, "num_input_tokens_seen": 62072768, "step": 1590 }, { "epoch": 0.0775268921407113, "grad_norm": 0.4342903792858124, "learning_rate": 4.970490334095482e-05, "loss": 1.257, "num_input_tokens_seen": 62446884, "step": 1600 }, { "epoch": 0.07801143521659075, "grad_norm": 0.41230422258377075, "learning_rate": 4.969886241492715e-05, "loss": 1.27, "num_input_tokens_seen": 62836772, "step": 1610 }, { "epoch": 0.0784959782924702, "grad_norm": 0.4433565139770508, "learning_rate": 4.969276065645936e-05, "loss": 1.2592, "num_input_tokens_seen": 63200816, "step": 1620 }, { "epoch": 0.07898052136834964, "grad_norm": 0.47754809260368347, "learning_rate": 4.968659808057986e-05, "loss": 1.2752, "num_input_tokens_seen": 63605328, "step": 1630 }, { "epoch": 0.07946506444422909, "grad_norm": 0.3927769958972931, "learning_rate": 4.968037470246687e-05, "loss": 1.2468, "num_input_tokens_seen": 63972724, "step": 1640 }, { "epoch": 0.07994960752010853, "grad_norm": 0.39971116185188293, "learning_rate": 4.9674090537448346e-05, "loss": 1.1927, "num_input_tokens_seen": 64354840, "step": 1650 }, { "epoch": 0.08043415059598798, "grad_norm": 0.41240766644477844, "learning_rate": 4.966774560100198e-05, "loss": 1.2495, "num_input_tokens_seen": 64781948, "step": 1660 }, { "epoch": 0.08091869367186742, "grad_norm": 0.38843321800231934, "learning_rate": 4.966133990875512e-05, "loss": 1.2167, "num_input_tokens_seen": 65145908, "step": 1670 }, { "epoch": 0.08140323674774687, "grad_norm": 0.42320936918258667, "learning_rate": 4.965487347648476e-05, "loss": 1.2934, "num_input_tokens_seen": 65551472, "step": 1680 }, { "epoch": 0.08188777982362631, "grad_norm": 0.39081892371177673, "learning_rate": 4.964834632011751e-05, "loss": 1.2347, "num_input_tokens_seen": 65941084, "step": 1690 }, { "epoch": 0.08237232289950576, "grad_norm": 0.4150112271308899, "learning_rate": 4.964175845572952e-05, "loss": 1.2232, "num_input_tokens_seen": 66336864, "step": 1700 }, { "epoch": 0.0828568659753852, "grad_norm": 0.37995874881744385, "learning_rate": 4.9635109899546476e-05, "loss": 1.263, "num_input_tokens_seen": 66718060, "step": 1710 }, { "epoch": 0.08334140905126466, "grad_norm": 0.39722344279289246, "learning_rate": 4.962840066794354e-05, "loss": 1.276, "num_input_tokens_seen": 67124980, "step": 1720 }, { "epoch": 0.08382595212714411, "grad_norm": 0.4169159233570099, "learning_rate": 4.9621630777445316e-05, "loss": 1.2465, "num_input_tokens_seen": 67528936, "step": 1730 }, { "epoch": 0.08431049520302356, "grad_norm": 0.426110178232193, "learning_rate": 4.96148002447258e-05, "loss": 1.2534, "num_input_tokens_seen": 67929316, "step": 1740 }, { "epoch": 0.084795038278903, "grad_norm": 0.44997116923332214, "learning_rate": 4.960790908660838e-05, "loss": 1.2193, "num_input_tokens_seen": 68318772, "step": 1750 }, { "epoch": 0.08527958135478245, "grad_norm": 0.39495953917503357, "learning_rate": 4.9600957320065715e-05, "loss": 1.2464, "num_input_tokens_seen": 68728948, "step": 1760 }, { "epoch": 0.08576412443066189, "grad_norm": 0.39029765129089355, "learning_rate": 4.959394496221977e-05, "loss": 1.251, "num_input_tokens_seen": 69116412, "step": 1770 }, { "epoch": 0.08624866750654134, "grad_norm": 0.3958011865615845, "learning_rate": 4.958687203034176e-05, "loss": 1.2569, "num_input_tokens_seen": 69515196, "step": 1780 }, { "epoch": 0.08673321058242078, "grad_norm": 0.41105154156684875, "learning_rate": 4.957973854185204e-05, "loss": 1.2641, "num_input_tokens_seen": 69919164, "step": 1790 }, { "epoch": 0.08721775365830023, "grad_norm": 0.4101089835166931, "learning_rate": 4.957254451432016e-05, "loss": 1.2562, "num_input_tokens_seen": 70312276, "step": 1800 }, { "epoch": 0.08770229673417967, "grad_norm": 0.4429624080657959, "learning_rate": 4.956528996546476e-05, "loss": 1.2673, "num_input_tokens_seen": 70700052, "step": 1810 }, { "epoch": 0.08818683981005912, "grad_norm": 0.38688287138938904, "learning_rate": 4.9557974913153536e-05, "loss": 1.2294, "num_input_tokens_seen": 71078868, "step": 1820 }, { "epoch": 0.08867138288593857, "grad_norm": 0.4344911277294159, "learning_rate": 4.955059937540322e-05, "loss": 1.2436, "num_input_tokens_seen": 71470396, "step": 1830 }, { "epoch": 0.08915592596181801, "grad_norm": 0.4057205021381378, "learning_rate": 4.9543163370379484e-05, "loss": 1.2561, "num_input_tokens_seen": 71856368, "step": 1840 }, { "epoch": 0.08964046903769746, "grad_norm": 0.4123441278934479, "learning_rate": 4.953566691639697e-05, "loss": 1.2738, "num_input_tokens_seen": 72253720, "step": 1850 }, { "epoch": 0.0901250121135769, "grad_norm": 0.42719489336013794, "learning_rate": 4.9528110031919175e-05, "loss": 1.2306, "num_input_tokens_seen": 72625716, "step": 1860 }, { "epoch": 0.09060955518945635, "grad_norm": 0.41707441210746765, "learning_rate": 4.9520492735558444e-05, "loss": 1.2504, "num_input_tokens_seen": 73013404, "step": 1870 }, { "epoch": 0.0910940982653358, "grad_norm": 0.4267444312572479, "learning_rate": 4.951281504607591e-05, "loss": 1.2637, "num_input_tokens_seen": 73406604, "step": 1880 }, { "epoch": 0.09157864134121524, "grad_norm": 0.42108404636383057, "learning_rate": 4.950507698238147e-05, "loss": 1.2222, "num_input_tokens_seen": 73783332, "step": 1890 }, { "epoch": 0.09206318441709468, "grad_norm": 0.4063856303691864, "learning_rate": 4.949727856353369e-05, "loss": 1.2441, "num_input_tokens_seen": 74175440, "step": 1900 }, { "epoch": 0.09254772749297413, "grad_norm": 0.37594878673553467, "learning_rate": 4.94894198087398e-05, "loss": 1.2485, "num_input_tokens_seen": 74581572, "step": 1910 }, { "epoch": 0.09303227056885358, "grad_norm": 0.3911046087741852, "learning_rate": 4.9481500737355656e-05, "loss": 1.2702, "num_input_tokens_seen": 74943092, "step": 1920 }, { "epoch": 0.09351681364473302, "grad_norm": 0.4102419912815094, "learning_rate": 4.947352136888566e-05, "loss": 1.2468, "num_input_tokens_seen": 75331720, "step": 1930 }, { "epoch": 0.09400135672061247, "grad_norm": 0.39770299196243286, "learning_rate": 4.94654817229827e-05, "loss": 1.2201, "num_input_tokens_seen": 75736304, "step": 1940 }, { "epoch": 0.09448589979649191, "grad_norm": 0.43506377935409546, "learning_rate": 4.945738181944815e-05, "loss": 1.2235, "num_input_tokens_seen": 76130884, "step": 1950 }, { "epoch": 0.09497044287237136, "grad_norm": 0.3792125880718231, "learning_rate": 4.9449221678231795e-05, "loss": 1.2698, "num_input_tokens_seen": 76512888, "step": 1960 }, { "epoch": 0.0954549859482508, "grad_norm": 0.39077404141426086, "learning_rate": 4.9441001319431784e-05, "loss": 1.2458, "num_input_tokens_seen": 76922052, "step": 1970 }, { "epoch": 0.09593952902413025, "grad_norm": 0.38054585456848145, "learning_rate": 4.943272076329457e-05, "loss": 1.2281, "num_input_tokens_seen": 77306336, "step": 1980 }, { "epoch": 0.0964240721000097, "grad_norm": 0.3908326029777527, "learning_rate": 4.942438003021487e-05, "loss": 1.2028, "num_input_tokens_seen": 77689480, "step": 1990 }, { "epoch": 0.09690861517588914, "grad_norm": 0.43404319882392883, "learning_rate": 4.941597914073563e-05, "loss": 1.234, "num_input_tokens_seen": 78067836, "step": 2000 }, { "epoch": 0.09690861517588914, "eval_loss": 1.2438511848449707, "eval_runtime": 5.3239, "eval_samples_per_second": 28.175, "eval_steps_per_second": 3.569, "num_input_tokens_seen": 78067836, "step": 2000 }, { "epoch": 0.09739315825176859, "grad_norm": 0.4713827967643738, "learning_rate": 4.9407518115547945e-05, "loss": 1.2405, "num_input_tokens_seen": 78461236, "step": 2010 }, { "epoch": 0.09787770132764803, "grad_norm": 0.4206187427043915, "learning_rate": 4.939899697549102e-05, "loss": 1.2046, "num_input_tokens_seen": 78840632, "step": 2020 }, { "epoch": 0.09836224440352748, "grad_norm": 0.3922071158885956, "learning_rate": 4.939041574155213e-05, "loss": 1.2593, "num_input_tokens_seen": 79253124, "step": 2030 }, { "epoch": 0.09884678747940692, "grad_norm": 0.4534291625022888, "learning_rate": 4.938177443486657e-05, "loss": 1.2696, "num_input_tokens_seen": 79629736, "step": 2040 }, { "epoch": 0.09933133055528637, "grad_norm": 0.424139142036438, "learning_rate": 4.937307307671756e-05, "loss": 1.2686, "num_input_tokens_seen": 80023928, "step": 2050 }, { "epoch": 0.09981587363116581, "grad_norm": 0.45865899324417114, "learning_rate": 4.9364311688536245e-05, "loss": 1.2834, "num_input_tokens_seen": 80420160, "step": 2060 }, { "epoch": 0.10030041670704526, "grad_norm": 0.42138755321502686, "learning_rate": 4.935549029190163e-05, "loss": 1.2079, "num_input_tokens_seen": 80810044, "step": 2070 }, { "epoch": 0.1007849597829247, "grad_norm": 0.3841094672679901, "learning_rate": 4.934660890854049e-05, "loss": 1.2496, "num_input_tokens_seen": 81210716, "step": 2080 }, { "epoch": 0.10126950285880415, "grad_norm": 0.38037049770355225, "learning_rate": 4.9337667560327396e-05, "loss": 1.2724, "num_input_tokens_seen": 81585980, "step": 2090 }, { "epoch": 0.1017540459346836, "grad_norm": 0.4014841318130493, "learning_rate": 4.932866626928454e-05, "loss": 1.227, "num_input_tokens_seen": 81980652, "step": 2100 }, { "epoch": 0.10223858901056304, "grad_norm": 0.41437438130378723, "learning_rate": 4.9319605057581816e-05, "loss": 1.2472, "num_input_tokens_seen": 82387492, "step": 2110 }, { "epoch": 0.10272313208644249, "grad_norm": 0.4091525971889496, "learning_rate": 4.931048394753666e-05, "loss": 1.188, "num_input_tokens_seen": 82759272, "step": 2120 }, { "epoch": 0.10320767516232193, "grad_norm": 0.43405330181121826, "learning_rate": 4.930130296161406e-05, "loss": 1.2372, "num_input_tokens_seen": 83158564, "step": 2130 }, { "epoch": 0.10369221823820138, "grad_norm": 0.43749940395355225, "learning_rate": 4.929206212242646e-05, "loss": 1.2085, "num_input_tokens_seen": 83536672, "step": 2140 }, { "epoch": 0.10417676131408082, "grad_norm": 0.43483173847198486, "learning_rate": 4.928276145273372e-05, "loss": 1.2199, "num_input_tokens_seen": 83943180, "step": 2150 }, { "epoch": 0.10466130438996027, "grad_norm": 0.42413613200187683, "learning_rate": 4.9273400975443076e-05, "loss": 1.2012, "num_input_tokens_seen": 84356164, "step": 2160 }, { "epoch": 0.10514584746583971, "grad_norm": 0.445049911737442, "learning_rate": 4.926398071360905e-05, "loss": 1.2046, "num_input_tokens_seen": 84761956, "step": 2170 }, { "epoch": 0.10563039054171916, "grad_norm": 0.4001222848892212, "learning_rate": 4.925450069043342e-05, "loss": 1.246, "num_input_tokens_seen": 85138340, "step": 2180 }, { "epoch": 0.1061149336175986, "grad_norm": 0.401049941778183, "learning_rate": 4.924496092926517e-05, "loss": 1.2164, "num_input_tokens_seen": 85513616, "step": 2190 }, { "epoch": 0.10659947669347805, "grad_norm": 0.4612962603569031, "learning_rate": 4.923536145360038e-05, "loss": 1.2358, "num_input_tokens_seen": 85931448, "step": 2200 }, { "epoch": 0.1070840197693575, "grad_norm": 0.41920316219329834, "learning_rate": 4.922570228708223e-05, "loss": 1.3026, "num_input_tokens_seen": 86315108, "step": 2210 }, { "epoch": 0.10756856284523694, "grad_norm": 0.39290520548820496, "learning_rate": 4.921598345350092e-05, "loss": 1.2535, "num_input_tokens_seen": 86709944, "step": 2220 }, { "epoch": 0.10805310592111639, "grad_norm": 0.4282558262348175, "learning_rate": 4.92062049767936e-05, "loss": 1.2274, "num_input_tokens_seen": 87113940, "step": 2230 }, { "epoch": 0.10853764899699583, "grad_norm": 0.4753318428993225, "learning_rate": 4.91963668810443e-05, "loss": 1.2483, "num_input_tokens_seen": 87501616, "step": 2240 }, { "epoch": 0.10902219207287528, "grad_norm": 0.423225075006485, "learning_rate": 4.918646919048393e-05, "loss": 1.2327, "num_input_tokens_seen": 87894360, "step": 2250 }, { "epoch": 0.10950673514875472, "grad_norm": 0.39466777443885803, "learning_rate": 4.9176511929490144e-05, "loss": 1.2604, "num_input_tokens_seen": 88317576, "step": 2260 }, { "epoch": 0.10999127822463417, "grad_norm": 0.41255709528923035, "learning_rate": 4.916649512258733e-05, "loss": 1.2341, "num_input_tokens_seen": 88690104, "step": 2270 }, { "epoch": 0.11047582130051362, "grad_norm": 0.42974668741226196, "learning_rate": 4.9156418794446545e-05, "loss": 1.2159, "num_input_tokens_seen": 89090732, "step": 2280 }, { "epoch": 0.11096036437639306, "grad_norm": 0.44297677278518677, "learning_rate": 4.914628296988543e-05, "loss": 1.2344, "num_input_tokens_seen": 89486096, "step": 2290 }, { "epoch": 0.11144490745227251, "grad_norm": 0.4597613513469696, "learning_rate": 4.913608767386817e-05, "loss": 1.2482, "num_input_tokens_seen": 89880732, "step": 2300 }, { "epoch": 0.11192945052815195, "grad_norm": 0.4195135235786438, "learning_rate": 4.912583293150542e-05, "loss": 1.2158, "num_input_tokens_seen": 90328148, "step": 2310 }, { "epoch": 0.1124139936040314, "grad_norm": 0.44454923272132874, "learning_rate": 4.9115518768054264e-05, "loss": 1.2103, "num_input_tokens_seen": 90703168, "step": 2320 }, { "epoch": 0.11289853667991084, "grad_norm": 0.3934001624584198, "learning_rate": 4.910514520891812e-05, "loss": 1.2893, "num_input_tokens_seen": 91052408, "step": 2330 }, { "epoch": 0.11338307975579029, "grad_norm": 0.38367339968681335, "learning_rate": 4.90947122796467e-05, "loss": 1.2125, "num_input_tokens_seen": 91430836, "step": 2340 }, { "epoch": 0.11386762283166973, "grad_norm": 0.40738987922668457, "learning_rate": 4.908422000593596e-05, "loss": 1.2289, "num_input_tokens_seen": 91848392, "step": 2350 }, { "epoch": 0.11435216590754918, "grad_norm": 0.4179443418979645, "learning_rate": 4.907366841362799e-05, "loss": 1.2494, "num_input_tokens_seen": 92240852, "step": 2360 }, { "epoch": 0.11483670898342863, "grad_norm": 0.38464492559432983, "learning_rate": 4.906305752871102e-05, "loss": 1.1879, "num_input_tokens_seen": 92619204, "step": 2370 }, { "epoch": 0.11532125205930807, "grad_norm": 0.42252519726753235, "learning_rate": 4.905238737731926e-05, "loss": 1.2608, "num_input_tokens_seen": 93002632, "step": 2380 }, { "epoch": 0.11580579513518752, "grad_norm": 0.4146505296230316, "learning_rate": 4.9041657985732936e-05, "loss": 1.2486, "num_input_tokens_seen": 93399964, "step": 2390 }, { "epoch": 0.11629033821106696, "grad_norm": 0.438689261674881, "learning_rate": 4.903086938037818e-05, "loss": 1.27, "num_input_tokens_seen": 93791652, "step": 2400 }, { "epoch": 0.11677488128694641, "grad_norm": 0.42062869668006897, "learning_rate": 4.9020021587826926e-05, "loss": 1.2591, "num_input_tokens_seen": 94191368, "step": 2410 }, { "epoch": 0.11725942436282585, "grad_norm": 0.4142090976238251, "learning_rate": 4.9009114634796926e-05, "loss": 1.2627, "num_input_tokens_seen": 94571872, "step": 2420 }, { "epoch": 0.1177439674387053, "grad_norm": 0.4038289487361908, "learning_rate": 4.8998148548151625e-05, "loss": 1.2551, "num_input_tokens_seen": 94979976, "step": 2430 }, { "epoch": 0.11822851051458474, "grad_norm": 0.38102611899375916, "learning_rate": 4.898712335490012e-05, "loss": 1.2559, "num_input_tokens_seen": 95358664, "step": 2440 }, { "epoch": 0.11871305359046419, "grad_norm": 0.3852735757827759, "learning_rate": 4.897603908219706e-05, "loss": 1.2448, "num_input_tokens_seen": 95763084, "step": 2450 }, { "epoch": 0.11919759666634364, "grad_norm": 0.3712899684906006, "learning_rate": 4.8964895757342643e-05, "loss": 1.2488, "num_input_tokens_seen": 96188292, "step": 2460 }, { "epoch": 0.11968213974222308, "grad_norm": 0.3657510578632355, "learning_rate": 4.8953693407782484e-05, "loss": 1.253, "num_input_tokens_seen": 96601596, "step": 2470 }, { "epoch": 0.12016668281810253, "grad_norm": 0.4605284631252289, "learning_rate": 4.894243206110758e-05, "loss": 1.236, "num_input_tokens_seen": 96973176, "step": 2480 }, { "epoch": 0.12065122589398197, "grad_norm": 0.41020140051841736, "learning_rate": 4.8931111745054226e-05, "loss": 1.2364, "num_input_tokens_seen": 97378752, "step": 2490 }, { "epoch": 0.12113576896986142, "grad_norm": 0.36746346950531006, "learning_rate": 4.891973248750399e-05, "loss": 1.2666, "num_input_tokens_seen": 97771668, "step": 2500 }, { "epoch": 0.12162031204574086, "grad_norm": 0.4322774112224579, "learning_rate": 4.890829431648357e-05, "loss": 1.1992, "num_input_tokens_seen": 98151028, "step": 2510 }, { "epoch": 0.12210485512162031, "grad_norm": 0.42371341586112976, "learning_rate": 4.8896797260164795e-05, "loss": 1.3052, "num_input_tokens_seen": 98531364, "step": 2520 }, { "epoch": 0.12258939819749975, "grad_norm": 0.43509674072265625, "learning_rate": 4.888524134686451e-05, "loss": 1.2146, "num_input_tokens_seen": 98929648, "step": 2530 }, { "epoch": 0.1230739412733792, "grad_norm": 0.38487133383750916, "learning_rate": 4.887362660504452e-05, "loss": 1.2251, "num_input_tokens_seen": 99326780, "step": 2540 }, { "epoch": 0.12355848434925865, "grad_norm": 0.47879770398139954, "learning_rate": 4.8861953063311544e-05, "loss": 1.2862, "num_input_tokens_seen": 99746768, "step": 2550 }, { "epoch": 0.12404302742513809, "grad_norm": 0.3904902935028076, "learning_rate": 4.8850220750417107e-05, "loss": 1.2565, "num_input_tokens_seen": 100114696, "step": 2560 }, { "epoch": 0.12452757050101754, "grad_norm": 0.3858630359172821, "learning_rate": 4.883842969525748e-05, "loss": 1.2145, "num_input_tokens_seen": 100542848, "step": 2570 }, { "epoch": 0.12501211357689698, "grad_norm": 0.42013123631477356, "learning_rate": 4.882657992687363e-05, "loss": 1.2271, "num_input_tokens_seen": 100976524, "step": 2580 }, { "epoch": 0.12549665665277643, "grad_norm": 0.3980698585510254, "learning_rate": 4.8814671474451126e-05, "loss": 1.2801, "num_input_tokens_seen": 101345460, "step": 2590 }, { "epoch": 0.12598119972865587, "grad_norm": 0.3923351764678955, "learning_rate": 4.880270436732006e-05, "loss": 1.2064, "num_input_tokens_seen": 101759284, "step": 2600 }, { "epoch": 0.12646574280453532, "grad_norm": 0.40927964448928833, "learning_rate": 4.879067863495502e-05, "loss": 1.2739, "num_input_tokens_seen": 102167588, "step": 2610 }, { "epoch": 0.12695028588041476, "grad_norm": 0.44859156012535095, "learning_rate": 4.8778594306974956e-05, "loss": 1.2309, "num_input_tokens_seen": 102539236, "step": 2620 }, { "epoch": 0.1274348289562942, "grad_norm": 0.39487892389297485, "learning_rate": 4.876645141314315e-05, "loss": 1.2866, "num_input_tokens_seen": 102925936, "step": 2630 }, { "epoch": 0.12791937203217366, "grad_norm": 0.3923860490322113, "learning_rate": 4.875424998336713e-05, "loss": 1.2757, "num_input_tokens_seen": 103325248, "step": 2640 }, { "epoch": 0.1284039151080531, "grad_norm": 0.4167238175868988, "learning_rate": 4.874199004769858e-05, "loss": 1.2338, "num_input_tokens_seen": 103712300, "step": 2650 }, { "epoch": 0.12888845818393255, "grad_norm": 0.38341742753982544, "learning_rate": 4.872967163633332e-05, "loss": 1.2383, "num_input_tokens_seen": 104124208, "step": 2660 }, { "epoch": 0.129373001259812, "grad_norm": 0.3938426375389099, "learning_rate": 4.871729477961115e-05, "loss": 1.2011, "num_input_tokens_seen": 104502028, "step": 2670 }, { "epoch": 0.12985754433569144, "grad_norm": 0.3831293284893036, "learning_rate": 4.8704859508015847e-05, "loss": 1.2634, "num_input_tokens_seen": 104876704, "step": 2680 }, { "epoch": 0.13034208741157088, "grad_norm": 0.4196309745311737, "learning_rate": 4.869236585217504e-05, "loss": 1.2585, "num_input_tokens_seen": 105302956, "step": 2690 }, { "epoch": 0.13082663048745033, "grad_norm": 0.4367752969264984, "learning_rate": 4.867981384286019e-05, "loss": 1.1848, "num_input_tokens_seen": 105695812, "step": 2700 }, { "epoch": 0.13131117356332977, "grad_norm": 0.39066222310066223, "learning_rate": 4.8667203510986444e-05, "loss": 1.2545, "num_input_tokens_seen": 106084708, "step": 2710 }, { "epoch": 0.13179571663920922, "grad_norm": 0.4448121190071106, "learning_rate": 4.865453488761262e-05, "loss": 1.275, "num_input_tokens_seen": 106499276, "step": 2720 }, { "epoch": 0.13228025971508867, "grad_norm": 0.4147442877292633, "learning_rate": 4.864180800394109e-05, "loss": 1.277, "num_input_tokens_seen": 106866988, "step": 2730 }, { "epoch": 0.1327648027909681, "grad_norm": 0.40027832984924316, "learning_rate": 4.862902289131773e-05, "loss": 1.2567, "num_input_tokens_seen": 107259856, "step": 2740 }, { "epoch": 0.13324934586684756, "grad_norm": 0.36126065254211426, "learning_rate": 4.861617958123183e-05, "loss": 1.2553, "num_input_tokens_seen": 107646308, "step": 2750 }, { "epoch": 0.133733888942727, "grad_norm": 0.41318050026893616, "learning_rate": 4.860327810531602e-05, "loss": 1.244, "num_input_tokens_seen": 108059036, "step": 2760 }, { "epoch": 0.13421843201860645, "grad_norm": 0.3924005925655365, "learning_rate": 4.859031849534618e-05, "loss": 1.223, "num_input_tokens_seen": 108463880, "step": 2770 }, { "epoch": 0.1347029750944859, "grad_norm": 0.41446420550346375, "learning_rate": 4.857730078324138e-05, "loss": 1.2424, "num_input_tokens_seen": 108859416, "step": 2780 }, { "epoch": 0.13518751817036534, "grad_norm": 0.4131425619125366, "learning_rate": 4.856422500106379e-05, "loss": 1.2368, "num_input_tokens_seen": 109284028, "step": 2790 }, { "epoch": 0.13567206124624479, "grad_norm": 0.40505287051200867, "learning_rate": 4.855109118101862e-05, "loss": 1.2731, "num_input_tokens_seen": 109666092, "step": 2800 }, { "epoch": 0.13615660432212423, "grad_norm": 0.4067140817642212, "learning_rate": 4.8537899355454e-05, "loss": 1.2945, "num_input_tokens_seen": 110023200, "step": 2810 }, { "epoch": 0.13664114739800368, "grad_norm": 0.42721500992774963, "learning_rate": 4.8524649556860934e-05, "loss": 1.2448, "num_input_tokens_seen": 110400364, "step": 2820 }, { "epoch": 0.13712569047388312, "grad_norm": 0.40935906767845154, "learning_rate": 4.851134181787323e-05, "loss": 1.2359, "num_input_tokens_seen": 110793504, "step": 2830 }, { "epoch": 0.13761023354976257, "grad_norm": 0.4633960425853729, "learning_rate": 4.849797617126738e-05, "loss": 1.2571, "num_input_tokens_seen": 111170796, "step": 2840 }, { "epoch": 0.138094776625642, "grad_norm": 0.43465399742126465, "learning_rate": 4.848455264996251e-05, "loss": 1.2171, "num_input_tokens_seen": 111576676, "step": 2850 }, { "epoch": 0.13857931970152146, "grad_norm": 0.3760763704776764, "learning_rate": 4.847107128702028e-05, "loss": 1.2611, "num_input_tokens_seen": 112001188, "step": 2860 }, { "epoch": 0.1390638627774009, "grad_norm": 0.4067819118499756, "learning_rate": 4.845753211564482e-05, "loss": 1.1932, "num_input_tokens_seen": 112357276, "step": 2870 }, { "epoch": 0.13954840585328035, "grad_norm": 0.44168785214424133, "learning_rate": 4.844393516918265e-05, "loss": 1.2458, "num_input_tokens_seen": 112745724, "step": 2880 }, { "epoch": 0.1400329489291598, "grad_norm": 0.42745277285575867, "learning_rate": 4.8430280481122556e-05, "loss": 1.1844, "num_input_tokens_seen": 113158204, "step": 2890 }, { "epoch": 0.14051749200503924, "grad_norm": 0.4196942150592804, "learning_rate": 4.8416568085095585e-05, "loss": 1.2464, "num_input_tokens_seen": 113534300, "step": 2900 }, { "epoch": 0.1410020350809187, "grad_norm": 0.3699077069759369, "learning_rate": 4.840279801487488e-05, "loss": 1.2435, "num_input_tokens_seen": 113941828, "step": 2910 }, { "epoch": 0.14148657815679813, "grad_norm": 0.4300510287284851, "learning_rate": 4.8388970304375636e-05, "loss": 1.2438, "num_input_tokens_seen": 114360276, "step": 2920 }, { "epoch": 0.14197112123267758, "grad_norm": 0.39781638979911804, "learning_rate": 4.837508498765504e-05, "loss": 1.2151, "num_input_tokens_seen": 114724572, "step": 2930 }, { "epoch": 0.14245566430855702, "grad_norm": 0.3915679156780243, "learning_rate": 4.836114209891214e-05, "loss": 1.2486, "num_input_tokens_seen": 115115652, "step": 2940 }, { "epoch": 0.14294020738443647, "grad_norm": 0.3921675682067871, "learning_rate": 4.834714167248778e-05, "loss": 1.2237, "num_input_tokens_seen": 115528916, "step": 2950 }, { "epoch": 0.14342475046031591, "grad_norm": 0.4208355247974396, "learning_rate": 4.8333083742864524e-05, "loss": 1.3102, "num_input_tokens_seen": 115883916, "step": 2960 }, { "epoch": 0.14390929353619536, "grad_norm": 0.4001530408859253, "learning_rate": 4.831896834466658e-05, "loss": 1.2382, "num_input_tokens_seen": 116270492, "step": 2970 }, { "epoch": 0.1443938366120748, "grad_norm": 0.4217351973056793, "learning_rate": 4.830479551265966e-05, "loss": 1.1906, "num_input_tokens_seen": 116680092, "step": 2980 }, { "epoch": 0.14487837968795425, "grad_norm": 0.37808090448379517, "learning_rate": 4.8290565281750974e-05, "loss": 1.2392, "num_input_tokens_seen": 117085684, "step": 2990 }, { "epoch": 0.1453629227638337, "grad_norm": 0.3951645493507385, "learning_rate": 4.827627768698909e-05, "loss": 1.2293, "num_input_tokens_seen": 117454120, "step": 3000 }, { "epoch": 0.14584746583971314, "grad_norm": 0.35314205288887024, "learning_rate": 4.8261932763563834e-05, "loss": 1.1356, "num_input_tokens_seen": 117866504, "step": 3010 }, { "epoch": 0.1463320089155926, "grad_norm": 0.4276081323623657, "learning_rate": 4.824753054680628e-05, "loss": 1.2778, "num_input_tokens_seen": 118250676, "step": 3020 }, { "epoch": 0.14681655199147203, "grad_norm": 0.40378737449645996, "learning_rate": 4.823307107218857e-05, "loss": 1.2868, "num_input_tokens_seen": 118607944, "step": 3030 }, { "epoch": 0.14730109506735148, "grad_norm": 0.41640186309814453, "learning_rate": 4.82185543753239e-05, "loss": 1.2456, "num_input_tokens_seen": 119009904, "step": 3040 }, { "epoch": 0.14778563814323092, "grad_norm": 0.4186214804649353, "learning_rate": 4.820398049196638e-05, "loss": 1.2287, "num_input_tokens_seen": 119411820, "step": 3050 }, { "epoch": 0.14827018121911037, "grad_norm": 0.42039960622787476, "learning_rate": 4.818934945801098e-05, "loss": 1.2247, "num_input_tokens_seen": 119798480, "step": 3060 }, { "epoch": 0.14875472429498982, "grad_norm": 0.42505961656570435, "learning_rate": 4.8174661309493436e-05, "loss": 1.2242, "num_input_tokens_seen": 120184372, "step": 3070 }, { "epoch": 0.14923926737086926, "grad_norm": 0.4036807417869568, "learning_rate": 4.815991608259014e-05, "loss": 1.2154, "num_input_tokens_seen": 120579184, "step": 3080 }, { "epoch": 0.1497238104467487, "grad_norm": 0.40509092807769775, "learning_rate": 4.8145113813618083e-05, "loss": 1.2086, "num_input_tokens_seen": 120941008, "step": 3090 }, { "epoch": 0.15020835352262815, "grad_norm": 0.4269082546234131, "learning_rate": 4.813025453903472e-05, "loss": 1.2496, "num_input_tokens_seen": 121322824, "step": 3100 }, { "epoch": 0.1506928965985076, "grad_norm": 0.4054916203022003, "learning_rate": 4.811533829543795e-05, "loss": 1.2079, "num_input_tokens_seen": 121738192, "step": 3110 }, { "epoch": 0.15117743967438704, "grad_norm": 0.413043349981308, "learning_rate": 4.8100365119565946e-05, "loss": 1.2609, "num_input_tokens_seen": 122108864, "step": 3120 }, { "epoch": 0.1516619827502665, "grad_norm": 0.41178593039512634, "learning_rate": 4.8085335048297135e-05, "loss": 1.2548, "num_input_tokens_seen": 122510356, "step": 3130 }, { "epoch": 0.15214652582614593, "grad_norm": 0.39382249116897583, "learning_rate": 4.8070248118650055e-05, "loss": 1.1809, "num_input_tokens_seen": 122865892, "step": 3140 }, { "epoch": 0.15263106890202538, "grad_norm": 0.46355170011520386, "learning_rate": 4.8055104367783275e-05, "loss": 1.2281, "num_input_tokens_seen": 123237536, "step": 3150 }, { "epoch": 0.15311561197790483, "grad_norm": 0.40591225028038025, "learning_rate": 4.803990383299535e-05, "loss": 1.2243, "num_input_tokens_seen": 123611572, "step": 3160 }, { "epoch": 0.15360015505378427, "grad_norm": 0.41489487886428833, "learning_rate": 4.802464655172466e-05, "loss": 1.239, "num_input_tokens_seen": 124010748, "step": 3170 }, { "epoch": 0.15408469812966372, "grad_norm": 0.3760206997394562, "learning_rate": 4.800933256154935e-05, "loss": 1.2013, "num_input_tokens_seen": 124392148, "step": 3180 }, { "epoch": 0.15456924120554316, "grad_norm": 0.43629947304725647, "learning_rate": 4.7993961900187255e-05, "loss": 1.2253, "num_input_tokens_seen": 124776000, "step": 3190 }, { "epoch": 0.1550537842814226, "grad_norm": 0.4242326021194458, "learning_rate": 4.7978534605495784e-05, "loss": 1.2461, "num_input_tokens_seen": 125158264, "step": 3200 }, { "epoch": 0.15553832735730205, "grad_norm": 0.40662136673927307, "learning_rate": 4.796305071547182e-05, "loss": 1.1913, "num_input_tokens_seen": 125544848, "step": 3210 }, { "epoch": 0.1560228704331815, "grad_norm": 0.4275949001312256, "learning_rate": 4.794751026825165e-05, "loss": 1.2397, "num_input_tokens_seen": 125940532, "step": 3220 }, { "epoch": 0.15650741350906094, "grad_norm": 0.41079017519950867, "learning_rate": 4.793191330211085e-05, "loss": 1.2478, "num_input_tokens_seen": 126357472, "step": 3230 }, { "epoch": 0.1569919565849404, "grad_norm": 0.4043791890144348, "learning_rate": 4.791625985546422e-05, "loss": 1.2263, "num_input_tokens_seen": 126745452, "step": 3240 }, { "epoch": 0.15747649966081984, "grad_norm": 0.4353187680244446, "learning_rate": 4.790054996686564e-05, "loss": 1.1988, "num_input_tokens_seen": 127138656, "step": 3250 }, { "epoch": 0.15796104273669928, "grad_norm": 0.4300646483898163, "learning_rate": 4.7884783675008016e-05, "loss": 1.183, "num_input_tokens_seen": 127522676, "step": 3260 }, { "epoch": 0.15844558581257873, "grad_norm": 0.3753831684589386, "learning_rate": 4.7868961018723194e-05, "loss": 1.2412, "num_input_tokens_seen": 127917336, "step": 3270 }, { "epoch": 0.15893012888845817, "grad_norm": 0.36102235317230225, "learning_rate": 4.785308203698182e-05, "loss": 1.2764, "num_input_tokens_seen": 128336744, "step": 3280 }, { "epoch": 0.15941467196433762, "grad_norm": 0.40448567271232605, "learning_rate": 4.783714676889327e-05, "loss": 1.2122, "num_input_tokens_seen": 128733104, "step": 3290 }, { "epoch": 0.15989921504021706, "grad_norm": 0.3721427023410797, "learning_rate": 4.7821155253705564e-05, "loss": 1.2684, "num_input_tokens_seen": 129154876, "step": 3300 }, { "epoch": 0.1603837581160965, "grad_norm": 0.4208812415599823, "learning_rate": 4.7805107530805244e-05, "loss": 1.2343, "num_input_tokens_seen": 129547976, "step": 3310 }, { "epoch": 0.16086830119197595, "grad_norm": 0.39913055300712585, "learning_rate": 4.778900363971729e-05, "loss": 1.2078, "num_input_tokens_seen": 129961780, "step": 3320 }, { "epoch": 0.1613528442678554, "grad_norm": 0.4164119362831116, "learning_rate": 4.777284362010504e-05, "loss": 1.2501, "num_input_tokens_seen": 130358684, "step": 3330 }, { "epoch": 0.16183738734373485, "grad_norm": 0.3951362073421478, "learning_rate": 4.775662751177003e-05, "loss": 1.206, "num_input_tokens_seen": 130762100, "step": 3340 }, { "epoch": 0.1623219304196143, "grad_norm": 0.4006355404853821, "learning_rate": 4.774035535465201e-05, "loss": 1.2516, "num_input_tokens_seen": 131146316, "step": 3350 }, { "epoch": 0.16280647349549374, "grad_norm": 0.41116824746131897, "learning_rate": 4.7724027188828716e-05, "loss": 1.1868, "num_input_tokens_seen": 131538384, "step": 3360 }, { "epoch": 0.16329101657137318, "grad_norm": 0.4331505596637726, "learning_rate": 4.7707643054515855e-05, "loss": 1.2212, "num_input_tokens_seen": 131926864, "step": 3370 }, { "epoch": 0.16377555964725263, "grad_norm": 0.3957429826259613, "learning_rate": 4.7691202992066984e-05, "loss": 1.1938, "num_input_tokens_seen": 132309648, "step": 3380 }, { "epoch": 0.16426010272313207, "grad_norm": 0.3914317786693573, "learning_rate": 4.767470704197341e-05, "loss": 1.2082, "num_input_tokens_seen": 132725476, "step": 3390 }, { "epoch": 0.16474464579901152, "grad_norm": 0.4648467004299164, "learning_rate": 4.765815524486407e-05, "loss": 1.2276, "num_input_tokens_seen": 133107108, "step": 3400 }, { "epoch": 0.16522918887489096, "grad_norm": 0.3745146691799164, "learning_rate": 4.764154764150548e-05, "loss": 1.223, "num_input_tokens_seen": 133483520, "step": 3410 }, { "epoch": 0.1657137319507704, "grad_norm": 0.38040289282798767, "learning_rate": 4.762488427280159e-05, "loss": 1.1943, "num_input_tokens_seen": 133871252, "step": 3420 }, { "epoch": 0.16619827502664986, "grad_norm": 0.41370365023612976, "learning_rate": 4.760816517979369e-05, "loss": 1.2548, "num_input_tokens_seen": 134284456, "step": 3430 }, { "epoch": 0.16668281810252933, "grad_norm": 0.4398741126060486, "learning_rate": 4.7591390403660326e-05, "loss": 1.2549, "num_input_tokens_seen": 134684752, "step": 3440 }, { "epoch": 0.16716736117840877, "grad_norm": 0.41080906987190247, "learning_rate": 4.7574559985717196e-05, "loss": 1.1814, "num_input_tokens_seen": 135115868, "step": 3450 }, { "epoch": 0.16765190425428822, "grad_norm": 0.4656144678592682, "learning_rate": 4.7557673967417024e-05, "loss": 1.1833, "num_input_tokens_seen": 135511836, "step": 3460 }, { "epoch": 0.16813644733016767, "grad_norm": 0.3965156376361847, "learning_rate": 4.7540732390349494e-05, "loss": 1.2127, "num_input_tokens_seen": 135920396, "step": 3470 }, { "epoch": 0.1686209904060471, "grad_norm": 0.400642991065979, "learning_rate": 4.752373529624113e-05, "loss": 1.2603, "num_input_tokens_seen": 136333964, "step": 3480 }, { "epoch": 0.16910553348192656, "grad_norm": 0.4004031717777252, "learning_rate": 4.7506682726955166e-05, "loss": 1.2232, "num_input_tokens_seen": 136718116, "step": 3490 }, { "epoch": 0.169590076557806, "grad_norm": 0.40622180700302124, "learning_rate": 4.74895747244915e-05, "loss": 1.1966, "num_input_tokens_seen": 137109584, "step": 3500 }, { "epoch": 0.17007461963368545, "grad_norm": 0.41456958651542664, "learning_rate": 4.747241133098655e-05, "loss": 1.2557, "num_input_tokens_seen": 137510040, "step": 3510 }, { "epoch": 0.1705591627095649, "grad_norm": 0.38758885860443115, "learning_rate": 4.745519258871314e-05, "loss": 1.2414, "num_input_tokens_seen": 137901036, "step": 3520 }, { "epoch": 0.17104370578544434, "grad_norm": 0.4364662766456604, "learning_rate": 4.743791854008045e-05, "loss": 1.2312, "num_input_tokens_seen": 138292348, "step": 3530 }, { "epoch": 0.17152824886132378, "grad_norm": 0.40788963437080383, "learning_rate": 4.742058922763386e-05, "loss": 1.2465, "num_input_tokens_seen": 138699080, "step": 3540 }, { "epoch": 0.17201279193720323, "grad_norm": 0.39772936701774597, "learning_rate": 4.740320469405487e-05, "loss": 1.2562, "num_input_tokens_seen": 139086780, "step": 3550 }, { "epoch": 0.17249733501308268, "grad_norm": 0.4041747748851776, "learning_rate": 4.738576498216097e-05, "loss": 1.193, "num_input_tokens_seen": 139486640, "step": 3560 }, { "epoch": 0.17298187808896212, "grad_norm": 0.4034312069416046, "learning_rate": 4.7368270134905565e-05, "loss": 1.1797, "num_input_tokens_seen": 139891212, "step": 3570 }, { "epoch": 0.17346642116484157, "grad_norm": 0.3874165713787079, "learning_rate": 4.735072019537786e-05, "loss": 1.2149, "num_input_tokens_seen": 140300916, "step": 3580 }, { "epoch": 0.173950964240721, "grad_norm": 0.3958899974822998, "learning_rate": 4.733311520680276e-05, "loss": 1.2303, "num_input_tokens_seen": 140685408, "step": 3590 }, { "epoch": 0.17443550731660046, "grad_norm": 0.42453455924987793, "learning_rate": 4.7315455212540714e-05, "loss": 1.2372, "num_input_tokens_seen": 141095616, "step": 3600 }, { "epoch": 0.1749200503924799, "grad_norm": 0.4058285057544708, "learning_rate": 4.7297740256087695e-05, "loss": 1.1825, "num_input_tokens_seen": 141494816, "step": 3610 }, { "epoch": 0.17540459346835935, "grad_norm": 0.3956435024738312, "learning_rate": 4.727997038107501e-05, "loss": 1.1843, "num_input_tokens_seen": 141885796, "step": 3620 }, { "epoch": 0.1758891365442388, "grad_norm": 0.40899357199668884, "learning_rate": 4.726214563126926e-05, "loss": 1.2292, "num_input_tokens_seen": 142285860, "step": 3630 }, { "epoch": 0.17637367962011824, "grad_norm": 0.41558772325515747, "learning_rate": 4.7244266050572175e-05, "loss": 1.2355, "num_input_tokens_seen": 142706648, "step": 3640 }, { "epoch": 0.17685822269599769, "grad_norm": 0.40557295083999634, "learning_rate": 4.722633168302054e-05, "loss": 1.2054, "num_input_tokens_seen": 143098084, "step": 3650 }, { "epoch": 0.17734276577187713, "grad_norm": 0.41156765818595886, "learning_rate": 4.720834257278609e-05, "loss": 1.1816, "num_input_tokens_seen": 143490440, "step": 3660 }, { "epoch": 0.17782730884775658, "grad_norm": 0.4566863179206848, "learning_rate": 4.719029876417537e-05, "loss": 1.2687, "num_input_tokens_seen": 143900176, "step": 3670 }, { "epoch": 0.17831185192363602, "grad_norm": 0.38572782278060913, "learning_rate": 4.717220030162964e-05, "loss": 1.2074, "num_input_tokens_seen": 144266036, "step": 3680 }, { "epoch": 0.17879639499951547, "grad_norm": 0.4045168459415436, "learning_rate": 4.715404722972481e-05, "loss": 1.2177, "num_input_tokens_seen": 144661236, "step": 3690 }, { "epoch": 0.1792809380753949, "grad_norm": 0.43136268854141235, "learning_rate": 4.713583959317125e-05, "loss": 1.231, "num_input_tokens_seen": 145039188, "step": 3700 }, { "epoch": 0.17976548115127436, "grad_norm": 0.37638697028160095, "learning_rate": 4.711757743681375e-05, "loss": 1.265, "num_input_tokens_seen": 145471096, "step": 3710 }, { "epoch": 0.1802500242271538, "grad_norm": 0.418754905462265, "learning_rate": 4.7099260805631354e-05, "loss": 1.2135, "num_input_tokens_seen": 145875596, "step": 3720 }, { "epoch": 0.18073456730303325, "grad_norm": 0.3730713725090027, "learning_rate": 4.7080889744737275e-05, "loss": 1.219, "num_input_tokens_seen": 146269480, "step": 3730 }, { "epoch": 0.1812191103789127, "grad_norm": 0.3674296438694, "learning_rate": 4.7062464299378835e-05, "loss": 1.2366, "num_input_tokens_seen": 146676848, "step": 3740 }, { "epoch": 0.18170365345479214, "grad_norm": 0.4344138205051422, "learning_rate": 4.7043984514937236e-05, "loss": 1.2243, "num_input_tokens_seen": 147058324, "step": 3750 }, { "epoch": 0.1821881965306716, "grad_norm": 0.3907421827316284, "learning_rate": 4.7025450436927555e-05, "loss": 1.2282, "num_input_tokens_seen": 147448108, "step": 3760 }, { "epoch": 0.18267273960655103, "grad_norm": 0.40709200501441956, "learning_rate": 4.700686211099859e-05, "loss": 1.2951, "num_input_tokens_seen": 147840480, "step": 3770 }, { "epoch": 0.18315728268243048, "grad_norm": 0.4162179231643677, "learning_rate": 4.698821958293273e-05, "loss": 1.2506, "num_input_tokens_seen": 148246316, "step": 3780 }, { "epoch": 0.18364182575830992, "grad_norm": 0.41495996713638306, "learning_rate": 4.696952289864587e-05, "loss": 1.2097, "num_input_tokens_seen": 148608520, "step": 3790 }, { "epoch": 0.18412636883418937, "grad_norm": 0.3995479643344879, "learning_rate": 4.6950772104187303e-05, "loss": 1.2055, "num_input_tokens_seen": 148986420, "step": 3800 }, { "epoch": 0.18461091191006881, "grad_norm": 0.3885250985622406, "learning_rate": 4.6931967245739586e-05, "loss": 1.1965, "num_input_tokens_seen": 149370912, "step": 3810 }, { "epoch": 0.18509545498594826, "grad_norm": 0.4217512905597687, "learning_rate": 4.691310836961843e-05, "loss": 1.2325, "num_input_tokens_seen": 149784152, "step": 3820 }, { "epoch": 0.1855799980618277, "grad_norm": 0.4255019724369049, "learning_rate": 4.689419552227259e-05, "loss": 1.2151, "num_input_tokens_seen": 150163748, "step": 3830 }, { "epoch": 0.18606454113770715, "grad_norm": 0.3889770209789276, "learning_rate": 4.687522875028376e-05, "loss": 1.1787, "num_input_tokens_seen": 150567952, "step": 3840 }, { "epoch": 0.1865490842135866, "grad_norm": 0.39190900325775146, "learning_rate": 4.685620810036642e-05, "loss": 1.2238, "num_input_tokens_seen": 150990236, "step": 3850 }, { "epoch": 0.18703362728946604, "grad_norm": 0.380062997341156, "learning_rate": 4.683713361936779e-05, "loss": 1.2109, "num_input_tokens_seen": 151403764, "step": 3860 }, { "epoch": 0.1875181703653455, "grad_norm": 0.38644662499427795, "learning_rate": 4.681800535426765e-05, "loss": 1.1836, "num_input_tokens_seen": 151781412, "step": 3870 }, { "epoch": 0.18800271344122493, "grad_norm": 0.41668441891670227, "learning_rate": 4.679882335217825e-05, "loss": 1.1741, "num_input_tokens_seen": 152190552, "step": 3880 }, { "epoch": 0.18848725651710438, "grad_norm": 0.43978846073150635, "learning_rate": 4.6779587660344195e-05, "loss": 1.2325, "num_input_tokens_seen": 152621828, "step": 3890 }, { "epoch": 0.18897179959298382, "grad_norm": 0.3876698911190033, "learning_rate": 4.676029832614231e-05, "loss": 1.2592, "num_input_tokens_seen": 153007936, "step": 3900 }, { "epoch": 0.18945634266886327, "grad_norm": 0.40177682042121887, "learning_rate": 4.6740955397081594e-05, "loss": 1.2129, "num_input_tokens_seen": 153391480, "step": 3910 }, { "epoch": 0.18994088574474272, "grad_norm": 0.44533804059028625, "learning_rate": 4.672155892080298e-05, "loss": 1.2284, "num_input_tokens_seen": 153772340, "step": 3920 }, { "epoch": 0.19042542882062216, "grad_norm": 0.4126012921333313, "learning_rate": 4.670210894507932e-05, "loss": 1.2157, "num_input_tokens_seen": 154169508, "step": 3930 }, { "epoch": 0.1909099718965016, "grad_norm": 0.4048556685447693, "learning_rate": 4.668260551781522e-05, "loss": 1.2128, "num_input_tokens_seen": 154556080, "step": 3940 }, { "epoch": 0.19139451497238105, "grad_norm": 0.41531628370285034, "learning_rate": 4.6663048687046965e-05, "loss": 1.2164, "num_input_tokens_seen": 154956220, "step": 3950 }, { "epoch": 0.1918790580482605, "grad_norm": 0.4042895436286926, "learning_rate": 4.6643438500942324e-05, "loss": 1.2069, "num_input_tokens_seen": 155328904, "step": 3960 }, { "epoch": 0.19236360112413994, "grad_norm": 0.38158828020095825, "learning_rate": 4.662377500780053e-05, "loss": 1.2578, "num_input_tokens_seen": 155742432, "step": 3970 }, { "epoch": 0.1928481442000194, "grad_norm": 0.3938775956630707, "learning_rate": 4.660405825605207e-05, "loss": 1.2278, "num_input_tokens_seen": 156106304, "step": 3980 }, { "epoch": 0.19333268727589883, "grad_norm": 0.4105328917503357, "learning_rate": 4.6584288294258623e-05, "loss": 1.2094, "num_input_tokens_seen": 156496196, "step": 3990 }, { "epoch": 0.19381723035177828, "grad_norm": 0.3988109827041626, "learning_rate": 4.6564465171112916e-05, "loss": 1.2248, "num_input_tokens_seen": 156868756, "step": 4000 }, { "epoch": 0.19381723035177828, "eval_loss": 1.2256019115447998, "eval_runtime": 5.1669, "eval_samples_per_second": 29.031, "eval_steps_per_second": 3.677, "num_input_tokens_seen": 156868756, "step": 4000 }, { "epoch": 0.19430177342765773, "grad_norm": 0.3892788887023926, "learning_rate": 4.654458893543861e-05, "loss": 1.1957, "num_input_tokens_seen": 157256772, "step": 4010 }, { "epoch": 0.19478631650353717, "grad_norm": 0.42166975140571594, "learning_rate": 4.6524659636190183e-05, "loss": 1.2207, "num_input_tokens_seen": 157653400, "step": 4020 }, { "epoch": 0.19527085957941662, "grad_norm": 0.4215097725391388, "learning_rate": 4.650467732245282e-05, "loss": 1.257, "num_input_tokens_seen": 158042052, "step": 4030 }, { "epoch": 0.19575540265529606, "grad_norm": 0.4513319432735443, "learning_rate": 4.648464204344224e-05, "loss": 1.2021, "num_input_tokens_seen": 158421912, "step": 4040 }, { "epoch": 0.1962399457311755, "grad_norm": 0.3815930485725403, "learning_rate": 4.646455384850466e-05, "loss": 1.2464, "num_input_tokens_seen": 158794420, "step": 4050 }, { "epoch": 0.19672448880705495, "grad_norm": 0.4318133294582367, "learning_rate": 4.64444127871166e-05, "loss": 1.1736, "num_input_tokens_seen": 159183072, "step": 4060 }, { "epoch": 0.1972090318829344, "grad_norm": 0.4135242700576782, "learning_rate": 4.6424218908884795e-05, "loss": 1.2094, "num_input_tokens_seen": 159567032, "step": 4070 }, { "epoch": 0.19769357495881384, "grad_norm": 0.410666823387146, "learning_rate": 4.640397226354607e-05, "loss": 1.2323, "num_input_tokens_seen": 159937052, "step": 4080 }, { "epoch": 0.1981781180346933, "grad_norm": 0.42202097177505493, "learning_rate": 4.63836729009672e-05, "loss": 1.209, "num_input_tokens_seen": 160346176, "step": 4090 }, { "epoch": 0.19866266111057274, "grad_norm": 0.42181599140167236, "learning_rate": 4.636332087114481e-05, "loss": 1.1995, "num_input_tokens_seen": 160734656, "step": 4100 }, { "epoch": 0.19914720418645218, "grad_norm": 0.42154404520988464, "learning_rate": 4.6342916224205254e-05, "loss": 1.2422, "num_input_tokens_seen": 161143420, "step": 4110 }, { "epoch": 0.19963174726233163, "grad_norm": 0.38917213678359985, "learning_rate": 4.632245901040446e-05, "loss": 1.2135, "num_input_tokens_seen": 161523856, "step": 4120 }, { "epoch": 0.20011629033821107, "grad_norm": 0.41019120812416077, "learning_rate": 4.6301949280127835e-05, "loss": 1.2309, "num_input_tokens_seen": 161906012, "step": 4130 }, { "epoch": 0.20060083341409052, "grad_norm": 0.3908578157424927, "learning_rate": 4.6281387083890134e-05, "loss": 1.2335, "num_input_tokens_seen": 162302056, "step": 4140 }, { "epoch": 0.20108537648996996, "grad_norm": 0.4184846878051758, "learning_rate": 4.626077247233533e-05, "loss": 1.207, "num_input_tokens_seen": 162674300, "step": 4150 }, { "epoch": 0.2015699195658494, "grad_norm": 0.41114839911460876, "learning_rate": 4.62401054962365e-05, "loss": 1.2287, "num_input_tokens_seen": 163082608, "step": 4160 }, { "epoch": 0.20205446264172885, "grad_norm": 0.4034859538078308, "learning_rate": 4.6219386206495684e-05, "loss": 1.2536, "num_input_tokens_seen": 163463012, "step": 4170 }, { "epoch": 0.2025390057176083, "grad_norm": 0.37627366185188293, "learning_rate": 4.6198614654143765e-05, "loss": 1.2428, "num_input_tokens_seen": 163855604, "step": 4180 }, { "epoch": 0.20302354879348775, "grad_norm": 0.3891449272632599, "learning_rate": 4.6177790890340376e-05, "loss": 1.2113, "num_input_tokens_seen": 164258760, "step": 4190 }, { "epoch": 0.2035080918693672, "grad_norm": 0.3885304033756256, "learning_rate": 4.615691496637371e-05, "loss": 1.2276, "num_input_tokens_seen": 164630872, "step": 4200 }, { "epoch": 0.20399263494524664, "grad_norm": 0.4263809323310852, "learning_rate": 4.613598693366044e-05, "loss": 1.1905, "num_input_tokens_seen": 165040256, "step": 4210 }, { "epoch": 0.20447717802112608, "grad_norm": 0.3751036524772644, "learning_rate": 4.611500684374559e-05, "loss": 1.2174, "num_input_tokens_seen": 165424264, "step": 4220 }, { "epoch": 0.20496172109700553, "grad_norm": 0.40289247035980225, "learning_rate": 4.6093974748302385e-05, "loss": 1.1899, "num_input_tokens_seen": 165812068, "step": 4230 }, { "epoch": 0.20544626417288497, "grad_norm": 0.4211735427379608, "learning_rate": 4.6072890699132155e-05, "loss": 1.1992, "num_input_tokens_seen": 166177560, "step": 4240 }, { "epoch": 0.20593080724876442, "grad_norm": 0.36460644006729126, "learning_rate": 4.605175474816418e-05, "loss": 1.2103, "num_input_tokens_seen": 166578756, "step": 4250 }, { "epoch": 0.20641535032464386, "grad_norm": 0.3921453356742859, "learning_rate": 4.603056694745556e-05, "loss": 1.2143, "num_input_tokens_seen": 166975748, "step": 4260 }, { "epoch": 0.2068998934005233, "grad_norm": 0.3987452983856201, "learning_rate": 4.600932734919113e-05, "loss": 1.2292, "num_input_tokens_seen": 167388080, "step": 4270 }, { "epoch": 0.20738443647640276, "grad_norm": 0.4142529368400574, "learning_rate": 4.5988036005683265e-05, "loss": 1.2048, "num_input_tokens_seen": 167784884, "step": 4280 }, { "epoch": 0.2078689795522822, "grad_norm": 0.39146703481674194, "learning_rate": 4.596669296937182e-05, "loss": 1.2422, "num_input_tokens_seen": 168184036, "step": 4290 }, { "epoch": 0.20835352262816165, "grad_norm": 0.3954419791698456, "learning_rate": 4.594529829282395e-05, "loss": 1.2297, "num_input_tokens_seen": 168575028, "step": 4300 }, { "epoch": 0.2088380657040411, "grad_norm": 0.4144536554813385, "learning_rate": 4.5923852028733985e-05, "loss": 1.1951, "num_input_tokens_seen": 168948296, "step": 4310 }, { "epoch": 0.20932260877992054, "grad_norm": 0.3958514630794525, "learning_rate": 4.590235422992335e-05, "loss": 1.1684, "num_input_tokens_seen": 169357988, "step": 4320 }, { "epoch": 0.20980715185579998, "grad_norm": 0.40073680877685547, "learning_rate": 4.588080494934036e-05, "loss": 1.2098, "num_input_tokens_seen": 169761752, "step": 4330 }, { "epoch": 0.21029169493167943, "grad_norm": 0.37582847476005554, "learning_rate": 4.585920424006015e-05, "loss": 1.2668, "num_input_tokens_seen": 170160840, "step": 4340 }, { "epoch": 0.21077623800755887, "grad_norm": 0.36600205302238464, "learning_rate": 4.5837552155284516e-05, "loss": 1.2413, "num_input_tokens_seen": 170562748, "step": 4350 }, { "epoch": 0.21126078108343832, "grad_norm": 0.41803476214408875, "learning_rate": 4.581584874834179e-05, "loss": 1.2522, "num_input_tokens_seen": 170956952, "step": 4360 }, { "epoch": 0.21174532415931777, "grad_norm": 0.43290525674819946, "learning_rate": 4.5794094072686716e-05, "loss": 1.254, "num_input_tokens_seen": 171333724, "step": 4370 }, { "epoch": 0.2122298672351972, "grad_norm": 0.4034164845943451, "learning_rate": 4.5772288181900294e-05, "loss": 1.2486, "num_input_tokens_seen": 171727380, "step": 4380 }, { "epoch": 0.21271441031107666, "grad_norm": 0.4040049910545349, "learning_rate": 4.575043112968969e-05, "loss": 1.24, "num_input_tokens_seen": 172114780, "step": 4390 }, { "epoch": 0.2131989533869561, "grad_norm": 0.39204099774360657, "learning_rate": 4.5728522969888044e-05, "loss": 1.2371, "num_input_tokens_seen": 172504760, "step": 4400 }, { "epoch": 0.21368349646283555, "grad_norm": 0.41550543904304504, "learning_rate": 4.5706563756454414e-05, "loss": 1.1737, "num_input_tokens_seen": 172866336, "step": 4410 }, { "epoch": 0.214168039538715, "grad_norm": 0.3796961009502411, "learning_rate": 4.5684553543473555e-05, "loss": 1.2245, "num_input_tokens_seen": 173274608, "step": 4420 }, { "epoch": 0.21465258261459444, "grad_norm": 0.39554864168167114, "learning_rate": 4.5662492385155886e-05, "loss": 1.2163, "num_input_tokens_seen": 173670996, "step": 4430 }, { "epoch": 0.21513712569047388, "grad_norm": 0.41738855838775635, "learning_rate": 4.564038033583725e-05, "loss": 1.1744, "num_input_tokens_seen": 174089336, "step": 4440 }, { "epoch": 0.21562166876635333, "grad_norm": 0.38417306542396545, "learning_rate": 4.561821744997887e-05, "loss": 1.2079, "num_input_tokens_seen": 174457260, "step": 4450 }, { "epoch": 0.21610621184223278, "grad_norm": 0.3893676698207855, "learning_rate": 4.5596003782167154e-05, "loss": 1.2236, "num_input_tokens_seen": 174845308, "step": 4460 }, { "epoch": 0.21659075491811222, "grad_norm": 0.5625600218772888, "learning_rate": 4.55737393871136e-05, "loss": 1.1457, "num_input_tokens_seen": 175217384, "step": 4470 }, { "epoch": 0.21707529799399167, "grad_norm": 0.4266904294490814, "learning_rate": 4.555142431965465e-05, "loss": 1.2847, "num_input_tokens_seen": 175597084, "step": 4480 }, { "epoch": 0.2175598410698711, "grad_norm": 0.40159592032432556, "learning_rate": 4.552905863475152e-05, "loss": 1.2064, "num_input_tokens_seen": 175985196, "step": 4490 }, { "epoch": 0.21804438414575056, "grad_norm": 0.3944200873374939, "learning_rate": 4.550664238749012e-05, "loss": 1.1969, "num_input_tokens_seen": 176390156, "step": 4500 }, { "epoch": 0.21852892722163, "grad_norm": 0.37802281975746155, "learning_rate": 4.54841756330809e-05, "loss": 1.1757, "num_input_tokens_seen": 176751292, "step": 4510 }, { "epoch": 0.21901347029750945, "grad_norm": 0.46125710010528564, "learning_rate": 4.546165842685869e-05, "loss": 1.1816, "num_input_tokens_seen": 177136472, "step": 4520 }, { "epoch": 0.2194980133733889, "grad_norm": 0.40457674860954285, "learning_rate": 4.543909082428257e-05, "loss": 1.2635, "num_input_tokens_seen": 177564052, "step": 4530 }, { "epoch": 0.21998255644926834, "grad_norm": 0.3985607326030731, "learning_rate": 4.541647288093579e-05, "loss": 1.2209, "num_input_tokens_seen": 177977356, "step": 4540 }, { "epoch": 0.22046709952514779, "grad_norm": 0.41993504762649536, "learning_rate": 4.539380465252554e-05, "loss": 1.1924, "num_input_tokens_seen": 178361500, "step": 4550 }, { "epoch": 0.22095164260102723, "grad_norm": 0.4131138324737549, "learning_rate": 4.5371086194882886e-05, "loss": 1.2114, "num_input_tokens_seen": 178786424, "step": 4560 }, { "epoch": 0.22143618567690668, "grad_norm": 0.3843264579772949, "learning_rate": 4.5348317563962594e-05, "loss": 1.2211, "num_input_tokens_seen": 179173444, "step": 4570 }, { "epoch": 0.22192072875278612, "grad_norm": 0.39719441533088684, "learning_rate": 4.532549881584301e-05, "loss": 1.2099, "num_input_tokens_seen": 179550012, "step": 4580 }, { "epoch": 0.22240527182866557, "grad_norm": 0.41848018765449524, "learning_rate": 4.5302630006725934e-05, "loss": 1.2015, "num_input_tokens_seen": 179934896, "step": 4590 }, { "epoch": 0.22288981490454501, "grad_norm": 0.37484750151634216, "learning_rate": 4.527971119293643e-05, "loss": 1.2331, "num_input_tokens_seen": 180319432, "step": 4600 }, { "epoch": 0.22337435798042446, "grad_norm": 0.40925487875938416, "learning_rate": 4.525674243092275e-05, "loss": 1.2214, "num_input_tokens_seen": 180713196, "step": 4610 }, { "epoch": 0.2238589010563039, "grad_norm": 0.381757915019989, "learning_rate": 4.523372377725615e-05, "loss": 1.1964, "num_input_tokens_seen": 181112116, "step": 4620 }, { "epoch": 0.22434344413218335, "grad_norm": 0.3882332742214203, "learning_rate": 4.5210655288630774e-05, "loss": 1.189, "num_input_tokens_seen": 181537372, "step": 4630 }, { "epoch": 0.2248279872080628, "grad_norm": 0.3924133777618408, "learning_rate": 4.518753702186352e-05, "loss": 1.2174, "num_input_tokens_seen": 181917440, "step": 4640 }, { "epoch": 0.22531253028394224, "grad_norm": 0.3810102045536041, "learning_rate": 4.5164369033893874e-05, "loss": 1.235, "num_input_tokens_seen": 182302576, "step": 4650 }, { "epoch": 0.2257970733598217, "grad_norm": 0.4031634032726288, "learning_rate": 4.514115138178378e-05, "loss": 1.2213, "num_input_tokens_seen": 182677192, "step": 4660 }, { "epoch": 0.22628161643570113, "grad_norm": 0.4006711542606354, "learning_rate": 4.511788412271749e-05, "loss": 1.1881, "num_input_tokens_seen": 183057516, "step": 4670 }, { "epoch": 0.22676615951158058, "grad_norm": 0.42727628350257874, "learning_rate": 4.509456731400149e-05, "loss": 1.1956, "num_input_tokens_seen": 183446504, "step": 4680 }, { "epoch": 0.22725070258746002, "grad_norm": 0.419729620218277, "learning_rate": 4.507120101306425e-05, "loss": 1.2287, "num_input_tokens_seen": 183853916, "step": 4690 }, { "epoch": 0.22773524566333947, "grad_norm": 0.3630729019641876, "learning_rate": 4.504778527745616e-05, "loss": 1.179, "num_input_tokens_seen": 184251140, "step": 4700 }, { "epoch": 0.22821978873921892, "grad_norm": 0.3897242248058319, "learning_rate": 4.5024320164849366e-05, "loss": 1.1704, "num_input_tokens_seen": 184655808, "step": 4710 }, { "epoch": 0.22870433181509836, "grad_norm": 0.3991836905479431, "learning_rate": 4.500080573303762e-05, "loss": 1.2208, "num_input_tokens_seen": 185054436, "step": 4720 }, { "epoch": 0.2291888748909778, "grad_norm": 0.39902886748313904, "learning_rate": 4.497724203993615e-05, "loss": 1.2088, "num_input_tokens_seen": 185439916, "step": 4730 }, { "epoch": 0.22967341796685725, "grad_norm": 0.39412978291511536, "learning_rate": 4.495362914358152e-05, "loss": 1.2317, "num_input_tokens_seen": 185844756, "step": 4740 }, { "epoch": 0.2301579610427367, "grad_norm": 0.37934547662734985, "learning_rate": 4.4929967102131473e-05, "loss": 1.2081, "num_input_tokens_seen": 186226308, "step": 4750 }, { "epoch": 0.23064250411861614, "grad_norm": 0.405785471200943, "learning_rate": 4.49062559738648e-05, "loss": 1.1877, "num_input_tokens_seen": 186592700, "step": 4760 }, { "epoch": 0.2311270471944956, "grad_norm": 0.36941656470298767, "learning_rate": 4.488249581718118e-05, "loss": 1.2322, "num_input_tokens_seen": 186963268, "step": 4770 }, { "epoch": 0.23161159027037503, "grad_norm": 0.3917679488658905, "learning_rate": 4.485868669060104e-05, "loss": 1.2006, "num_input_tokens_seen": 187364124, "step": 4780 }, { "epoch": 0.23209613334625448, "grad_norm": 0.38910412788391113, "learning_rate": 4.483482865276545e-05, "loss": 1.2244, "num_input_tokens_seen": 187761036, "step": 4790 }, { "epoch": 0.23258067642213393, "grad_norm": 0.42575618624687195, "learning_rate": 4.481092176243592e-05, "loss": 1.2119, "num_input_tokens_seen": 188156176, "step": 4800 }, { "epoch": 0.23306521949801337, "grad_norm": 0.3848058879375458, "learning_rate": 4.4786966078494296e-05, "loss": 1.2539, "num_input_tokens_seen": 188549760, "step": 4810 }, { "epoch": 0.23354976257389282, "grad_norm": 0.4218595623970032, "learning_rate": 4.47629616599426e-05, "loss": 1.2262, "num_input_tokens_seen": 188935732, "step": 4820 }, { "epoch": 0.23403430564977226, "grad_norm": 0.3957028388977051, "learning_rate": 4.473890856590287e-05, "loss": 1.2091, "num_input_tokens_seen": 189364744, "step": 4830 }, { "epoch": 0.2345188487256517, "grad_norm": 0.3964502811431885, "learning_rate": 4.471480685561704e-05, "loss": 1.1825, "num_input_tokens_seen": 189761404, "step": 4840 }, { "epoch": 0.23500339180153115, "grad_norm": 0.44072818756103516, "learning_rate": 4.469065658844679e-05, "loss": 1.1775, "num_input_tokens_seen": 190143308, "step": 4850 }, { "epoch": 0.2354879348774106, "grad_norm": 0.4088262617588043, "learning_rate": 4.466645782387339e-05, "loss": 1.1938, "num_input_tokens_seen": 190553260, "step": 4860 }, { "epoch": 0.23597247795329004, "grad_norm": 0.3793879449367523, "learning_rate": 4.464221062149756e-05, "loss": 1.184, "num_input_tokens_seen": 190944324, "step": 4870 }, { "epoch": 0.2364570210291695, "grad_norm": 0.3843640089035034, "learning_rate": 4.461791504103931e-05, "loss": 1.1906, "num_input_tokens_seen": 191314092, "step": 4880 }, { "epoch": 0.23694156410504894, "grad_norm": 0.42063283920288086, "learning_rate": 4.459357114233781e-05, "loss": 1.2077, "num_input_tokens_seen": 191680088, "step": 4890 }, { "epoch": 0.23742610718092838, "grad_norm": 0.3904326558113098, "learning_rate": 4.4569178985351246e-05, "loss": 1.1678, "num_input_tokens_seen": 192097904, "step": 4900 }, { "epoch": 0.23791065025680783, "grad_norm": 0.40638238191604614, "learning_rate": 4.4544738630156644e-05, "loss": 1.2103, "num_input_tokens_seen": 192484692, "step": 4910 }, { "epoch": 0.23839519333268727, "grad_norm": 0.4055958092212677, "learning_rate": 4.4520250136949755e-05, "loss": 1.2244, "num_input_tokens_seen": 192862996, "step": 4920 }, { "epoch": 0.23887973640856672, "grad_norm": 0.42126190662384033, "learning_rate": 4.449571356604488e-05, "loss": 1.1551, "num_input_tokens_seen": 193256264, "step": 4930 }, { "epoch": 0.23936427948444616, "grad_norm": 0.3869550824165344, "learning_rate": 4.4471128977874755e-05, "loss": 1.2033, "num_input_tokens_seen": 193631576, "step": 4940 }, { "epoch": 0.2398488225603256, "grad_norm": 0.4147154986858368, "learning_rate": 4.4446496432990345e-05, "loss": 1.1741, "num_input_tokens_seen": 194010208, "step": 4950 }, { "epoch": 0.24033336563620505, "grad_norm": 0.39161938428878784, "learning_rate": 4.442181599206078e-05, "loss": 1.2042, "num_input_tokens_seen": 194370712, "step": 4960 }, { "epoch": 0.2408179087120845, "grad_norm": 0.40375518798828125, "learning_rate": 4.439708771587311e-05, "loss": 1.2074, "num_input_tokens_seen": 194764676, "step": 4970 }, { "epoch": 0.24130245178796395, "grad_norm": 0.37387263774871826, "learning_rate": 4.437231166533222e-05, "loss": 1.23, "num_input_tokens_seen": 195173608, "step": 4980 }, { "epoch": 0.2417869948638434, "grad_norm": 0.45811185240745544, "learning_rate": 4.434748790146066e-05, "loss": 1.1996, "num_input_tokens_seen": 195550528, "step": 4990 }, { "epoch": 0.24227153793972284, "grad_norm": 0.40425029397010803, "learning_rate": 4.432261648539852e-05, "loss": 1.1837, "num_input_tokens_seen": 195956732, "step": 5000 }, { "epoch": 0.24275608101560228, "grad_norm": 0.40268510580062866, "learning_rate": 4.42976974784032e-05, "loss": 1.25, "num_input_tokens_seen": 196340952, "step": 5010 }, { "epoch": 0.24324062409148173, "grad_norm": 0.43269723653793335, "learning_rate": 4.427273094184938e-05, "loss": 1.174, "num_input_tokens_seen": 196711452, "step": 5020 }, { "epoch": 0.24372516716736117, "grad_norm": 0.3802777826786041, "learning_rate": 4.424771693722875e-05, "loss": 1.2558, "num_input_tokens_seen": 197112496, "step": 5030 }, { "epoch": 0.24420971024324062, "grad_norm": 0.39778804779052734, "learning_rate": 4.422265552614995e-05, "loss": 1.1993, "num_input_tokens_seen": 197504404, "step": 5040 }, { "epoch": 0.24469425331912006, "grad_norm": 0.4463038146495819, "learning_rate": 4.4197546770338346e-05, "loss": 1.196, "num_input_tokens_seen": 197904548, "step": 5050 }, { "epoch": 0.2451787963949995, "grad_norm": 0.37699079513549805, "learning_rate": 4.417239073163596e-05, "loss": 1.1525, "num_input_tokens_seen": 198319024, "step": 5060 }, { "epoch": 0.24566333947087896, "grad_norm": 0.3865511119365692, "learning_rate": 4.414718747200121e-05, "loss": 1.2275, "num_input_tokens_seen": 198714528, "step": 5070 }, { "epoch": 0.2461478825467584, "grad_norm": 0.3743216395378113, "learning_rate": 4.412193705350886e-05, "loss": 1.2307, "num_input_tokens_seen": 199085164, "step": 5080 }, { "epoch": 0.24663242562263785, "grad_norm": 0.41756245493888855, "learning_rate": 4.4096639538349835e-05, "loss": 1.2028, "num_input_tokens_seen": 199462308, "step": 5090 }, { "epoch": 0.2471169686985173, "grad_norm": 0.35077378153800964, "learning_rate": 4.407129498883101e-05, "loss": 1.1789, "num_input_tokens_seen": 199882788, "step": 5100 }, { "epoch": 0.24760151177439674, "grad_norm": 0.4078513979911804, "learning_rate": 4.404590346737514e-05, "loss": 1.2188, "num_input_tokens_seen": 200271336, "step": 5110 }, { "epoch": 0.24808605485027618, "grad_norm": 0.42663541436195374, "learning_rate": 4.402046503652065e-05, "loss": 1.2293, "num_input_tokens_seen": 200660788, "step": 5120 }, { "epoch": 0.24857059792615563, "grad_norm": 0.3861826956272125, "learning_rate": 4.399497975892153e-05, "loss": 1.2182, "num_input_tokens_seen": 201075832, "step": 5130 }, { "epoch": 0.24905514100203507, "grad_norm": 0.38945886492729187, "learning_rate": 4.3969447697347116e-05, "loss": 1.2464, "num_input_tokens_seen": 201447360, "step": 5140 }, { "epoch": 0.24953968407791452, "grad_norm": 0.3884553611278534, "learning_rate": 4.3943868914681996e-05, "loss": 1.2353, "num_input_tokens_seen": 201863948, "step": 5150 }, { "epoch": 0.25002422715379397, "grad_norm": 0.409061461687088, "learning_rate": 4.3918243473925823e-05, "loss": 1.2235, "num_input_tokens_seen": 202265196, "step": 5160 }, { "epoch": 0.25050877022967344, "grad_norm": 0.43314066529273987, "learning_rate": 4.389257143819318e-05, "loss": 1.2076, "num_input_tokens_seen": 202665352, "step": 5170 }, { "epoch": 0.25099331330555286, "grad_norm": 0.43011194467544556, "learning_rate": 4.386685287071337e-05, "loss": 1.1834, "num_input_tokens_seen": 203076796, "step": 5180 }, { "epoch": 0.25147785638143233, "grad_norm": 0.4096457064151764, "learning_rate": 4.3841087834830354e-05, "loss": 1.2336, "num_input_tokens_seen": 203488952, "step": 5190 }, { "epoch": 0.25196239945731175, "grad_norm": 0.4316791594028473, "learning_rate": 4.381527639400251e-05, "loss": 1.2205, "num_input_tokens_seen": 203855896, "step": 5200 }, { "epoch": 0.2524469425331912, "grad_norm": 0.4344671666622162, "learning_rate": 4.3789418611802533e-05, "loss": 1.1693, "num_input_tokens_seen": 204240628, "step": 5210 }, { "epoch": 0.25293148560907064, "grad_norm": 0.378844678401947, "learning_rate": 4.3763514551917236e-05, "loss": 1.1722, "num_input_tokens_seen": 204638860, "step": 5220 }, { "epoch": 0.2534160286849501, "grad_norm": 0.38779839873313904, "learning_rate": 4.3737564278147406e-05, "loss": 1.2179, "num_input_tokens_seen": 205028820, "step": 5230 }, { "epoch": 0.25390057176082953, "grad_norm": 0.4080614745616913, "learning_rate": 4.3711567854407683e-05, "loss": 1.1904, "num_input_tokens_seen": 205408752, "step": 5240 }, { "epoch": 0.254385114836709, "grad_norm": 0.40406113862991333, "learning_rate": 4.3685525344726345e-05, "loss": 1.2017, "num_input_tokens_seen": 205800048, "step": 5250 }, { "epoch": 0.2548696579125884, "grad_norm": 0.3979373872280121, "learning_rate": 4.365943681324519e-05, "loss": 1.2127, "num_input_tokens_seen": 206178400, "step": 5260 }, { "epoch": 0.2553542009884679, "grad_norm": 0.406194269657135, "learning_rate": 4.363330232421938e-05, "loss": 1.2299, "num_input_tokens_seen": 206550496, "step": 5270 }, { "epoch": 0.2558387440643473, "grad_norm": 0.43176546692848206, "learning_rate": 4.3607121942017225e-05, "loss": 1.1843, "num_input_tokens_seen": 206938216, "step": 5280 }, { "epoch": 0.2563232871402268, "grad_norm": 0.43860721588134766, "learning_rate": 4.358089573112012e-05, "loss": 1.1568, "num_input_tokens_seen": 207342028, "step": 5290 }, { "epoch": 0.2568078302161062, "grad_norm": 0.40631529688835144, "learning_rate": 4.355462375612231e-05, "loss": 1.2707, "num_input_tokens_seen": 207740984, "step": 5300 }, { "epoch": 0.2572923732919857, "grad_norm": 0.3890661895275116, "learning_rate": 4.352830608173074e-05, "loss": 1.1957, "num_input_tokens_seen": 208160252, "step": 5310 }, { "epoch": 0.2577769163678651, "grad_norm": 0.40882420539855957, "learning_rate": 4.350194277276495e-05, "loss": 1.1938, "num_input_tokens_seen": 208537272, "step": 5320 }, { "epoch": 0.25826145944374457, "grad_norm": 0.4055963456630707, "learning_rate": 4.347553389415684e-05, "loss": 1.1603, "num_input_tokens_seen": 208927108, "step": 5330 }, { "epoch": 0.258746002519624, "grad_norm": 0.3766281008720398, "learning_rate": 4.3449079510950555e-05, "loss": 1.2169, "num_input_tokens_seen": 209320208, "step": 5340 }, { "epoch": 0.25923054559550346, "grad_norm": 0.3955957591533661, "learning_rate": 4.3422579688302337e-05, "loss": 1.2096, "num_input_tokens_seen": 209704852, "step": 5350 }, { "epoch": 0.2597150886713829, "grad_norm": 0.416856050491333, "learning_rate": 4.339603449148031e-05, "loss": 1.2155, "num_input_tokens_seen": 210092272, "step": 5360 }, { "epoch": 0.26019963174726235, "grad_norm": 0.3843337893486023, "learning_rate": 4.33694439858644e-05, "loss": 1.2163, "num_input_tokens_seen": 210513236, "step": 5370 }, { "epoch": 0.26068417482314177, "grad_norm": 0.3668496310710907, "learning_rate": 4.334280823694607e-05, "loss": 1.2402, "num_input_tokens_seen": 210927300, "step": 5380 }, { "epoch": 0.26116871789902124, "grad_norm": 0.38131269812583923, "learning_rate": 4.331612731032826e-05, "loss": 1.1913, "num_input_tokens_seen": 211339824, "step": 5390 }, { "epoch": 0.26165326097490066, "grad_norm": 0.4179877042770386, "learning_rate": 4.328940127172516e-05, "loss": 1.197, "num_input_tokens_seen": 211752352, "step": 5400 }, { "epoch": 0.26213780405078013, "grad_norm": 0.42718562483787537, "learning_rate": 4.326263018696208e-05, "loss": 1.199, "num_input_tokens_seen": 212135324, "step": 5410 }, { "epoch": 0.26262234712665955, "grad_norm": 0.43352535367012024, "learning_rate": 4.3235814121975274e-05, "loss": 1.2094, "num_input_tokens_seen": 212524932, "step": 5420 }, { "epoch": 0.263106890202539, "grad_norm": 0.396100789308548, "learning_rate": 4.320895314281177e-05, "loss": 1.2254, "num_input_tokens_seen": 212938816, "step": 5430 }, { "epoch": 0.26359143327841844, "grad_norm": 0.3762286901473999, "learning_rate": 4.318204731562922e-05, "loss": 1.2321, "num_input_tokens_seen": 213336448, "step": 5440 }, { "epoch": 0.2640759763542979, "grad_norm": 0.35878944396972656, "learning_rate": 4.3155096706695755e-05, "loss": 1.2227, "num_input_tokens_seen": 213756868, "step": 5450 }, { "epoch": 0.26456051943017733, "grad_norm": 0.4023889899253845, "learning_rate": 4.312810138238979e-05, "loss": 1.2823, "num_input_tokens_seen": 214154532, "step": 5460 }, { "epoch": 0.2650450625060568, "grad_norm": 0.40626269578933716, "learning_rate": 4.310106140919986e-05, "loss": 1.2193, "num_input_tokens_seen": 214554500, "step": 5470 }, { "epoch": 0.2655296055819362, "grad_norm": 0.4005719721317291, "learning_rate": 4.307397685372448e-05, "loss": 1.1867, "num_input_tokens_seen": 214941668, "step": 5480 }, { "epoch": 0.2660141486578157, "grad_norm": 0.4291759431362152, "learning_rate": 4.304684778267199e-05, "loss": 1.2184, "num_input_tokens_seen": 215343812, "step": 5490 }, { "epoch": 0.2664986917336951, "grad_norm": 0.41158977150917053, "learning_rate": 4.3019674262860324e-05, "loss": 1.1839, "num_input_tokens_seen": 215688820, "step": 5500 }, { "epoch": 0.2669832348095746, "grad_norm": 0.4709673821926117, "learning_rate": 4.2992456361216934e-05, "loss": 1.2018, "num_input_tokens_seen": 216078056, "step": 5510 }, { "epoch": 0.267467777885454, "grad_norm": 0.47514909505844116, "learning_rate": 4.2965194144778554e-05, "loss": 1.2325, "num_input_tokens_seen": 216461372, "step": 5520 }, { "epoch": 0.2679523209613335, "grad_norm": 0.38686904311180115, "learning_rate": 4.293788768069108e-05, "loss": 1.2933, "num_input_tokens_seen": 216842388, "step": 5530 }, { "epoch": 0.2684368640372129, "grad_norm": 0.3987712860107422, "learning_rate": 4.291053703620939e-05, "loss": 1.2467, "num_input_tokens_seen": 217231240, "step": 5540 }, { "epoch": 0.26892140711309237, "grad_norm": 0.4047813415527344, "learning_rate": 4.288314227869716e-05, "loss": 1.2023, "num_input_tokens_seen": 217648192, "step": 5550 }, { "epoch": 0.2694059501889718, "grad_norm": 0.424683541059494, "learning_rate": 4.285570347562674e-05, "loss": 1.2134, "num_input_tokens_seen": 218017588, "step": 5560 }, { "epoch": 0.26989049326485126, "grad_norm": 0.4166494607925415, "learning_rate": 4.2828220694578935e-05, "loss": 1.1818, "num_input_tokens_seen": 218425896, "step": 5570 }, { "epoch": 0.2703750363407307, "grad_norm": 0.3912692964076996, "learning_rate": 4.28006940032429e-05, "loss": 1.1785, "num_input_tokens_seen": 218807588, "step": 5580 }, { "epoch": 0.27085957941661015, "grad_norm": 0.36660903692245483, "learning_rate": 4.2773123469415897e-05, "loss": 1.1937, "num_input_tokens_seen": 219185200, "step": 5590 }, { "epoch": 0.27134412249248957, "grad_norm": 0.39641043543815613, "learning_rate": 4.274550916100321e-05, "loss": 1.1904, "num_input_tokens_seen": 219546888, "step": 5600 }, { "epoch": 0.27182866556836904, "grad_norm": 0.371076375246048, "learning_rate": 4.271785114601791e-05, "loss": 1.2116, "num_input_tokens_seen": 219938908, "step": 5610 }, { "epoch": 0.27231320864424846, "grad_norm": 0.4308777153491974, "learning_rate": 4.269014949258072e-05, "loss": 1.1918, "num_input_tokens_seen": 220337796, "step": 5620 }, { "epoch": 0.27279775172012793, "grad_norm": 0.4023754894733429, "learning_rate": 4.266240426891987e-05, "loss": 1.1797, "num_input_tokens_seen": 220717632, "step": 5630 }, { "epoch": 0.27328229479600735, "grad_norm": 0.3996109366416931, "learning_rate": 4.2634615543370885e-05, "loss": 1.1916, "num_input_tokens_seen": 221115656, "step": 5640 }, { "epoch": 0.2737668378718868, "grad_norm": 0.43148279190063477, "learning_rate": 4.26067833843764e-05, "loss": 1.1981, "num_input_tokens_seen": 221450512, "step": 5650 }, { "epoch": 0.27425138094776624, "grad_norm": 0.4533494710922241, "learning_rate": 4.257890786048609e-05, "loss": 1.1904, "num_input_tokens_seen": 221855800, "step": 5660 }, { "epoch": 0.2747359240236457, "grad_norm": 0.38806283473968506, "learning_rate": 4.255098904035638e-05, "loss": 1.2631, "num_input_tokens_seen": 222267776, "step": 5670 }, { "epoch": 0.27522046709952513, "grad_norm": 0.3938799202442169, "learning_rate": 4.252302699275037e-05, "loss": 1.1635, "num_input_tokens_seen": 222682008, "step": 5680 }, { "epoch": 0.2757050101754046, "grad_norm": 0.4024967551231384, "learning_rate": 4.249502178653759e-05, "loss": 1.1915, "num_input_tokens_seen": 223053692, "step": 5690 }, { "epoch": 0.276189553251284, "grad_norm": 0.3679750859737396, "learning_rate": 4.246697349069391e-05, "loss": 1.1711, "num_input_tokens_seen": 223438432, "step": 5700 }, { "epoch": 0.2766740963271635, "grad_norm": 0.42832136154174805, "learning_rate": 4.243888217430129e-05, "loss": 1.1753, "num_input_tokens_seen": 223831660, "step": 5710 }, { "epoch": 0.2771586394030429, "grad_norm": 0.4067901074886322, "learning_rate": 4.241074790654769e-05, "loss": 1.2218, "num_input_tokens_seen": 224233376, "step": 5720 }, { "epoch": 0.2776431824789224, "grad_norm": 0.38691502809524536, "learning_rate": 4.2382570756726815e-05, "loss": 1.2041, "num_input_tokens_seen": 224640328, "step": 5730 }, { "epoch": 0.2781277255548018, "grad_norm": 0.40320757031440735, "learning_rate": 4.235435079423802e-05, "loss": 1.2297, "num_input_tokens_seen": 225039904, "step": 5740 }, { "epoch": 0.2786122686306813, "grad_norm": 0.43891477584838867, "learning_rate": 4.232608808858608e-05, "loss": 1.1493, "num_input_tokens_seen": 225441684, "step": 5750 }, { "epoch": 0.2790968117065607, "grad_norm": 0.4412339925765991, "learning_rate": 4.2297782709381065e-05, "loss": 1.2154, "num_input_tokens_seen": 225837976, "step": 5760 }, { "epoch": 0.2795813547824402, "grad_norm": 0.39056098461151123, "learning_rate": 4.226943472633813e-05, "loss": 1.1641, "num_input_tokens_seen": 226187316, "step": 5770 }, { "epoch": 0.2800658978583196, "grad_norm": 0.41356363892555237, "learning_rate": 4.2241044209277384e-05, "loss": 1.2058, "num_input_tokens_seen": 226579696, "step": 5780 }, { "epoch": 0.28055044093419906, "grad_norm": 0.40139320492744446, "learning_rate": 4.2212611228123686e-05, "loss": 1.2125, "num_input_tokens_seen": 226972904, "step": 5790 }, { "epoch": 0.2810349840100785, "grad_norm": 0.40096282958984375, "learning_rate": 4.218413585290647e-05, "loss": 1.2027, "num_input_tokens_seen": 227364980, "step": 5800 }, { "epoch": 0.28151952708595795, "grad_norm": 0.41258716583251953, "learning_rate": 4.21556181537596e-05, "loss": 1.2129, "num_input_tokens_seen": 227750424, "step": 5810 }, { "epoch": 0.2820040701618374, "grad_norm": 0.4123449921607971, "learning_rate": 4.2127058200921186e-05, "loss": 1.1705, "num_input_tokens_seen": 228150768, "step": 5820 }, { "epoch": 0.28248861323771685, "grad_norm": 0.3969346582889557, "learning_rate": 4.209845606473339e-05, "loss": 1.1913, "num_input_tokens_seen": 228525776, "step": 5830 }, { "epoch": 0.28297315631359626, "grad_norm": 0.378460168838501, "learning_rate": 4.206981181564229e-05, "loss": 1.1467, "num_input_tokens_seen": 228912604, "step": 5840 }, { "epoch": 0.28345769938947574, "grad_norm": 0.38569942116737366, "learning_rate": 4.2041125524197664e-05, "loss": 1.2075, "num_input_tokens_seen": 229298524, "step": 5850 }, { "epoch": 0.28394224246535515, "grad_norm": 0.3867596983909607, "learning_rate": 4.201239726105286e-05, "loss": 1.1713, "num_input_tokens_seen": 229686964, "step": 5860 }, { "epoch": 0.28442678554123463, "grad_norm": 0.373037725687027, "learning_rate": 4.198362709696458e-05, "loss": 1.2551, "num_input_tokens_seen": 230060948, "step": 5870 }, { "epoch": 0.28491132861711405, "grad_norm": 0.3632396161556244, "learning_rate": 4.195481510279276e-05, "loss": 1.2301, "num_input_tokens_seen": 230448628, "step": 5880 }, { "epoch": 0.2853958716929935, "grad_norm": 0.4397088289260864, "learning_rate": 4.192596134950033e-05, "loss": 1.2441, "num_input_tokens_seen": 230837484, "step": 5890 }, { "epoch": 0.28588041476887294, "grad_norm": 0.37855035066604614, "learning_rate": 4.189706590815307e-05, "loss": 1.2049, "num_input_tokens_seen": 231247448, "step": 5900 }, { "epoch": 0.2863649578447524, "grad_norm": 0.3964214324951172, "learning_rate": 4.186812884991946e-05, "loss": 1.1927, "num_input_tokens_seen": 231653776, "step": 5910 }, { "epoch": 0.28684950092063183, "grad_norm": 0.35895830392837524, "learning_rate": 4.183915024607048e-05, "loss": 1.2302, "num_input_tokens_seen": 232045136, "step": 5920 }, { "epoch": 0.2873340439965113, "grad_norm": 0.375405877828598, "learning_rate": 4.18101301679794e-05, "loss": 1.1242, "num_input_tokens_seen": 232402564, "step": 5930 }, { "epoch": 0.2878185870723907, "grad_norm": 0.37960904836654663, "learning_rate": 4.178106868712168e-05, "loss": 1.2209, "num_input_tokens_seen": 232790844, "step": 5940 }, { "epoch": 0.2883031301482702, "grad_norm": 0.4015379250049591, "learning_rate": 4.1751965875074746e-05, "loss": 1.1929, "num_input_tokens_seen": 233194892, "step": 5950 }, { "epoch": 0.2887876732241496, "grad_norm": 0.42236605286598206, "learning_rate": 4.172282180351779e-05, "loss": 1.2123, "num_input_tokens_seen": 233566416, "step": 5960 }, { "epoch": 0.2892722163000291, "grad_norm": 0.4050694406032562, "learning_rate": 4.169363654423166e-05, "loss": 1.1981, "num_input_tokens_seen": 233992752, "step": 5970 }, { "epoch": 0.2897567593759085, "grad_norm": 0.4495182931423187, "learning_rate": 4.166441016909864e-05, "loss": 1.2207, "num_input_tokens_seen": 234347140, "step": 5980 }, { "epoch": 0.290241302451788, "grad_norm": 0.3789389431476593, "learning_rate": 4.163514275010228e-05, "loss": 1.1707, "num_input_tokens_seen": 234751484, "step": 5990 }, { "epoch": 0.2907258455276674, "grad_norm": 0.3984537720680237, "learning_rate": 4.160583435932719e-05, "loss": 1.2024, "num_input_tokens_seen": 235148092, "step": 6000 }, { "epoch": 0.2907258455276674, "eval_loss": 1.200870156288147, "eval_runtime": 5.1147, "eval_samples_per_second": 29.327, "eval_steps_per_second": 3.715, "num_input_tokens_seen": 235148092, "step": 6000 }, { "epoch": 0.29121038860354687, "grad_norm": 0.409307062625885, "learning_rate": 4.157648506895895e-05, "loss": 1.2106, "num_input_tokens_seen": 235543492, "step": 6010 }, { "epoch": 0.2916949316794263, "grad_norm": 0.3580097258090973, "learning_rate": 4.154709495128383e-05, "loss": 1.1765, "num_input_tokens_seen": 235908876, "step": 6020 }, { "epoch": 0.29217947475530576, "grad_norm": 0.38826310634613037, "learning_rate": 4.151766407868866e-05, "loss": 1.1798, "num_input_tokens_seen": 236298644, "step": 6030 }, { "epoch": 0.2926640178311852, "grad_norm": 0.3992711901664734, "learning_rate": 4.1488192523660676e-05, "loss": 1.2077, "num_input_tokens_seen": 236681740, "step": 6040 }, { "epoch": 0.29314856090706465, "grad_norm": 0.4223003089427948, "learning_rate": 4.1458680358787275e-05, "loss": 1.1904, "num_input_tokens_seen": 237098732, "step": 6050 }, { "epoch": 0.29363310398294407, "grad_norm": 0.3620993494987488, "learning_rate": 4.14291276567559e-05, "loss": 1.2307, "num_input_tokens_seen": 237483400, "step": 6060 }, { "epoch": 0.29411764705882354, "grad_norm": 0.3823581039905548, "learning_rate": 4.139953449035383e-05, "loss": 1.1695, "num_input_tokens_seen": 237865200, "step": 6070 }, { "epoch": 0.29460219013470296, "grad_norm": 0.37260711193084717, "learning_rate": 4.136990093246802e-05, "loss": 1.2297, "num_input_tokens_seen": 238233548, "step": 6080 }, { "epoch": 0.29508673321058243, "grad_norm": 0.4020680785179138, "learning_rate": 4.134022705608487e-05, "loss": 1.2263, "num_input_tokens_seen": 238623180, "step": 6090 }, { "epoch": 0.29557127628646185, "grad_norm": 0.41028186678886414, "learning_rate": 4.1310512934290124e-05, "loss": 1.2341, "num_input_tokens_seen": 238985848, "step": 6100 }, { "epoch": 0.2960558193623413, "grad_norm": 0.4138743579387665, "learning_rate": 4.1280758640268625e-05, "loss": 1.2075, "num_input_tokens_seen": 239355272, "step": 6110 }, { "epoch": 0.29654036243822074, "grad_norm": 0.35869237780570984, "learning_rate": 4.125096424730417e-05, "loss": 1.2002, "num_input_tokens_seen": 239742404, "step": 6120 }, { "epoch": 0.2970249055141002, "grad_norm": 0.39054998755455017, "learning_rate": 4.12211298287793e-05, "loss": 1.2103, "num_input_tokens_seen": 240150336, "step": 6130 }, { "epoch": 0.29750944858997963, "grad_norm": 0.42105597257614136, "learning_rate": 4.119125545817517e-05, "loss": 1.2114, "num_input_tokens_seen": 240519488, "step": 6140 }, { "epoch": 0.2979939916658591, "grad_norm": 0.37524622678756714, "learning_rate": 4.11613412090713e-05, "loss": 1.2072, "num_input_tokens_seen": 240898472, "step": 6150 }, { "epoch": 0.2984785347417385, "grad_norm": 0.3830278217792511, "learning_rate": 4.113138715514546e-05, "loss": 1.2617, "num_input_tokens_seen": 241258828, "step": 6160 }, { "epoch": 0.298963077817618, "grad_norm": 0.3757288157939911, "learning_rate": 4.110139337017345e-05, "loss": 1.201, "num_input_tokens_seen": 241648504, "step": 6170 }, { "epoch": 0.2994476208934974, "grad_norm": 0.500215470790863, "learning_rate": 4.1071359928028896e-05, "loss": 1.218, "num_input_tokens_seen": 242026552, "step": 6180 }, { "epoch": 0.2999321639693769, "grad_norm": 0.3983130156993866, "learning_rate": 4.104128690268314e-05, "loss": 1.1784, "num_input_tokens_seen": 242415488, "step": 6190 }, { "epoch": 0.3004167070452563, "grad_norm": 0.4273653030395508, "learning_rate": 4.101117436820499e-05, "loss": 1.2151, "num_input_tokens_seen": 242796596, "step": 6200 }, { "epoch": 0.3009012501211358, "grad_norm": 0.4290236830711365, "learning_rate": 4.098102239876058e-05, "loss": 1.1989, "num_input_tokens_seen": 243188916, "step": 6210 }, { "epoch": 0.3013857931970152, "grad_norm": 0.4325619339942932, "learning_rate": 4.095083106861317e-05, "loss": 1.1811, "num_input_tokens_seen": 243612644, "step": 6220 }, { "epoch": 0.30187033627289467, "grad_norm": 0.38956135511398315, "learning_rate": 4.0920600452122934e-05, "loss": 1.2224, "num_input_tokens_seen": 244023064, "step": 6230 }, { "epoch": 0.3023548793487741, "grad_norm": 0.3935461938381195, "learning_rate": 4.0890330623746856e-05, "loss": 1.175, "num_input_tokens_seen": 244414972, "step": 6240 }, { "epoch": 0.30283942242465356, "grad_norm": 0.3803289830684662, "learning_rate": 4.086002165803845e-05, "loss": 1.1878, "num_input_tokens_seen": 244793504, "step": 6250 }, { "epoch": 0.303323965500533, "grad_norm": 0.3836755156517029, "learning_rate": 4.082967362964766e-05, "loss": 1.2004, "num_input_tokens_seen": 245195808, "step": 6260 }, { "epoch": 0.30380850857641245, "grad_norm": 0.36543598771095276, "learning_rate": 4.079928661332062e-05, "loss": 1.2333, "num_input_tokens_seen": 245600428, "step": 6270 }, { "epoch": 0.30429305165229187, "grad_norm": 0.46092289686203003, "learning_rate": 4.07688606838995e-05, "loss": 1.1549, "num_input_tokens_seen": 246013016, "step": 6280 }, { "epoch": 0.30477759472817134, "grad_norm": 0.364113986492157, "learning_rate": 4.0738395916322305e-05, "loss": 1.213, "num_input_tokens_seen": 246396572, "step": 6290 }, { "epoch": 0.30526213780405076, "grad_norm": 0.4042966067790985, "learning_rate": 4.0707892385622695e-05, "loss": 1.1444, "num_input_tokens_seen": 246823552, "step": 6300 }, { "epoch": 0.30574668087993023, "grad_norm": 0.3558143079280853, "learning_rate": 4.0677350166929805e-05, "loss": 1.2313, "num_input_tokens_seen": 247219524, "step": 6310 }, { "epoch": 0.30623122395580965, "grad_norm": 0.4053792655467987, "learning_rate": 4.064676933546807e-05, "loss": 1.2071, "num_input_tokens_seen": 247632420, "step": 6320 }, { "epoch": 0.3067157670316891, "grad_norm": 0.37768182158470154, "learning_rate": 4.061614996655701e-05, "loss": 1.2165, "num_input_tokens_seen": 248035440, "step": 6330 }, { "epoch": 0.30720031010756854, "grad_norm": 0.38902053236961365, "learning_rate": 4.0585492135611064e-05, "loss": 1.2016, "num_input_tokens_seen": 248418968, "step": 6340 }, { "epoch": 0.307684853183448, "grad_norm": 0.4327777028083801, "learning_rate": 4.055479591813941e-05, "loss": 1.248, "num_input_tokens_seen": 248804400, "step": 6350 }, { "epoch": 0.30816939625932743, "grad_norm": 0.38908207416534424, "learning_rate": 4.052406138974576e-05, "loss": 1.2374, "num_input_tokens_seen": 249179376, "step": 6360 }, { "epoch": 0.3086539393352069, "grad_norm": 0.45117059350013733, "learning_rate": 4.049328862612819e-05, "loss": 1.1853, "num_input_tokens_seen": 249591320, "step": 6370 }, { "epoch": 0.3091384824110863, "grad_norm": 0.3981532156467438, "learning_rate": 4.0462477703078944e-05, "loss": 1.1586, "num_input_tokens_seen": 249973744, "step": 6380 }, { "epoch": 0.3096230254869658, "grad_norm": 0.39917775988578796, "learning_rate": 4.043162869648427e-05, "loss": 1.2109, "num_input_tokens_seen": 250419824, "step": 6390 }, { "epoch": 0.3101075685628452, "grad_norm": 0.45799973607063293, "learning_rate": 4.0400741682324185e-05, "loss": 1.2142, "num_input_tokens_seen": 250837544, "step": 6400 }, { "epoch": 0.3105921116387247, "grad_norm": 0.4319687783718109, "learning_rate": 4.036981673667234e-05, "loss": 1.2561, "num_input_tokens_seen": 251224108, "step": 6410 }, { "epoch": 0.3110766547146041, "grad_norm": 0.3861522376537323, "learning_rate": 4.0338853935695795e-05, "loss": 1.1854, "num_input_tokens_seen": 251600636, "step": 6420 }, { "epoch": 0.3115611977904836, "grad_norm": 0.4382316768169403, "learning_rate": 4.0307853355654856e-05, "loss": 1.1935, "num_input_tokens_seen": 251988072, "step": 6430 }, { "epoch": 0.312045740866363, "grad_norm": 0.41492384672164917, "learning_rate": 4.027681507290288e-05, "loss": 1.2185, "num_input_tokens_seen": 252366656, "step": 6440 }, { "epoch": 0.31253028394224247, "grad_norm": 0.4099133014678955, "learning_rate": 4.0245739163886076e-05, "loss": 1.2092, "num_input_tokens_seen": 252784056, "step": 6450 }, { "epoch": 0.3130148270181219, "grad_norm": 0.41487085819244385, "learning_rate": 4.021462570514333e-05, "loss": 1.1743, "num_input_tokens_seen": 253186828, "step": 6460 }, { "epoch": 0.31349937009400136, "grad_norm": 0.370210736989975, "learning_rate": 4.0183474773305995e-05, "loss": 1.2137, "num_input_tokens_seen": 253574680, "step": 6470 }, { "epoch": 0.3139839131698808, "grad_norm": 0.3662930727005005, "learning_rate": 4.015228644509776e-05, "loss": 1.1403, "num_input_tokens_seen": 253967160, "step": 6480 }, { "epoch": 0.31446845624576025, "grad_norm": 0.41104060411453247, "learning_rate": 4.0121060797334375e-05, "loss": 1.2502, "num_input_tokens_seen": 254357996, "step": 6490 }, { "epoch": 0.31495299932163967, "grad_norm": 0.3852168023586273, "learning_rate": 4.008979790692353e-05, "loss": 1.1896, "num_input_tokens_seen": 254758512, "step": 6500 }, { "epoch": 0.31543754239751914, "grad_norm": 0.40157365798950195, "learning_rate": 4.005849785086464e-05, "loss": 1.2011, "num_input_tokens_seen": 255148092, "step": 6510 }, { "epoch": 0.31592208547339856, "grad_norm": 0.41574087738990784, "learning_rate": 4.002716070624866e-05, "loss": 1.2243, "num_input_tokens_seen": 255552832, "step": 6520 }, { "epoch": 0.31640662854927804, "grad_norm": 0.3938582241535187, "learning_rate": 3.999578655025787e-05, "loss": 1.1956, "num_input_tokens_seen": 255953292, "step": 6530 }, { "epoch": 0.31689117162515745, "grad_norm": 0.39169254899024963, "learning_rate": 3.996437546016575e-05, "loss": 1.2158, "num_input_tokens_seen": 256352132, "step": 6540 }, { "epoch": 0.3173757147010369, "grad_norm": 0.38570165634155273, "learning_rate": 3.993292751333671e-05, "loss": 1.2131, "num_input_tokens_seen": 256744884, "step": 6550 }, { "epoch": 0.31786025777691634, "grad_norm": 0.38711732625961304, "learning_rate": 3.9901442787225955e-05, "loss": 1.1811, "num_input_tokens_seen": 257140072, "step": 6560 }, { "epoch": 0.3183448008527958, "grad_norm": 0.4001631438732147, "learning_rate": 3.986992135937927e-05, "loss": 1.2469, "num_input_tokens_seen": 257526996, "step": 6570 }, { "epoch": 0.31882934392867524, "grad_norm": 0.4085213243961334, "learning_rate": 3.9838363307432844e-05, "loss": 1.173, "num_input_tokens_seen": 257934640, "step": 6580 }, { "epoch": 0.3193138870045547, "grad_norm": 0.3954554498195648, "learning_rate": 3.980676870911305e-05, "loss": 1.1913, "num_input_tokens_seen": 258291948, "step": 6590 }, { "epoch": 0.3197984300804341, "grad_norm": 0.3921205401420593, "learning_rate": 3.977513764223629e-05, "loss": 1.1919, "num_input_tokens_seen": 258703912, "step": 6600 }, { "epoch": 0.3202829731563136, "grad_norm": 0.39368802309036255, "learning_rate": 3.974347018470879e-05, "loss": 1.1346, "num_input_tokens_seen": 259074912, "step": 6610 }, { "epoch": 0.320767516232193, "grad_norm": 0.4187362790107727, "learning_rate": 3.9711766414526386e-05, "loss": 1.2207, "num_input_tokens_seen": 259450412, "step": 6620 }, { "epoch": 0.3212520593080725, "grad_norm": 0.4282675087451935, "learning_rate": 3.9680026409774385e-05, "loss": 1.1791, "num_input_tokens_seen": 259839588, "step": 6630 }, { "epoch": 0.3217366023839519, "grad_norm": 0.41149836778640747, "learning_rate": 3.9648250248627285e-05, "loss": 1.1941, "num_input_tokens_seen": 260210136, "step": 6640 }, { "epoch": 0.3222211454598314, "grad_norm": 0.40860268473625183, "learning_rate": 3.961643800934869e-05, "loss": 1.1643, "num_input_tokens_seen": 260596296, "step": 6650 }, { "epoch": 0.3227056885357108, "grad_norm": 0.40710315108299255, "learning_rate": 3.958458977029103e-05, "loss": 1.1714, "num_input_tokens_seen": 260991860, "step": 6660 }, { "epoch": 0.3231902316115903, "grad_norm": 0.36702919006347656, "learning_rate": 3.955270560989542e-05, "loss": 1.2253, "num_input_tokens_seen": 261370536, "step": 6670 }, { "epoch": 0.3236747746874697, "grad_norm": 0.36147916316986084, "learning_rate": 3.952078560669142e-05, "loss": 1.1437, "num_input_tokens_seen": 261745012, "step": 6680 }, { "epoch": 0.32415931776334916, "grad_norm": 0.38143739104270935, "learning_rate": 3.9488829839296914e-05, "loss": 1.2353, "num_input_tokens_seen": 262122240, "step": 6690 }, { "epoch": 0.3246438608392286, "grad_norm": 0.4425183832645416, "learning_rate": 3.9456838386417835e-05, "loss": 1.1905, "num_input_tokens_seen": 262497788, "step": 6700 }, { "epoch": 0.32512840391510806, "grad_norm": 0.4275406301021576, "learning_rate": 3.9424811326848024e-05, "loss": 1.2369, "num_input_tokens_seen": 262888292, "step": 6710 }, { "epoch": 0.3256129469909875, "grad_norm": 0.4195478558540344, "learning_rate": 3.939274873946901e-05, "loss": 1.2022, "num_input_tokens_seen": 263277288, "step": 6720 }, { "epoch": 0.32609749006686695, "grad_norm": 0.38973620533943176, "learning_rate": 3.936065070324984e-05, "loss": 1.1681, "num_input_tokens_seen": 263661148, "step": 6730 }, { "epoch": 0.32658203314274636, "grad_norm": 0.40185558795928955, "learning_rate": 3.932851729724685e-05, "loss": 1.2155, "num_input_tokens_seen": 264039708, "step": 6740 }, { "epoch": 0.32706657621862584, "grad_norm": 0.3969719409942627, "learning_rate": 3.929634860060351e-05, "loss": 1.2022, "num_input_tokens_seen": 264431972, "step": 6750 }, { "epoch": 0.32755111929450526, "grad_norm": 0.4183984696865082, "learning_rate": 3.92641446925502e-05, "loss": 1.2105, "num_input_tokens_seen": 264799512, "step": 6760 }, { "epoch": 0.32803566237038473, "grad_norm": 0.42662981152534485, "learning_rate": 3.9231905652404034e-05, "loss": 1.218, "num_input_tokens_seen": 265163968, "step": 6770 }, { "epoch": 0.32852020544626415, "grad_norm": 0.4058518707752228, "learning_rate": 3.919963155956864e-05, "loss": 1.1482, "num_input_tokens_seen": 265559800, "step": 6780 }, { "epoch": 0.3290047485221436, "grad_norm": 0.3763887584209442, "learning_rate": 3.916732249353399e-05, "loss": 1.1912, "num_input_tokens_seen": 265950740, "step": 6790 }, { "epoch": 0.32948929159802304, "grad_norm": 0.36359450221061707, "learning_rate": 3.91349785338762e-05, "loss": 1.178, "num_input_tokens_seen": 266339700, "step": 6800 }, { "epoch": 0.3299738346739025, "grad_norm": 0.3989261984825134, "learning_rate": 3.91025997602573e-05, "loss": 1.23, "num_input_tokens_seen": 266740236, "step": 6810 }, { "epoch": 0.33045837774978193, "grad_norm": 0.37006011605262756, "learning_rate": 3.90701862524251e-05, "loss": 1.1739, "num_input_tokens_seen": 267116000, "step": 6820 }, { "epoch": 0.3309429208256614, "grad_norm": 0.3907497525215149, "learning_rate": 3.903773809021294e-05, "loss": 1.1801, "num_input_tokens_seen": 267526384, "step": 6830 }, { "epoch": 0.3314274639015408, "grad_norm": 0.3850066363811493, "learning_rate": 3.900525535353952e-05, "loss": 1.2116, "num_input_tokens_seen": 267893528, "step": 6840 }, { "epoch": 0.3319120069774203, "grad_norm": 0.4400016665458679, "learning_rate": 3.897273812240868e-05, "loss": 1.1532, "num_input_tokens_seen": 268282380, "step": 6850 }, { "epoch": 0.3323965500532997, "grad_norm": 0.3924698233604431, "learning_rate": 3.894018647690924e-05, "loss": 1.2103, "num_input_tokens_seen": 268696688, "step": 6860 }, { "epoch": 0.3328810931291792, "grad_norm": 0.3710118234157562, "learning_rate": 3.890760049721477e-05, "loss": 1.1848, "num_input_tokens_seen": 269125764, "step": 6870 }, { "epoch": 0.33336563620505866, "grad_norm": 0.37334132194519043, "learning_rate": 3.887498026358341e-05, "loss": 1.2415, "num_input_tokens_seen": 269507904, "step": 6880 }, { "epoch": 0.3338501792809381, "grad_norm": 0.46161389350891113, "learning_rate": 3.8842325856357656e-05, "loss": 1.2383, "num_input_tokens_seen": 269890536, "step": 6890 }, { "epoch": 0.33433472235681755, "grad_norm": 0.43443042039871216, "learning_rate": 3.8809637355964176e-05, "loss": 1.1867, "num_input_tokens_seen": 270291048, "step": 6900 }, { "epoch": 0.33481926543269697, "grad_norm": 0.39960533380508423, "learning_rate": 3.8776914842913626e-05, "loss": 1.2211, "num_input_tokens_seen": 270695776, "step": 6910 }, { "epoch": 0.33530380850857644, "grad_norm": 0.4040696322917938, "learning_rate": 3.8744158397800404e-05, "loss": 1.2013, "num_input_tokens_seen": 271104472, "step": 6920 }, { "epoch": 0.33578835158445586, "grad_norm": 0.4039623737335205, "learning_rate": 3.87113681013025e-05, "loss": 1.1545, "num_input_tokens_seen": 271507036, "step": 6930 }, { "epoch": 0.33627289466033533, "grad_norm": 0.40450507402420044, "learning_rate": 3.867854403418128e-05, "loss": 1.1504, "num_input_tokens_seen": 271911112, "step": 6940 }, { "epoch": 0.33675743773621475, "grad_norm": 0.4363791346549988, "learning_rate": 3.864568627728128e-05, "loss": 1.1979, "num_input_tokens_seen": 272282304, "step": 6950 }, { "epoch": 0.3372419808120942, "grad_norm": 0.38292601704597473, "learning_rate": 3.861279491153e-05, "loss": 1.23, "num_input_tokens_seen": 272673128, "step": 6960 }, { "epoch": 0.33772652388797364, "grad_norm": 0.39683130383491516, "learning_rate": 3.857987001793775e-05, "loss": 1.1738, "num_input_tokens_seen": 273079120, "step": 6970 }, { "epoch": 0.3382110669638531, "grad_norm": 0.4041154682636261, "learning_rate": 3.8546911677597395e-05, "loss": 1.1948, "num_input_tokens_seen": 273482888, "step": 6980 }, { "epoch": 0.33869561003973253, "grad_norm": 0.3793328106403351, "learning_rate": 3.8513919971684175e-05, "loss": 1.1841, "num_input_tokens_seen": 273889924, "step": 6990 }, { "epoch": 0.339180153115612, "grad_norm": 0.3985520005226135, "learning_rate": 3.848089498145552e-05, "loss": 1.1821, "num_input_tokens_seen": 274289204, "step": 7000 }, { "epoch": 0.3396646961914914, "grad_norm": 0.42367762327194214, "learning_rate": 3.844783678825083e-05, "loss": 1.2186, "num_input_tokens_seen": 274669384, "step": 7010 }, { "epoch": 0.3401492392673709, "grad_norm": 0.38627681136131287, "learning_rate": 3.8414745473491295e-05, "loss": 1.1691, "num_input_tokens_seen": 275045420, "step": 7020 }, { "epoch": 0.3406337823432503, "grad_norm": 0.40813496708869934, "learning_rate": 3.838162111867967e-05, "loss": 1.1912, "num_input_tokens_seen": 275452320, "step": 7030 }, { "epoch": 0.3411183254191298, "grad_norm": 0.3678280711174011, "learning_rate": 3.834846380540009e-05, "loss": 1.2274, "num_input_tokens_seen": 275844788, "step": 7040 }, { "epoch": 0.3416028684950092, "grad_norm": 0.3878428637981415, "learning_rate": 3.831527361531786e-05, "loss": 1.2127, "num_input_tokens_seen": 276211512, "step": 7050 }, { "epoch": 0.3420874115708887, "grad_norm": 0.4239497482776642, "learning_rate": 3.828205063017927e-05, "loss": 1.2089, "num_input_tokens_seen": 276596564, "step": 7060 }, { "epoch": 0.3425719546467681, "grad_norm": 0.3910852074623108, "learning_rate": 3.824879493181138e-05, "loss": 1.153, "num_input_tokens_seen": 276989732, "step": 7070 }, { "epoch": 0.34305649772264757, "grad_norm": 0.5242650508880615, "learning_rate": 3.821550660212182e-05, "loss": 1.1671, "num_input_tokens_seen": 277396804, "step": 7080 }, { "epoch": 0.343541040798527, "grad_norm": 0.3888712525367737, "learning_rate": 3.8182185723098584e-05, "loss": 1.1751, "num_input_tokens_seen": 277776744, "step": 7090 }, { "epoch": 0.34402558387440646, "grad_norm": 0.40083321928977966, "learning_rate": 3.814883237680984e-05, "loss": 1.262, "num_input_tokens_seen": 278174104, "step": 7100 }, { "epoch": 0.3445101269502859, "grad_norm": 0.4279809594154358, "learning_rate": 3.811544664540373e-05, "loss": 1.1803, "num_input_tokens_seen": 278545544, "step": 7110 }, { "epoch": 0.34499467002616535, "grad_norm": 0.39681699872016907, "learning_rate": 3.8082028611108144e-05, "loss": 1.1754, "num_input_tokens_seen": 278922000, "step": 7120 }, { "epoch": 0.34547921310204477, "grad_norm": 0.4334243834018707, "learning_rate": 3.804857835623054e-05, "loss": 1.1708, "num_input_tokens_seen": 279321312, "step": 7130 }, { "epoch": 0.34596375617792424, "grad_norm": 0.3777294158935547, "learning_rate": 3.801509596315773e-05, "loss": 1.1567, "num_input_tokens_seen": 279699076, "step": 7140 }, { "epoch": 0.34644829925380366, "grad_norm": 0.4178134500980377, "learning_rate": 3.798158151435569e-05, "loss": 1.1974, "num_input_tokens_seen": 280103592, "step": 7150 }, { "epoch": 0.34693284232968313, "grad_norm": 0.3830433189868927, "learning_rate": 3.794803509236935e-05, "loss": 1.1979, "num_input_tokens_seen": 280490292, "step": 7160 }, { "epoch": 0.34741738540556255, "grad_norm": 0.4182297885417938, "learning_rate": 3.791445677982237e-05, "loss": 1.1493, "num_input_tokens_seen": 280873500, "step": 7170 }, { "epoch": 0.347901928481442, "grad_norm": 0.3943808674812317, "learning_rate": 3.788084665941697e-05, "loss": 1.2254, "num_input_tokens_seen": 281272964, "step": 7180 }, { "epoch": 0.34838647155732144, "grad_norm": 0.3648427128791809, "learning_rate": 3.7847204813933715e-05, "loss": 1.1815, "num_input_tokens_seen": 281658580, "step": 7190 }, { "epoch": 0.3488710146332009, "grad_norm": 0.4216112494468689, "learning_rate": 3.78135313262313e-05, "loss": 1.2728, "num_input_tokens_seen": 282047632, "step": 7200 }, { "epoch": 0.34935555770908033, "grad_norm": 0.4069361686706543, "learning_rate": 3.7779826279246344e-05, "loss": 1.2193, "num_input_tokens_seen": 282442860, "step": 7210 }, { "epoch": 0.3498401007849598, "grad_norm": 0.3994755148887634, "learning_rate": 3.774608975599323e-05, "loss": 1.1925, "num_input_tokens_seen": 282829096, "step": 7220 }, { "epoch": 0.3503246438608392, "grad_norm": 0.40187373757362366, "learning_rate": 3.7712321839563816e-05, "loss": 1.1644, "num_input_tokens_seen": 283214644, "step": 7230 }, { "epoch": 0.3508091869367187, "grad_norm": 0.40469735860824585, "learning_rate": 3.7678522613127315e-05, "loss": 1.1836, "num_input_tokens_seen": 283597500, "step": 7240 }, { "epoch": 0.3512937300125981, "grad_norm": 0.3870331943035126, "learning_rate": 3.764469215993006e-05, "loss": 1.1341, "num_input_tokens_seen": 283969576, "step": 7250 }, { "epoch": 0.3517782730884776, "grad_norm": 0.38946250081062317, "learning_rate": 3.761083056329527e-05, "loss": 1.2089, "num_input_tokens_seen": 284368172, "step": 7260 }, { "epoch": 0.352262816164357, "grad_norm": 0.41155683994293213, "learning_rate": 3.757693790662289e-05, "loss": 1.2169, "num_input_tokens_seen": 284753172, "step": 7270 }, { "epoch": 0.3527473592402365, "grad_norm": 0.3682643175125122, "learning_rate": 3.754301427338935e-05, "loss": 1.2065, "num_input_tokens_seen": 285154396, "step": 7280 }, { "epoch": 0.3532319023161159, "grad_norm": 0.384960800409317, "learning_rate": 3.750905974714739e-05, "loss": 1.194, "num_input_tokens_seen": 285556552, "step": 7290 }, { "epoch": 0.35371644539199537, "grad_norm": 0.39383670687675476, "learning_rate": 3.747507441152581e-05, "loss": 1.1667, "num_input_tokens_seen": 285937684, "step": 7300 }, { "epoch": 0.3542009884678748, "grad_norm": 0.4078581631183624, "learning_rate": 3.744105835022932e-05, "loss": 1.168, "num_input_tokens_seen": 286339704, "step": 7310 }, { "epoch": 0.35468553154375426, "grad_norm": 0.4079924523830414, "learning_rate": 3.740701164703831e-05, "loss": 1.2325, "num_input_tokens_seen": 286724008, "step": 7320 }, { "epoch": 0.3551700746196337, "grad_norm": 0.40963229537010193, "learning_rate": 3.737293438580861e-05, "loss": 1.217, "num_input_tokens_seen": 287109268, "step": 7330 }, { "epoch": 0.35565461769551315, "grad_norm": 0.40729522705078125, "learning_rate": 3.7338826650471335e-05, "loss": 1.1671, "num_input_tokens_seen": 287487492, "step": 7340 }, { "epoch": 0.35613916077139257, "grad_norm": 0.4351021647453308, "learning_rate": 3.730468852503265e-05, "loss": 1.1748, "num_input_tokens_seen": 287878812, "step": 7350 }, { "epoch": 0.35662370384727204, "grad_norm": 0.3914582431316376, "learning_rate": 3.727052009357358e-05, "loss": 1.1588, "num_input_tokens_seen": 288271328, "step": 7360 }, { "epoch": 0.35710824692315146, "grad_norm": 0.4153572618961334, "learning_rate": 3.723632144024979e-05, "loss": 1.2123, "num_input_tokens_seen": 288659084, "step": 7370 }, { "epoch": 0.35759278999903094, "grad_norm": 0.38702714443206787, "learning_rate": 3.7202092649291356e-05, "loss": 1.1788, "num_input_tokens_seen": 289060932, "step": 7380 }, { "epoch": 0.35807733307491035, "grad_norm": 0.40125608444213867, "learning_rate": 3.7167833805002614e-05, "loss": 1.1521, "num_input_tokens_seen": 289437452, "step": 7390 }, { "epoch": 0.3585618761507898, "grad_norm": 0.43882331252098083, "learning_rate": 3.7133544991761896e-05, "loss": 1.1843, "num_input_tokens_seen": 289845972, "step": 7400 }, { "epoch": 0.35904641922666924, "grad_norm": 0.37727952003479004, "learning_rate": 3.7099226294021375e-05, "loss": 1.2051, "num_input_tokens_seen": 290209868, "step": 7410 }, { "epoch": 0.3595309623025487, "grad_norm": 0.3910406231880188, "learning_rate": 3.706487779630679e-05, "loss": 1.1358, "num_input_tokens_seen": 290641272, "step": 7420 }, { "epoch": 0.36001550537842814, "grad_norm": 0.3660062551498413, "learning_rate": 3.703049958321733e-05, "loss": 1.1969, "num_input_tokens_seen": 291057612, "step": 7430 }, { "epoch": 0.3605000484543076, "grad_norm": 0.3746223449707031, "learning_rate": 3.69960917394253e-05, "loss": 1.1809, "num_input_tokens_seen": 291459584, "step": 7440 }, { "epoch": 0.360984591530187, "grad_norm": 0.3975430428981781, "learning_rate": 3.696165434967605e-05, "loss": 1.2374, "num_input_tokens_seen": 291868768, "step": 7450 }, { "epoch": 0.3614691346060665, "grad_norm": 0.40742921829223633, "learning_rate": 3.692718749878767e-05, "loss": 1.2081, "num_input_tokens_seen": 292249240, "step": 7460 }, { "epoch": 0.3619536776819459, "grad_norm": 0.44806450605392456, "learning_rate": 3.6892691271650814e-05, "loss": 1.2251, "num_input_tokens_seen": 292658816, "step": 7470 }, { "epoch": 0.3624382207578254, "grad_norm": 0.37986406683921814, "learning_rate": 3.68581657532285e-05, "loss": 1.1863, "num_input_tokens_seen": 293031712, "step": 7480 }, { "epoch": 0.3629227638337048, "grad_norm": 0.4018678069114685, "learning_rate": 3.682361102855586e-05, "loss": 1.1884, "num_input_tokens_seen": 293444948, "step": 7490 }, { "epoch": 0.3634073069095843, "grad_norm": 0.3642045855522156, "learning_rate": 3.6789027182739996e-05, "loss": 1.1818, "num_input_tokens_seen": 293852108, "step": 7500 }, { "epoch": 0.3638918499854637, "grad_norm": 0.3775769770145416, "learning_rate": 3.675441430095972e-05, "loss": 1.2457, "num_input_tokens_seen": 294219892, "step": 7510 }, { "epoch": 0.3643763930613432, "grad_norm": 0.37454187870025635, "learning_rate": 3.6719772468465345e-05, "loss": 1.1447, "num_input_tokens_seen": 294604332, "step": 7520 }, { "epoch": 0.3648609361372226, "grad_norm": 0.38920724391937256, "learning_rate": 3.6685101770578515e-05, "loss": 1.2058, "num_input_tokens_seen": 295015584, "step": 7530 }, { "epoch": 0.36534547921310206, "grad_norm": 0.4151008725166321, "learning_rate": 3.6650402292691946e-05, "loss": 1.1891, "num_input_tokens_seen": 295420200, "step": 7540 }, { "epoch": 0.3658300222889815, "grad_norm": 0.4080232083797455, "learning_rate": 3.661567412026925e-05, "loss": 1.2027, "num_input_tokens_seen": 295806112, "step": 7550 }, { "epoch": 0.36631456536486096, "grad_norm": 0.43082767724990845, "learning_rate": 3.658091733884471e-05, "loss": 1.1592, "num_input_tokens_seen": 296215744, "step": 7560 }, { "epoch": 0.3667991084407404, "grad_norm": 0.3868250250816345, "learning_rate": 3.654613203402307e-05, "loss": 1.1657, "num_input_tokens_seen": 296611788, "step": 7570 }, { "epoch": 0.36728365151661985, "grad_norm": 0.3819040358066559, "learning_rate": 3.6511318291479324e-05, "loss": 1.1796, "num_input_tokens_seen": 297006960, "step": 7580 }, { "epoch": 0.36776819459249926, "grad_norm": 0.4080538749694824, "learning_rate": 3.6476476196958514e-05, "loss": 1.2078, "num_input_tokens_seen": 297411416, "step": 7590 }, { "epoch": 0.36825273766837874, "grad_norm": 0.40407416224479675, "learning_rate": 3.644160583627551e-05, "loss": 1.1925, "num_input_tokens_seen": 297793232, "step": 7600 }, { "epoch": 0.36873728074425816, "grad_norm": 0.37448978424072266, "learning_rate": 3.6406707295314795e-05, "loss": 1.1922, "num_input_tokens_seen": 298204896, "step": 7610 }, { "epoch": 0.36922182382013763, "grad_norm": 0.37111660838127136, "learning_rate": 3.6371780660030266e-05, "loss": 1.148, "num_input_tokens_seen": 298561544, "step": 7620 }, { "epoch": 0.36970636689601705, "grad_norm": 0.3740837574005127, "learning_rate": 3.633682601644501e-05, "loss": 1.2045, "num_input_tokens_seen": 298944532, "step": 7630 }, { "epoch": 0.3701909099718965, "grad_norm": 0.39125657081604004, "learning_rate": 3.6301843450651096e-05, "loss": 1.1708, "num_input_tokens_seen": 299346324, "step": 7640 }, { "epoch": 0.37067545304777594, "grad_norm": 0.3972494304180145, "learning_rate": 3.6266833048809386e-05, "loss": 1.1542, "num_input_tokens_seen": 299741248, "step": 7650 }, { "epoch": 0.3711599961236554, "grad_norm": 0.3871600329875946, "learning_rate": 3.623179489714926e-05, "loss": 1.1652, "num_input_tokens_seen": 300143696, "step": 7660 }, { "epoch": 0.37164453919953483, "grad_norm": 0.3876330852508545, "learning_rate": 3.619672908196849e-05, "loss": 1.1383, "num_input_tokens_seen": 300512992, "step": 7670 }, { "epoch": 0.3721290822754143, "grad_norm": 0.4013025760650635, "learning_rate": 3.616163568963295e-05, "loss": 1.1484, "num_input_tokens_seen": 300904468, "step": 7680 }, { "epoch": 0.3726136253512937, "grad_norm": 0.36160480976104736, "learning_rate": 3.6126514806576437e-05, "loss": 1.205, "num_input_tokens_seen": 301325360, "step": 7690 }, { "epoch": 0.3730981684271732, "grad_norm": 0.35836082696914673, "learning_rate": 3.6091366519300476e-05, "loss": 1.1436, "num_input_tokens_seen": 301732708, "step": 7700 }, { "epoch": 0.3735827115030526, "grad_norm": 0.43142881989479065, "learning_rate": 3.605619091437408e-05, "loss": 1.193, "num_input_tokens_seen": 302120864, "step": 7710 }, { "epoch": 0.3740672545789321, "grad_norm": 0.41082125902175903, "learning_rate": 3.6020988078433537e-05, "loss": 1.1767, "num_input_tokens_seen": 302492456, "step": 7720 }, { "epoch": 0.3745517976548115, "grad_norm": 0.393816739320755, "learning_rate": 3.598575809818221e-05, "loss": 1.2283, "num_input_tokens_seen": 302846348, "step": 7730 }, { "epoch": 0.375036340730691, "grad_norm": 0.4002731144428253, "learning_rate": 3.5950501060390315e-05, "loss": 1.1765, "num_input_tokens_seen": 303200188, "step": 7740 }, { "epoch": 0.3755208838065704, "grad_norm": 0.41621705889701843, "learning_rate": 3.5915217051894725e-05, "loss": 1.1259, "num_input_tokens_seen": 303616612, "step": 7750 }, { "epoch": 0.37600542688244987, "grad_norm": 0.352398544549942, "learning_rate": 3.587990615959871e-05, "loss": 1.1968, "num_input_tokens_seen": 304005224, "step": 7760 }, { "epoch": 0.3764899699583293, "grad_norm": 0.36988088488578796, "learning_rate": 3.584456847047177e-05, "loss": 1.1962, "num_input_tokens_seen": 304405536, "step": 7770 }, { "epoch": 0.37697451303420876, "grad_norm": 0.377210795879364, "learning_rate": 3.5809204071549415e-05, "loss": 1.2039, "num_input_tokens_seen": 304791388, "step": 7780 }, { "epoch": 0.3774590561100882, "grad_norm": 0.35832905769348145, "learning_rate": 3.577381304993294e-05, "loss": 1.1936, "num_input_tokens_seen": 305184000, "step": 7790 }, { "epoch": 0.37794359918596765, "grad_norm": 0.3834468424320221, "learning_rate": 3.5738395492789176e-05, "loss": 1.2239, "num_input_tokens_seen": 305552272, "step": 7800 }, { "epoch": 0.37842814226184707, "grad_norm": 0.3998955488204956, "learning_rate": 3.570295148735036e-05, "loss": 1.1768, "num_input_tokens_seen": 305942616, "step": 7810 }, { "epoch": 0.37891268533772654, "grad_norm": 0.382485955953598, "learning_rate": 3.5667481120913836e-05, "loss": 1.1988, "num_input_tokens_seen": 306364336, "step": 7820 }, { "epoch": 0.37939722841360596, "grad_norm": 0.43074601888656616, "learning_rate": 3.5631984480841885e-05, "loss": 1.2357, "num_input_tokens_seen": 306775904, "step": 7830 }, { "epoch": 0.37988177148948543, "grad_norm": 0.3977073132991791, "learning_rate": 3.55964616545615e-05, "loss": 1.1833, "num_input_tokens_seen": 307121492, "step": 7840 }, { "epoch": 0.38036631456536485, "grad_norm": 0.38828161358833313, "learning_rate": 3.5560912729564155e-05, "loss": 1.1875, "num_input_tokens_seen": 307493488, "step": 7850 }, { "epoch": 0.3808508576412443, "grad_norm": 0.404365211725235, "learning_rate": 3.5525337793405636e-05, "loss": 1.1878, "num_input_tokens_seen": 307890584, "step": 7860 }, { "epoch": 0.38133540071712374, "grad_norm": 0.42278143763542175, "learning_rate": 3.548973693370576e-05, "loss": 1.1444, "num_input_tokens_seen": 308295900, "step": 7870 }, { "epoch": 0.3818199437930032, "grad_norm": 0.38250425457954407, "learning_rate": 3.545411023814823e-05, "loss": 1.1498, "num_input_tokens_seen": 308723400, "step": 7880 }, { "epoch": 0.38230448686888263, "grad_norm": 0.4000505805015564, "learning_rate": 3.541845779448034e-05, "loss": 1.1917, "num_input_tokens_seen": 309139432, "step": 7890 }, { "epoch": 0.3827890299447621, "grad_norm": 0.4143424928188324, "learning_rate": 3.5382779690512824e-05, "loss": 1.2196, "num_input_tokens_seen": 309543828, "step": 7900 }, { "epoch": 0.3832735730206415, "grad_norm": 0.4070720672607422, "learning_rate": 3.5347076014119606e-05, "loss": 1.1941, "num_input_tokens_seen": 309945560, "step": 7910 }, { "epoch": 0.383758116096521, "grad_norm": 0.4233192801475525, "learning_rate": 3.5311346853237614e-05, "loss": 1.2011, "num_input_tokens_seen": 310335412, "step": 7920 }, { "epoch": 0.3842426591724004, "grad_norm": 0.4479762017726898, "learning_rate": 3.527559229586653e-05, "loss": 1.2178, "num_input_tokens_seen": 310706360, "step": 7930 }, { "epoch": 0.3847272022482799, "grad_norm": 0.41147053241729736, "learning_rate": 3.523981243006857e-05, "loss": 1.2376, "num_input_tokens_seen": 311088520, "step": 7940 }, { "epoch": 0.3852117453241593, "grad_norm": 0.38983359932899475, "learning_rate": 3.5204007343968326e-05, "loss": 1.2019, "num_input_tokens_seen": 311467552, "step": 7950 }, { "epoch": 0.3856962884000388, "grad_norm": 0.3905565142631531, "learning_rate": 3.516817712575246e-05, "loss": 1.1972, "num_input_tokens_seen": 311857180, "step": 7960 }, { "epoch": 0.3861808314759182, "grad_norm": 0.390117883682251, "learning_rate": 3.513232186366956e-05, "loss": 1.1901, "num_input_tokens_seen": 312250400, "step": 7970 }, { "epoch": 0.38666537455179767, "grad_norm": 0.3978836238384247, "learning_rate": 3.50964416460299e-05, "loss": 1.1729, "num_input_tokens_seen": 312652712, "step": 7980 }, { "epoch": 0.3871499176276771, "grad_norm": 0.4010826349258423, "learning_rate": 3.5060536561205195e-05, "loss": 1.1981, "num_input_tokens_seen": 313069828, "step": 7990 }, { "epoch": 0.38763446070355656, "grad_norm": 0.42469522356987, "learning_rate": 3.5024606697628424e-05, "loss": 1.2074, "num_input_tokens_seen": 313452856, "step": 8000 }, { "epoch": 0.38763446070355656, "eval_loss": 1.1777091026306152, "eval_runtime": 5.3785, "eval_samples_per_second": 27.889, "eval_steps_per_second": 3.533, "num_input_tokens_seen": 313452856, "step": 8000 }, { "epoch": 0.388119003779436, "grad_norm": 0.3903387486934662, "learning_rate": 3.49886521437936e-05, "loss": 1.153, "num_input_tokens_seen": 313830208, "step": 8010 }, { "epoch": 0.38860354685531545, "grad_norm": 0.398037314414978, "learning_rate": 3.495267298825555e-05, "loss": 1.1707, "num_input_tokens_seen": 314242840, "step": 8020 }, { "epoch": 0.38908808993119487, "grad_norm": 0.3712460398674011, "learning_rate": 3.4916669319629664e-05, "loss": 1.161, "num_input_tokens_seen": 314602680, "step": 8030 }, { "epoch": 0.38957263300707434, "grad_norm": 0.4191719591617584, "learning_rate": 3.4880641226591733e-05, "loss": 1.1705, "num_input_tokens_seen": 315002656, "step": 8040 }, { "epoch": 0.39005717608295376, "grad_norm": 0.4063269793987274, "learning_rate": 3.48445887978777e-05, "loss": 1.1794, "num_input_tokens_seen": 315420940, "step": 8050 }, { "epoch": 0.39054171915883323, "grad_norm": 0.40375033020973206, "learning_rate": 3.480851212228345e-05, "loss": 1.1876, "num_input_tokens_seen": 315794464, "step": 8060 }, { "epoch": 0.39102626223471265, "grad_norm": 0.43218469619750977, "learning_rate": 3.4772411288664576e-05, "loss": 1.2107, "num_input_tokens_seen": 316178848, "step": 8070 }, { "epoch": 0.3915108053105921, "grad_norm": 0.3920303285121918, "learning_rate": 3.473628638593618e-05, "loss": 1.1755, "num_input_tokens_seen": 316541328, "step": 8080 }, { "epoch": 0.39199534838647154, "grad_norm": 0.405202716588974, "learning_rate": 3.470013750307263e-05, "loss": 1.1929, "num_input_tokens_seen": 316916528, "step": 8090 }, { "epoch": 0.392479891462351, "grad_norm": 0.39330440759658813, "learning_rate": 3.466396472910739e-05, "loss": 1.1855, "num_input_tokens_seen": 317357952, "step": 8100 }, { "epoch": 0.39296443453823043, "grad_norm": 0.3834272623062134, "learning_rate": 3.462776815313274e-05, "loss": 1.1863, "num_input_tokens_seen": 317785332, "step": 8110 }, { "epoch": 0.3934489776141099, "grad_norm": 0.40610063076019287, "learning_rate": 3.4591547864299576e-05, "loss": 1.1825, "num_input_tokens_seen": 318172140, "step": 8120 }, { "epoch": 0.3939335206899893, "grad_norm": 0.41190090775489807, "learning_rate": 3.455530395181722e-05, "loss": 1.1949, "num_input_tokens_seen": 318556536, "step": 8130 }, { "epoch": 0.3944180637658688, "grad_norm": 0.3720203936100006, "learning_rate": 3.451903650495317e-05, "loss": 1.1492, "num_input_tokens_seen": 318939604, "step": 8140 }, { "epoch": 0.3949026068417482, "grad_norm": 0.39604607224464417, "learning_rate": 3.448274561303288e-05, "loss": 1.2164, "num_input_tokens_seen": 319344624, "step": 8150 }, { "epoch": 0.3953871499176277, "grad_norm": 0.40863263607025146, "learning_rate": 3.444643136543957e-05, "loss": 1.1766, "num_input_tokens_seen": 319738832, "step": 8160 }, { "epoch": 0.3958716929935071, "grad_norm": 0.4493870437145233, "learning_rate": 3.4410093851613964e-05, "loss": 1.2016, "num_input_tokens_seen": 320145484, "step": 8170 }, { "epoch": 0.3963562360693866, "grad_norm": 0.3747680187225342, "learning_rate": 3.4373733161054095e-05, "loss": 1.1803, "num_input_tokens_seen": 320561880, "step": 8180 }, { "epoch": 0.396840779145266, "grad_norm": 0.39447423815727234, "learning_rate": 3.433734938331508e-05, "loss": 1.1853, "num_input_tokens_seen": 320944848, "step": 8190 }, { "epoch": 0.39732532222114547, "grad_norm": 0.4037897288799286, "learning_rate": 3.43009426080089e-05, "loss": 1.1759, "num_input_tokens_seen": 321319020, "step": 8200 }, { "epoch": 0.3978098652970249, "grad_norm": 0.3832351565361023, "learning_rate": 3.4264512924804175e-05, "loss": 1.1788, "num_input_tokens_seen": 321694824, "step": 8210 }, { "epoch": 0.39829440837290436, "grad_norm": 0.42781588435173035, "learning_rate": 3.422806042342596e-05, "loss": 1.1784, "num_input_tokens_seen": 322077948, "step": 8220 }, { "epoch": 0.3987789514487838, "grad_norm": 0.3955729305744171, "learning_rate": 3.419158519365548e-05, "loss": 1.234, "num_input_tokens_seen": 322505044, "step": 8230 }, { "epoch": 0.39926349452466325, "grad_norm": 0.419911652803421, "learning_rate": 3.4155087325329985e-05, "loss": 1.1763, "num_input_tokens_seen": 322911104, "step": 8240 }, { "epoch": 0.39974803760054267, "grad_norm": 0.4269978702068329, "learning_rate": 3.411856690834243e-05, "loss": 1.17, "num_input_tokens_seen": 323320996, "step": 8250 }, { "epoch": 0.40023258067642214, "grad_norm": 0.36580172181129456, "learning_rate": 3.408202403264135e-05, "loss": 1.1565, "num_input_tokens_seen": 323709072, "step": 8260 }, { "epoch": 0.40071712375230156, "grad_norm": 0.3848220705986023, "learning_rate": 3.404545878823058e-05, "loss": 1.1959, "num_input_tokens_seen": 324080944, "step": 8270 }, { "epoch": 0.40120166682818104, "grad_norm": 0.4162658452987671, "learning_rate": 3.400887126516904e-05, "loss": 1.1497, "num_input_tokens_seen": 324485356, "step": 8280 }, { "epoch": 0.40168620990406045, "grad_norm": 0.40066495537757874, "learning_rate": 3.3972261553570536e-05, "loss": 1.1657, "num_input_tokens_seen": 324871712, "step": 8290 }, { "epoch": 0.4021707529799399, "grad_norm": 0.3958793878555298, "learning_rate": 3.3935629743603496e-05, "loss": 1.1941, "num_input_tokens_seen": 325257684, "step": 8300 }, { "epoch": 0.40265529605581935, "grad_norm": 0.408408522605896, "learning_rate": 3.389897592549082e-05, "loss": 1.1954, "num_input_tokens_seen": 325638416, "step": 8310 }, { "epoch": 0.4031398391316988, "grad_norm": 0.36070606112480164, "learning_rate": 3.3862300189509564e-05, "loss": 1.2009, "num_input_tokens_seen": 326018508, "step": 8320 }, { "epoch": 0.40362438220757824, "grad_norm": 0.3974900543689728, "learning_rate": 3.382560262599082e-05, "loss": 1.1711, "num_input_tokens_seen": 326405592, "step": 8330 }, { "epoch": 0.4041089252834577, "grad_norm": 0.3950560390949249, "learning_rate": 3.378888332531939e-05, "loss": 1.1634, "num_input_tokens_seen": 326784284, "step": 8340 }, { "epoch": 0.4045934683593371, "grad_norm": 0.36520737409591675, "learning_rate": 3.3752142377933646e-05, "loss": 1.1633, "num_input_tokens_seen": 327186212, "step": 8350 }, { "epoch": 0.4050780114352166, "grad_norm": 0.4279801547527313, "learning_rate": 3.371537987432526e-05, "loss": 1.1751, "num_input_tokens_seen": 327580260, "step": 8360 }, { "epoch": 0.405562554511096, "grad_norm": 0.4253396987915039, "learning_rate": 3.367859590503901e-05, "loss": 1.1813, "num_input_tokens_seen": 327974240, "step": 8370 }, { "epoch": 0.4060470975869755, "grad_norm": 0.46800392866134644, "learning_rate": 3.364179056067253e-05, "loss": 1.2001, "num_input_tokens_seen": 328378912, "step": 8380 }, { "epoch": 0.4065316406628549, "grad_norm": 0.3665182292461395, "learning_rate": 3.360496393187609e-05, "loss": 1.2047, "num_input_tokens_seen": 328787716, "step": 8390 }, { "epoch": 0.4070161837387344, "grad_norm": 0.38969382643699646, "learning_rate": 3.356811610935241e-05, "loss": 1.1611, "num_input_tokens_seen": 329162816, "step": 8400 }, { "epoch": 0.4075007268146138, "grad_norm": 0.3802407383918762, "learning_rate": 3.353124718385641e-05, "loss": 1.2203, "num_input_tokens_seen": 329545976, "step": 8410 }, { "epoch": 0.4079852698904933, "grad_norm": 0.4015510082244873, "learning_rate": 3.3494357246194956e-05, "loss": 1.1915, "num_input_tokens_seen": 329952104, "step": 8420 }, { "epoch": 0.4084698129663727, "grad_norm": 0.3899809420108795, "learning_rate": 3.345744638722669e-05, "loss": 1.1899, "num_input_tokens_seen": 330329116, "step": 8430 }, { "epoch": 0.40895435604225217, "grad_norm": 0.3643754720687866, "learning_rate": 3.3420514697861766e-05, "loss": 1.2037, "num_input_tokens_seen": 330723032, "step": 8440 }, { "epoch": 0.4094388991181316, "grad_norm": 0.3885078728199005, "learning_rate": 3.338356226906166e-05, "loss": 1.2045, "num_input_tokens_seen": 331128052, "step": 8450 }, { "epoch": 0.40992344219401106, "grad_norm": 0.4163413643836975, "learning_rate": 3.3346589191838915e-05, "loss": 1.1901, "num_input_tokens_seen": 331532708, "step": 8460 }, { "epoch": 0.4104079852698905, "grad_norm": 0.36400115489959717, "learning_rate": 3.330959555725695e-05, "loss": 1.1846, "num_input_tokens_seen": 331914620, "step": 8470 }, { "epoch": 0.41089252834576995, "grad_norm": 0.36604899168014526, "learning_rate": 3.327258145642979e-05, "loss": 1.1852, "num_input_tokens_seen": 332313264, "step": 8480 }, { "epoch": 0.41137707142164937, "grad_norm": 0.4256085455417633, "learning_rate": 3.323554698052187e-05, "loss": 1.1764, "num_input_tokens_seen": 332721544, "step": 8490 }, { "epoch": 0.41186161449752884, "grad_norm": 0.39886176586151123, "learning_rate": 3.3198492220747834e-05, "loss": 1.1835, "num_input_tokens_seen": 333090772, "step": 8500 }, { "epoch": 0.41234615757340826, "grad_norm": 0.3774135410785675, "learning_rate": 3.316141726837226e-05, "loss": 1.1219, "num_input_tokens_seen": 333473080, "step": 8510 }, { "epoch": 0.41283070064928773, "grad_norm": 0.4200323522090912, "learning_rate": 3.312432221470947e-05, "loss": 1.187, "num_input_tokens_seen": 333860928, "step": 8520 }, { "epoch": 0.41331524372516715, "grad_norm": 0.433433473110199, "learning_rate": 3.30872071511233e-05, "loss": 1.1961, "num_input_tokens_seen": 334240240, "step": 8530 }, { "epoch": 0.4137997868010466, "grad_norm": 0.38179147243499756, "learning_rate": 3.3050072169026855e-05, "loss": 1.2174, "num_input_tokens_seen": 334633964, "step": 8540 }, { "epoch": 0.41428432987692604, "grad_norm": 0.3912273049354553, "learning_rate": 3.30129173598823e-05, "loss": 1.1801, "num_input_tokens_seen": 335042032, "step": 8550 }, { "epoch": 0.4147688729528055, "grad_norm": 0.3975712060928345, "learning_rate": 3.297574281520065e-05, "loss": 1.1647, "num_input_tokens_seen": 335436124, "step": 8560 }, { "epoch": 0.41525341602868493, "grad_norm": 0.45118552446365356, "learning_rate": 3.293854862654151e-05, "loss": 1.1916, "num_input_tokens_seen": 335836836, "step": 8570 }, { "epoch": 0.4157379591045644, "grad_norm": 0.3697469234466553, "learning_rate": 3.290133488551288e-05, "loss": 1.1983, "num_input_tokens_seen": 336228900, "step": 8580 }, { "epoch": 0.4162225021804438, "grad_norm": 0.37667909264564514, "learning_rate": 3.286410168377091e-05, "loss": 1.1968, "num_input_tokens_seen": 336622764, "step": 8590 }, { "epoch": 0.4167070452563233, "grad_norm": 0.36621060967445374, "learning_rate": 3.282684911301968e-05, "loss": 1.1642, "num_input_tokens_seen": 337027652, "step": 8600 }, { "epoch": 0.4171915883322027, "grad_norm": 0.39898422360420227, "learning_rate": 3.2789577265010974e-05, "loss": 1.2309, "num_input_tokens_seen": 337410656, "step": 8610 }, { "epoch": 0.4176761314080822, "grad_norm": 0.40708452463150024, "learning_rate": 3.275228623154406e-05, "loss": 1.1755, "num_input_tokens_seen": 337815548, "step": 8620 }, { "epoch": 0.4181606744839616, "grad_norm": 0.39701101183891296, "learning_rate": 3.271497610446547e-05, "loss": 1.2235, "num_input_tokens_seen": 338219464, "step": 8630 }, { "epoch": 0.4186452175598411, "grad_norm": 0.3921484351158142, "learning_rate": 3.267764697566874e-05, "loss": 1.1719, "num_input_tokens_seen": 338635512, "step": 8640 }, { "epoch": 0.4191297606357205, "grad_norm": 0.37697601318359375, "learning_rate": 3.264029893709422e-05, "loss": 1.1836, "num_input_tokens_seen": 339021496, "step": 8650 }, { "epoch": 0.41961430371159997, "grad_norm": 0.3878484070301056, "learning_rate": 3.260293208072883e-05, "loss": 1.1847, "num_input_tokens_seen": 339403824, "step": 8660 }, { "epoch": 0.4200988467874794, "grad_norm": 0.42603814601898193, "learning_rate": 3.2565546498605834e-05, "loss": 1.1747, "num_input_tokens_seen": 339808580, "step": 8670 }, { "epoch": 0.42058338986335886, "grad_norm": 0.3939875662326813, "learning_rate": 3.252814228280464e-05, "loss": 1.2169, "num_input_tokens_seen": 340199464, "step": 8680 }, { "epoch": 0.4210679329392383, "grad_norm": 0.3872847855091095, "learning_rate": 3.249071952545052e-05, "loss": 1.1669, "num_input_tokens_seen": 340608396, "step": 8690 }, { "epoch": 0.42155247601511775, "grad_norm": 0.4044415056705475, "learning_rate": 3.245327831871442e-05, "loss": 1.2167, "num_input_tokens_seen": 340997568, "step": 8700 }, { "epoch": 0.42203701909099717, "grad_norm": 0.4061294198036194, "learning_rate": 3.2415818754812735e-05, "loss": 1.179, "num_input_tokens_seen": 341399468, "step": 8710 }, { "epoch": 0.42252156216687664, "grad_norm": 0.3923657238483429, "learning_rate": 3.237834092600709e-05, "loss": 1.1482, "num_input_tokens_seen": 341803448, "step": 8720 }, { "epoch": 0.42300610524275606, "grad_norm": 0.3841986060142517, "learning_rate": 3.234084492460404e-05, "loss": 1.1648, "num_input_tokens_seen": 342189656, "step": 8730 }, { "epoch": 0.42349064831863553, "grad_norm": 0.4120123088359833, "learning_rate": 3.230333084295496e-05, "loss": 1.1752, "num_input_tokens_seen": 342569416, "step": 8740 }, { "epoch": 0.42397519139451495, "grad_norm": 0.44047102332115173, "learning_rate": 3.226579877345572e-05, "loss": 1.2093, "num_input_tokens_seen": 342959072, "step": 8750 }, { "epoch": 0.4244597344703944, "grad_norm": 0.38502237200737, "learning_rate": 3.22282488085465e-05, "loss": 1.1982, "num_input_tokens_seen": 343375272, "step": 8760 }, { "epoch": 0.42494427754627384, "grad_norm": 0.37675246596336365, "learning_rate": 3.2190681040711566e-05, "loss": 1.168, "num_input_tokens_seen": 343750180, "step": 8770 }, { "epoch": 0.4254288206221533, "grad_norm": 0.36886003613471985, "learning_rate": 3.215309556247903e-05, "loss": 1.2242, "num_input_tokens_seen": 344158160, "step": 8780 }, { "epoch": 0.42591336369803273, "grad_norm": 0.40360134840011597, "learning_rate": 3.21154924664206e-05, "loss": 1.2046, "num_input_tokens_seen": 344541580, "step": 8790 }, { "epoch": 0.4263979067739122, "grad_norm": 0.39642900228500366, "learning_rate": 3.207787184515142e-05, "loss": 1.2041, "num_input_tokens_seen": 344912764, "step": 8800 }, { "epoch": 0.4268824498497916, "grad_norm": 0.40765756368637085, "learning_rate": 3.204023379132975e-05, "loss": 1.1939, "num_input_tokens_seen": 345319956, "step": 8810 }, { "epoch": 0.4273669929256711, "grad_norm": 0.4131956100463867, "learning_rate": 3.2002578397656826e-05, "loss": 1.1394, "num_input_tokens_seen": 345697944, "step": 8820 }, { "epoch": 0.4278515360015505, "grad_norm": 0.4072543978691101, "learning_rate": 3.196490575687657e-05, "loss": 1.1678, "num_input_tokens_seen": 346082640, "step": 8830 }, { "epoch": 0.42833607907743, "grad_norm": 0.41665875911712646, "learning_rate": 3.192721596177538e-05, "loss": 1.1764, "num_input_tokens_seen": 346505640, "step": 8840 }, { "epoch": 0.4288206221533094, "grad_norm": 0.4276581406593323, "learning_rate": 3.188950910518193e-05, "loss": 1.1871, "num_input_tokens_seen": 346926848, "step": 8850 }, { "epoch": 0.4293051652291889, "grad_norm": 0.40989550948143005, "learning_rate": 3.185178527996687e-05, "loss": 1.2048, "num_input_tokens_seen": 347308076, "step": 8860 }, { "epoch": 0.4297897083050683, "grad_norm": 0.3860456347465515, "learning_rate": 3.18140445790427e-05, "loss": 1.1526, "num_input_tokens_seen": 347719608, "step": 8870 }, { "epoch": 0.43027425138094777, "grad_norm": 0.360655277967453, "learning_rate": 3.1776287095363435e-05, "loss": 1.1543, "num_input_tokens_seen": 348104112, "step": 8880 }, { "epoch": 0.4307587944568272, "grad_norm": 0.42288416624069214, "learning_rate": 3.173851292192446e-05, "loss": 1.1498, "num_input_tokens_seen": 348488872, "step": 8890 }, { "epoch": 0.43124333753270666, "grad_norm": 0.39610555768013, "learning_rate": 3.170072215176224e-05, "loss": 1.1501, "num_input_tokens_seen": 348882096, "step": 8900 }, { "epoch": 0.4317278806085861, "grad_norm": 0.40388280153274536, "learning_rate": 3.166291487795413e-05, "loss": 1.214, "num_input_tokens_seen": 349271852, "step": 8910 }, { "epoch": 0.43221242368446555, "grad_norm": 0.41157808899879456, "learning_rate": 3.1625091193618144e-05, "loss": 1.1712, "num_input_tokens_seen": 349662936, "step": 8920 }, { "epoch": 0.43269696676034497, "grad_norm": 0.41196179389953613, "learning_rate": 3.158725119191269e-05, "loss": 1.1742, "num_input_tokens_seen": 350080888, "step": 8930 }, { "epoch": 0.43318150983622444, "grad_norm": 0.3659604489803314, "learning_rate": 3.1549394966036384e-05, "loss": 1.1984, "num_input_tokens_seen": 350479392, "step": 8940 }, { "epoch": 0.43366605291210386, "grad_norm": 0.3877184987068176, "learning_rate": 3.1511522609227795e-05, "loss": 1.1329, "num_input_tokens_seen": 350885976, "step": 8950 }, { "epoch": 0.43415059598798333, "grad_norm": 0.4030652642250061, "learning_rate": 3.147363421476522e-05, "loss": 1.2162, "num_input_tokens_seen": 351282332, "step": 8960 }, { "epoch": 0.43463513906386275, "grad_norm": 0.38181596994400024, "learning_rate": 3.1435729875966455e-05, "loss": 1.1721, "num_input_tokens_seen": 351676412, "step": 8970 }, { "epoch": 0.4351196821397422, "grad_norm": 0.4003159701824188, "learning_rate": 3.139780968618858e-05, "loss": 1.1416, "num_input_tokens_seen": 352076632, "step": 8980 }, { "epoch": 0.43560422521562164, "grad_norm": 0.3967534303665161, "learning_rate": 3.13598737388277e-05, "loss": 1.1532, "num_input_tokens_seen": 352466764, "step": 8990 }, { "epoch": 0.4360887682915011, "grad_norm": 0.3866839110851288, "learning_rate": 3.1321922127318724e-05, "loss": 1.1999, "num_input_tokens_seen": 352848332, "step": 9000 }, { "epoch": 0.43657331136738053, "grad_norm": 0.3901658356189728, "learning_rate": 3.128395494513517e-05, "loss": 1.2005, "num_input_tokens_seen": 353205300, "step": 9010 }, { "epoch": 0.43705785444326, "grad_norm": 0.3866805136203766, "learning_rate": 3.1245972285788885e-05, "loss": 1.1484, "num_input_tokens_seen": 353606764, "step": 9020 }, { "epoch": 0.4375423975191394, "grad_norm": 0.3832225799560547, "learning_rate": 3.120797424282984e-05, "loss": 1.1939, "num_input_tokens_seen": 353998028, "step": 9030 }, { "epoch": 0.4380269405950189, "grad_norm": 0.39194390177726746, "learning_rate": 3.116996090984588e-05, "loss": 1.1963, "num_input_tokens_seen": 354399420, "step": 9040 }, { "epoch": 0.4385114836708983, "grad_norm": 0.3819257915019989, "learning_rate": 3.113193238046255e-05, "loss": 1.1531, "num_input_tokens_seen": 354797548, "step": 9050 }, { "epoch": 0.4389960267467778, "grad_norm": 0.3843849301338196, "learning_rate": 3.1093888748342765e-05, "loss": 1.1642, "num_input_tokens_seen": 355200164, "step": 9060 }, { "epoch": 0.4394805698226572, "grad_norm": 0.404051274061203, "learning_rate": 3.10558301071867e-05, "loss": 1.1794, "num_input_tokens_seen": 355583100, "step": 9070 }, { "epoch": 0.4399651128985367, "grad_norm": 0.3765638768672943, "learning_rate": 3.1017756550731437e-05, "loss": 1.2015, "num_input_tokens_seen": 355999884, "step": 9080 }, { "epoch": 0.4404496559744161, "grad_norm": 0.4142244756221771, "learning_rate": 3.097966817275085e-05, "loss": 1.1928, "num_input_tokens_seen": 356363412, "step": 9090 }, { "epoch": 0.44093419905029557, "grad_norm": 0.4150756597518921, "learning_rate": 3.0941565067055275e-05, "loss": 1.1665, "num_input_tokens_seen": 356759816, "step": 9100 }, { "epoch": 0.441418742126175, "grad_norm": 0.4147997498512268, "learning_rate": 3.090344732749134e-05, "loss": 1.1488, "num_input_tokens_seen": 357163868, "step": 9110 }, { "epoch": 0.44190328520205446, "grad_norm": 0.3696651756763458, "learning_rate": 3.086531504794172e-05, "loss": 1.1948, "num_input_tokens_seen": 357578232, "step": 9120 }, { "epoch": 0.4423878282779339, "grad_norm": 0.38870376348495483, "learning_rate": 3.0827168322324884e-05, "loss": 1.141, "num_input_tokens_seen": 357996944, "step": 9130 }, { "epoch": 0.44287237135381335, "grad_norm": 0.4145285487174988, "learning_rate": 3.0789007244594904e-05, "loss": 1.1882, "num_input_tokens_seen": 358391992, "step": 9140 }, { "epoch": 0.44335691442969277, "grad_norm": 0.4269642233848572, "learning_rate": 3.0750831908741176e-05, "loss": 1.1631, "num_input_tokens_seen": 358795456, "step": 9150 }, { "epoch": 0.44384145750557225, "grad_norm": 0.4123391807079315, "learning_rate": 3.071264240878824e-05, "loss": 1.223, "num_input_tokens_seen": 359217840, "step": 9160 }, { "epoch": 0.44432600058145166, "grad_norm": 0.3737592399120331, "learning_rate": 3.06744388387955e-05, "loss": 1.1953, "num_input_tokens_seen": 359596700, "step": 9170 }, { "epoch": 0.44481054365733114, "grad_norm": 0.35300344228744507, "learning_rate": 3.0636221292857014e-05, "loss": 1.1526, "num_input_tokens_seen": 359971336, "step": 9180 }, { "epoch": 0.4452950867332106, "grad_norm": 0.42388081550598145, "learning_rate": 3.059798986510129e-05, "loss": 1.2093, "num_input_tokens_seen": 360365460, "step": 9190 }, { "epoch": 0.44577962980909003, "grad_norm": 0.4228881895542145, "learning_rate": 3.055974464969099e-05, "loss": 1.183, "num_input_tokens_seen": 360783972, "step": 9200 }, { "epoch": 0.4462641728849695, "grad_norm": 0.47396156191825867, "learning_rate": 3.0521485740822756e-05, "loss": 1.1713, "num_input_tokens_seen": 361168724, "step": 9210 }, { "epoch": 0.4467487159608489, "grad_norm": 0.3625274896621704, "learning_rate": 3.0483213232726943e-05, "loss": 1.1959, "num_input_tokens_seen": 361586496, "step": 9220 }, { "epoch": 0.4472332590367284, "grad_norm": 0.3853507936000824, "learning_rate": 3.044492721966743e-05, "loss": 1.1483, "num_input_tokens_seen": 361979848, "step": 9230 }, { "epoch": 0.4477178021126078, "grad_norm": 0.3805350959300995, "learning_rate": 3.040662779594131e-05, "loss": 1.145, "num_input_tokens_seen": 362373352, "step": 9240 }, { "epoch": 0.4482023451884873, "grad_norm": 0.3566093444824219, "learning_rate": 3.036831505587876e-05, "loss": 1.1617, "num_input_tokens_seen": 362795508, "step": 9250 }, { "epoch": 0.4486868882643667, "grad_norm": 0.4072115123271942, "learning_rate": 3.0329989093842708e-05, "loss": 1.1401, "num_input_tokens_seen": 363198416, "step": 9260 }, { "epoch": 0.4491714313402462, "grad_norm": 0.3717459440231323, "learning_rate": 3.0291650004228676e-05, "loss": 1.1726, "num_input_tokens_seen": 363578608, "step": 9270 }, { "epoch": 0.4496559744161256, "grad_norm": 0.3549041450023651, "learning_rate": 3.025329788146451e-05, "loss": 1.1873, "num_input_tokens_seen": 363973344, "step": 9280 }, { "epoch": 0.45014051749200507, "grad_norm": 0.40732115507125854, "learning_rate": 3.0214932820010155e-05, "loss": 1.1968, "num_input_tokens_seen": 364371260, "step": 9290 }, { "epoch": 0.4506250605678845, "grad_norm": 0.39712613821029663, "learning_rate": 3.017655491435742e-05, "loss": 1.1529, "num_input_tokens_seen": 364781760, "step": 9300 }, { "epoch": 0.45110960364376396, "grad_norm": 0.4143085181713104, "learning_rate": 3.0138164259029757e-05, "loss": 1.2141, "num_input_tokens_seen": 365169144, "step": 9310 }, { "epoch": 0.4515941467196434, "grad_norm": 0.41558942198753357, "learning_rate": 3.0099760948582023e-05, "loss": 1.1541, "num_input_tokens_seen": 365542904, "step": 9320 }, { "epoch": 0.45207868979552285, "grad_norm": 0.38127419352531433, "learning_rate": 3.0061345077600228e-05, "loss": 1.1399, "num_input_tokens_seen": 365944000, "step": 9330 }, { "epoch": 0.45256323287140227, "grad_norm": 0.3887827396392822, "learning_rate": 3.0022916740701334e-05, "loss": 1.1597, "num_input_tokens_seen": 366332944, "step": 9340 }, { "epoch": 0.45304777594728174, "grad_norm": 0.38723742961883545, "learning_rate": 2.9984476032532992e-05, "loss": 1.1531, "num_input_tokens_seen": 366697388, "step": 9350 }, { "epoch": 0.45353231902316116, "grad_norm": 0.39358997344970703, "learning_rate": 2.9946023047773336e-05, "loss": 1.1241, "num_input_tokens_seen": 367099924, "step": 9360 }, { "epoch": 0.45401686209904063, "grad_norm": 0.3835636079311371, "learning_rate": 2.9907557881130737e-05, "loss": 1.2058, "num_input_tokens_seen": 367476372, "step": 9370 }, { "epoch": 0.45450140517492005, "grad_norm": 0.40065279603004456, "learning_rate": 2.9869080627343555e-05, "loss": 1.1679, "num_input_tokens_seen": 367872588, "step": 9380 }, { "epoch": 0.4549859482507995, "grad_norm": 0.38955920934677124, "learning_rate": 2.9830591381179928e-05, "loss": 1.1893, "num_input_tokens_seen": 368260348, "step": 9390 }, { "epoch": 0.45547049132667894, "grad_norm": 0.4499861001968384, "learning_rate": 2.9792090237437543e-05, "loss": 1.1168, "num_input_tokens_seen": 368644964, "step": 9400 }, { "epoch": 0.4559550344025584, "grad_norm": 0.386730432510376, "learning_rate": 2.9753577290943376e-05, "loss": 1.2014, "num_input_tokens_seen": 369055912, "step": 9410 }, { "epoch": 0.45643957747843783, "grad_norm": 0.39342692494392395, "learning_rate": 2.971505263655347e-05, "loss": 1.1853, "num_input_tokens_seen": 369472516, "step": 9420 }, { "epoch": 0.4569241205543173, "grad_norm": 0.4010606110095978, "learning_rate": 2.9676516369152713e-05, "loss": 1.0836, "num_input_tokens_seen": 369855504, "step": 9430 }, { "epoch": 0.4574086636301967, "grad_norm": 0.41329410672187805, "learning_rate": 2.9637968583654597e-05, "loss": 1.1578, "num_input_tokens_seen": 370249352, "step": 9440 }, { "epoch": 0.4578932067060762, "grad_norm": 0.3728090226650238, "learning_rate": 2.9599409375000975e-05, "loss": 1.1782, "num_input_tokens_seen": 370653324, "step": 9450 }, { "epoch": 0.4583777497819556, "grad_norm": 0.37402981519699097, "learning_rate": 2.9560838838161843e-05, "loss": 1.1704, "num_input_tokens_seen": 371025900, "step": 9460 }, { "epoch": 0.4588622928578351, "grad_norm": 0.39774417877197266, "learning_rate": 2.9522257068135086e-05, "loss": 1.1619, "num_input_tokens_seen": 371424328, "step": 9470 }, { "epoch": 0.4593468359337145, "grad_norm": 0.383232444524765, "learning_rate": 2.9483664159946273e-05, "loss": 1.1858, "num_input_tokens_seen": 371838760, "step": 9480 }, { "epoch": 0.459831379009594, "grad_norm": 0.37605851888656616, "learning_rate": 2.9445060208648383e-05, "loss": 1.1487, "num_input_tokens_seen": 372233964, "step": 9490 }, { "epoch": 0.4603159220854734, "grad_norm": 0.376461386680603, "learning_rate": 2.9406445309321623e-05, "loss": 1.2029, "num_input_tokens_seen": 372599424, "step": 9500 }, { "epoch": 0.46080046516135287, "grad_norm": 0.413618266582489, "learning_rate": 2.9367819557073134e-05, "loss": 1.1964, "num_input_tokens_seen": 372979896, "step": 9510 }, { "epoch": 0.4612850082372323, "grad_norm": 0.40922775864601135, "learning_rate": 2.9329183047036807e-05, "loss": 1.1989, "num_input_tokens_seen": 373358408, "step": 9520 }, { "epoch": 0.46176955131311176, "grad_norm": 0.37975606322288513, "learning_rate": 2.9290535874373022e-05, "loss": 1.2165, "num_input_tokens_seen": 373734508, "step": 9530 }, { "epoch": 0.4622540943889912, "grad_norm": 0.35514163970947266, "learning_rate": 2.925187813426843e-05, "loss": 1.1367, "num_input_tokens_seen": 374105356, "step": 9540 }, { "epoch": 0.46273863746487065, "grad_norm": 0.3892764151096344, "learning_rate": 2.9213209921935703e-05, "loss": 1.194, "num_input_tokens_seen": 374514008, "step": 9550 }, { "epoch": 0.46322318054075007, "grad_norm": 0.35142743587493896, "learning_rate": 2.9174531332613292e-05, "loss": 1.1837, "num_input_tokens_seen": 374918944, "step": 9560 }, { "epoch": 0.46370772361662954, "grad_norm": 0.42724543809890747, "learning_rate": 2.9135842461565225e-05, "loss": 1.1408, "num_input_tokens_seen": 375331548, "step": 9570 }, { "epoch": 0.46419226669250896, "grad_norm": 0.38824784755706787, "learning_rate": 2.9097143404080853e-05, "loss": 1.1919, "num_input_tokens_seen": 375707484, "step": 9580 }, { "epoch": 0.46467680976838843, "grad_norm": 0.3674311637878418, "learning_rate": 2.905843425547461e-05, "loss": 1.1473, "num_input_tokens_seen": 376112184, "step": 9590 }, { "epoch": 0.46516135284426785, "grad_norm": 0.3765534460544586, "learning_rate": 2.9019715111085773e-05, "loss": 1.1647, "num_input_tokens_seen": 376494180, "step": 9600 }, { "epoch": 0.4656458959201473, "grad_norm": 0.38776230812072754, "learning_rate": 2.8980986066278255e-05, "loss": 1.2236, "num_input_tokens_seen": 376885136, "step": 9610 }, { "epoch": 0.46613043899602674, "grad_norm": 0.37993282079696655, "learning_rate": 2.8942247216440354e-05, "loss": 1.2307, "num_input_tokens_seen": 377277268, "step": 9620 }, { "epoch": 0.4666149820719062, "grad_norm": 0.4081166088581085, "learning_rate": 2.8903498656984514e-05, "loss": 1.1428, "num_input_tokens_seen": 377661420, "step": 9630 }, { "epoch": 0.46709952514778563, "grad_norm": 0.3825088441371918, "learning_rate": 2.8864740483347074e-05, "loss": 1.1167, "num_input_tokens_seen": 378079588, "step": 9640 }, { "epoch": 0.4675840682236651, "grad_norm": 0.39228224754333496, "learning_rate": 2.8825972790988082e-05, "loss": 1.2089, "num_input_tokens_seen": 378473248, "step": 9650 }, { "epoch": 0.4680686112995445, "grad_norm": 0.40170297026634216, "learning_rate": 2.8787195675391015e-05, "loss": 1.1505, "num_input_tokens_seen": 378849372, "step": 9660 }, { "epoch": 0.468553154375424, "grad_norm": 0.39289790391921997, "learning_rate": 2.874840923206256e-05, "loss": 1.1567, "num_input_tokens_seen": 379273348, "step": 9670 }, { "epoch": 0.4690376974513034, "grad_norm": 0.3959096372127533, "learning_rate": 2.870961355653239e-05, "loss": 1.1711, "num_input_tokens_seen": 379660408, "step": 9680 }, { "epoch": 0.4695222405271829, "grad_norm": 0.43645361065864563, "learning_rate": 2.8670808744352884e-05, "loss": 1.1985, "num_input_tokens_seen": 380054412, "step": 9690 }, { "epoch": 0.4700067836030623, "grad_norm": 0.39692050218582153, "learning_rate": 2.863199489109897e-05, "loss": 1.182, "num_input_tokens_seen": 380485684, "step": 9700 }, { "epoch": 0.4704913266789418, "grad_norm": 0.42202410101890564, "learning_rate": 2.8593172092367797e-05, "loss": 1.1693, "num_input_tokens_seen": 380889292, "step": 9710 }, { "epoch": 0.4709758697548212, "grad_norm": 0.403148889541626, "learning_rate": 2.8554340443778594e-05, "loss": 1.1838, "num_input_tokens_seen": 381290904, "step": 9720 }, { "epoch": 0.47146041283070067, "grad_norm": 0.3972223699092865, "learning_rate": 2.8515500040972347e-05, "loss": 1.2143, "num_input_tokens_seen": 381694440, "step": 9730 }, { "epoch": 0.4719449559065801, "grad_norm": 0.3653228282928467, "learning_rate": 2.8476650979611623e-05, "loss": 1.1622, "num_input_tokens_seen": 382095168, "step": 9740 }, { "epoch": 0.47242949898245956, "grad_norm": 0.3957580626010895, "learning_rate": 2.843779335538031e-05, "loss": 1.1705, "num_input_tokens_seen": 382467792, "step": 9750 }, { "epoch": 0.472914042058339, "grad_norm": 0.4030523896217346, "learning_rate": 2.8398927263983382e-05, "loss": 1.1782, "num_input_tokens_seen": 382880656, "step": 9760 }, { "epoch": 0.47339858513421845, "grad_norm": 0.4061778485774994, "learning_rate": 2.8360052801146687e-05, "loss": 1.1652, "num_input_tokens_seen": 383259940, "step": 9770 }, { "epoch": 0.47388312821009787, "grad_norm": 0.38041362166404724, "learning_rate": 2.8321170062616664e-05, "loss": 1.1677, "num_input_tokens_seen": 383648640, "step": 9780 }, { "epoch": 0.47436767128597734, "grad_norm": 0.39732810854911804, "learning_rate": 2.828227914416015e-05, "loss": 1.1045, "num_input_tokens_seen": 384047384, "step": 9790 }, { "epoch": 0.47485221436185676, "grad_norm": 0.4323562681674957, "learning_rate": 2.824338014156412e-05, "loss": 1.1875, "num_input_tokens_seen": 384447200, "step": 9800 }, { "epoch": 0.47533675743773623, "grad_norm": 0.3938267230987549, "learning_rate": 2.8204473150635476e-05, "loss": 1.1853, "num_input_tokens_seen": 384839616, "step": 9810 }, { "epoch": 0.47582130051361565, "grad_norm": 0.3984321355819702, "learning_rate": 2.816555826720078e-05, "loss": 1.1566, "num_input_tokens_seen": 385227704, "step": 9820 }, { "epoch": 0.4763058435894951, "grad_norm": 0.37600892782211304, "learning_rate": 2.8126635587106036e-05, "loss": 1.1883, "num_input_tokens_seen": 385642608, "step": 9830 }, { "epoch": 0.47679038666537454, "grad_norm": 0.43121588230133057, "learning_rate": 2.808770520621646e-05, "loss": 1.1532, "num_input_tokens_seen": 386042136, "step": 9840 }, { "epoch": 0.477274929741254, "grad_norm": 0.4222434163093567, "learning_rate": 2.8048767220416223e-05, "loss": 1.175, "num_input_tokens_seen": 386420708, "step": 9850 }, { "epoch": 0.47775947281713343, "grad_norm": 0.423554927110672, "learning_rate": 2.800982172560823e-05, "loss": 1.1398, "num_input_tokens_seen": 386777908, "step": 9860 }, { "epoch": 0.4782440158930129, "grad_norm": 0.3951357305049896, "learning_rate": 2.7970868817713887e-05, "loss": 1.1527, "num_input_tokens_seen": 387176340, "step": 9870 }, { "epoch": 0.4787285589688923, "grad_norm": 0.3626028001308441, "learning_rate": 2.7931908592672844e-05, "loss": 1.1641, "num_input_tokens_seen": 387547504, "step": 9880 }, { "epoch": 0.4792131020447718, "grad_norm": 0.4093586504459381, "learning_rate": 2.78929411464428e-05, "loss": 1.1164, "num_input_tokens_seen": 387947832, "step": 9890 }, { "epoch": 0.4796976451206512, "grad_norm": 0.3790941536426544, "learning_rate": 2.7853966574999197e-05, "loss": 1.1421, "num_input_tokens_seen": 388371140, "step": 9900 }, { "epoch": 0.4801821881965307, "grad_norm": 0.44463813304901123, "learning_rate": 2.7814984974335067e-05, "loss": 1.178, "num_input_tokens_seen": 388758584, "step": 9910 }, { "epoch": 0.4806667312724101, "grad_norm": 0.40607205033302307, "learning_rate": 2.7775996440460733e-05, "loss": 1.2158, "num_input_tokens_seen": 389136284, "step": 9920 }, { "epoch": 0.4811512743482896, "grad_norm": 0.41725072264671326, "learning_rate": 2.7737001069403608e-05, "loss": 1.1731, "num_input_tokens_seen": 389528592, "step": 9930 }, { "epoch": 0.481635817424169, "grad_norm": 0.43715402483940125, "learning_rate": 2.7697998957207915e-05, "loss": 1.1359, "num_input_tokens_seen": 389924908, "step": 9940 }, { "epoch": 0.4821203605000485, "grad_norm": 0.4026664197444916, "learning_rate": 2.765899019993453e-05, "loss": 1.1334, "num_input_tokens_seen": 390327408, "step": 9950 }, { "epoch": 0.4826049035759279, "grad_norm": 0.38899385929107666, "learning_rate": 2.7619974893660643e-05, "loss": 1.1628, "num_input_tokens_seen": 390716640, "step": 9960 }, { "epoch": 0.48308944665180736, "grad_norm": 0.3987119197845459, "learning_rate": 2.758095313447961e-05, "loss": 1.1804, "num_input_tokens_seen": 391147420, "step": 9970 }, { "epoch": 0.4835739897276868, "grad_norm": 0.4419252872467041, "learning_rate": 2.754192501850066e-05, "loss": 1.1857, "num_input_tokens_seen": 391523552, "step": 9980 }, { "epoch": 0.48405853280356625, "grad_norm": 0.40142005681991577, "learning_rate": 2.7502890641848696e-05, "loss": 1.2214, "num_input_tokens_seen": 391906876, "step": 9990 }, { "epoch": 0.4845430758794457, "grad_norm": 0.36947381496429443, "learning_rate": 2.7463850100664028e-05, "loss": 1.1617, "num_input_tokens_seen": 392316428, "step": 10000 }, { "epoch": 0.4845430758794457, "eval_loss": 1.1596810817718506, "eval_runtime": 5.1226, "eval_samples_per_second": 29.282, "eval_steps_per_second": 3.709, "num_input_tokens_seen": 392316428, "step": 10000 }, { "epoch": 0.48502761895532515, "grad_norm": 0.38736334443092346, "learning_rate": 2.7424803491102158e-05, "loss": 1.1817, "num_input_tokens_seen": 392732788, "step": 10010 }, { "epoch": 0.48551216203120456, "grad_norm": 0.40144097805023193, "learning_rate": 2.738575090933352e-05, "loss": 1.1628, "num_input_tokens_seen": 393114108, "step": 10020 }, { "epoch": 0.48599670510708404, "grad_norm": 0.42609161138534546, "learning_rate": 2.7346692451543264e-05, "loss": 1.1679, "num_input_tokens_seen": 393509264, "step": 10030 }, { "epoch": 0.48648124818296345, "grad_norm": 0.4124153256416321, "learning_rate": 2.730762821393103e-05, "loss": 1.1491, "num_input_tokens_seen": 393915040, "step": 10040 }, { "epoch": 0.48696579125884293, "grad_norm": 0.42232540249824524, "learning_rate": 2.726855829271066e-05, "loss": 1.1626, "num_input_tokens_seen": 394289168, "step": 10050 }, { "epoch": 0.48745033433472235, "grad_norm": 0.4197705388069153, "learning_rate": 2.722948278411003e-05, "loss": 1.1708, "num_input_tokens_seen": 394680292, "step": 10060 }, { "epoch": 0.4879348774106018, "grad_norm": 0.3949076533317566, "learning_rate": 2.7190401784370743e-05, "loss": 1.1591, "num_input_tokens_seen": 395070520, "step": 10070 }, { "epoch": 0.48841942048648124, "grad_norm": 0.42010289430618286, "learning_rate": 2.7151315389747967e-05, "loss": 1.1333, "num_input_tokens_seen": 395500444, "step": 10080 }, { "epoch": 0.4889039635623607, "grad_norm": 0.36050793528556824, "learning_rate": 2.7112223696510108e-05, "loss": 1.1303, "num_input_tokens_seen": 395889484, "step": 10090 }, { "epoch": 0.48938850663824013, "grad_norm": 0.40366074442863464, "learning_rate": 2.7073126800938666e-05, "loss": 1.1765, "num_input_tokens_seen": 396277572, "step": 10100 }, { "epoch": 0.4898730497141196, "grad_norm": 0.410087913274765, "learning_rate": 2.7034024799327928e-05, "loss": 1.1247, "num_input_tokens_seen": 396678980, "step": 10110 }, { "epoch": 0.490357592789999, "grad_norm": 0.38809484243392944, "learning_rate": 2.6994917787984764e-05, "loss": 1.1675, "num_input_tokens_seen": 397061400, "step": 10120 }, { "epoch": 0.4908421358658785, "grad_norm": 0.4384767711162567, "learning_rate": 2.69558058632284e-05, "loss": 1.1633, "num_input_tokens_seen": 397457472, "step": 10130 }, { "epoch": 0.4913266789417579, "grad_norm": 0.4038788974285126, "learning_rate": 2.6916689121390127e-05, "loss": 1.1532, "num_input_tokens_seen": 397885272, "step": 10140 }, { "epoch": 0.4918112220176374, "grad_norm": 0.4135740399360657, "learning_rate": 2.6877567658813134e-05, "loss": 1.1828, "num_input_tokens_seen": 398292228, "step": 10150 }, { "epoch": 0.4922957650935168, "grad_norm": 0.3874277174472809, "learning_rate": 2.6838441571852223e-05, "loss": 1.1119, "num_input_tokens_seen": 398678720, "step": 10160 }, { "epoch": 0.4927803081693963, "grad_norm": 0.4087202548980713, "learning_rate": 2.679931095687358e-05, "loss": 1.1294, "num_input_tokens_seen": 399066144, "step": 10170 }, { "epoch": 0.4932648512452757, "grad_norm": 0.39371687173843384, "learning_rate": 2.6760175910254565e-05, "loss": 1.1805, "num_input_tokens_seen": 399480648, "step": 10180 }, { "epoch": 0.49374939432115517, "grad_norm": 0.39417147636413574, "learning_rate": 2.6721036528383424e-05, "loss": 1.1356, "num_input_tokens_seen": 399859596, "step": 10190 }, { "epoch": 0.4942339373970346, "grad_norm": 0.394105464220047, "learning_rate": 2.66818929076591e-05, "loss": 1.1561, "num_input_tokens_seen": 400214040, "step": 10200 }, { "epoch": 0.49471848047291406, "grad_norm": 0.3794916868209839, "learning_rate": 2.664274514449097e-05, "loss": 1.1644, "num_input_tokens_seen": 400607764, "step": 10210 }, { "epoch": 0.4952030235487935, "grad_norm": 0.43505197763442993, "learning_rate": 2.660359333529862e-05, "loss": 1.1151, "num_input_tokens_seen": 400977376, "step": 10220 }, { "epoch": 0.49568756662467295, "grad_norm": 0.373087614774704, "learning_rate": 2.6564437576511587e-05, "loss": 1.1589, "num_input_tokens_seen": 401363624, "step": 10230 }, { "epoch": 0.49617210970055237, "grad_norm": 0.41389840841293335, "learning_rate": 2.6525277964569155e-05, "loss": 1.1478, "num_input_tokens_seen": 401733716, "step": 10240 }, { "epoch": 0.49665665277643184, "grad_norm": 0.41126182675361633, "learning_rate": 2.648611459592008e-05, "loss": 1.1653, "num_input_tokens_seen": 402141292, "step": 10250 }, { "epoch": 0.49714119585231126, "grad_norm": 0.3853723108768463, "learning_rate": 2.644694756702238e-05, "loss": 1.1522, "num_input_tokens_seen": 402544236, "step": 10260 }, { "epoch": 0.49762573892819073, "grad_norm": 0.39461591839790344, "learning_rate": 2.640777697434309e-05, "loss": 1.175, "num_input_tokens_seen": 402961036, "step": 10270 }, { "epoch": 0.49811028200407015, "grad_norm": 0.4081992506980896, "learning_rate": 2.6368602914358008e-05, "loss": 1.16, "num_input_tokens_seen": 403377968, "step": 10280 }, { "epoch": 0.4985948250799496, "grad_norm": 0.4176690876483917, "learning_rate": 2.63294254835515e-05, "loss": 1.1912, "num_input_tokens_seen": 403777800, "step": 10290 }, { "epoch": 0.49907936815582904, "grad_norm": 0.38343364000320435, "learning_rate": 2.6290244778416202e-05, "loss": 1.1198, "num_input_tokens_seen": 404136320, "step": 10300 }, { "epoch": 0.4995639112317085, "grad_norm": 0.4266665577888489, "learning_rate": 2.6251060895452844e-05, "loss": 1.2289, "num_input_tokens_seen": 404515816, "step": 10310 }, { "epoch": 0.5000484543075879, "grad_norm": 0.36221781373023987, "learning_rate": 2.6211873931169955e-05, "loss": 1.1902, "num_input_tokens_seen": 404931996, "step": 10320 }, { "epoch": 0.5005329973834673, "grad_norm": 0.4041613042354584, "learning_rate": 2.6172683982083675e-05, "loss": 1.1757, "num_input_tokens_seen": 405288152, "step": 10330 }, { "epoch": 0.5010175404593469, "grad_norm": 0.38727837800979614, "learning_rate": 2.6133491144717475e-05, "loss": 1.1697, "num_input_tokens_seen": 405676608, "step": 10340 }, { "epoch": 0.5015020835352263, "grad_norm": 0.41572102904319763, "learning_rate": 2.6094295515601967e-05, "loss": 1.1641, "num_input_tokens_seen": 406068632, "step": 10350 }, { "epoch": 0.5019866266111057, "grad_norm": 0.42120370268821716, "learning_rate": 2.605509719127463e-05, "loss": 1.1326, "num_input_tokens_seen": 406465000, "step": 10360 }, { "epoch": 0.5024711696869851, "grad_norm": 0.4009106755256653, "learning_rate": 2.6015896268279555e-05, "loss": 1.1493, "num_input_tokens_seen": 406845928, "step": 10370 }, { "epoch": 0.5029557127628647, "grad_norm": 0.3664901554584503, "learning_rate": 2.5976692843167255e-05, "loss": 1.1917, "num_input_tokens_seen": 407225352, "step": 10380 }, { "epoch": 0.5034402558387441, "grad_norm": 0.4137326180934906, "learning_rate": 2.5937487012494422e-05, "loss": 1.1673, "num_input_tokens_seen": 407601740, "step": 10390 }, { "epoch": 0.5039247989146235, "grad_norm": 0.3960126042366028, "learning_rate": 2.5898278872823646e-05, "loss": 1.1413, "num_input_tokens_seen": 407997996, "step": 10400 }, { "epoch": 0.5044093419905029, "grad_norm": 0.42062610387802124, "learning_rate": 2.5859068520723205e-05, "loss": 1.1753, "num_input_tokens_seen": 408377408, "step": 10410 }, { "epoch": 0.5048938850663824, "grad_norm": 0.35909387469291687, "learning_rate": 2.581985605276684e-05, "loss": 1.1511, "num_input_tokens_seen": 408721880, "step": 10420 }, { "epoch": 0.5053784281422619, "grad_norm": 0.42842748761177063, "learning_rate": 2.578064156553349e-05, "loss": 1.1312, "num_input_tokens_seen": 409143260, "step": 10430 }, { "epoch": 0.5058629712181413, "grad_norm": 0.3839662969112396, "learning_rate": 2.5741425155607097e-05, "loss": 1.1704, "num_input_tokens_seen": 409535852, "step": 10440 }, { "epoch": 0.5063475142940207, "grad_norm": 0.41793426871299744, "learning_rate": 2.57022069195763e-05, "loss": 1.1989, "num_input_tokens_seen": 409937740, "step": 10450 }, { "epoch": 0.5068320573699002, "grad_norm": 0.4150349199771881, "learning_rate": 2.5662986954034245e-05, "loss": 1.1293, "num_input_tokens_seen": 410319088, "step": 10460 }, { "epoch": 0.5073166004457796, "grad_norm": 0.37542667984962463, "learning_rate": 2.5623765355578354e-05, "loss": 1.2247, "num_input_tokens_seen": 410678500, "step": 10470 }, { "epoch": 0.5078011435216591, "grad_norm": 0.4027327001094818, "learning_rate": 2.5584542220810065e-05, "loss": 1.1796, "num_input_tokens_seen": 411079744, "step": 10480 }, { "epoch": 0.5082856865975385, "grad_norm": 0.39059481024742126, "learning_rate": 2.55453176463346e-05, "loss": 1.1849, "num_input_tokens_seen": 411488444, "step": 10490 }, { "epoch": 0.508770229673418, "grad_norm": 0.40780818462371826, "learning_rate": 2.5506091728760702e-05, "loss": 1.1187, "num_input_tokens_seen": 411879348, "step": 10500 }, { "epoch": 0.5092547727492974, "grad_norm": 0.3999616503715515, "learning_rate": 2.546686456470046e-05, "loss": 1.1775, "num_input_tokens_seen": 412294412, "step": 10510 }, { "epoch": 0.5097393158251768, "grad_norm": 0.4117770195007324, "learning_rate": 2.5427636250769016e-05, "loss": 1.1678, "num_input_tokens_seen": 412688988, "step": 10520 }, { "epoch": 0.5102238589010563, "grad_norm": 0.3824278712272644, "learning_rate": 2.538840688358435e-05, "loss": 1.1374, "num_input_tokens_seen": 413069468, "step": 10530 }, { "epoch": 0.5107084019769358, "grad_norm": 0.3502415418624878, "learning_rate": 2.5349176559767007e-05, "loss": 1.2013, "num_input_tokens_seen": 413440188, "step": 10540 }, { "epoch": 0.5111929450528152, "grad_norm": 0.3400461971759796, "learning_rate": 2.530994537593994e-05, "loss": 1.1755, "num_input_tokens_seen": 413826580, "step": 10550 }, { "epoch": 0.5116774881286946, "grad_norm": 0.4381474554538727, "learning_rate": 2.527071342872817e-05, "loss": 1.1644, "num_input_tokens_seen": 414235248, "step": 10560 }, { "epoch": 0.512162031204574, "grad_norm": 0.3964703381061554, "learning_rate": 2.5231480814758633e-05, "loss": 1.1745, "num_input_tokens_seen": 414635544, "step": 10570 }, { "epoch": 0.5126465742804536, "grad_norm": 0.3687261939048767, "learning_rate": 2.51922476306599e-05, "loss": 1.166, "num_input_tokens_seen": 415043552, "step": 10580 }, { "epoch": 0.513131117356333, "grad_norm": 0.3944050669670105, "learning_rate": 2.5153013973061916e-05, "loss": 1.2209, "num_input_tokens_seen": 415413920, "step": 10590 }, { "epoch": 0.5136156604322124, "grad_norm": 0.4199015498161316, "learning_rate": 2.511377993859584e-05, "loss": 1.1452, "num_input_tokens_seen": 415791632, "step": 10600 }, { "epoch": 0.5141002035080918, "grad_norm": 0.4487917721271515, "learning_rate": 2.507454562389372e-05, "loss": 1.1334, "num_input_tokens_seen": 416204172, "step": 10610 }, { "epoch": 0.5145847465839714, "grad_norm": 0.4062841832637787, "learning_rate": 2.5035311125588322e-05, "loss": 1.1398, "num_input_tokens_seen": 416591364, "step": 10620 }, { "epoch": 0.5150692896598508, "grad_norm": 0.41555893421173096, "learning_rate": 2.4996076540312854e-05, "loss": 1.1504, "num_input_tokens_seen": 416957332, "step": 10630 }, { "epoch": 0.5155538327357302, "grad_norm": 0.35754913091659546, "learning_rate": 2.4956841964700718e-05, "loss": 1.1095, "num_input_tokens_seen": 417326068, "step": 10640 }, { "epoch": 0.5160383758116096, "grad_norm": 0.381197065114975, "learning_rate": 2.4917607495385338e-05, "loss": 1.1859, "num_input_tokens_seen": 417756524, "step": 10650 }, { "epoch": 0.5165229188874891, "grad_norm": 0.41809821128845215, "learning_rate": 2.4878373228999835e-05, "loss": 1.1568, "num_input_tokens_seen": 418159804, "step": 10660 }, { "epoch": 0.5170074619633686, "grad_norm": 0.39222896099090576, "learning_rate": 2.4839139262176837e-05, "loss": 1.1618, "num_input_tokens_seen": 418565656, "step": 10670 }, { "epoch": 0.517492005039248, "grad_norm": 0.38486379384994507, "learning_rate": 2.4799905691548245e-05, "loss": 1.1738, "num_input_tokens_seen": 418952312, "step": 10680 }, { "epoch": 0.5179765481151274, "grad_norm": 0.3719366788864136, "learning_rate": 2.476067261374499e-05, "loss": 1.1973, "num_input_tokens_seen": 419369500, "step": 10690 }, { "epoch": 0.5184610911910069, "grad_norm": 0.3997963070869446, "learning_rate": 2.4721440125396773e-05, "loss": 1.1287, "num_input_tokens_seen": 419730868, "step": 10700 }, { "epoch": 0.5189456342668863, "grad_norm": 0.38283371925354004, "learning_rate": 2.4682208323131852e-05, "loss": 1.1742, "num_input_tokens_seen": 420139208, "step": 10710 }, { "epoch": 0.5194301773427658, "grad_norm": 0.3874126970767975, "learning_rate": 2.464297730357678e-05, "loss": 1.1564, "num_input_tokens_seen": 420521152, "step": 10720 }, { "epoch": 0.5199147204186452, "grad_norm": 0.3984072804450989, "learning_rate": 2.460374716335622e-05, "loss": 1.1473, "num_input_tokens_seen": 420929760, "step": 10730 }, { "epoch": 0.5203992634945247, "grad_norm": 0.3916292190551758, "learning_rate": 2.456451799909263e-05, "loss": 1.1685, "num_input_tokens_seen": 421298444, "step": 10740 }, { "epoch": 0.5208838065704041, "grad_norm": 0.4141142964363098, "learning_rate": 2.4525289907406068e-05, "loss": 1.139, "num_input_tokens_seen": 421670376, "step": 10750 }, { "epoch": 0.5213683496462835, "grad_norm": 0.4091172218322754, "learning_rate": 2.448606298491399e-05, "loss": 1.1174, "num_input_tokens_seen": 422062488, "step": 10760 }, { "epoch": 0.521852892722163, "grad_norm": 0.413764089345932, "learning_rate": 2.4446837328230907e-05, "loss": 1.157, "num_input_tokens_seen": 422469080, "step": 10770 }, { "epoch": 0.5223374357980425, "grad_norm": 0.35379403829574585, "learning_rate": 2.4407613033968287e-05, "loss": 1.1344, "num_input_tokens_seen": 422867012, "step": 10780 }, { "epoch": 0.5228219788739219, "grad_norm": 0.3597969114780426, "learning_rate": 2.436839019873418e-05, "loss": 1.1457, "num_input_tokens_seen": 423225644, "step": 10790 }, { "epoch": 0.5233065219498013, "grad_norm": 0.3982402980327606, "learning_rate": 2.4329168919133062e-05, "loss": 1.1819, "num_input_tokens_seen": 423605548, "step": 10800 }, { "epoch": 0.5237910650256807, "grad_norm": 0.452061265707016, "learning_rate": 2.4289949291765608e-05, "loss": 1.1438, "num_input_tokens_seen": 423993544, "step": 10810 }, { "epoch": 0.5242756081015603, "grad_norm": 0.4269421100616455, "learning_rate": 2.4250731413228374e-05, "loss": 1.1333, "num_input_tokens_seen": 424365256, "step": 10820 }, { "epoch": 0.5247601511774397, "grad_norm": 0.3826684057712555, "learning_rate": 2.421151538011364e-05, "loss": 1.1673, "num_input_tokens_seen": 424768824, "step": 10830 }, { "epoch": 0.5252446942533191, "grad_norm": 0.3775523602962494, "learning_rate": 2.4172301289009137e-05, "loss": 1.1264, "num_input_tokens_seen": 425175704, "step": 10840 }, { "epoch": 0.5257292373291985, "grad_norm": 0.4121640920639038, "learning_rate": 2.4133089236497804e-05, "loss": 1.1525, "num_input_tokens_seen": 425568528, "step": 10850 }, { "epoch": 0.526213780405078, "grad_norm": 0.4088398218154907, "learning_rate": 2.4093879319157572e-05, "loss": 1.1616, "num_input_tokens_seen": 425944588, "step": 10860 }, { "epoch": 0.5266983234809575, "grad_norm": 0.4086717665195465, "learning_rate": 2.4054671633561094e-05, "loss": 1.1615, "num_input_tokens_seen": 426354932, "step": 10870 }, { "epoch": 0.5271828665568369, "grad_norm": 0.40191206336021423, "learning_rate": 2.401546627627554e-05, "loss": 1.1574, "num_input_tokens_seen": 426750536, "step": 10880 }, { "epoch": 0.5276674096327163, "grad_norm": 0.4215683043003082, "learning_rate": 2.3976263343862357e-05, "loss": 1.1275, "num_input_tokens_seen": 427145692, "step": 10890 }, { "epoch": 0.5281519527085958, "grad_norm": 0.428448885679245, "learning_rate": 2.393706293287698e-05, "loss": 1.1513, "num_input_tokens_seen": 427517468, "step": 10900 }, { "epoch": 0.5286364957844752, "grad_norm": 0.41464531421661377, "learning_rate": 2.3897865139868685e-05, "loss": 1.2119, "num_input_tokens_seen": 427940524, "step": 10910 }, { "epoch": 0.5291210388603547, "grad_norm": 0.40263885259628296, "learning_rate": 2.3858670061380267e-05, "loss": 1.1092, "num_input_tokens_seen": 428306984, "step": 10920 }, { "epoch": 0.5296055819362341, "grad_norm": 0.4086105227470398, "learning_rate": 2.3819477793947825e-05, "loss": 1.147, "num_input_tokens_seen": 428680608, "step": 10930 }, { "epoch": 0.5300901250121136, "grad_norm": 0.40625572204589844, "learning_rate": 2.378028843410058e-05, "loss": 1.1671, "num_input_tokens_seen": 429089700, "step": 10940 }, { "epoch": 0.530574668087993, "grad_norm": 0.4443352520465851, "learning_rate": 2.374110207836054e-05, "loss": 1.1634, "num_input_tokens_seen": 429511972, "step": 10950 }, { "epoch": 0.5310592111638724, "grad_norm": 0.3834231495857239, "learning_rate": 2.3701918823242357e-05, "loss": 1.1406, "num_input_tokens_seen": 429933992, "step": 10960 }, { "epoch": 0.5315437542397519, "grad_norm": 0.4218290150165558, "learning_rate": 2.366273876525302e-05, "loss": 1.1097, "num_input_tokens_seen": 430335652, "step": 10970 }, { "epoch": 0.5320282973156314, "grad_norm": 0.38524654507637024, "learning_rate": 2.3623562000891646e-05, "loss": 1.176, "num_input_tokens_seen": 430737500, "step": 10980 }, { "epoch": 0.5325128403915108, "grad_norm": 0.4242812395095825, "learning_rate": 2.3584388626649246e-05, "loss": 1.1395, "num_input_tokens_seen": 431134616, "step": 10990 }, { "epoch": 0.5329973834673902, "grad_norm": 0.42048659920692444, "learning_rate": 2.3545218739008483e-05, "loss": 1.1276, "num_input_tokens_seen": 431534440, "step": 11000 }, { "epoch": 0.5334819265432696, "grad_norm": 0.43740230798721313, "learning_rate": 2.3506052434443436e-05, "loss": 1.1547, "num_input_tokens_seen": 431956964, "step": 11010 }, { "epoch": 0.5339664696191492, "grad_norm": 0.40744540095329285, "learning_rate": 2.3466889809419342e-05, "loss": 1.1674, "num_input_tokens_seen": 432337932, "step": 11020 }, { "epoch": 0.5344510126950286, "grad_norm": 0.4600279927253723, "learning_rate": 2.342773096039238e-05, "loss": 1.1226, "num_input_tokens_seen": 432737924, "step": 11030 }, { "epoch": 0.534935555770908, "grad_norm": 0.4031646251678467, "learning_rate": 2.3388575983809456e-05, "loss": 1.1667, "num_input_tokens_seen": 433161476, "step": 11040 }, { "epoch": 0.5354200988467874, "grad_norm": 0.3792802691459656, "learning_rate": 2.3349424976107903e-05, "loss": 1.2043, "num_input_tokens_seen": 433547952, "step": 11050 }, { "epoch": 0.535904641922667, "grad_norm": 0.4132523536682129, "learning_rate": 2.3310278033715285e-05, "loss": 1.1405, "num_input_tokens_seen": 433943164, "step": 11060 }, { "epoch": 0.5363891849985464, "grad_norm": 0.38873347640037537, "learning_rate": 2.3271135253049183e-05, "loss": 1.1494, "num_input_tokens_seen": 434332620, "step": 11070 }, { "epoch": 0.5368737280744258, "grad_norm": 0.3987562358379364, "learning_rate": 2.3231996730516884e-05, "loss": 1.1497, "num_input_tokens_seen": 434702280, "step": 11080 }, { "epoch": 0.5373582711503052, "grad_norm": 0.41692429780960083, "learning_rate": 2.3192862562515226e-05, "loss": 1.1397, "num_input_tokens_seen": 435102792, "step": 11090 }, { "epoch": 0.5378428142261847, "grad_norm": 0.39025530219078064, "learning_rate": 2.3153732845430302e-05, "loss": 1.122, "num_input_tokens_seen": 435481228, "step": 11100 }, { "epoch": 0.5383273573020642, "grad_norm": 0.3755995035171509, "learning_rate": 2.3114607675637233e-05, "loss": 1.1926, "num_input_tokens_seen": 435894796, "step": 11110 }, { "epoch": 0.5388119003779436, "grad_norm": 0.4043687582015991, "learning_rate": 2.3075487149499974e-05, "loss": 1.1358, "num_input_tokens_seen": 436289408, "step": 11120 }, { "epoch": 0.539296443453823, "grad_norm": 0.3815999925136566, "learning_rate": 2.3036371363371008e-05, "loss": 1.1443, "num_input_tokens_seen": 436692436, "step": 11130 }, { "epoch": 0.5397809865297025, "grad_norm": 0.38345128297805786, "learning_rate": 2.2997260413591156e-05, "loss": 1.1892, "num_input_tokens_seen": 437100692, "step": 11140 }, { "epoch": 0.5402655296055819, "grad_norm": 0.4016791880130768, "learning_rate": 2.295815439648934e-05, "loss": 1.1396, "num_input_tokens_seen": 437475216, "step": 11150 }, { "epoch": 0.5407500726814614, "grad_norm": 0.387517511844635, "learning_rate": 2.2919053408382306e-05, "loss": 1.1204, "num_input_tokens_seen": 437882256, "step": 11160 }, { "epoch": 0.5412346157573408, "grad_norm": 0.3928276598453522, "learning_rate": 2.287995754557445e-05, "loss": 1.1519, "num_input_tokens_seen": 438262044, "step": 11170 }, { "epoch": 0.5417191588332203, "grad_norm": 0.4152311682701111, "learning_rate": 2.2840866904357495e-05, "loss": 1.1304, "num_input_tokens_seen": 438668324, "step": 11180 }, { "epoch": 0.5422037019090997, "grad_norm": 0.4021238684654236, "learning_rate": 2.2801781581010362e-05, "loss": 1.1578, "num_input_tokens_seen": 439061528, "step": 11190 }, { "epoch": 0.5426882449849791, "grad_norm": 0.3985769748687744, "learning_rate": 2.2762701671798833e-05, "loss": 1.1735, "num_input_tokens_seen": 439445292, "step": 11200 }, { "epoch": 0.5431727880608586, "grad_norm": 0.4368831515312195, "learning_rate": 2.2723627272975352e-05, "loss": 1.155, "num_input_tokens_seen": 439862688, "step": 11210 }, { "epoch": 0.5436573311367381, "grad_norm": 0.4021163880825043, "learning_rate": 2.2684558480778833e-05, "loss": 1.1758, "num_input_tokens_seen": 440251824, "step": 11220 }, { "epoch": 0.5441418742126175, "grad_norm": 0.41846776008605957, "learning_rate": 2.264549539143434e-05, "loss": 1.1374, "num_input_tokens_seen": 440647480, "step": 11230 }, { "epoch": 0.5446264172884969, "grad_norm": 0.38505181670188904, "learning_rate": 2.2606438101152893e-05, "loss": 1.1702, "num_input_tokens_seen": 441032116, "step": 11240 }, { "epoch": 0.5451109603643763, "grad_norm": 0.3722958564758301, "learning_rate": 2.2567386706131268e-05, "loss": 1.1441, "num_input_tokens_seen": 441435016, "step": 11250 }, { "epoch": 0.5455955034402559, "grad_norm": 0.38270628452301025, "learning_rate": 2.2528341302551666e-05, "loss": 1.1893, "num_input_tokens_seen": 441836580, "step": 11260 }, { "epoch": 0.5460800465161353, "grad_norm": 0.40590032935142517, "learning_rate": 2.2489301986581586e-05, "loss": 1.116, "num_input_tokens_seen": 442232080, "step": 11270 }, { "epoch": 0.5465645895920147, "grad_norm": 0.36881017684936523, "learning_rate": 2.2450268854373497e-05, "loss": 1.1247, "num_input_tokens_seen": 442637844, "step": 11280 }, { "epoch": 0.5470491326678941, "grad_norm": 0.38571688532829285, "learning_rate": 2.2411242002064637e-05, "loss": 1.1632, "num_input_tokens_seen": 443047888, "step": 11290 }, { "epoch": 0.5475336757437737, "grad_norm": 0.402986079454422, "learning_rate": 2.23722215257768e-05, "loss": 1.1353, "num_input_tokens_seen": 443450848, "step": 11300 }, { "epoch": 0.5480182188196531, "grad_norm": 0.43902143836021423, "learning_rate": 2.2333207521616056e-05, "loss": 1.1409, "num_input_tokens_seen": 443825900, "step": 11310 }, { "epoch": 0.5485027618955325, "grad_norm": 0.4374961853027344, "learning_rate": 2.2294200085672552e-05, "loss": 1.1557, "num_input_tokens_seen": 444255800, "step": 11320 }, { "epoch": 0.5489873049714119, "grad_norm": 0.38471776247024536, "learning_rate": 2.225519931402024e-05, "loss": 1.1903, "num_input_tokens_seen": 444647372, "step": 11330 }, { "epoch": 0.5494718480472914, "grad_norm": 0.3870150148868561, "learning_rate": 2.2216205302716656e-05, "loss": 1.165, "num_input_tokens_seen": 445028328, "step": 11340 }, { "epoch": 0.5499563911231709, "grad_norm": 0.3954487144947052, "learning_rate": 2.217721814780272e-05, "loss": 1.1201, "num_input_tokens_seen": 445435648, "step": 11350 }, { "epoch": 0.5504409341990503, "grad_norm": 0.4321858286857605, "learning_rate": 2.2138237945302412e-05, "loss": 1.0994, "num_input_tokens_seen": 445822760, "step": 11360 }, { "epoch": 0.5509254772749297, "grad_norm": 0.3976816236972809, "learning_rate": 2.2099264791222643e-05, "loss": 1.1399, "num_input_tokens_seen": 446231820, "step": 11370 }, { "epoch": 0.5514100203508092, "grad_norm": 0.3802758455276489, "learning_rate": 2.2060298781552927e-05, "loss": 1.1505, "num_input_tokens_seen": 446644088, "step": 11380 }, { "epoch": 0.5518945634266886, "grad_norm": 0.39267027378082275, "learning_rate": 2.2021340012265177e-05, "loss": 1.1583, "num_input_tokens_seen": 447026360, "step": 11390 }, { "epoch": 0.552379106502568, "grad_norm": 0.41610804200172424, "learning_rate": 2.198238857931352e-05, "loss": 1.1108, "num_input_tokens_seen": 447431376, "step": 11400 }, { "epoch": 0.5528636495784475, "grad_norm": 0.40714430809020996, "learning_rate": 2.1943444578633957e-05, "loss": 1.1341, "num_input_tokens_seen": 447810844, "step": 11410 }, { "epoch": 0.553348192654327, "grad_norm": 0.3614059090614319, "learning_rate": 2.1904508106144208e-05, "loss": 1.165, "num_input_tokens_seen": 448200712, "step": 11420 }, { "epoch": 0.5538327357302064, "grad_norm": 0.3718690574169159, "learning_rate": 2.1865579257743475e-05, "loss": 1.0994, "num_input_tokens_seen": 448585024, "step": 11430 }, { "epoch": 0.5543172788060858, "grad_norm": 0.4221544861793518, "learning_rate": 2.1826658129312133e-05, "loss": 1.1858, "num_input_tokens_seen": 448991036, "step": 11440 }, { "epoch": 0.5548018218819653, "grad_norm": 0.40332892537117004, "learning_rate": 2.178774481671159e-05, "loss": 1.1761, "num_input_tokens_seen": 449381496, "step": 11450 }, { "epoch": 0.5552863649578448, "grad_norm": 0.4298264682292938, "learning_rate": 2.174883941578397e-05, "loss": 1.1593, "num_input_tokens_seen": 449808272, "step": 11460 }, { "epoch": 0.5557709080337242, "grad_norm": 0.4092737138271332, "learning_rate": 2.1709942022351924e-05, "loss": 1.1434, "num_input_tokens_seen": 450187828, "step": 11470 }, { "epoch": 0.5562554511096036, "grad_norm": 0.4071583151817322, "learning_rate": 2.1671052732218392e-05, "loss": 1.1578, "num_input_tokens_seen": 450578040, "step": 11480 }, { "epoch": 0.5567399941854831, "grad_norm": 0.4042060673236847, "learning_rate": 2.1632171641166326e-05, "loss": 1.114, "num_input_tokens_seen": 450979104, "step": 11490 }, { "epoch": 0.5572245372613626, "grad_norm": 0.41152459383010864, "learning_rate": 2.1593298844958526e-05, "loss": 1.1724, "num_input_tokens_seen": 451369344, "step": 11500 }, { "epoch": 0.557709080337242, "grad_norm": 0.3623616099357605, "learning_rate": 2.1554434439337326e-05, "loss": 1.1765, "num_input_tokens_seen": 451771544, "step": 11510 }, { "epoch": 0.5581936234131214, "grad_norm": 0.41300123929977417, "learning_rate": 2.15155785200244e-05, "loss": 1.1843, "num_input_tokens_seen": 452152352, "step": 11520 }, { "epoch": 0.5586781664890009, "grad_norm": 0.41391250491142273, "learning_rate": 2.147673118272054e-05, "loss": 1.141, "num_input_tokens_seen": 452586876, "step": 11530 }, { "epoch": 0.5591627095648803, "grad_norm": 0.37428995966911316, "learning_rate": 2.1437892523105378e-05, "loss": 1.1809, "num_input_tokens_seen": 452992560, "step": 11540 }, { "epoch": 0.5596472526407598, "grad_norm": 0.42412886023521423, "learning_rate": 2.1399062636837197e-05, "loss": 1.1461, "num_input_tokens_seen": 453342104, "step": 11550 }, { "epoch": 0.5601317957166392, "grad_norm": 0.3886395990848541, "learning_rate": 2.1360241619552652e-05, "loss": 1.1342, "num_input_tokens_seen": 453754860, "step": 11560 }, { "epoch": 0.5606163387925187, "grad_norm": 0.3729265332221985, "learning_rate": 2.1321429566866542e-05, "loss": 1.1745, "num_input_tokens_seen": 454158212, "step": 11570 }, { "epoch": 0.5611008818683981, "grad_norm": 0.3990989625453949, "learning_rate": 2.1282626574371635e-05, "loss": 1.1643, "num_input_tokens_seen": 454533420, "step": 11580 }, { "epoch": 0.5615854249442775, "grad_norm": 0.39453041553497314, "learning_rate": 2.124383273763834e-05, "loss": 1.1723, "num_input_tokens_seen": 454918724, "step": 11590 }, { "epoch": 0.562069968020157, "grad_norm": 0.3955458700656891, "learning_rate": 2.120504815221452e-05, "loss": 1.1748, "num_input_tokens_seen": 455320488, "step": 11600 }, { "epoch": 0.5625545110960365, "grad_norm": 0.3768325448036194, "learning_rate": 2.1166272913625273e-05, "loss": 1.1634, "num_input_tokens_seen": 455716484, "step": 11610 }, { "epoch": 0.5630390541719159, "grad_norm": 0.3783731758594513, "learning_rate": 2.1127507117372657e-05, "loss": 1.1953, "num_input_tokens_seen": 456099764, "step": 11620 }, { "epoch": 0.5635235972477953, "grad_norm": 0.41152095794677734, "learning_rate": 2.1088750858935495e-05, "loss": 1.1831, "num_input_tokens_seen": 456506480, "step": 11630 }, { "epoch": 0.5640081403236747, "grad_norm": 0.414093017578125, "learning_rate": 2.1050004233769094e-05, "loss": 1.1475, "num_input_tokens_seen": 456916444, "step": 11640 }, { "epoch": 0.5644926833995543, "grad_norm": 0.41902777552604675, "learning_rate": 2.1011267337305036e-05, "loss": 1.1574, "num_input_tokens_seen": 457314824, "step": 11650 }, { "epoch": 0.5649772264754337, "grad_norm": 0.40867382287979126, "learning_rate": 2.0972540264950976e-05, "loss": 1.0926, "num_input_tokens_seen": 457711980, "step": 11660 }, { "epoch": 0.5654617695513131, "grad_norm": 0.3919362425804138, "learning_rate": 2.0933823112090322e-05, "loss": 1.2073, "num_input_tokens_seen": 458128016, "step": 11670 }, { "epoch": 0.5659463126271925, "grad_norm": 0.3909580707550049, "learning_rate": 2.0895115974082106e-05, "loss": 1.1525, "num_input_tokens_seen": 458509664, "step": 11680 }, { "epoch": 0.5664308557030721, "grad_norm": 0.38012978434562683, "learning_rate": 2.0856418946260643e-05, "loss": 1.146, "num_input_tokens_seen": 458918900, "step": 11690 }, { "epoch": 0.5669153987789515, "grad_norm": 0.378833144903183, "learning_rate": 2.0817732123935364e-05, "loss": 1.1811, "num_input_tokens_seen": 459322936, "step": 11700 }, { "epoch": 0.5673999418548309, "grad_norm": 0.37148529291152954, "learning_rate": 2.0779055602390583e-05, "loss": 1.1755, "num_input_tokens_seen": 459735016, "step": 11710 }, { "epoch": 0.5678844849307103, "grad_norm": 0.3815120458602905, "learning_rate": 2.0740389476885223e-05, "loss": 1.1825, "num_input_tokens_seen": 460131788, "step": 11720 }, { "epoch": 0.5683690280065898, "grad_norm": 0.3902805745601654, "learning_rate": 2.070173384265261e-05, "loss": 1.1369, "num_input_tokens_seen": 460515608, "step": 11730 }, { "epoch": 0.5688535710824693, "grad_norm": 0.40333831310272217, "learning_rate": 2.066308879490023e-05, "loss": 1.1596, "num_input_tokens_seen": 460897436, "step": 11740 }, { "epoch": 0.5693381141583487, "grad_norm": 0.39738982915878296, "learning_rate": 2.0624454428809484e-05, "loss": 1.182, "num_input_tokens_seen": 461288352, "step": 11750 }, { "epoch": 0.5698226572342281, "grad_norm": 0.38377317786216736, "learning_rate": 2.0585830839535487e-05, "loss": 1.1371, "num_input_tokens_seen": 461665964, "step": 11760 }, { "epoch": 0.5703072003101076, "grad_norm": 0.40340209007263184, "learning_rate": 2.054721812220678e-05, "loss": 1.1388, "num_input_tokens_seen": 462099048, "step": 11770 }, { "epoch": 0.570791743385987, "grad_norm": 0.39503028988838196, "learning_rate": 2.0508616371925156e-05, "loss": 1.1972, "num_input_tokens_seen": 462477648, "step": 11780 }, { "epoch": 0.5712762864618665, "grad_norm": 0.3878142237663269, "learning_rate": 2.0470025683765392e-05, "loss": 1.1394, "num_input_tokens_seen": 462857744, "step": 11790 }, { "epoch": 0.5717608295377459, "grad_norm": 0.430828720331192, "learning_rate": 2.043144615277499e-05, "loss": 1.1204, "num_input_tokens_seen": 463231476, "step": 11800 }, { "epoch": 0.5722453726136254, "grad_norm": 0.38779589533805847, "learning_rate": 2.039287787397402e-05, "loss": 1.1468, "num_input_tokens_seen": 463608072, "step": 11810 }, { "epoch": 0.5727299156895048, "grad_norm": 0.40600141882896423, "learning_rate": 2.0354320942354804e-05, "loss": 1.1389, "num_input_tokens_seen": 464007424, "step": 11820 }, { "epoch": 0.5732144587653842, "grad_norm": 0.3781258463859558, "learning_rate": 2.0315775452881708e-05, "loss": 1.172, "num_input_tokens_seen": 464436116, "step": 11830 }, { "epoch": 0.5736990018412637, "grad_norm": 0.41185423731803894, "learning_rate": 2.027724150049096e-05, "loss": 1.1674, "num_input_tokens_seen": 464834636, "step": 11840 }, { "epoch": 0.5741835449171432, "grad_norm": 0.39568862318992615, "learning_rate": 2.0238719180090323e-05, "loss": 1.1769, "num_input_tokens_seen": 465219480, "step": 11850 }, { "epoch": 0.5746680879930226, "grad_norm": 0.3740871846675873, "learning_rate": 2.0200208586558954e-05, "loss": 1.1428, "num_input_tokens_seen": 465608864, "step": 11860 }, { "epoch": 0.575152631068902, "grad_norm": 0.39056453108787537, "learning_rate": 2.0161709814747102e-05, "loss": 1.1356, "num_input_tokens_seen": 466002328, "step": 11870 }, { "epoch": 0.5756371741447814, "grad_norm": 0.46836405992507935, "learning_rate": 2.012322295947589e-05, "loss": 1.1183, "num_input_tokens_seen": 466418136, "step": 11880 }, { "epoch": 0.576121717220661, "grad_norm": 0.41011425852775574, "learning_rate": 2.0084748115537126e-05, "loss": 1.1868, "num_input_tokens_seen": 466816508, "step": 11890 }, { "epoch": 0.5766062602965404, "grad_norm": 0.4022364914417267, "learning_rate": 2.0046285377692998e-05, "loss": 1.1622, "num_input_tokens_seen": 467208132, "step": 11900 }, { "epoch": 0.5770908033724198, "grad_norm": 0.4446437656879425, "learning_rate": 2.0007834840675905e-05, "loss": 1.2062, "num_input_tokens_seen": 467607752, "step": 11910 }, { "epoch": 0.5775753464482992, "grad_norm": 0.4171381890773773, "learning_rate": 1.9969396599188177e-05, "loss": 1.1675, "num_input_tokens_seen": 467989092, "step": 11920 }, { "epoch": 0.5780598895241787, "grad_norm": 0.44026637077331543, "learning_rate": 1.993097074790186e-05, "loss": 1.1417, "num_input_tokens_seen": 468356440, "step": 11930 }, { "epoch": 0.5785444326000582, "grad_norm": 0.3875192105770111, "learning_rate": 1.989255738145851e-05, "loss": 1.1676, "num_input_tokens_seen": 468736536, "step": 11940 }, { "epoch": 0.5790289756759376, "grad_norm": 0.37714606523513794, "learning_rate": 1.9854156594468905e-05, "loss": 1.1685, "num_input_tokens_seen": 469122032, "step": 11950 }, { "epoch": 0.579513518751817, "grad_norm": 0.3938728868961334, "learning_rate": 1.9815768481512837e-05, "loss": 1.1166, "num_input_tokens_seen": 469516476, "step": 11960 }, { "epoch": 0.5799980618276965, "grad_norm": 0.4008789360523224, "learning_rate": 1.9777393137138916e-05, "loss": 1.1795, "num_input_tokens_seen": 469938440, "step": 11970 }, { "epoch": 0.580482604903576, "grad_norm": 0.3709718585014343, "learning_rate": 1.9739030655864263e-05, "loss": 1.1366, "num_input_tokens_seen": 470313748, "step": 11980 }, { "epoch": 0.5809671479794554, "grad_norm": 0.4143054485321045, "learning_rate": 1.9700681132174356e-05, "loss": 1.1339, "num_input_tokens_seen": 470678456, "step": 11990 }, { "epoch": 0.5814516910553348, "grad_norm": 0.3943856656551361, "learning_rate": 1.9662344660522726e-05, "loss": 1.1755, "num_input_tokens_seen": 471101508, "step": 12000 }, { "epoch": 0.5814516910553348, "eval_loss": 1.1437398195266724, "eval_runtime": 7.1807, "eval_samples_per_second": 20.889, "eval_steps_per_second": 2.646, "num_input_tokens_seen": 471101508, "step": 12000 }, { "epoch": 0.5819362341312143, "grad_norm": 0.40113458037376404, "learning_rate": 1.9624021335330767e-05, "loss": 1.1483, "num_input_tokens_seen": 471482556, "step": 12010 }, { "epoch": 0.5824207772070937, "grad_norm": 0.3723471760749817, "learning_rate": 1.9585711250987515e-05, "loss": 1.0867, "num_input_tokens_seen": 471888220, "step": 12020 }, { "epoch": 0.5829053202829731, "grad_norm": 0.3816375732421875, "learning_rate": 1.9547414501849363e-05, "loss": 1.1351, "num_input_tokens_seen": 472248060, "step": 12030 }, { "epoch": 0.5833898633588526, "grad_norm": 0.41664794087409973, "learning_rate": 1.9509131182239875e-05, "loss": 1.1705, "num_input_tokens_seen": 472639804, "step": 12040 }, { "epoch": 0.5838744064347321, "grad_norm": 0.3829415440559387, "learning_rate": 1.9470861386449546e-05, "loss": 1.1315, "num_input_tokens_seen": 473018176, "step": 12050 }, { "epoch": 0.5843589495106115, "grad_norm": 0.408346027135849, "learning_rate": 1.9432605208735543e-05, "loss": 1.1286, "num_input_tokens_seen": 473403200, "step": 12060 }, { "epoch": 0.5848434925864909, "grad_norm": 0.3734830915927887, "learning_rate": 1.9394362743321516e-05, "loss": 1.1347, "num_input_tokens_seen": 473796676, "step": 12070 }, { "epoch": 0.5853280356623703, "grad_norm": 0.3995228111743927, "learning_rate": 1.9356134084397305e-05, "loss": 1.159, "num_input_tokens_seen": 474192924, "step": 12080 }, { "epoch": 0.5858125787382499, "grad_norm": 0.38744309544563293, "learning_rate": 1.9317919326118793e-05, "loss": 1.1302, "num_input_tokens_seen": 474587284, "step": 12090 }, { "epoch": 0.5862971218141293, "grad_norm": 0.40833401679992676, "learning_rate": 1.9279718562607595e-05, "loss": 1.1293, "num_input_tokens_seen": 474984376, "step": 12100 }, { "epoch": 0.5867816648900087, "grad_norm": 0.40676310658454895, "learning_rate": 1.9241531887950853e-05, "loss": 1.1125, "num_input_tokens_seen": 475382208, "step": 12110 }, { "epoch": 0.5872662079658881, "grad_norm": 0.4175487756729126, "learning_rate": 1.9203359396201038e-05, "loss": 1.1302, "num_input_tokens_seen": 475803188, "step": 12120 }, { "epoch": 0.5877507510417677, "grad_norm": 0.39633074402809143, "learning_rate": 1.9165201181375663e-05, "loss": 1.1573, "num_input_tokens_seen": 476200608, "step": 12130 }, { "epoch": 0.5882352941176471, "grad_norm": 0.38707002997398376, "learning_rate": 1.9127057337457077e-05, "loss": 1.1641, "num_input_tokens_seen": 476617248, "step": 12140 }, { "epoch": 0.5887198371935265, "grad_norm": 0.38113346695899963, "learning_rate": 1.908892795839226e-05, "loss": 1.1381, "num_input_tokens_seen": 476978804, "step": 12150 }, { "epoch": 0.5892043802694059, "grad_norm": 0.40942734479904175, "learning_rate": 1.905081313809253e-05, "loss": 1.1546, "num_input_tokens_seen": 477371880, "step": 12160 }, { "epoch": 0.5896889233452854, "grad_norm": 0.3737110495567322, "learning_rate": 1.9012712970433384e-05, "loss": 1.1463, "num_input_tokens_seen": 477751464, "step": 12170 }, { "epoch": 0.5901734664211649, "grad_norm": 0.37110334634780884, "learning_rate": 1.8974627549254205e-05, "loss": 1.1607, "num_input_tokens_seen": 478117284, "step": 12180 }, { "epoch": 0.5906580094970443, "grad_norm": 0.38362741470336914, "learning_rate": 1.8936556968358047e-05, "loss": 1.1784, "num_input_tokens_seen": 478503476, "step": 12190 }, { "epoch": 0.5911425525729237, "grad_norm": 0.4000930190086365, "learning_rate": 1.8898501321511445e-05, "loss": 1.0922, "num_input_tokens_seen": 478897364, "step": 12200 }, { "epoch": 0.5916270956488032, "grad_norm": 0.39831140637397766, "learning_rate": 1.8860460702444126e-05, "loss": 1.1765, "num_input_tokens_seen": 479272824, "step": 12210 }, { "epoch": 0.5921116387246826, "grad_norm": 0.37732118368148804, "learning_rate": 1.8822435204848827e-05, "loss": 1.1729, "num_input_tokens_seen": 479653192, "step": 12220 }, { "epoch": 0.5925961818005621, "grad_norm": 0.3948591649532318, "learning_rate": 1.8784424922381015e-05, "loss": 1.136, "num_input_tokens_seen": 480072752, "step": 12230 }, { "epoch": 0.5930807248764415, "grad_norm": 0.3917677700519562, "learning_rate": 1.8746429948658693e-05, "loss": 1.1598, "num_input_tokens_seen": 480459552, "step": 12240 }, { "epoch": 0.593565267952321, "grad_norm": 0.4181142747402191, "learning_rate": 1.8708450377262178e-05, "loss": 1.109, "num_input_tokens_seen": 480856544, "step": 12250 }, { "epoch": 0.5940498110282004, "grad_norm": 0.4221828281879425, "learning_rate": 1.867048630173381e-05, "loss": 1.1264, "num_input_tokens_seen": 481254676, "step": 12260 }, { "epoch": 0.5945343541040798, "grad_norm": 0.3877246379852295, "learning_rate": 1.8632537815577812e-05, "loss": 1.1524, "num_input_tokens_seen": 481664936, "step": 12270 }, { "epoch": 0.5950188971799593, "grad_norm": 0.41641271114349365, "learning_rate": 1.859460501225998e-05, "loss": 1.1492, "num_input_tokens_seen": 482076064, "step": 12280 }, { "epoch": 0.5955034402558388, "grad_norm": 0.38876596093177795, "learning_rate": 1.8556687985207473e-05, "loss": 1.1841, "num_input_tokens_seen": 482481220, "step": 12290 }, { "epoch": 0.5959879833317182, "grad_norm": 0.372016966342926, "learning_rate": 1.851878682780864e-05, "loss": 1.1443, "num_input_tokens_seen": 482858776, "step": 12300 }, { "epoch": 0.5964725264075976, "grad_norm": 0.4116594195365906, "learning_rate": 1.84809016334127e-05, "loss": 1.168, "num_input_tokens_seen": 483260116, "step": 12310 }, { "epoch": 0.596957069483477, "grad_norm": 0.42143404483795166, "learning_rate": 1.8443032495329564e-05, "loss": 1.1707, "num_input_tokens_seen": 483647520, "step": 12320 }, { "epoch": 0.5974416125593566, "grad_norm": 0.41614919900894165, "learning_rate": 1.8405179506829622e-05, "loss": 1.1575, "num_input_tokens_seen": 484057976, "step": 12330 }, { "epoch": 0.597926155635236, "grad_norm": 0.39331093430519104, "learning_rate": 1.836734276114346e-05, "loss": 1.1582, "num_input_tokens_seen": 484441132, "step": 12340 }, { "epoch": 0.5984106987111154, "grad_norm": 0.41087356209754944, "learning_rate": 1.832952235146166e-05, "loss": 1.1851, "num_input_tokens_seen": 484824128, "step": 12350 }, { "epoch": 0.5988952417869948, "grad_norm": 0.3742915093898773, "learning_rate": 1.829171837093459e-05, "loss": 1.1018, "num_input_tokens_seen": 485241940, "step": 12360 }, { "epoch": 0.5993797848628744, "grad_norm": 0.3956594169139862, "learning_rate": 1.8253930912672136e-05, "loss": 1.1769, "num_input_tokens_seen": 485653000, "step": 12370 }, { "epoch": 0.5998643279387538, "grad_norm": 0.4172208905220032, "learning_rate": 1.8216160069743498e-05, "loss": 1.1215, "num_input_tokens_seen": 486031148, "step": 12380 }, { "epoch": 0.6003488710146332, "grad_norm": 0.3961176872253418, "learning_rate": 1.8178405935176933e-05, "loss": 1.1929, "num_input_tokens_seen": 486411448, "step": 12390 }, { "epoch": 0.6008334140905126, "grad_norm": 0.3706035315990448, "learning_rate": 1.8140668601959593e-05, "loss": 1.0883, "num_input_tokens_seen": 486792408, "step": 12400 }, { "epoch": 0.6013179571663921, "grad_norm": 0.3640589416027069, "learning_rate": 1.81029481630372e-05, "loss": 1.1478, "num_input_tokens_seen": 487182316, "step": 12410 }, { "epoch": 0.6018025002422716, "grad_norm": 0.4026361107826233, "learning_rate": 1.806524471131388e-05, "loss": 1.1486, "num_input_tokens_seen": 487604672, "step": 12420 }, { "epoch": 0.602287043318151, "grad_norm": 0.41077134013175964, "learning_rate": 1.8027558339651936e-05, "loss": 1.1469, "num_input_tokens_seen": 487986272, "step": 12430 }, { "epoch": 0.6027715863940304, "grad_norm": 0.394106924533844, "learning_rate": 1.7989889140871583e-05, "loss": 1.0973, "num_input_tokens_seen": 488373412, "step": 12440 }, { "epoch": 0.6032561294699099, "grad_norm": 0.3956287205219269, "learning_rate": 1.795223720775076e-05, "loss": 1.118, "num_input_tokens_seen": 488785752, "step": 12450 }, { "epoch": 0.6037406725457893, "grad_norm": 0.3825465142726898, "learning_rate": 1.791460263302487e-05, "loss": 1.1358, "num_input_tokens_seen": 489205628, "step": 12460 }, { "epoch": 0.6042252156216688, "grad_norm": 0.3835325539112091, "learning_rate": 1.7876985509386547e-05, "loss": 1.1225, "num_input_tokens_seen": 489595928, "step": 12470 }, { "epoch": 0.6047097586975482, "grad_norm": 0.3921869397163391, "learning_rate": 1.7839385929485482e-05, "loss": 1.1232, "num_input_tokens_seen": 490009524, "step": 12480 }, { "epoch": 0.6051943017734277, "grad_norm": 0.4052605628967285, "learning_rate": 1.7801803985928117e-05, "loss": 1.1823, "num_input_tokens_seen": 490436288, "step": 12490 }, { "epoch": 0.6056788448493071, "grad_norm": 0.37396615743637085, "learning_rate": 1.7764239771277477e-05, "loss": 1.1319, "num_input_tokens_seen": 490820096, "step": 12500 }, { "epoch": 0.6061633879251865, "grad_norm": 0.3906390368938446, "learning_rate": 1.772669337805292e-05, "loss": 1.1324, "num_input_tokens_seen": 491197804, "step": 12510 }, { "epoch": 0.606647931001066, "grad_norm": 0.3880655765533447, "learning_rate": 1.768916489872991e-05, "loss": 1.1389, "num_input_tokens_seen": 491590844, "step": 12520 }, { "epoch": 0.6071324740769455, "grad_norm": 0.42076969146728516, "learning_rate": 1.765165442573979e-05, "loss": 1.1312, "num_input_tokens_seen": 491987104, "step": 12530 }, { "epoch": 0.6076170171528249, "grad_norm": 0.4096836745738983, "learning_rate": 1.7614162051469545e-05, "loss": 1.1339, "num_input_tokens_seen": 492387748, "step": 12540 }, { "epoch": 0.6081015602287043, "grad_norm": 0.3795408606529236, "learning_rate": 1.7576687868261587e-05, "loss": 1.1246, "num_input_tokens_seen": 492784488, "step": 12550 }, { "epoch": 0.6085861033045837, "grad_norm": 0.38204097747802734, "learning_rate": 1.7539231968413546e-05, "loss": 1.1139, "num_input_tokens_seen": 493156420, "step": 12560 }, { "epoch": 0.6090706463804633, "grad_norm": 0.4025273621082306, "learning_rate": 1.7501794444177975e-05, "loss": 1.1592, "num_input_tokens_seen": 493529324, "step": 12570 }, { "epoch": 0.6095551894563427, "grad_norm": 0.39617040753364563, "learning_rate": 1.746437538776222e-05, "loss": 1.1537, "num_input_tokens_seen": 493929468, "step": 12580 }, { "epoch": 0.6100397325322221, "grad_norm": 0.39346054196357727, "learning_rate": 1.742697489132811e-05, "loss": 1.1642, "num_input_tokens_seen": 494356540, "step": 12590 }, { "epoch": 0.6105242756081015, "grad_norm": 0.4220488369464874, "learning_rate": 1.738959304699176e-05, "loss": 1.1657, "num_input_tokens_seen": 494737364, "step": 12600 }, { "epoch": 0.611008818683981, "grad_norm": 0.38811561465263367, "learning_rate": 1.735222994682336e-05, "loss": 1.131, "num_input_tokens_seen": 495145924, "step": 12610 }, { "epoch": 0.6114933617598605, "grad_norm": 0.3839203417301178, "learning_rate": 1.7314885682846925e-05, "loss": 1.1502, "num_input_tokens_seen": 495543252, "step": 12620 }, { "epoch": 0.6119779048357399, "grad_norm": 0.4147859811782837, "learning_rate": 1.7277560347040094e-05, "loss": 1.219, "num_input_tokens_seen": 495924020, "step": 12630 }, { "epoch": 0.6124624479116193, "grad_norm": 0.3909815549850464, "learning_rate": 1.7240254031333862e-05, "loss": 1.1642, "num_input_tokens_seen": 496317616, "step": 12640 }, { "epoch": 0.6129469909874988, "grad_norm": 0.3842679560184479, "learning_rate": 1.720296682761238e-05, "loss": 1.142, "num_input_tokens_seen": 496696256, "step": 12650 }, { "epoch": 0.6134315340633782, "grad_norm": 0.4022013545036316, "learning_rate": 1.716569882771274e-05, "loss": 1.1284, "num_input_tokens_seen": 497103872, "step": 12660 }, { "epoch": 0.6139160771392577, "grad_norm": 0.39619848132133484, "learning_rate": 1.7128450123424746e-05, "loss": 1.1532, "num_input_tokens_seen": 497505888, "step": 12670 }, { "epoch": 0.6144006202151371, "grad_norm": 0.3860088586807251, "learning_rate": 1.709122080649064e-05, "loss": 1.1758, "num_input_tokens_seen": 497903692, "step": 12680 }, { "epoch": 0.6148851632910166, "grad_norm": 0.377780020236969, "learning_rate": 1.705401096860496e-05, "loss": 1.1119, "num_input_tokens_seen": 498287500, "step": 12690 }, { "epoch": 0.615369706366896, "grad_norm": 0.3980233371257782, "learning_rate": 1.7016820701414215e-05, "loss": 1.1614, "num_input_tokens_seen": 498685408, "step": 12700 }, { "epoch": 0.6158542494427754, "grad_norm": 0.399446964263916, "learning_rate": 1.697965009651677e-05, "loss": 1.1788, "num_input_tokens_seen": 499066376, "step": 12710 }, { "epoch": 0.6163387925186549, "grad_norm": 0.3826245069503784, "learning_rate": 1.6942499245462525e-05, "loss": 1.1421, "num_input_tokens_seen": 499446672, "step": 12720 }, { "epoch": 0.6168233355945344, "grad_norm": 0.39101335406303406, "learning_rate": 1.6905368239752718e-05, "loss": 1.1129, "num_input_tokens_seen": 499832256, "step": 12730 }, { "epoch": 0.6173078786704138, "grad_norm": 0.41338834166526794, "learning_rate": 1.686825717083975e-05, "loss": 1.1999, "num_input_tokens_seen": 500220888, "step": 12740 }, { "epoch": 0.6177924217462932, "grad_norm": 0.4286590814590454, "learning_rate": 1.6831166130126872e-05, "loss": 1.1657, "num_input_tokens_seen": 500641332, "step": 12750 }, { "epoch": 0.6182769648221726, "grad_norm": 0.4031652510166168, "learning_rate": 1.6794095208968058e-05, "loss": 1.1097, "num_input_tokens_seen": 501041940, "step": 12760 }, { "epoch": 0.6187615078980522, "grad_norm": 0.3872615098953247, "learning_rate": 1.675704449866768e-05, "loss": 1.1681, "num_input_tokens_seen": 501421856, "step": 12770 }, { "epoch": 0.6192460509739316, "grad_norm": 0.4028913676738739, "learning_rate": 1.672001409048034e-05, "loss": 1.1452, "num_input_tokens_seen": 501803240, "step": 12780 }, { "epoch": 0.619730594049811, "grad_norm": 0.4178786277770996, "learning_rate": 1.668300407561067e-05, "loss": 1.1493, "num_input_tokens_seen": 502206632, "step": 12790 }, { "epoch": 0.6202151371256904, "grad_norm": 0.4024272859096527, "learning_rate": 1.6646014545213042e-05, "loss": 1.1246, "num_input_tokens_seen": 502597668, "step": 12800 }, { "epoch": 0.62069968020157, "grad_norm": 0.38439643383026123, "learning_rate": 1.6609045590391383e-05, "loss": 1.1421, "num_input_tokens_seen": 502992140, "step": 12810 }, { "epoch": 0.6211842232774494, "grad_norm": 0.44478264451026917, "learning_rate": 1.6572097302198935e-05, "loss": 1.116, "num_input_tokens_seen": 503374384, "step": 12820 }, { "epoch": 0.6216687663533288, "grad_norm": 0.44358116388320923, "learning_rate": 1.6535169771638066e-05, "loss": 1.127, "num_input_tokens_seen": 503789460, "step": 12830 }, { "epoch": 0.6221533094292082, "grad_norm": 0.3874736428260803, "learning_rate": 1.6498263089659992e-05, "loss": 1.1076, "num_input_tokens_seen": 504149520, "step": 12840 }, { "epoch": 0.6226378525050877, "grad_norm": 0.3996816873550415, "learning_rate": 1.6461377347164593e-05, "loss": 1.1729, "num_input_tokens_seen": 504529724, "step": 12850 }, { "epoch": 0.6231223955809672, "grad_norm": 0.39820224046707153, "learning_rate": 1.6424512635000158e-05, "loss": 1.1639, "num_input_tokens_seen": 504913244, "step": 12860 }, { "epoch": 0.6236069386568466, "grad_norm": 0.4100916385650635, "learning_rate": 1.638766904396321e-05, "loss": 1.1485, "num_input_tokens_seen": 505324308, "step": 12870 }, { "epoch": 0.624091481732726, "grad_norm": 0.38851940631866455, "learning_rate": 1.635084666479822e-05, "loss": 1.1161, "num_input_tokens_seen": 505696332, "step": 12880 }, { "epoch": 0.6245760248086055, "grad_norm": 0.4355764091014862, "learning_rate": 1.6314045588197442e-05, "loss": 1.143, "num_input_tokens_seen": 506083568, "step": 12890 }, { "epoch": 0.6250605678844849, "grad_norm": 0.38407522439956665, "learning_rate": 1.6277265904800643e-05, "loss": 1.1396, "num_input_tokens_seen": 506497556, "step": 12900 }, { "epoch": 0.6255451109603644, "grad_norm": 0.38687968254089355, "learning_rate": 1.6240507705194896e-05, "loss": 1.1215, "num_input_tokens_seen": 506881424, "step": 12910 }, { "epoch": 0.6260296540362438, "grad_norm": 0.36193808913230896, "learning_rate": 1.6203771079914387e-05, "loss": 1.1487, "num_input_tokens_seen": 507256568, "step": 12920 }, { "epoch": 0.6265141971121233, "grad_norm": 0.3949004113674164, "learning_rate": 1.6167056119440125e-05, "loss": 1.1569, "num_input_tokens_seen": 507669200, "step": 12930 }, { "epoch": 0.6269987401880027, "grad_norm": 0.39322030544281006, "learning_rate": 1.6130362914199814e-05, "loss": 1.0929, "num_input_tokens_seen": 508034880, "step": 12940 }, { "epoch": 0.6274832832638821, "grad_norm": 0.40019360184669495, "learning_rate": 1.6093691554567524e-05, "loss": 1.1317, "num_input_tokens_seen": 508462956, "step": 12950 }, { "epoch": 0.6279678263397616, "grad_norm": 0.40196293592453003, "learning_rate": 1.6057042130863538e-05, "loss": 1.0989, "num_input_tokens_seen": 508848948, "step": 12960 }, { "epoch": 0.6284523694156411, "grad_norm": 0.4627667963504791, "learning_rate": 1.6020414733354117e-05, "loss": 1.1436, "num_input_tokens_seen": 509221768, "step": 12970 }, { "epoch": 0.6289369124915205, "grad_norm": 0.42447420954704285, "learning_rate": 1.598380945225127e-05, "loss": 1.1207, "num_input_tokens_seen": 509643908, "step": 12980 }, { "epoch": 0.6294214555673999, "grad_norm": 0.3753610849380493, "learning_rate": 1.594722637771253e-05, "loss": 1.1599, "num_input_tokens_seen": 510024072, "step": 12990 }, { "epoch": 0.6299059986432793, "grad_norm": 0.39578545093536377, "learning_rate": 1.5910665599840745e-05, "loss": 1.1634, "num_input_tokens_seen": 510426748, "step": 13000 }, { "epoch": 0.6303905417191589, "grad_norm": 0.4332728087902069, "learning_rate": 1.5874127208683824e-05, "loss": 1.0877, "num_input_tokens_seen": 510818184, "step": 13010 }, { "epoch": 0.6308750847950383, "grad_norm": 0.3980608284473419, "learning_rate": 1.5837611294234583e-05, "loss": 1.102, "num_input_tokens_seen": 511196012, "step": 13020 }, { "epoch": 0.6313596278709177, "grad_norm": 0.3887230455875397, "learning_rate": 1.5801117946430434e-05, "loss": 1.1257, "num_input_tokens_seen": 511590376, "step": 13030 }, { "epoch": 0.6318441709467971, "grad_norm": 0.4023469388484955, "learning_rate": 1.576464725515322e-05, "loss": 1.13, "num_input_tokens_seen": 511985844, "step": 13040 }, { "epoch": 0.6323287140226767, "grad_norm": 0.3778144121170044, "learning_rate": 1.5728199310229e-05, "loss": 1.0993, "num_input_tokens_seen": 512401748, "step": 13050 }, { "epoch": 0.6328132570985561, "grad_norm": 0.3801812529563904, "learning_rate": 1.5691774201427772e-05, "loss": 1.0846, "num_input_tokens_seen": 512818064, "step": 13060 }, { "epoch": 0.6332978001744355, "grad_norm": 0.3784249722957611, "learning_rate": 1.565537201846335e-05, "loss": 1.1541, "num_input_tokens_seen": 513220236, "step": 13070 }, { "epoch": 0.6337823432503149, "grad_norm": 0.4744737148284912, "learning_rate": 1.5618992850993025e-05, "loss": 1.1496, "num_input_tokens_seen": 513594808, "step": 13080 }, { "epoch": 0.6342668863261944, "grad_norm": 0.3975614607334137, "learning_rate": 1.5582636788617412e-05, "loss": 1.176, "num_input_tokens_seen": 513981028, "step": 13090 }, { "epoch": 0.6347514294020739, "grad_norm": 0.3835643231868744, "learning_rate": 1.5546303920880256e-05, "loss": 1.1276, "num_input_tokens_seen": 514381624, "step": 13100 }, { "epoch": 0.6352359724779533, "grad_norm": 0.3918743431568146, "learning_rate": 1.5509994337268126e-05, "loss": 1.1648, "num_input_tokens_seen": 514799552, "step": 13110 }, { "epoch": 0.6357205155538327, "grad_norm": 0.3977094888687134, "learning_rate": 1.547370812721028e-05, "loss": 1.0923, "num_input_tokens_seen": 515216184, "step": 13120 }, { "epoch": 0.6362050586297122, "grad_norm": 0.41210541129112244, "learning_rate": 1.5437445380078383e-05, "loss": 1.1419, "num_input_tokens_seen": 515617056, "step": 13130 }, { "epoch": 0.6366896017055916, "grad_norm": 0.3935719132423401, "learning_rate": 1.5401206185186323e-05, "loss": 1.1472, "num_input_tokens_seen": 516021628, "step": 13140 }, { "epoch": 0.637174144781471, "grad_norm": 0.4016217291355133, "learning_rate": 1.536499063178999e-05, "loss": 1.1494, "num_input_tokens_seen": 516424464, "step": 13150 }, { "epoch": 0.6376586878573505, "grad_norm": 0.41335245966911316, "learning_rate": 1.5328798809087012e-05, "loss": 1.1859, "num_input_tokens_seen": 516827724, "step": 13160 }, { "epoch": 0.63814323093323, "grad_norm": 0.38563454151153564, "learning_rate": 1.5292630806216603e-05, "loss": 1.1049, "num_input_tokens_seen": 517224524, "step": 13170 }, { "epoch": 0.6386277740091094, "grad_norm": 0.4143526256084442, "learning_rate": 1.5256486712259304e-05, "loss": 1.1405, "num_input_tokens_seen": 517641892, "step": 13180 }, { "epoch": 0.6391123170849888, "grad_norm": 0.3733633756637573, "learning_rate": 1.5220366616236748e-05, "loss": 1.1688, "num_input_tokens_seen": 518022120, "step": 13190 }, { "epoch": 0.6395968601608683, "grad_norm": 0.412308931350708, "learning_rate": 1.5184270607111494e-05, "loss": 1.1234, "num_input_tokens_seen": 518438460, "step": 13200 }, { "epoch": 0.6400814032367478, "grad_norm": 0.3862762749195099, "learning_rate": 1.5148198773786754e-05, "loss": 1.14, "num_input_tokens_seen": 518813036, "step": 13210 }, { "epoch": 0.6405659463126272, "grad_norm": 0.4147486984729767, "learning_rate": 1.5112151205106182e-05, "loss": 1.1246, "num_input_tokens_seen": 519216952, "step": 13220 }, { "epoch": 0.6410504893885066, "grad_norm": 0.40653976798057556, "learning_rate": 1.507612798985371e-05, "loss": 1.115, "num_input_tokens_seen": 519604276, "step": 13230 }, { "epoch": 0.641535032464386, "grad_norm": 0.4051119089126587, "learning_rate": 1.5040129216753257e-05, "loss": 1.179, "num_input_tokens_seen": 519988992, "step": 13240 }, { "epoch": 0.6420195755402656, "grad_norm": 0.39531031250953674, "learning_rate": 1.5004154974468548e-05, "loss": 1.1244, "num_input_tokens_seen": 520363784, "step": 13250 }, { "epoch": 0.642504118616145, "grad_norm": 0.38629281520843506, "learning_rate": 1.4968205351602899e-05, "loss": 1.115, "num_input_tokens_seen": 520766492, "step": 13260 }, { "epoch": 0.6429886616920244, "grad_norm": 0.3945416808128357, "learning_rate": 1.4932280436698976e-05, "loss": 1.1265, "num_input_tokens_seen": 521157024, "step": 13270 }, { "epoch": 0.6434732047679038, "grad_norm": 0.3776805102825165, "learning_rate": 1.48963803182386e-05, "loss": 1.1165, "num_input_tokens_seen": 521579912, "step": 13280 }, { "epoch": 0.6439577478437833, "grad_norm": 0.4211878776550293, "learning_rate": 1.4860505084642506e-05, "loss": 1.1249, "num_input_tokens_seen": 521991792, "step": 13290 }, { "epoch": 0.6444422909196628, "grad_norm": 0.4258326292037964, "learning_rate": 1.4824654824270168e-05, "loss": 1.1046, "num_input_tokens_seen": 522392304, "step": 13300 }, { "epoch": 0.6449268339955422, "grad_norm": 0.40192142128944397, "learning_rate": 1.478882962541952e-05, "loss": 1.1577, "num_input_tokens_seen": 522783384, "step": 13310 }, { "epoch": 0.6454113770714216, "grad_norm": 0.3894496262073517, "learning_rate": 1.475302957632677e-05, "loss": 1.1205, "num_input_tokens_seen": 523150196, "step": 13320 }, { "epoch": 0.6458959201473011, "grad_norm": 0.4126818776130676, "learning_rate": 1.471725476516622e-05, "loss": 1.1076, "num_input_tokens_seen": 523551404, "step": 13330 }, { "epoch": 0.6463804632231805, "grad_norm": 0.37312477827072144, "learning_rate": 1.4681505280049974e-05, "loss": 1.1852, "num_input_tokens_seen": 523955836, "step": 13340 }, { "epoch": 0.64686500629906, "grad_norm": 0.40087607502937317, "learning_rate": 1.4645781209027764e-05, "loss": 1.1758, "num_input_tokens_seen": 524356324, "step": 13350 }, { "epoch": 0.6473495493749394, "grad_norm": 0.38394778966903687, "learning_rate": 1.4610082640086754e-05, "loss": 1.108, "num_input_tokens_seen": 524759624, "step": 13360 }, { "epoch": 0.6478340924508189, "grad_norm": 0.4227401912212372, "learning_rate": 1.4574409661151264e-05, "loss": 1.1216, "num_input_tokens_seen": 525129624, "step": 13370 }, { "epoch": 0.6483186355266983, "grad_norm": 0.3866701126098633, "learning_rate": 1.4538762360082608e-05, "loss": 1.1328, "num_input_tokens_seen": 525533224, "step": 13380 }, { "epoch": 0.6488031786025777, "grad_norm": 0.39452120661735535, "learning_rate": 1.4503140824678863e-05, "loss": 1.124, "num_input_tokens_seen": 525921192, "step": 13390 }, { "epoch": 0.6492877216784572, "grad_norm": 0.3937327563762665, "learning_rate": 1.4467545142674599e-05, "loss": 1.132, "num_input_tokens_seen": 526287908, "step": 13400 }, { "epoch": 0.6497722647543367, "grad_norm": 0.427117258310318, "learning_rate": 1.4431975401740783e-05, "loss": 1.1116, "num_input_tokens_seen": 526650256, "step": 13410 }, { "epoch": 0.6502568078302161, "grad_norm": 0.36921098828315735, "learning_rate": 1.4396431689484416e-05, "loss": 1.0976, "num_input_tokens_seen": 527032076, "step": 13420 }, { "epoch": 0.6507413509060955, "grad_norm": 0.3914394676685333, "learning_rate": 1.4360914093448463e-05, "loss": 1.2131, "num_input_tokens_seen": 527466664, "step": 13430 }, { "epoch": 0.651225893981975, "grad_norm": 0.4258521497249603, "learning_rate": 1.4325422701111502e-05, "loss": 1.1817, "num_input_tokens_seen": 527850952, "step": 13440 }, { "epoch": 0.6517104370578545, "grad_norm": 0.401259183883667, "learning_rate": 1.428995759988761e-05, "loss": 1.158, "num_input_tokens_seen": 528263332, "step": 13450 }, { "epoch": 0.6521949801337339, "grad_norm": 0.3809134066104889, "learning_rate": 1.4254518877126095e-05, "loss": 1.1234, "num_input_tokens_seen": 528686588, "step": 13460 }, { "epoch": 0.6526795232096133, "grad_norm": 0.406120240688324, "learning_rate": 1.42191066201113e-05, "loss": 1.1635, "num_input_tokens_seen": 529070212, "step": 13470 }, { "epoch": 0.6531640662854927, "grad_norm": 0.43414533138275146, "learning_rate": 1.418372091606239e-05, "loss": 1.1747, "num_input_tokens_seen": 529477424, "step": 13480 }, { "epoch": 0.6536486093613723, "grad_norm": 0.4020240306854248, "learning_rate": 1.4148361852133129e-05, "loss": 1.1394, "num_input_tokens_seen": 529879856, "step": 13490 }, { "epoch": 0.6541331524372517, "grad_norm": 0.39953121542930603, "learning_rate": 1.4113029515411647e-05, "loss": 1.1363, "num_input_tokens_seen": 530288184, "step": 13500 }, { "epoch": 0.6546176955131311, "grad_norm": 0.39526060223579407, "learning_rate": 1.407772399292027e-05, "loss": 1.1483, "num_input_tokens_seen": 530697004, "step": 13510 }, { "epoch": 0.6551022385890105, "grad_norm": 0.4059954583644867, "learning_rate": 1.4042445371615271e-05, "loss": 1.125, "num_input_tokens_seen": 531092792, "step": 13520 }, { "epoch": 0.65558678166489, "grad_norm": 0.40413933992385864, "learning_rate": 1.4007193738386675e-05, "loss": 1.1345, "num_input_tokens_seen": 531481940, "step": 13530 }, { "epoch": 0.6560713247407695, "grad_norm": 0.3758378326892853, "learning_rate": 1.3971969180058032e-05, "loss": 1.139, "num_input_tokens_seen": 531883224, "step": 13540 }, { "epoch": 0.6565558678166489, "grad_norm": 0.3990527093410492, "learning_rate": 1.3936771783386183e-05, "loss": 1.1284, "num_input_tokens_seen": 532298904, "step": 13550 }, { "epoch": 0.6570404108925283, "grad_norm": 0.403590589761734, "learning_rate": 1.390160163506113e-05, "loss": 1.1493, "num_input_tokens_seen": 532676364, "step": 13560 }, { "epoch": 0.6575249539684078, "grad_norm": 0.41753387451171875, "learning_rate": 1.3866458821705697e-05, "loss": 1.1165, "num_input_tokens_seen": 533090008, "step": 13570 }, { "epoch": 0.6580094970442872, "grad_norm": 0.40354228019714355, "learning_rate": 1.3831343429875421e-05, "loss": 1.1542, "num_input_tokens_seen": 533475684, "step": 13580 }, { "epoch": 0.6584940401201667, "grad_norm": 0.40463098883628845, "learning_rate": 1.3796255546058293e-05, "loss": 1.1574, "num_input_tokens_seen": 533869320, "step": 13590 }, { "epoch": 0.6589785831960461, "grad_norm": 0.40255841612815857, "learning_rate": 1.3761195256674554e-05, "loss": 1.1158, "num_input_tokens_seen": 534226148, "step": 13600 }, { "epoch": 0.6594631262719256, "grad_norm": 0.4016367197036743, "learning_rate": 1.3726162648076474e-05, "loss": 1.1712, "num_input_tokens_seen": 534636196, "step": 13610 }, { "epoch": 0.659947669347805, "grad_norm": 0.4226371645927429, "learning_rate": 1.3691157806548167e-05, "loss": 1.1584, "num_input_tokens_seen": 535027492, "step": 13620 }, { "epoch": 0.6604322124236844, "grad_norm": 0.4005202651023865, "learning_rate": 1.365618081830532e-05, "loss": 1.1075, "num_input_tokens_seen": 535420836, "step": 13630 }, { "epoch": 0.6609167554995639, "grad_norm": 0.38361644744873047, "learning_rate": 1.3621231769495047e-05, "loss": 1.1318, "num_input_tokens_seen": 535798348, "step": 13640 }, { "epoch": 0.6614012985754434, "grad_norm": 0.36114785075187683, "learning_rate": 1.3586310746195641e-05, "loss": 1.1685, "num_input_tokens_seen": 536163432, "step": 13650 }, { "epoch": 0.6618858416513228, "grad_norm": 0.452426940202713, "learning_rate": 1.3551417834416375e-05, "loss": 1.1241, "num_input_tokens_seen": 536559076, "step": 13660 }, { "epoch": 0.6623703847272022, "grad_norm": 0.43133866786956787, "learning_rate": 1.3516553120097281e-05, "loss": 1.1362, "num_input_tokens_seen": 536949788, "step": 13670 }, { "epoch": 0.6628549278030816, "grad_norm": 0.3915277421474457, "learning_rate": 1.3481716689108915e-05, "loss": 1.1215, "num_input_tokens_seen": 537337428, "step": 13680 }, { "epoch": 0.6633394708789612, "grad_norm": 0.38239341974258423, "learning_rate": 1.3446908627252236e-05, "loss": 1.1594, "num_input_tokens_seen": 537726764, "step": 13690 }, { "epoch": 0.6638240139548406, "grad_norm": 0.4042087495326996, "learning_rate": 1.3412129020258257e-05, "loss": 1.1515, "num_input_tokens_seen": 538088164, "step": 13700 }, { "epoch": 0.66430855703072, "grad_norm": 0.39968568086624146, "learning_rate": 1.3377377953787956e-05, "loss": 1.116, "num_input_tokens_seen": 538462580, "step": 13710 }, { "epoch": 0.6647931001065994, "grad_norm": 0.3845781683921814, "learning_rate": 1.3342655513432001e-05, "loss": 1.1221, "num_input_tokens_seen": 538858660, "step": 13720 }, { "epoch": 0.665277643182479, "grad_norm": 0.37981632351875305, "learning_rate": 1.3307961784710554e-05, "loss": 1.1724, "num_input_tokens_seen": 539260832, "step": 13730 }, { "epoch": 0.6657621862583584, "grad_norm": 0.4112668037414551, "learning_rate": 1.327329685307307e-05, "loss": 1.1521, "num_input_tokens_seen": 539654112, "step": 13740 }, { "epoch": 0.6662467293342378, "grad_norm": 0.3671356439590454, "learning_rate": 1.3238660803898074e-05, "loss": 1.1909, "num_input_tokens_seen": 540039560, "step": 13750 }, { "epoch": 0.6667312724101173, "grad_norm": 0.3972877562046051, "learning_rate": 1.3204053722492927e-05, "loss": 1.1792, "num_input_tokens_seen": 540430488, "step": 13760 }, { "epoch": 0.6672158154859967, "grad_norm": 0.39763057231903076, "learning_rate": 1.3169475694093703e-05, "loss": 1.1322, "num_input_tokens_seen": 540829364, "step": 13770 }, { "epoch": 0.6677003585618762, "grad_norm": 0.37179917097091675, "learning_rate": 1.3134926803864861e-05, "loss": 1.1592, "num_input_tokens_seen": 541255364, "step": 13780 }, { "epoch": 0.6681849016377556, "grad_norm": 0.3995148241519928, "learning_rate": 1.3100407136899123e-05, "loss": 1.103, "num_input_tokens_seen": 541631036, "step": 13790 }, { "epoch": 0.6686694447136351, "grad_norm": 0.4086041748523712, "learning_rate": 1.3065916778217235e-05, "loss": 1.1492, "num_input_tokens_seen": 542006572, "step": 13800 }, { "epoch": 0.6691539877895145, "grad_norm": 0.3766721785068512, "learning_rate": 1.3031455812767746e-05, "loss": 1.1115, "num_input_tokens_seen": 542379296, "step": 13810 }, { "epoch": 0.6696385308653939, "grad_norm": 0.4227388799190521, "learning_rate": 1.299702432542683e-05, "loss": 1.0872, "num_input_tokens_seen": 542769344, "step": 13820 }, { "epoch": 0.6701230739412734, "grad_norm": 0.4330199658870697, "learning_rate": 1.2962622400998012e-05, "loss": 1.0849, "num_input_tokens_seen": 543147408, "step": 13830 }, { "epoch": 0.6706076170171529, "grad_norm": 0.37317484617233276, "learning_rate": 1.292825012421208e-05, "loss": 1.1378, "num_input_tokens_seen": 543533556, "step": 13840 }, { "epoch": 0.6710921600930323, "grad_norm": 0.39739182591438293, "learning_rate": 1.2893907579726728e-05, "loss": 1.1708, "num_input_tokens_seen": 543906020, "step": 13850 }, { "epoch": 0.6715767031689117, "grad_norm": 0.4071529507637024, "learning_rate": 1.2859594852126456e-05, "loss": 1.1109, "num_input_tokens_seen": 544321144, "step": 13860 }, { "epoch": 0.6720612462447911, "grad_norm": 0.398400217294693, "learning_rate": 1.282531202592232e-05, "loss": 1.0842, "num_input_tokens_seen": 544708388, "step": 13870 }, { "epoch": 0.6725457893206707, "grad_norm": 0.37718960642814636, "learning_rate": 1.2791059185551744e-05, "loss": 1.1508, "num_input_tokens_seen": 545104336, "step": 13880 }, { "epoch": 0.6730303323965501, "grad_norm": 0.41166213154792786, "learning_rate": 1.2756836415378254e-05, "loss": 1.1302, "num_input_tokens_seen": 545501896, "step": 13890 }, { "epoch": 0.6735148754724295, "grad_norm": 0.4132916033267975, "learning_rate": 1.2722643799691378e-05, "loss": 1.1038, "num_input_tokens_seen": 545881328, "step": 13900 }, { "epoch": 0.6739994185483089, "grad_norm": 0.40238797664642334, "learning_rate": 1.2688481422706309e-05, "loss": 1.146, "num_input_tokens_seen": 546279016, "step": 13910 }, { "epoch": 0.6744839616241884, "grad_norm": 0.4064416289329529, "learning_rate": 1.2654349368563828e-05, "loss": 1.1636, "num_input_tokens_seen": 546677216, "step": 13920 }, { "epoch": 0.6749685047000679, "grad_norm": 0.37984856963157654, "learning_rate": 1.2620247721329973e-05, "loss": 1.1299, "num_input_tokens_seen": 547080472, "step": 13930 }, { "epoch": 0.6754530477759473, "grad_norm": 0.4335133135318756, "learning_rate": 1.2586176564995922e-05, "loss": 1.1437, "num_input_tokens_seen": 547472432, "step": 13940 }, { "epoch": 0.6759375908518267, "grad_norm": 0.3851531445980072, "learning_rate": 1.2552135983477756e-05, "loss": 1.1503, "num_input_tokens_seen": 547869104, "step": 13950 }, { "epoch": 0.6764221339277062, "grad_norm": 0.37984663248062134, "learning_rate": 1.251812606061624e-05, "loss": 1.1761, "num_input_tokens_seen": 548278152, "step": 13960 }, { "epoch": 0.6769066770035856, "grad_norm": 0.40836653113365173, "learning_rate": 1.2484146880176642e-05, "loss": 1.066, "num_input_tokens_seen": 548692648, "step": 13970 }, { "epoch": 0.6773912200794651, "grad_norm": 0.38975247740745544, "learning_rate": 1.2450198525848487e-05, "loss": 1.0798, "num_input_tokens_seen": 549066912, "step": 13980 }, { "epoch": 0.6778757631553445, "grad_norm": 0.3853222727775574, "learning_rate": 1.2416281081245398e-05, "loss": 1.1685, "num_input_tokens_seen": 549440944, "step": 13990 }, { "epoch": 0.678360306231224, "grad_norm": 0.45054277777671814, "learning_rate": 1.2382394629904864e-05, "loss": 1.1473, "num_input_tokens_seen": 549831184, "step": 14000 }, { "epoch": 0.678360306231224, "eval_loss": 1.1321347951889038, "eval_runtime": 5.0521, "eval_samples_per_second": 29.69, "eval_steps_per_second": 3.761, "num_input_tokens_seen": 549831184, "step": 14000 }, { "epoch": 0.6788448493071034, "grad_norm": 0.3913303315639496, "learning_rate": 1.2348539255288038e-05, "loss": 1.1504, "num_input_tokens_seen": 550197984, "step": 14010 }, { "epoch": 0.6793293923829828, "grad_norm": 0.38616275787353516, "learning_rate": 1.2314715040779534e-05, "loss": 1.1412, "num_input_tokens_seen": 550565288, "step": 14020 }, { "epoch": 0.6798139354588623, "grad_norm": 0.376402884721756, "learning_rate": 1.2280922069687225e-05, "loss": 1.1712, "num_input_tokens_seen": 550966008, "step": 14030 }, { "epoch": 0.6802984785347418, "grad_norm": 0.4074733853340149, "learning_rate": 1.2247160425241996e-05, "loss": 1.0974, "num_input_tokens_seen": 551358528, "step": 14040 }, { "epoch": 0.6807830216106212, "grad_norm": 0.41833576560020447, "learning_rate": 1.221343019059764e-05, "loss": 1.1279, "num_input_tokens_seen": 551759700, "step": 14050 }, { "epoch": 0.6812675646865006, "grad_norm": 0.36511725187301636, "learning_rate": 1.217973144883053e-05, "loss": 1.1514, "num_input_tokens_seen": 552164368, "step": 14060 }, { "epoch": 0.68175210776238, "grad_norm": 0.3876798152923584, "learning_rate": 1.2146064282939501e-05, "loss": 1.169, "num_input_tokens_seen": 552571428, "step": 14070 }, { "epoch": 0.6822366508382596, "grad_norm": 0.3926098942756653, "learning_rate": 1.2112428775845616e-05, "loss": 1.1665, "num_input_tokens_seen": 552952628, "step": 14080 }, { "epoch": 0.682721193914139, "grad_norm": 0.3909384608268738, "learning_rate": 1.2078825010391958e-05, "loss": 1.133, "num_input_tokens_seen": 553359608, "step": 14090 }, { "epoch": 0.6832057369900184, "grad_norm": 0.38221150636672974, "learning_rate": 1.2045253069343448e-05, "loss": 1.1773, "num_input_tokens_seen": 553757744, "step": 14100 }, { "epoch": 0.6836902800658978, "grad_norm": 0.42475947737693787, "learning_rate": 1.2011713035386588e-05, "loss": 1.1495, "num_input_tokens_seen": 554165892, "step": 14110 }, { "epoch": 0.6841748231417774, "grad_norm": 0.41192540526390076, "learning_rate": 1.1978204991129324e-05, "loss": 1.1358, "num_input_tokens_seen": 554549428, "step": 14120 }, { "epoch": 0.6846593662176568, "grad_norm": 0.390726238489151, "learning_rate": 1.1944729019100808e-05, "loss": 1.1225, "num_input_tokens_seen": 554948408, "step": 14130 }, { "epoch": 0.6851439092935362, "grad_norm": 0.40107494592666626, "learning_rate": 1.19112852017512e-05, "loss": 1.1305, "num_input_tokens_seen": 555381500, "step": 14140 }, { "epoch": 0.6856284523694156, "grad_norm": 0.41240257024765015, "learning_rate": 1.1877873621451453e-05, "loss": 1.1169, "num_input_tokens_seen": 555772300, "step": 14150 }, { "epoch": 0.6861129954452951, "grad_norm": 0.42891547083854675, "learning_rate": 1.1844494360493141e-05, "loss": 1.095, "num_input_tokens_seen": 556200948, "step": 14160 }, { "epoch": 0.6865975385211746, "grad_norm": 0.4002200663089752, "learning_rate": 1.1811147501088196e-05, "loss": 1.188, "num_input_tokens_seen": 556610528, "step": 14170 }, { "epoch": 0.687082081597054, "grad_norm": 0.4111470878124237, "learning_rate": 1.1777833125368812e-05, "loss": 1.1927, "num_input_tokens_seen": 557009708, "step": 14180 }, { "epoch": 0.6875666246729334, "grad_norm": 0.43058955669403076, "learning_rate": 1.17445513153871e-05, "loss": 1.1684, "num_input_tokens_seen": 557438504, "step": 14190 }, { "epoch": 0.6880511677488129, "grad_norm": 0.36157649755477905, "learning_rate": 1.1711302153115045e-05, "loss": 1.1805, "num_input_tokens_seen": 557855252, "step": 14200 }, { "epoch": 0.6885357108246923, "grad_norm": 0.3946475088596344, "learning_rate": 1.1678085720444142e-05, "loss": 1.0994, "num_input_tokens_seen": 558247492, "step": 14210 }, { "epoch": 0.6890202539005718, "grad_norm": 0.38171178102493286, "learning_rate": 1.1644902099185328e-05, "loss": 1.1378, "num_input_tokens_seen": 558657816, "step": 14220 }, { "epoch": 0.6895047969764512, "grad_norm": 0.38918769359588623, "learning_rate": 1.1611751371068706e-05, "loss": 1.1926, "num_input_tokens_seen": 559061308, "step": 14230 }, { "epoch": 0.6899893400523307, "grad_norm": 0.4294280707836151, "learning_rate": 1.1578633617743373e-05, "loss": 1.1388, "num_input_tokens_seen": 559449040, "step": 14240 }, { "epoch": 0.6904738831282101, "grad_norm": 0.4078969359397888, "learning_rate": 1.1545548920777194e-05, "loss": 1.1432, "num_input_tokens_seen": 559819924, "step": 14250 }, { "epoch": 0.6909584262040895, "grad_norm": 0.3768039643764496, "learning_rate": 1.1512497361656633e-05, "loss": 1.1597, "num_input_tokens_seen": 560234120, "step": 14260 }, { "epoch": 0.691442969279969, "grad_norm": 0.3907431364059448, "learning_rate": 1.1479479021786533e-05, "loss": 1.1175, "num_input_tokens_seen": 560648180, "step": 14270 }, { "epoch": 0.6919275123558485, "grad_norm": 0.415067195892334, "learning_rate": 1.1446493982489916e-05, "loss": 1.1103, "num_input_tokens_seen": 561025820, "step": 14280 }, { "epoch": 0.6924120554317279, "grad_norm": 0.3955281674861908, "learning_rate": 1.1413542325007804e-05, "loss": 1.1182, "num_input_tokens_seen": 561453128, "step": 14290 }, { "epoch": 0.6928965985076073, "grad_norm": 0.3895666003227234, "learning_rate": 1.1380624130498946e-05, "loss": 1.165, "num_input_tokens_seen": 561854100, "step": 14300 }, { "epoch": 0.6933811415834867, "grad_norm": 0.40961936116218567, "learning_rate": 1.134773948003976e-05, "loss": 1.1452, "num_input_tokens_seen": 562259444, "step": 14310 }, { "epoch": 0.6938656846593663, "grad_norm": 0.4259204864501953, "learning_rate": 1.1314888454623951e-05, "loss": 1.1066, "num_input_tokens_seen": 562672632, "step": 14320 }, { "epoch": 0.6943502277352457, "grad_norm": 0.44359973073005676, "learning_rate": 1.1282071135162498e-05, "loss": 1.1223, "num_input_tokens_seen": 563095012, "step": 14330 }, { "epoch": 0.6948347708111251, "grad_norm": 0.4470522701740265, "learning_rate": 1.1249287602483285e-05, "loss": 1.1879, "num_input_tokens_seen": 563479840, "step": 14340 }, { "epoch": 0.6953193138870045, "grad_norm": 0.39908698201179504, "learning_rate": 1.1216537937331028e-05, "loss": 1.1204, "num_input_tokens_seen": 563889984, "step": 14350 }, { "epoch": 0.695803856962884, "grad_norm": 0.4189169704914093, "learning_rate": 1.1183822220367014e-05, "loss": 1.1517, "num_input_tokens_seen": 564268180, "step": 14360 }, { "epoch": 0.6962884000387635, "grad_norm": 0.3651569187641144, "learning_rate": 1.1151140532168917e-05, "loss": 1.1388, "num_input_tokens_seen": 564649240, "step": 14370 }, { "epoch": 0.6967729431146429, "grad_norm": 0.4237881898880005, "learning_rate": 1.1118492953230594e-05, "loss": 1.1211, "num_input_tokens_seen": 565027528, "step": 14380 }, { "epoch": 0.6972574861905223, "grad_norm": 0.41956138610839844, "learning_rate": 1.1085879563961915e-05, "loss": 1.1297, "num_input_tokens_seen": 565436040, "step": 14390 }, { "epoch": 0.6977420292664018, "grad_norm": 0.4476149082183838, "learning_rate": 1.1053300444688502e-05, "loss": 1.108, "num_input_tokens_seen": 565816684, "step": 14400 }, { "epoch": 0.6982265723422812, "grad_norm": 0.40152573585510254, "learning_rate": 1.1020755675651605e-05, "loss": 1.1131, "num_input_tokens_seen": 566192928, "step": 14410 }, { "epoch": 0.6987111154181607, "grad_norm": 0.4253696799278259, "learning_rate": 1.0988245337007863e-05, "loss": 1.1218, "num_input_tokens_seen": 566589328, "step": 14420 }, { "epoch": 0.6991956584940401, "grad_norm": 0.39764001965522766, "learning_rate": 1.0955769508829103e-05, "loss": 1.1141, "num_input_tokens_seen": 566991652, "step": 14430 }, { "epoch": 0.6996802015699196, "grad_norm": 0.3731628656387329, "learning_rate": 1.0923328271102174e-05, "loss": 1.1397, "num_input_tokens_seen": 567388660, "step": 14440 }, { "epoch": 0.700164744645799, "grad_norm": 0.4285131096839905, "learning_rate": 1.0890921703728693e-05, "loss": 1.1473, "num_input_tokens_seen": 567783500, "step": 14450 }, { "epoch": 0.7006492877216784, "grad_norm": 0.3825373649597168, "learning_rate": 1.0858549886524944e-05, "loss": 1.1373, "num_input_tokens_seen": 568160888, "step": 14460 }, { "epoch": 0.7011338307975579, "grad_norm": 0.38323846459388733, "learning_rate": 1.0826212899221559e-05, "loss": 1.1175, "num_input_tokens_seen": 568542356, "step": 14470 }, { "epoch": 0.7016183738734374, "grad_norm": 0.4034413695335388, "learning_rate": 1.0793910821463424e-05, "loss": 1.1396, "num_input_tokens_seen": 568922996, "step": 14480 }, { "epoch": 0.7021029169493168, "grad_norm": 0.4333634078502655, "learning_rate": 1.076164373280944e-05, "loss": 1.117, "num_input_tokens_seen": 569333292, "step": 14490 }, { "epoch": 0.7025874600251962, "grad_norm": 0.39302992820739746, "learning_rate": 1.0729411712732319e-05, "loss": 1.1303, "num_input_tokens_seen": 569719056, "step": 14500 }, { "epoch": 0.7030720031010756, "grad_norm": 0.39154550433158875, "learning_rate": 1.0697214840618409e-05, "loss": 1.2008, "num_input_tokens_seen": 570111844, "step": 14510 }, { "epoch": 0.7035565461769552, "grad_norm": 0.406896710395813, "learning_rate": 1.0665053195767493e-05, "loss": 1.1581, "num_input_tokens_seen": 570534852, "step": 14520 }, { "epoch": 0.7040410892528346, "grad_norm": 0.4043082892894745, "learning_rate": 1.0632926857392567e-05, "loss": 1.1445, "num_input_tokens_seen": 570909832, "step": 14530 }, { "epoch": 0.704525632328714, "grad_norm": 0.392946720123291, "learning_rate": 1.0600835904619713e-05, "loss": 1.1079, "num_input_tokens_seen": 571297336, "step": 14540 }, { "epoch": 0.7050101754045934, "grad_norm": 0.4595877528190613, "learning_rate": 1.0568780416487811e-05, "loss": 1.1688, "num_input_tokens_seen": 571674808, "step": 14550 }, { "epoch": 0.705494718480473, "grad_norm": 0.39945316314697266, "learning_rate": 1.0536760471948423e-05, "loss": 1.0807, "num_input_tokens_seen": 572049484, "step": 14560 }, { "epoch": 0.7059792615563524, "grad_norm": 0.4135000705718994, "learning_rate": 1.0504776149865559e-05, "loss": 1.0744, "num_input_tokens_seen": 572426520, "step": 14570 }, { "epoch": 0.7064638046322318, "grad_norm": 0.40249672532081604, "learning_rate": 1.0472827529015494e-05, "loss": 1.1261, "num_input_tokens_seen": 572824836, "step": 14580 }, { "epoch": 0.7069483477081112, "grad_norm": 0.4083980321884155, "learning_rate": 1.0440914688086581e-05, "loss": 1.1333, "num_input_tokens_seen": 573191304, "step": 14590 }, { "epoch": 0.7074328907839907, "grad_norm": 0.43228352069854736, "learning_rate": 1.0409037705679018e-05, "loss": 1.1248, "num_input_tokens_seen": 573607132, "step": 14600 }, { "epoch": 0.7079174338598702, "grad_norm": 0.3942347764968872, "learning_rate": 1.0377196660304717e-05, "loss": 1.1319, "num_input_tokens_seen": 573989044, "step": 14610 }, { "epoch": 0.7084019769357496, "grad_norm": 0.3767456114292145, "learning_rate": 1.0345391630387064e-05, "loss": 1.1459, "num_input_tokens_seen": 574401880, "step": 14620 }, { "epoch": 0.708886520011629, "grad_norm": 0.41538006067276, "learning_rate": 1.0313622694260747e-05, "loss": 1.136, "num_input_tokens_seen": 574809836, "step": 14630 }, { "epoch": 0.7093710630875085, "grad_norm": 0.3873327970504761, "learning_rate": 1.0281889930171546e-05, "loss": 1.2085, "num_input_tokens_seen": 575187448, "step": 14640 }, { "epoch": 0.7098556061633879, "grad_norm": 0.3875696063041687, "learning_rate": 1.0250193416276171e-05, "loss": 1.131, "num_input_tokens_seen": 575619396, "step": 14650 }, { "epoch": 0.7103401492392674, "grad_norm": 0.40218356251716614, "learning_rate": 1.0218533230642005e-05, "loss": 1.1638, "num_input_tokens_seen": 576039756, "step": 14660 }, { "epoch": 0.7108246923151468, "grad_norm": 0.360257089138031, "learning_rate": 1.0186909451247023e-05, "loss": 1.1354, "num_input_tokens_seen": 576409768, "step": 14670 }, { "epoch": 0.7113092353910263, "grad_norm": 0.3771324157714844, "learning_rate": 1.0155322155979468e-05, "loss": 1.1132, "num_input_tokens_seen": 576806004, "step": 14680 }, { "epoch": 0.7117937784669057, "grad_norm": 0.41122815012931824, "learning_rate": 1.0123771422637757e-05, "loss": 1.178, "num_input_tokens_seen": 577197312, "step": 14690 }, { "epoch": 0.7122783215427851, "grad_norm": 0.3926630914211273, "learning_rate": 1.0092257328930255e-05, "loss": 1.0902, "num_input_tokens_seen": 577597560, "step": 14700 }, { "epoch": 0.7127628646186646, "grad_norm": 0.4445978105068207, "learning_rate": 1.0060779952475074e-05, "loss": 1.1167, "num_input_tokens_seen": 577999220, "step": 14710 }, { "epoch": 0.7132474076945441, "grad_norm": 0.42254573106765747, "learning_rate": 1.0029339370799912e-05, "loss": 1.129, "num_input_tokens_seen": 578383384, "step": 14720 }, { "epoch": 0.7137319507704235, "grad_norm": 0.4229062795639038, "learning_rate": 9.997935661341804e-06, "loss": 1.1274, "num_input_tokens_seen": 578758652, "step": 14730 }, { "epoch": 0.7142164938463029, "grad_norm": 0.39834490418434143, "learning_rate": 9.966568901447026e-06, "loss": 1.1414, "num_input_tokens_seen": 579152316, "step": 14740 }, { "epoch": 0.7147010369221823, "grad_norm": 0.3739629089832306, "learning_rate": 9.935239168370795e-06, "loss": 1.1303, "num_input_tokens_seen": 579581096, "step": 14750 }, { "epoch": 0.7151855799980619, "grad_norm": 0.412434458732605, "learning_rate": 9.903946539277162e-06, "loss": 1.0987, "num_input_tokens_seen": 579998920, "step": 14760 }, { "epoch": 0.7156701230739413, "grad_norm": 0.401985764503479, "learning_rate": 9.872691091238789e-06, "loss": 1.0892, "num_input_tokens_seen": 580392288, "step": 14770 }, { "epoch": 0.7161546661498207, "grad_norm": 0.42549392580986023, "learning_rate": 9.841472901236765e-06, "loss": 1.1417, "num_input_tokens_seen": 580778920, "step": 14780 }, { "epoch": 0.7166392092257001, "grad_norm": 0.42221537232398987, "learning_rate": 9.81029204616038e-06, "loss": 1.1389, "num_input_tokens_seen": 581160372, "step": 14790 }, { "epoch": 0.7171237523015797, "grad_norm": 0.4084275960922241, "learning_rate": 9.779148602807032e-06, "loss": 1.1119, "num_input_tokens_seen": 581518392, "step": 14800 }, { "epoch": 0.7176082953774591, "grad_norm": 0.3924039900302887, "learning_rate": 9.748042647881909e-06, "loss": 1.1196, "num_input_tokens_seen": 581916744, "step": 14810 }, { "epoch": 0.7180928384533385, "grad_norm": 0.4364875555038452, "learning_rate": 9.716974257997927e-06, "loss": 1.1251, "num_input_tokens_seen": 582306144, "step": 14820 }, { "epoch": 0.7185773815292179, "grad_norm": 0.4177420735359192, "learning_rate": 9.68594350967543e-06, "loss": 1.1217, "num_input_tokens_seen": 582701516, "step": 14830 }, { "epoch": 0.7190619246050974, "grad_norm": 0.3962678015232086, "learning_rate": 9.654950479342079e-06, "loss": 1.1467, "num_input_tokens_seen": 583110044, "step": 14840 }, { "epoch": 0.7195464676809769, "grad_norm": 0.3973855972290039, "learning_rate": 9.62399524333263e-06, "loss": 1.0922, "num_input_tokens_seen": 583517076, "step": 14850 }, { "epoch": 0.7200310107568563, "grad_norm": 0.3978596329689026, "learning_rate": 9.593077877888757e-06, "loss": 1.1018, "num_input_tokens_seen": 583892052, "step": 14860 }, { "epoch": 0.7205155538327357, "grad_norm": 0.3927263617515564, "learning_rate": 9.56219845915886e-06, "loss": 1.1682, "num_input_tokens_seen": 584282980, "step": 14870 }, { "epoch": 0.7210000969086152, "grad_norm": 0.4256937801837921, "learning_rate": 9.531357063197867e-06, "loss": 1.1401, "num_input_tokens_seen": 584680944, "step": 14880 }, { "epoch": 0.7214846399844946, "grad_norm": 0.36345407366752625, "learning_rate": 9.500553765967066e-06, "loss": 1.1402, "num_input_tokens_seen": 585085904, "step": 14890 }, { "epoch": 0.721969183060374, "grad_norm": 0.38229337334632874, "learning_rate": 9.46978864333391e-06, "loss": 1.1655, "num_input_tokens_seen": 585477160, "step": 14900 }, { "epoch": 0.7224537261362535, "grad_norm": 0.41841405630111694, "learning_rate": 9.439061771071824e-06, "loss": 1.1179, "num_input_tokens_seen": 585880592, "step": 14910 }, { "epoch": 0.722938269212133, "grad_norm": 0.4083956480026245, "learning_rate": 9.408373224860035e-06, "loss": 1.1698, "num_input_tokens_seen": 586284256, "step": 14920 }, { "epoch": 0.7234228122880124, "grad_norm": 0.4051735997200012, "learning_rate": 9.377723080283368e-06, "loss": 1.1011, "num_input_tokens_seen": 586669452, "step": 14930 }, { "epoch": 0.7239073553638918, "grad_norm": 0.40003690123558044, "learning_rate": 9.347111412832041e-06, "loss": 1.0989, "num_input_tokens_seen": 587054752, "step": 14940 }, { "epoch": 0.7243918984397713, "grad_norm": 0.41723689436912537, "learning_rate": 9.31653829790156e-06, "loss": 1.1647, "num_input_tokens_seen": 587436984, "step": 14950 }, { "epoch": 0.7248764415156508, "grad_norm": 0.3850226402282715, "learning_rate": 9.286003810792423e-06, "loss": 1.1224, "num_input_tokens_seen": 587849120, "step": 14960 }, { "epoch": 0.7253609845915302, "grad_norm": 0.38576218485832214, "learning_rate": 9.255508026710017e-06, "loss": 1.1768, "num_input_tokens_seen": 588230752, "step": 14970 }, { "epoch": 0.7258455276674096, "grad_norm": 0.44978147745132446, "learning_rate": 9.225051020764396e-06, "loss": 1.1512, "num_input_tokens_seen": 588620412, "step": 14980 }, { "epoch": 0.726330070743289, "grad_norm": 0.4420928359031677, "learning_rate": 9.194632867970115e-06, "loss": 1.1517, "num_input_tokens_seen": 588987072, "step": 14990 }, { "epoch": 0.7268146138191686, "grad_norm": 0.40865370631217957, "learning_rate": 9.16425364324602e-06, "loss": 1.1299, "num_input_tokens_seen": 589381344, "step": 15000 }, { "epoch": 0.727299156895048, "grad_norm": 0.4068050682544708, "learning_rate": 9.133913421415103e-06, "loss": 1.1, "num_input_tokens_seen": 589790080, "step": 15010 }, { "epoch": 0.7277836999709274, "grad_norm": 0.37048131227493286, "learning_rate": 9.10361227720425e-06, "loss": 1.1035, "num_input_tokens_seen": 590204672, "step": 15020 }, { "epoch": 0.7282682430468068, "grad_norm": 0.3541117012500763, "learning_rate": 9.073350285244142e-06, "loss": 1.0962, "num_input_tokens_seen": 590633676, "step": 15030 }, { "epoch": 0.7287527861226863, "grad_norm": 0.4002024531364441, "learning_rate": 9.04312752006901e-06, "loss": 1.1322, "num_input_tokens_seen": 590999504, "step": 15040 }, { "epoch": 0.7292373291985658, "grad_norm": 0.4075698256492615, "learning_rate": 9.012944056116477e-06, "loss": 1.1658, "num_input_tokens_seen": 591409344, "step": 15050 }, { "epoch": 0.7297218722744452, "grad_norm": 0.41059961915016174, "learning_rate": 8.982799967727374e-06, "loss": 1.1439, "num_input_tokens_seen": 591798840, "step": 15060 }, { "epoch": 0.7302064153503246, "grad_norm": 0.3906279504299164, "learning_rate": 8.952695329145517e-06, "loss": 1.1835, "num_input_tokens_seen": 592185512, "step": 15070 }, { "epoch": 0.7306909584262041, "grad_norm": 0.41761744022369385, "learning_rate": 8.922630214517618e-06, "loss": 1.1068, "num_input_tokens_seen": 592622168, "step": 15080 }, { "epoch": 0.7311755015020835, "grad_norm": 0.408543199300766, "learning_rate": 8.892604697892981e-06, "loss": 1.1234, "num_input_tokens_seen": 592987956, "step": 15090 }, { "epoch": 0.731660044577963, "grad_norm": 0.4270235002040863, "learning_rate": 8.862618853223442e-06, "loss": 1.115, "num_input_tokens_seen": 593378144, "step": 15100 }, { "epoch": 0.7321445876538424, "grad_norm": 0.39898785948753357, "learning_rate": 8.832672754363066e-06, "loss": 1.1088, "num_input_tokens_seen": 593782856, "step": 15110 }, { "epoch": 0.7326291307297219, "grad_norm": 0.3919895887374878, "learning_rate": 8.802766475068066e-06, "loss": 1.0846, "num_input_tokens_seen": 594160824, "step": 15120 }, { "epoch": 0.7331136738056013, "grad_norm": 0.4317933917045593, "learning_rate": 8.77290008899657e-06, "loss": 1.0992, "num_input_tokens_seen": 594564940, "step": 15130 }, { "epoch": 0.7335982168814807, "grad_norm": 0.4144594073295593, "learning_rate": 8.743073669708454e-06, "loss": 1.1512, "num_input_tokens_seen": 594969124, "step": 15140 }, { "epoch": 0.7340827599573602, "grad_norm": 0.41548287868499756, "learning_rate": 8.713287290665139e-06, "loss": 1.109, "num_input_tokens_seen": 595337996, "step": 15150 }, { "epoch": 0.7345673030332397, "grad_norm": 0.396049827337265, "learning_rate": 8.68354102522945e-06, "loss": 1.1377, "num_input_tokens_seen": 595759896, "step": 15160 }, { "epoch": 0.7350518461091191, "grad_norm": 0.3859698176383972, "learning_rate": 8.6538349466654e-06, "loss": 1.1388, "num_input_tokens_seen": 596196968, "step": 15170 }, { "epoch": 0.7355363891849985, "grad_norm": 0.4152047634124756, "learning_rate": 8.624169128138038e-06, "loss": 1.1374, "num_input_tokens_seen": 596608088, "step": 15180 }, { "epoch": 0.736020932260878, "grad_norm": 0.3938763439655304, "learning_rate": 8.594543642713245e-06, "loss": 1.1061, "num_input_tokens_seen": 596999532, "step": 15190 }, { "epoch": 0.7365054753367575, "grad_norm": 0.3672252595424652, "learning_rate": 8.564958563357543e-06, "loss": 1.1148, "num_input_tokens_seen": 597383556, "step": 15200 }, { "epoch": 0.7369900184126369, "grad_norm": 0.4167373478412628, "learning_rate": 8.535413962937983e-06, "loss": 1.1183, "num_input_tokens_seen": 597766768, "step": 15210 }, { "epoch": 0.7374745614885163, "grad_norm": 0.395749568939209, "learning_rate": 8.50590991422186e-06, "loss": 1.1175, "num_input_tokens_seen": 598156584, "step": 15220 }, { "epoch": 0.7379591045643957, "grad_norm": 0.4062468409538269, "learning_rate": 8.476446489876651e-06, "loss": 1.1551, "num_input_tokens_seen": 598562200, "step": 15230 }, { "epoch": 0.7384436476402753, "grad_norm": 0.3954980969429016, "learning_rate": 8.447023762469725e-06, "loss": 1.1301, "num_input_tokens_seen": 598961604, "step": 15240 }, { "epoch": 0.7389281907161547, "grad_norm": 0.39155393838882446, "learning_rate": 8.417641804468243e-06, "loss": 1.1221, "num_input_tokens_seen": 599354072, "step": 15250 }, { "epoch": 0.7394127337920341, "grad_norm": 0.39971819519996643, "learning_rate": 8.388300688238951e-06, "loss": 1.127, "num_input_tokens_seen": 599752608, "step": 15260 }, { "epoch": 0.7398972768679135, "grad_norm": 0.4002969563007355, "learning_rate": 8.359000486047994e-06, "loss": 1.1052, "num_input_tokens_seen": 600141532, "step": 15270 }, { "epoch": 0.740381819943793, "grad_norm": 0.4160708487033844, "learning_rate": 8.329741270060754e-06, "loss": 1.0865, "num_input_tokens_seen": 600526872, "step": 15280 }, { "epoch": 0.7408663630196725, "grad_norm": 0.41618549823760986, "learning_rate": 8.300523112341674e-06, "loss": 1.1429, "num_input_tokens_seen": 600899788, "step": 15290 }, { "epoch": 0.7413509060955519, "grad_norm": 0.4318315386772156, "learning_rate": 8.271346084854042e-06, "loss": 1.0822, "num_input_tokens_seen": 601307316, "step": 15300 }, { "epoch": 0.7418354491714313, "grad_norm": 0.38331708312034607, "learning_rate": 8.242210259459872e-06, "loss": 1.1082, "num_input_tokens_seen": 601712016, "step": 15310 }, { "epoch": 0.7423199922473108, "grad_norm": 0.3820512294769287, "learning_rate": 8.213115707919692e-06, "loss": 1.1066, "num_input_tokens_seen": 602117976, "step": 15320 }, { "epoch": 0.7428045353231902, "grad_norm": 0.385928750038147, "learning_rate": 8.18406250189237e-06, "loss": 1.1274, "num_input_tokens_seen": 602497772, "step": 15330 }, { "epoch": 0.7432890783990697, "grad_norm": 0.40160906314849854, "learning_rate": 8.15505071293495e-06, "loss": 1.1348, "num_input_tokens_seen": 602904540, "step": 15340 }, { "epoch": 0.7437736214749491, "grad_norm": 0.3810666501522064, "learning_rate": 8.126080412502437e-06, "loss": 1.1229, "num_input_tokens_seen": 603301296, "step": 15350 }, { "epoch": 0.7442581645508286, "grad_norm": 0.37731239199638367, "learning_rate": 8.097151671947709e-06, "loss": 1.1554, "num_input_tokens_seen": 603671376, "step": 15360 }, { "epoch": 0.744742707626708, "grad_norm": 0.3665080666542053, "learning_rate": 8.068264562521221e-06, "loss": 1.1406, "num_input_tokens_seen": 604069780, "step": 15370 }, { "epoch": 0.7452272507025874, "grad_norm": 0.37638604640960693, "learning_rate": 8.039419155370933e-06, "loss": 1.2152, "num_input_tokens_seen": 604460932, "step": 15380 }, { "epoch": 0.7457117937784669, "grad_norm": 0.4055761396884918, "learning_rate": 8.010615521542077e-06, "loss": 1.075, "num_input_tokens_seen": 604849832, "step": 15390 }, { "epoch": 0.7461963368543464, "grad_norm": 0.41014060378074646, "learning_rate": 7.981853731977005e-06, "loss": 1.0986, "num_input_tokens_seen": 605250684, "step": 15400 }, { "epoch": 0.7466808799302258, "grad_norm": 0.44504401087760925, "learning_rate": 7.953133857514999e-06, "loss": 1.1245, "num_input_tokens_seen": 605657060, "step": 15410 }, { "epoch": 0.7471654230061052, "grad_norm": 0.3899877071380615, "learning_rate": 7.92445596889213e-06, "loss": 1.096, "num_input_tokens_seen": 606046780, "step": 15420 }, { "epoch": 0.7476499660819846, "grad_norm": 0.40705353021621704, "learning_rate": 7.89582013674101e-06, "loss": 1.1188, "num_input_tokens_seen": 606446820, "step": 15430 }, { "epoch": 0.7481345091578642, "grad_norm": 0.3865094482898712, "learning_rate": 7.867226431590735e-06, "loss": 1.1454, "num_input_tokens_seen": 606828080, "step": 15440 }, { "epoch": 0.7486190522337436, "grad_norm": 0.42546987533569336, "learning_rate": 7.838674923866585e-06, "loss": 1.0962, "num_input_tokens_seen": 607213936, "step": 15450 }, { "epoch": 0.749103595309623, "grad_norm": 0.39779505133628845, "learning_rate": 7.81016568388993e-06, "loss": 1.1534, "num_input_tokens_seen": 607602128, "step": 15460 }, { "epoch": 0.7495881383855024, "grad_norm": 0.43804842233657837, "learning_rate": 7.78169878187805e-06, "loss": 1.1426, "num_input_tokens_seen": 607980840, "step": 15470 }, { "epoch": 0.750072681461382, "grad_norm": 0.42345863580703735, "learning_rate": 7.753274287943927e-06, "loss": 1.1451, "num_input_tokens_seen": 608389256, "step": 15480 }, { "epoch": 0.7505572245372614, "grad_norm": 0.4104250371456146, "learning_rate": 7.724892272096115e-06, "loss": 1.1078, "num_input_tokens_seen": 608763772, "step": 15490 }, { "epoch": 0.7510417676131408, "grad_norm": 0.37295570969581604, "learning_rate": 7.696552804238514e-06, "loss": 1.1495, "num_input_tokens_seen": 609154616, "step": 15500 }, { "epoch": 0.7515263106890202, "grad_norm": 0.42971867322921753, "learning_rate": 7.668255954170258e-06, "loss": 1.1204, "num_input_tokens_seen": 609563980, "step": 15510 }, { "epoch": 0.7520108537648997, "grad_norm": 0.3849759101867676, "learning_rate": 7.640001791585507e-06, "loss": 1.1601, "num_input_tokens_seen": 609984756, "step": 15520 }, { "epoch": 0.7524953968407792, "grad_norm": 0.41971349716186523, "learning_rate": 7.611790386073281e-06, "loss": 1.1537, "num_input_tokens_seen": 610384836, "step": 15530 }, { "epoch": 0.7529799399166586, "grad_norm": 0.3679080903530121, "learning_rate": 7.583621807117288e-06, "loss": 1.1229, "num_input_tokens_seen": 610787156, "step": 15540 }, { "epoch": 0.753464482992538, "grad_norm": 0.3937299847602844, "learning_rate": 7.555496124095776e-06, "loss": 1.1227, "num_input_tokens_seen": 611161580, "step": 15550 }, { "epoch": 0.7539490260684175, "grad_norm": 0.4039948284626007, "learning_rate": 7.527413406281294e-06, "loss": 1.163, "num_input_tokens_seen": 611556308, "step": 15560 }, { "epoch": 0.7544335691442969, "grad_norm": 0.38725948333740234, "learning_rate": 7.499373722840636e-06, "loss": 1.1422, "num_input_tokens_seen": 611967848, "step": 15570 }, { "epoch": 0.7549181122201764, "grad_norm": 0.3972112536430359, "learning_rate": 7.471377142834532e-06, "loss": 1.102, "num_input_tokens_seen": 612364408, "step": 15580 }, { "epoch": 0.7554026552960558, "grad_norm": 0.39917463064193726, "learning_rate": 7.443423735217622e-06, "loss": 1.136, "num_input_tokens_seen": 612775300, "step": 15590 }, { "epoch": 0.7558871983719353, "grad_norm": 0.39816567301750183, "learning_rate": 7.415513568838153e-06, "loss": 1.1112, "num_input_tokens_seen": 613151088, "step": 15600 }, { "epoch": 0.7563717414478147, "grad_norm": 0.4336942136287689, "learning_rate": 7.387646712437904e-06, "loss": 1.1323, "num_input_tokens_seen": 613546524, "step": 15610 }, { "epoch": 0.7568562845236941, "grad_norm": 0.4093795716762543, "learning_rate": 7.35982323465198e-06, "loss": 1.081, "num_input_tokens_seen": 613944468, "step": 15620 }, { "epoch": 0.7573408275995736, "grad_norm": 0.3967530131340027, "learning_rate": 7.332043204008621e-06, "loss": 1.2024, "num_input_tokens_seen": 614316784, "step": 15630 }, { "epoch": 0.7578253706754531, "grad_norm": 0.3960801362991333, "learning_rate": 7.304306688929102e-06, "loss": 1.1074, "num_input_tokens_seen": 614744716, "step": 15640 }, { "epoch": 0.7583099137513325, "grad_norm": 0.40353167057037354, "learning_rate": 7.2766137577274765e-06, "loss": 1.1534, "num_input_tokens_seen": 615162168, "step": 15650 }, { "epoch": 0.7587944568272119, "grad_norm": 0.4028729200363159, "learning_rate": 7.248964478610482e-06, "loss": 1.1388, "num_input_tokens_seen": 615552592, "step": 15660 }, { "epoch": 0.7592789999030913, "grad_norm": 0.4119277596473694, "learning_rate": 7.221358919677329e-06, "loss": 1.1401, "num_input_tokens_seen": 615985000, "step": 15670 }, { "epoch": 0.7597635429789709, "grad_norm": 0.4111025035381317, "learning_rate": 7.193797148919557e-06, "loss": 1.1402, "num_input_tokens_seen": 616372156, "step": 15680 }, { "epoch": 0.7602480860548503, "grad_norm": 0.3870091438293457, "learning_rate": 7.166279234220829e-06, "loss": 1.1222, "num_input_tokens_seen": 616794564, "step": 15690 }, { "epoch": 0.7607326291307297, "grad_norm": 0.4028170704841614, "learning_rate": 7.138805243356847e-06, "loss": 1.1556, "num_input_tokens_seen": 617166456, "step": 15700 }, { "epoch": 0.7612171722066091, "grad_norm": 0.4021094739437103, "learning_rate": 7.111375243995058e-06, "loss": 1.1548, "num_input_tokens_seen": 617553256, "step": 15710 }, { "epoch": 0.7617017152824886, "grad_norm": 0.4055640399456024, "learning_rate": 7.083989303694635e-06, "loss": 1.1371, "num_input_tokens_seen": 617951232, "step": 15720 }, { "epoch": 0.7621862583583681, "grad_norm": 0.3847959339618683, "learning_rate": 7.05664748990617e-06, "loss": 1.1114, "num_input_tokens_seen": 618343460, "step": 15730 }, { "epoch": 0.7626708014342475, "grad_norm": 0.38287290930747986, "learning_rate": 7.0293498699716105e-06, "loss": 1.1387, "num_input_tokens_seen": 618773472, "step": 15740 }, { "epoch": 0.7631553445101269, "grad_norm": 0.41064536571502686, "learning_rate": 7.0020965111240454e-06, "loss": 1.1396, "num_input_tokens_seen": 619175808, "step": 15750 }, { "epoch": 0.7636398875860064, "grad_norm": 0.42379313707351685, "learning_rate": 6.9748874804875516e-06, "loss": 1.1246, "num_input_tokens_seen": 619583744, "step": 15760 }, { "epoch": 0.7641244306618858, "grad_norm": 0.4114890396595001, "learning_rate": 6.947722845077032e-06, "loss": 1.1252, "num_input_tokens_seen": 619983760, "step": 15770 }, { "epoch": 0.7646089737377653, "grad_norm": 0.4021173119544983, "learning_rate": 6.9206026717980265e-06, "loss": 1.1981, "num_input_tokens_seen": 620365344, "step": 15780 }, { "epoch": 0.7650935168136447, "grad_norm": 0.42168235778808594, "learning_rate": 6.893527027446589e-06, "loss": 1.0703, "num_input_tokens_seen": 620746756, "step": 15790 }, { "epoch": 0.7655780598895242, "grad_norm": 0.38521257042884827, "learning_rate": 6.866495978709087e-06, "loss": 1.1277, "num_input_tokens_seen": 621139404, "step": 15800 }, { "epoch": 0.7660626029654036, "grad_norm": 0.4103443920612335, "learning_rate": 6.839509592162055e-06, "loss": 1.1429, "num_input_tokens_seen": 621564096, "step": 15810 }, { "epoch": 0.766547146041283, "grad_norm": 0.36137986183166504, "learning_rate": 6.8125679342720294e-06, "loss": 1.1534, "num_input_tokens_seen": 621947492, "step": 15820 }, { "epoch": 0.7670316891171625, "grad_norm": 0.4212779104709625, "learning_rate": 6.7856710713953805e-06, "loss": 1.1169, "num_input_tokens_seen": 622366700, "step": 15830 }, { "epoch": 0.767516232193042, "grad_norm": 0.4411621689796448, "learning_rate": 6.75881906977813e-06, "loss": 1.1259, "num_input_tokens_seen": 622768796, "step": 15840 }, { "epoch": 0.7680007752689214, "grad_norm": 0.39934778213500977, "learning_rate": 6.732011995555851e-06, "loss": 1.1114, "num_input_tokens_seen": 623147136, "step": 15850 }, { "epoch": 0.7684853183448008, "grad_norm": 0.43150418996810913, "learning_rate": 6.705249914753414e-06, "loss": 1.1407, "num_input_tokens_seen": 623483104, "step": 15860 }, { "epoch": 0.7689698614206802, "grad_norm": 0.3736652731895447, "learning_rate": 6.6785328932849e-06, "loss": 1.1541, "num_input_tokens_seen": 623845764, "step": 15870 }, { "epoch": 0.7694544044965598, "grad_norm": 0.436292439699173, "learning_rate": 6.65186099695341e-06, "loss": 1.1381, "num_input_tokens_seen": 624231876, "step": 15880 }, { "epoch": 0.7699389475724392, "grad_norm": 0.44637537002563477, "learning_rate": 6.6252342914508965e-06, "loss": 1.1727, "num_input_tokens_seen": 624626492, "step": 15890 }, { "epoch": 0.7704234906483186, "grad_norm": 0.4382629990577698, "learning_rate": 6.598652842358008e-06, "loss": 1.0927, "num_input_tokens_seen": 625027572, "step": 15900 }, { "epoch": 0.770908033724198, "grad_norm": 0.37075766921043396, "learning_rate": 6.572116715143939e-06, "loss": 1.1196, "num_input_tokens_seen": 625436036, "step": 15910 }, { "epoch": 0.7713925768000776, "grad_norm": 0.3835522532463074, "learning_rate": 6.545625975166231e-06, "loss": 1.1169, "num_input_tokens_seen": 625817956, "step": 15920 }, { "epoch": 0.771877119875957, "grad_norm": 0.4104726016521454, "learning_rate": 6.51918068767067e-06, "loss": 1.1782, "num_input_tokens_seen": 626184948, "step": 15930 }, { "epoch": 0.7723616629518364, "grad_norm": 0.4460158944129944, "learning_rate": 6.492780917791075e-06, "loss": 1.1137, "num_input_tokens_seen": 626596420, "step": 15940 }, { "epoch": 0.7728462060277158, "grad_norm": 0.4014306664466858, "learning_rate": 6.466426730549166e-06, "loss": 1.154, "num_input_tokens_seen": 627007456, "step": 15950 }, { "epoch": 0.7733307491035953, "grad_norm": 0.3667517304420471, "learning_rate": 6.440118190854394e-06, "loss": 1.1304, "num_input_tokens_seen": 627390576, "step": 15960 }, { "epoch": 0.7738152921794748, "grad_norm": 0.39722877740859985, "learning_rate": 6.41385536350376e-06, "loss": 1.1548, "num_input_tokens_seen": 627755000, "step": 15970 }, { "epoch": 0.7742998352553542, "grad_norm": 0.41055047512054443, "learning_rate": 6.387638313181721e-06, "loss": 1.1473, "num_input_tokens_seen": 628145832, "step": 15980 }, { "epoch": 0.7747843783312336, "grad_norm": 0.4256828725337982, "learning_rate": 6.3614671044599364e-06, "loss": 1.1001, "num_input_tokens_seen": 628544212, "step": 15990 }, { "epoch": 0.7752689214071131, "grad_norm": 0.4094095230102539, "learning_rate": 6.335341801797209e-06, "loss": 1.1743, "num_input_tokens_seen": 628937800, "step": 16000 }, { "epoch": 0.7752689214071131, "eval_loss": 1.1244345903396606, "eval_runtime": 5.2554, "eval_samples_per_second": 28.542, "eval_steps_per_second": 3.615, "num_input_tokens_seen": 628937800, "step": 16000 }, { "epoch": 0.7757534644829925, "grad_norm": 0.44040119647979736, "learning_rate": 6.309262469539235e-06, "loss": 1.0965, "num_input_tokens_seen": 629321000, "step": 16010 }, { "epoch": 0.776238007558872, "grad_norm": 0.3980225920677185, "learning_rate": 6.283229171918506e-06, "loss": 1.1363, "num_input_tokens_seen": 629740836, "step": 16020 }, { "epoch": 0.7767225506347514, "grad_norm": 0.4174353778362274, "learning_rate": 6.257241973054132e-06, "loss": 1.1164, "num_input_tokens_seen": 630136212, "step": 16030 }, { "epoch": 0.7772070937106309, "grad_norm": 0.44722503423690796, "learning_rate": 6.231300936951686e-06, "loss": 1.1585, "num_input_tokens_seen": 630539300, "step": 16040 }, { "epoch": 0.7776916367865103, "grad_norm": 0.41615843772888184, "learning_rate": 6.205406127503021e-06, "loss": 1.1508, "num_input_tokens_seen": 630936060, "step": 16050 }, { "epoch": 0.7781761798623897, "grad_norm": 0.39735278487205505, "learning_rate": 6.1795576084861804e-06, "loss": 1.0761, "num_input_tokens_seen": 631334484, "step": 16060 }, { "epoch": 0.7786607229382693, "grad_norm": 0.4064568281173706, "learning_rate": 6.153755443565146e-06, "loss": 1.1212, "num_input_tokens_seen": 631740948, "step": 16070 }, { "epoch": 0.7791452660141487, "grad_norm": 0.4038830101490021, "learning_rate": 6.12799969628976e-06, "loss": 1.1316, "num_input_tokens_seen": 632141196, "step": 16080 }, { "epoch": 0.7796298090900281, "grad_norm": 0.41059982776641846, "learning_rate": 6.102290430095536e-06, "loss": 1.1087, "num_input_tokens_seen": 632532608, "step": 16090 }, { "epoch": 0.7801143521659075, "grad_norm": 0.40716275572776794, "learning_rate": 6.0766277083035035e-06, "loss": 1.1119, "num_input_tokens_seen": 632916136, "step": 16100 }, { "epoch": 0.780598895241787, "grad_norm": 0.39503198862075806, "learning_rate": 6.05101159412006e-06, "loss": 1.1406, "num_input_tokens_seen": 633327552, "step": 16110 }, { "epoch": 0.7810834383176665, "grad_norm": 0.3686826229095459, "learning_rate": 6.025442150636781e-06, "loss": 1.0964, "num_input_tokens_seen": 633698396, "step": 16120 }, { "epoch": 0.7815679813935459, "grad_norm": 0.4384057819843292, "learning_rate": 5.999919440830354e-06, "loss": 1.1353, "num_input_tokens_seen": 634084992, "step": 16130 }, { "epoch": 0.7820525244694253, "grad_norm": 0.4173773527145386, "learning_rate": 5.974443527562296e-06, "loss": 1.0915, "num_input_tokens_seen": 634487672, "step": 16140 }, { "epoch": 0.7825370675453048, "grad_norm": 0.4255557358264923, "learning_rate": 5.949014473578909e-06, "loss": 1.133, "num_input_tokens_seen": 634851792, "step": 16150 }, { "epoch": 0.7830216106211842, "grad_norm": 0.40748724341392517, "learning_rate": 5.923632341511071e-06, "loss": 1.156, "num_input_tokens_seen": 635237128, "step": 16160 }, { "epoch": 0.7835061536970637, "grad_norm": 0.38821229338645935, "learning_rate": 5.898297193874086e-06, "loss": 1.1218, "num_input_tokens_seen": 635617944, "step": 16170 }, { "epoch": 0.7839906967729431, "grad_norm": 0.39625513553619385, "learning_rate": 5.873009093067547e-06, "loss": 1.1036, "num_input_tokens_seen": 636028208, "step": 16180 }, { "epoch": 0.7844752398488226, "grad_norm": 0.4132024943828583, "learning_rate": 5.8477681013751724e-06, "loss": 1.1117, "num_input_tokens_seen": 636400024, "step": 16190 }, { "epoch": 0.784959782924702, "grad_norm": 0.4017508029937744, "learning_rate": 5.822574280964637e-06, "loss": 1.1399, "num_input_tokens_seen": 636803444, "step": 16200 }, { "epoch": 0.7854443260005814, "grad_norm": 0.3757273852825165, "learning_rate": 5.79742769388745e-06, "loss": 1.1278, "num_input_tokens_seen": 637208612, "step": 16210 }, { "epoch": 0.7859288690764609, "grad_norm": 0.4408009648323059, "learning_rate": 5.77232840207878e-06, "loss": 1.1813, "num_input_tokens_seen": 637569988, "step": 16220 }, { "epoch": 0.7864134121523404, "grad_norm": 0.4416961371898651, "learning_rate": 5.747276467357313e-06, "loss": 1.0945, "num_input_tokens_seen": 637969128, "step": 16230 }, { "epoch": 0.7868979552282198, "grad_norm": 0.3773948848247528, "learning_rate": 5.722271951425101e-06, "loss": 1.097, "num_input_tokens_seen": 638377296, "step": 16240 }, { "epoch": 0.7873824983040992, "grad_norm": 0.4033990502357483, "learning_rate": 5.6973149158673775e-06, "loss": 1.1328, "num_input_tokens_seen": 638763704, "step": 16250 }, { "epoch": 0.7878670413799786, "grad_norm": 0.40168893337249756, "learning_rate": 5.6724054221524825e-06, "loss": 1.1358, "num_input_tokens_seen": 639184012, "step": 16260 }, { "epoch": 0.7883515844558582, "grad_norm": 0.4118176996707916, "learning_rate": 5.647543531631611e-06, "loss": 1.152, "num_input_tokens_seen": 639586888, "step": 16270 }, { "epoch": 0.7888361275317376, "grad_norm": 0.4122507572174072, "learning_rate": 5.62272930553874e-06, "loss": 1.1098, "num_input_tokens_seen": 640002384, "step": 16280 }, { "epoch": 0.789320670607617, "grad_norm": 0.42295822501182556, "learning_rate": 5.597962804990453e-06, "loss": 1.1196, "num_input_tokens_seen": 640390420, "step": 16290 }, { "epoch": 0.7898052136834964, "grad_norm": 0.37593498826026917, "learning_rate": 5.573244090985777e-06, "loss": 1.0817, "num_input_tokens_seen": 640735112, "step": 16300 }, { "epoch": 0.790289756759376, "grad_norm": 0.40528619289398193, "learning_rate": 5.548573224406045e-06, "loss": 1.163, "num_input_tokens_seen": 641116036, "step": 16310 }, { "epoch": 0.7907742998352554, "grad_norm": 0.4023245573043823, "learning_rate": 5.523950266014754e-06, "loss": 1.1551, "num_input_tokens_seen": 641501348, "step": 16320 }, { "epoch": 0.7912588429111348, "grad_norm": 0.3965412676334381, "learning_rate": 5.499375276457372e-06, "loss": 1.0989, "num_input_tokens_seen": 641919160, "step": 16330 }, { "epoch": 0.7917433859870142, "grad_norm": 0.4392828643321991, "learning_rate": 5.4748483162612716e-06, "loss": 1.1252, "num_input_tokens_seen": 642318860, "step": 16340 }, { "epoch": 0.7922279290628937, "grad_norm": 0.38923805952072144, "learning_rate": 5.450369445835485e-06, "loss": 1.1219, "num_input_tokens_seen": 642734704, "step": 16350 }, { "epoch": 0.7927124721387732, "grad_norm": 0.42891883850097656, "learning_rate": 5.425938725470628e-06, "loss": 1.1488, "num_input_tokens_seen": 643108016, "step": 16360 }, { "epoch": 0.7931970152146526, "grad_norm": 0.42537081241607666, "learning_rate": 5.4015562153387126e-06, "loss": 1.1634, "num_input_tokens_seen": 643481944, "step": 16370 }, { "epoch": 0.793681558290532, "grad_norm": 0.39477813243865967, "learning_rate": 5.377221975493016e-06, "loss": 1.091, "num_input_tokens_seen": 643855924, "step": 16380 }, { "epoch": 0.7941661013664115, "grad_norm": 0.4207918345928192, "learning_rate": 5.352936065867931e-06, "loss": 1.0861, "num_input_tokens_seen": 644257816, "step": 16390 }, { "epoch": 0.7946506444422909, "grad_norm": 0.44012096524238586, "learning_rate": 5.328698546278798e-06, "loss": 1.1237, "num_input_tokens_seen": 644645828, "step": 16400 }, { "epoch": 0.7951351875181704, "grad_norm": 0.4076230525970459, "learning_rate": 5.30450947642179e-06, "loss": 1.1243, "num_input_tokens_seen": 645029716, "step": 16410 }, { "epoch": 0.7956197305940498, "grad_norm": 0.3962908983230591, "learning_rate": 5.280368915873749e-06, "loss": 1.149, "num_input_tokens_seen": 645399720, "step": 16420 }, { "epoch": 0.7961042736699293, "grad_norm": 0.4037371575832367, "learning_rate": 5.256276924092035e-06, "loss": 1.1561, "num_input_tokens_seen": 645804128, "step": 16430 }, { "epoch": 0.7965888167458087, "grad_norm": 0.4168999493122101, "learning_rate": 5.232233560414387e-06, "loss": 1.1377, "num_input_tokens_seen": 646212012, "step": 16440 }, { "epoch": 0.7970733598216881, "grad_norm": 0.40669918060302734, "learning_rate": 5.208238884058783e-06, "loss": 1.1135, "num_input_tokens_seen": 646620356, "step": 16450 }, { "epoch": 0.7975579028975676, "grad_norm": 0.3902994692325592, "learning_rate": 5.1842929541232544e-06, "loss": 1.1916, "num_input_tokens_seen": 646994340, "step": 16460 }, { "epoch": 0.7980424459734471, "grad_norm": 0.451654314994812, "learning_rate": 5.1603958295858205e-06, "loss": 1.1252, "num_input_tokens_seen": 647370752, "step": 16470 }, { "epoch": 0.7985269890493265, "grad_norm": 0.39699918031692505, "learning_rate": 5.136547569304246e-06, "loss": 1.1475, "num_input_tokens_seen": 647750888, "step": 16480 }, { "epoch": 0.7990115321252059, "grad_norm": 0.37806710600852966, "learning_rate": 5.112748232015993e-06, "loss": 1.1349, "num_input_tokens_seen": 648103956, "step": 16490 }, { "epoch": 0.7994960752010853, "grad_norm": 0.38627612590789795, "learning_rate": 5.088997876337981e-06, "loss": 1.1531, "num_input_tokens_seen": 648508460, "step": 16500 }, { "epoch": 0.7999806182769649, "grad_norm": 0.414298951625824, "learning_rate": 5.065296560766522e-06, "loss": 1.1369, "num_input_tokens_seen": 648868944, "step": 16510 }, { "epoch": 0.8004651613528443, "grad_norm": 0.38814184069633484, "learning_rate": 5.041644343677126e-06, "loss": 1.1623, "num_input_tokens_seen": 649301060, "step": 16520 }, { "epoch": 0.8009497044287237, "grad_norm": 0.380260169506073, "learning_rate": 5.018041283324387e-06, "loss": 1.142, "num_input_tokens_seen": 649700576, "step": 16530 }, { "epoch": 0.8014342475046031, "grad_norm": 0.406630277633667, "learning_rate": 4.994487437841827e-06, "loss": 1.1287, "num_input_tokens_seen": 650094908, "step": 16540 }, { "epoch": 0.8019187905804827, "grad_norm": 0.4198836088180542, "learning_rate": 4.9709828652417385e-06, "loss": 1.1155, "num_input_tokens_seen": 650509396, "step": 16550 }, { "epoch": 0.8024033336563621, "grad_norm": 0.3846551775932312, "learning_rate": 4.947527623415071e-06, "loss": 1.084, "num_input_tokens_seen": 650893528, "step": 16560 }, { "epoch": 0.8028878767322415, "grad_norm": 0.36685115098953247, "learning_rate": 4.924121770131274e-06, "loss": 1.1465, "num_input_tokens_seen": 651297116, "step": 16570 }, { "epoch": 0.8033724198081209, "grad_norm": 0.4003463387489319, "learning_rate": 4.900765363038151e-06, "loss": 1.1613, "num_input_tokens_seen": 651677048, "step": 16580 }, { "epoch": 0.8038569628840004, "grad_norm": 0.4008733928203583, "learning_rate": 4.87745845966171e-06, "loss": 1.1023, "num_input_tokens_seen": 652077724, "step": 16590 }, { "epoch": 0.8043415059598799, "grad_norm": 0.40616297721862793, "learning_rate": 4.854201117406066e-06, "loss": 1.1272, "num_input_tokens_seen": 652466264, "step": 16600 }, { "epoch": 0.8048260490357593, "grad_norm": 0.41137826442718506, "learning_rate": 4.830993393553215e-06, "loss": 1.1, "num_input_tokens_seen": 652835720, "step": 16610 }, { "epoch": 0.8053105921116387, "grad_norm": 0.44767943024635315, "learning_rate": 4.807835345263009e-06, "loss": 1.1096, "num_input_tokens_seen": 653218356, "step": 16620 }, { "epoch": 0.8057951351875182, "grad_norm": 0.38211873173713684, "learning_rate": 4.784727029572894e-06, "loss": 1.1586, "num_input_tokens_seen": 653630732, "step": 16630 }, { "epoch": 0.8062796782633976, "grad_norm": 0.4150942862033844, "learning_rate": 4.761668503397851e-06, "loss": 1.1344, "num_input_tokens_seen": 654032908, "step": 16640 }, { "epoch": 0.806764221339277, "grad_norm": 0.3996165096759796, "learning_rate": 4.738659823530236e-06, "loss": 1.1124, "num_input_tokens_seen": 654438804, "step": 16650 }, { "epoch": 0.8072487644151565, "grad_norm": 0.3866037428379059, "learning_rate": 4.715701046639623e-06, "loss": 1.1109, "num_input_tokens_seen": 654823000, "step": 16660 }, { "epoch": 0.807733307491036, "grad_norm": 0.4089403450489044, "learning_rate": 4.6927922292726886e-06, "loss": 1.1196, "num_input_tokens_seen": 655212888, "step": 16670 }, { "epoch": 0.8082178505669154, "grad_norm": 0.43001115322113037, "learning_rate": 4.669933427853043e-06, "loss": 1.0934, "num_input_tokens_seen": 655622528, "step": 16680 }, { "epoch": 0.8087023936427948, "grad_norm": 0.40845033526420593, "learning_rate": 4.647124698681127e-06, "loss": 1.1133, "num_input_tokens_seen": 656007064, "step": 16690 }, { "epoch": 0.8091869367186743, "grad_norm": 0.38767552375793457, "learning_rate": 4.624366097934046e-06, "loss": 1.1088, "num_input_tokens_seen": 656399380, "step": 16700 }, { "epoch": 0.8096714797945538, "grad_norm": 0.39583420753479004, "learning_rate": 4.60165768166545e-06, "loss": 1.1914, "num_input_tokens_seen": 656798972, "step": 16710 }, { "epoch": 0.8101560228704332, "grad_norm": 0.4240066707134247, "learning_rate": 4.578999505805362e-06, "loss": 1.1416, "num_input_tokens_seen": 657198652, "step": 16720 }, { "epoch": 0.8106405659463126, "grad_norm": 0.3843323290348053, "learning_rate": 4.556391626160109e-06, "loss": 1.1275, "num_input_tokens_seen": 657637540, "step": 16730 }, { "epoch": 0.811125109022192, "grad_norm": 0.41490888595581055, "learning_rate": 4.533834098412082e-06, "loss": 1.154, "num_input_tokens_seen": 658012976, "step": 16740 }, { "epoch": 0.8116096520980716, "grad_norm": 0.4508754014968872, "learning_rate": 4.511326978119718e-06, "loss": 1.1442, "num_input_tokens_seen": 658403516, "step": 16750 }, { "epoch": 0.812094195173951, "grad_norm": 0.3963550925254822, "learning_rate": 4.488870320717251e-06, "loss": 1.1307, "num_input_tokens_seen": 658799828, "step": 16760 }, { "epoch": 0.8125787382498304, "grad_norm": 0.4240289032459259, "learning_rate": 4.466464181514657e-06, "loss": 1.1333, "num_input_tokens_seen": 659188772, "step": 16770 }, { "epoch": 0.8130632813257098, "grad_norm": 0.4394795596599579, "learning_rate": 4.444108615697476e-06, "loss": 1.1301, "num_input_tokens_seen": 659605584, "step": 16780 }, { "epoch": 0.8135478244015893, "grad_norm": 0.391284704208374, "learning_rate": 4.421803678326691e-06, "loss": 1.0913, "num_input_tokens_seen": 660023780, "step": 16790 }, { "epoch": 0.8140323674774688, "grad_norm": 0.37516680359840393, "learning_rate": 4.399549424338589e-06, "loss": 1.0831, "num_input_tokens_seen": 660418884, "step": 16800 }, { "epoch": 0.8145169105533482, "grad_norm": 0.415872186422348, "learning_rate": 4.3773459085446275e-06, "loss": 1.1123, "num_input_tokens_seen": 660814748, "step": 16810 }, { "epoch": 0.8150014536292276, "grad_norm": 0.4023378789424896, "learning_rate": 4.355193185631287e-06, "loss": 1.087, "num_input_tokens_seen": 661192768, "step": 16820 }, { "epoch": 0.8154859967051071, "grad_norm": 0.3767937421798706, "learning_rate": 4.333091310159956e-06, "loss": 1.1545, "num_input_tokens_seen": 661606668, "step": 16830 }, { "epoch": 0.8159705397809865, "grad_norm": 0.5043548941612244, "learning_rate": 4.311040336566791e-06, "loss": 1.1445, "num_input_tokens_seen": 661982448, "step": 16840 }, { "epoch": 0.816455082856866, "grad_norm": 0.3812048137187958, "learning_rate": 4.289040319162569e-06, "loss": 1.136, "num_input_tokens_seen": 662373168, "step": 16850 }, { "epoch": 0.8169396259327454, "grad_norm": 0.40610408782958984, "learning_rate": 4.267091312132576e-06, "loss": 1.1151, "num_input_tokens_seen": 662758704, "step": 16860 }, { "epoch": 0.8174241690086249, "grad_norm": 0.4081581234931946, "learning_rate": 4.245193369536437e-06, "loss": 1.1532, "num_input_tokens_seen": 663150068, "step": 16870 }, { "epoch": 0.8179087120845043, "grad_norm": 0.41858088970184326, "learning_rate": 4.2233465453080486e-06, "loss": 1.1054, "num_input_tokens_seen": 663562480, "step": 16880 }, { "epoch": 0.8183932551603837, "grad_norm": 0.45081600546836853, "learning_rate": 4.20155089325536e-06, "loss": 1.128, "num_input_tokens_seen": 663941916, "step": 16890 }, { "epoch": 0.8188777982362632, "grad_norm": 0.41167008876800537, "learning_rate": 4.17980646706031e-06, "loss": 1.1354, "num_input_tokens_seen": 664345336, "step": 16900 }, { "epoch": 0.8193623413121427, "grad_norm": 0.39505717158317566, "learning_rate": 4.158113320278667e-06, "loss": 1.084, "num_input_tokens_seen": 664762920, "step": 16910 }, { "epoch": 0.8198468843880221, "grad_norm": 0.3788187503814697, "learning_rate": 4.136471506339901e-06, "loss": 1.1064, "num_input_tokens_seen": 665156684, "step": 16920 }, { "epoch": 0.8203314274639015, "grad_norm": 0.43902385234832764, "learning_rate": 4.114881078547042e-06, "loss": 1.1317, "num_input_tokens_seen": 665557436, "step": 16930 }, { "epoch": 0.820815970539781, "grad_norm": 0.3830069303512573, "learning_rate": 4.093342090076571e-06, "loss": 1.127, "num_input_tokens_seen": 665945288, "step": 16940 }, { "epoch": 0.8213005136156605, "grad_norm": 0.4254535734653473, "learning_rate": 4.071854593978253e-06, "loss": 1.1208, "num_input_tokens_seen": 666344140, "step": 16950 }, { "epoch": 0.8217850566915399, "grad_norm": 0.38880154490470886, "learning_rate": 4.050418643175067e-06, "loss": 1.1566, "num_input_tokens_seen": 666728028, "step": 16960 }, { "epoch": 0.8222695997674193, "grad_norm": 0.4109187126159668, "learning_rate": 4.029034290462996e-06, "loss": 1.1369, "num_input_tokens_seen": 667095588, "step": 16970 }, { "epoch": 0.8227541428432987, "grad_norm": 0.36832395195961, "learning_rate": 4.0077015885109676e-06, "loss": 1.1458, "num_input_tokens_seen": 667490212, "step": 16980 }, { "epoch": 0.8232386859191783, "grad_norm": 0.38334590196609497, "learning_rate": 3.986420589860682e-06, "loss": 1.1087, "num_input_tokens_seen": 667892264, "step": 16990 }, { "epoch": 0.8237232289950577, "grad_norm": 0.41069987416267395, "learning_rate": 3.9651913469265e-06, "loss": 1.0849, "num_input_tokens_seen": 668289636, "step": 17000 }, { "epoch": 0.8242077720709371, "grad_norm": 0.3793525695800781, "learning_rate": 3.944013911995317e-06, "loss": 1.1524, "num_input_tokens_seen": 668676344, "step": 17010 }, { "epoch": 0.8246923151468165, "grad_norm": 0.41535207629203796, "learning_rate": 3.922888337226399e-06, "loss": 1.1035, "num_input_tokens_seen": 669042236, "step": 17020 }, { "epoch": 0.825176858222696, "grad_norm": 0.3818122148513794, "learning_rate": 3.90181467465133e-06, "loss": 1.103, "num_input_tokens_seen": 669438068, "step": 17030 }, { "epoch": 0.8256614012985755, "grad_norm": 0.4072776436805725, "learning_rate": 3.880792976173788e-06, "loss": 1.1103, "num_input_tokens_seen": 669833020, "step": 17040 }, { "epoch": 0.8261459443744549, "grad_norm": 0.3956544101238251, "learning_rate": 3.859823293569495e-06, "loss": 1.1063, "num_input_tokens_seen": 670234476, "step": 17050 }, { "epoch": 0.8266304874503343, "grad_norm": 0.4025129973888397, "learning_rate": 3.838905678486049e-06, "loss": 1.0968, "num_input_tokens_seen": 670625508, "step": 17060 }, { "epoch": 0.8271150305262138, "grad_norm": 0.48538658022880554, "learning_rate": 3.818040182442814e-06, "loss": 1.1978, "num_input_tokens_seen": 671013016, "step": 17070 }, { "epoch": 0.8275995736020932, "grad_norm": 0.39329686760902405, "learning_rate": 3.7972268568307685e-06, "loss": 1.1016, "num_input_tokens_seen": 671418600, "step": 17080 }, { "epoch": 0.8280841166779727, "grad_norm": 0.39455854892730713, "learning_rate": 3.776465752912428e-06, "loss": 1.16, "num_input_tokens_seen": 671802384, "step": 17090 }, { "epoch": 0.8285686597538521, "grad_norm": 0.5405839681625366, "learning_rate": 3.7557569218216516e-06, "loss": 1.164, "num_input_tokens_seen": 672200552, "step": 17100 }, { "epoch": 0.8290532028297316, "grad_norm": 0.369508296251297, "learning_rate": 3.735100414563594e-06, "loss": 1.1282, "num_input_tokens_seen": 672618664, "step": 17110 }, { "epoch": 0.829537745905611, "grad_norm": 0.41533350944519043, "learning_rate": 3.7144962820144928e-06, "loss": 1.1445, "num_input_tokens_seen": 673004068, "step": 17120 }, { "epoch": 0.8300222889814904, "grad_norm": 0.38036444783210754, "learning_rate": 3.6939445749216235e-06, "loss": 1.1596, "num_input_tokens_seen": 673389044, "step": 17130 }, { "epoch": 0.8305068320573699, "grad_norm": 0.37696197628974915, "learning_rate": 3.6734453439031257e-06, "loss": 1.1471, "num_input_tokens_seen": 673724844, "step": 17140 }, { "epoch": 0.8309913751332494, "grad_norm": 0.40430694818496704, "learning_rate": 3.65299863944788e-06, "loss": 1.095, "num_input_tokens_seen": 674106168, "step": 17150 }, { "epoch": 0.8314759182091288, "grad_norm": 0.468234658241272, "learning_rate": 3.6326045119154327e-06, "loss": 1.108, "num_input_tokens_seen": 674469340, "step": 17160 }, { "epoch": 0.8319604612850082, "grad_norm": 0.40100011229515076, "learning_rate": 3.612263011535791e-06, "loss": 1.1609, "num_input_tokens_seen": 674828188, "step": 17170 }, { "epoch": 0.8324450043608876, "grad_norm": 0.42651060223579407, "learning_rate": 3.5919741884093723e-06, "loss": 1.1289, "num_input_tokens_seen": 675221792, "step": 17180 }, { "epoch": 0.8329295474367672, "grad_norm": 0.3936256468296051, "learning_rate": 3.5717380925068405e-06, "loss": 1.1308, "num_input_tokens_seen": 675644508, "step": 17190 }, { "epoch": 0.8334140905126466, "grad_norm": 0.41212528944015503, "learning_rate": 3.5515547736689995e-06, "loss": 1.0939, "num_input_tokens_seen": 676038672, "step": 17200 }, { "epoch": 0.833898633588526, "grad_norm": 0.4407350420951843, "learning_rate": 3.531424281606663e-06, "loss": 1.1637, "num_input_tokens_seen": 676431380, "step": 17210 }, { "epoch": 0.8343831766644054, "grad_norm": 0.37873637676239014, "learning_rate": 3.511346665900536e-06, "loss": 1.1442, "num_input_tokens_seen": 676821844, "step": 17220 }, { "epoch": 0.834867719740285, "grad_norm": 0.39286068081855774, "learning_rate": 3.491321976001072e-06, "loss": 1.1215, "num_input_tokens_seen": 677199996, "step": 17230 }, { "epoch": 0.8353522628161644, "grad_norm": 0.42986762523651123, "learning_rate": 3.471350261228412e-06, "loss": 1.0996, "num_input_tokens_seen": 677592120, "step": 17240 }, { "epoch": 0.8358368058920438, "grad_norm": 0.4484102725982666, "learning_rate": 3.451431570772179e-06, "loss": 1.1268, "num_input_tokens_seen": 677999524, "step": 17250 }, { "epoch": 0.8363213489679232, "grad_norm": 0.407656729221344, "learning_rate": 3.431565953691418e-06, "loss": 1.1505, "num_input_tokens_seen": 678368908, "step": 17260 }, { "epoch": 0.8368058920438027, "grad_norm": 0.398433119058609, "learning_rate": 3.4117534589144547e-06, "loss": 1.1351, "num_input_tokens_seen": 678739932, "step": 17270 }, { "epoch": 0.8372904351196822, "grad_norm": 0.47509506344795227, "learning_rate": 3.3919941352387768e-06, "loss": 1.0815, "num_input_tokens_seen": 679131488, "step": 17280 }, { "epoch": 0.8377749781955616, "grad_norm": 0.3671634793281555, "learning_rate": 3.372288031330917e-06, "loss": 1.1059, "num_input_tokens_seen": 679542760, "step": 17290 }, { "epoch": 0.838259521271441, "grad_norm": 0.4009820818901062, "learning_rate": 3.3526351957263115e-06, "loss": 1.1132, "num_input_tokens_seen": 679961904, "step": 17300 }, { "epoch": 0.8387440643473205, "grad_norm": 0.3820185363292694, "learning_rate": 3.3330356768292215e-06, "loss": 1.0903, "num_input_tokens_seen": 680347620, "step": 17310 }, { "epoch": 0.8392286074231999, "grad_norm": 0.45867863297462463, "learning_rate": 3.3134895229125772e-06, "loss": 1.1581, "num_input_tokens_seen": 680716452, "step": 17320 }, { "epoch": 0.8397131504990794, "grad_norm": 0.4050085246562958, "learning_rate": 3.293996782117881e-06, "loss": 1.0828, "num_input_tokens_seen": 681101364, "step": 17330 }, { "epoch": 0.8401976935749588, "grad_norm": 0.39128220081329346, "learning_rate": 3.2745575024550695e-06, "loss": 1.0908, "num_input_tokens_seen": 681520008, "step": 17340 }, { "epoch": 0.8406822366508383, "grad_norm": 0.47006210684776306, "learning_rate": 3.2551717318024255e-06, "loss": 1.1409, "num_input_tokens_seen": 681899268, "step": 17350 }, { "epoch": 0.8411667797267177, "grad_norm": 0.41448378562927246, "learning_rate": 3.235839517906411e-06, "loss": 1.1815, "num_input_tokens_seen": 682286376, "step": 17360 }, { "epoch": 0.8416513228025971, "grad_norm": 0.3959016501903534, "learning_rate": 3.216560908381616e-06, "loss": 1.1594, "num_input_tokens_seen": 682681272, "step": 17370 }, { "epoch": 0.8421358658784766, "grad_norm": 0.39144328236579895, "learning_rate": 3.1973359507105645e-06, "loss": 1.0814, "num_input_tokens_seen": 683085540, "step": 17380 }, { "epoch": 0.8426204089543561, "grad_norm": 0.5121538043022156, "learning_rate": 3.1781646922436848e-06, "loss": 1.1284, "num_input_tokens_seen": 683457100, "step": 17390 }, { "epoch": 0.8431049520302355, "grad_norm": 0.4063737094402313, "learning_rate": 3.1590471801991012e-06, "loss": 1.1375, "num_input_tokens_seen": 683831048, "step": 17400 }, { "epoch": 0.8435894951061149, "grad_norm": 0.3836385905742645, "learning_rate": 3.1399834616625907e-06, "loss": 1.1183, "num_input_tokens_seen": 684224324, "step": 17410 }, { "epoch": 0.8440740381819943, "grad_norm": 0.39683645963668823, "learning_rate": 3.120973583587425e-06, "loss": 1.1197, "num_input_tokens_seen": 684624492, "step": 17420 }, { "epoch": 0.8445585812578739, "grad_norm": 0.3896249234676361, "learning_rate": 3.102017592794279e-06, "loss": 1.0956, "num_input_tokens_seen": 685029460, "step": 17430 }, { "epoch": 0.8450431243337533, "grad_norm": 0.39281994104385376, "learning_rate": 3.0831155359710927e-06, "loss": 1.158, "num_input_tokens_seen": 685406276, "step": 17440 }, { "epoch": 0.8455276674096327, "grad_norm": 0.4067038893699646, "learning_rate": 3.0642674596729785e-06, "loss": 1.1395, "num_input_tokens_seen": 685780380, "step": 17450 }, { "epoch": 0.8460122104855121, "grad_norm": 0.3911641836166382, "learning_rate": 3.0454734103220942e-06, "loss": 1.1158, "num_input_tokens_seen": 686162356, "step": 17460 }, { "epoch": 0.8464967535613916, "grad_norm": 0.3925408720970154, "learning_rate": 3.026733434207532e-06, "loss": 1.1467, "num_input_tokens_seen": 686553108, "step": 17470 }, { "epoch": 0.8469812966372711, "grad_norm": 0.42923226952552795, "learning_rate": 3.008047577485204e-06, "loss": 1.1584, "num_input_tokens_seen": 686914652, "step": 17480 }, { "epoch": 0.8474658397131505, "grad_norm": 0.4102732837200165, "learning_rate": 2.98941588617771e-06, "loss": 1.1051, "num_input_tokens_seen": 687275992, "step": 17490 }, { "epoch": 0.8479503827890299, "grad_norm": 0.41153576970100403, "learning_rate": 2.970838406174284e-06, "loss": 1.1446, "num_input_tokens_seen": 687689920, "step": 17500 }, { "epoch": 0.8484349258649094, "grad_norm": 0.4032898545265198, "learning_rate": 2.952315183230589e-06, "loss": 1.1283, "num_input_tokens_seen": 688080568, "step": 17510 }, { "epoch": 0.8489194689407888, "grad_norm": 0.4179829955101013, "learning_rate": 2.9338462629687034e-06, "loss": 1.1031, "num_input_tokens_seen": 688453284, "step": 17520 }, { "epoch": 0.8494040120166683, "grad_norm": 0.4297243058681488, "learning_rate": 2.915431690876916e-06, "loss": 1.1442, "num_input_tokens_seen": 688838776, "step": 17530 }, { "epoch": 0.8498885550925477, "grad_norm": 0.4191533029079437, "learning_rate": 2.8970715123096877e-06, "loss": 1.1041, "num_input_tokens_seen": 689225084, "step": 17540 }, { "epoch": 0.8503730981684272, "grad_norm": 0.4235047996044159, "learning_rate": 2.8787657724875016e-06, "loss": 1.0696, "num_input_tokens_seen": 689607636, "step": 17550 }, { "epoch": 0.8508576412443066, "grad_norm": 0.409962922334671, "learning_rate": 2.860514516496754e-06, "loss": 1.1092, "num_input_tokens_seen": 689951392, "step": 17560 }, { "epoch": 0.851342184320186, "grad_norm": 0.4196973443031311, "learning_rate": 2.8423177892896585e-06, "loss": 1.0901, "num_input_tokens_seen": 690348064, "step": 17570 }, { "epoch": 0.8518267273960655, "grad_norm": 0.3807368278503418, "learning_rate": 2.8241756356841233e-06, "loss": 1.1281, "num_input_tokens_seen": 690749348, "step": 17580 }, { "epoch": 0.852311270471945, "grad_norm": 0.40411800146102905, "learning_rate": 2.806088100363635e-06, "loss": 1.0834, "num_input_tokens_seen": 691138376, "step": 17590 }, { "epoch": 0.8527958135478244, "grad_norm": 0.39498665928840637, "learning_rate": 2.7880552278771703e-06, "loss": 1.0952, "num_input_tokens_seen": 691567648, "step": 17600 }, { "epoch": 0.8532803566237038, "grad_norm": 0.434725284576416, "learning_rate": 2.7700770626390677e-06, "loss": 1.1152, "num_input_tokens_seen": 691920228, "step": 17610 }, { "epoch": 0.8537648996995832, "grad_norm": 0.3900477886199951, "learning_rate": 2.7521536489289233e-06, "loss": 1.1517, "num_input_tokens_seen": 692294800, "step": 17620 }, { "epoch": 0.8542494427754628, "grad_norm": 0.42599180340766907, "learning_rate": 2.7342850308914843e-06, "loss": 1.1066, "num_input_tokens_seen": 692670100, "step": 17630 }, { "epoch": 0.8547339858513422, "grad_norm": 0.416204571723938, "learning_rate": 2.716471252536526e-06, "loss": 1.0739, "num_input_tokens_seen": 693061084, "step": 17640 }, { "epoch": 0.8552185289272216, "grad_norm": 0.40145227313041687, "learning_rate": 2.6987123577387833e-06, "loss": 1.1394, "num_input_tokens_seen": 693458880, "step": 17650 }, { "epoch": 0.855703072003101, "grad_norm": 0.38669687509536743, "learning_rate": 2.6810083902377825e-06, "loss": 1.1175, "num_input_tokens_seen": 693867900, "step": 17660 }, { "epoch": 0.8561876150789806, "grad_norm": 0.42774924635887146, "learning_rate": 2.663359393637785e-06, "loss": 1.1, "num_input_tokens_seen": 694285092, "step": 17670 }, { "epoch": 0.85667215815486, "grad_norm": 0.39861494302749634, "learning_rate": 2.645765411407655e-06, "loss": 1.1243, "num_input_tokens_seen": 694666360, "step": 17680 }, { "epoch": 0.8571567012307394, "grad_norm": 0.3979116678237915, "learning_rate": 2.6282264868807637e-06, "loss": 1.1226, "num_input_tokens_seen": 695052444, "step": 17690 }, { "epoch": 0.8576412443066188, "grad_norm": 0.4196152985095978, "learning_rate": 2.61074266325487e-06, "loss": 1.1283, "num_input_tokens_seen": 695441168, "step": 17700 }, { "epoch": 0.8581257873824983, "grad_norm": 0.41752830147743225, "learning_rate": 2.59331398359203e-06, "loss": 1.0917, "num_input_tokens_seen": 695864664, "step": 17710 }, { "epoch": 0.8586103304583778, "grad_norm": 0.41962143778800964, "learning_rate": 2.5759404908184654e-06, "loss": 1.1223, "num_input_tokens_seen": 696267288, "step": 17720 }, { "epoch": 0.8590948735342572, "grad_norm": 0.4000556170940399, "learning_rate": 2.5586222277244887e-06, "loss": 1.1531, "num_input_tokens_seen": 696650996, "step": 17730 }, { "epoch": 0.8595794166101366, "grad_norm": 0.4215715229511261, "learning_rate": 2.541359236964386e-06, "loss": 1.0838, "num_input_tokens_seen": 697023960, "step": 17740 }, { "epoch": 0.8600639596860161, "grad_norm": 0.3923567533493042, "learning_rate": 2.5241515610562983e-06, "loss": 1.1086, "num_input_tokens_seen": 697411632, "step": 17750 }, { "epoch": 0.8605485027618955, "grad_norm": 0.39392712712287903, "learning_rate": 2.506999242382141e-06, "loss": 1.1011, "num_input_tokens_seen": 697800768, "step": 17760 }, { "epoch": 0.861033045837775, "grad_norm": 0.39785829186439514, "learning_rate": 2.489902323187465e-06, "loss": 1.0928, "num_input_tokens_seen": 698174508, "step": 17770 }, { "epoch": 0.8615175889136544, "grad_norm": 0.4119921028614044, "learning_rate": 2.472860845581404e-06, "loss": 1.1081, "num_input_tokens_seen": 698560436, "step": 17780 }, { "epoch": 0.8620021319895339, "grad_norm": 0.4029790461063385, "learning_rate": 2.455874851536516e-06, "loss": 1.1645, "num_input_tokens_seen": 698924028, "step": 17790 }, { "epoch": 0.8624866750654133, "grad_norm": 0.4168151915073395, "learning_rate": 2.4389443828887166e-06, "loss": 1.1385, "num_input_tokens_seen": 699315512, "step": 17800 }, { "epoch": 0.8629712181412927, "grad_norm": 0.44244882464408875, "learning_rate": 2.422069481337161e-06, "loss": 1.1355, "num_input_tokens_seen": 699695032, "step": 17810 }, { "epoch": 0.8634557612171722, "grad_norm": 0.3886070251464844, "learning_rate": 2.405250188444147e-06, "loss": 1.1019, "num_input_tokens_seen": 700102396, "step": 17820 }, { "epoch": 0.8639403042930517, "grad_norm": 0.4062063992023468, "learning_rate": 2.3884865456350103e-06, "loss": 1.1314, "num_input_tokens_seen": 700467740, "step": 17830 }, { "epoch": 0.8644248473689311, "grad_norm": 0.4632991552352905, "learning_rate": 2.371778594198021e-06, "loss": 1.1259, "num_input_tokens_seen": 700860544, "step": 17840 }, { "epoch": 0.8649093904448105, "grad_norm": 0.3729296624660492, "learning_rate": 2.355126375284272e-06, "loss": 1.1139, "num_input_tokens_seen": 701260456, "step": 17850 }, { "epoch": 0.8653939335206899, "grad_norm": 0.3984206020832062, "learning_rate": 2.338529929907618e-06, "loss": 1.1629, "num_input_tokens_seen": 701665356, "step": 17860 }, { "epoch": 0.8658784765965695, "grad_norm": 0.39754825830459595, "learning_rate": 2.321989298944513e-06, "loss": 1.1423, "num_input_tokens_seen": 702049100, "step": 17870 }, { "epoch": 0.8663630196724489, "grad_norm": 0.4021519720554352, "learning_rate": 2.305504523133964e-06, "loss": 1.1645, "num_input_tokens_seen": 702418804, "step": 17880 }, { "epoch": 0.8668475627483283, "grad_norm": 0.41587236523628235, "learning_rate": 2.2890756430773956e-06, "loss": 1.1121, "num_input_tokens_seen": 702806248, "step": 17890 }, { "epoch": 0.8673321058242077, "grad_norm": 0.38616082072257996, "learning_rate": 2.272702699238574e-06, "loss": 1.1448, "num_input_tokens_seen": 703198880, "step": 17900 }, { "epoch": 0.8678166489000873, "grad_norm": 0.4032549560070038, "learning_rate": 2.2563857319434945e-06, "loss": 1.1484, "num_input_tokens_seen": 703603228, "step": 17910 }, { "epoch": 0.8683011919759667, "grad_norm": 0.4154454171657562, "learning_rate": 2.2401247813802652e-06, "loss": 1.0953, "num_input_tokens_seen": 703977824, "step": 17920 }, { "epoch": 0.8687857350518461, "grad_norm": 0.3659140169620514, "learning_rate": 2.223919887599063e-06, "loss": 1.136, "num_input_tokens_seen": 704375196, "step": 17930 }, { "epoch": 0.8692702781277255, "grad_norm": 0.4203336834907532, "learning_rate": 2.2077710905119618e-06, "loss": 1.1147, "num_input_tokens_seen": 704771996, "step": 17940 }, { "epoch": 0.869754821203605, "grad_norm": 0.4060530364513397, "learning_rate": 2.191678429892893e-06, "loss": 1.1429, "num_input_tokens_seen": 705149948, "step": 17950 }, { "epoch": 0.8702393642794845, "grad_norm": 0.4052405059337616, "learning_rate": 2.175641945377524e-06, "loss": 1.1261, "num_input_tokens_seen": 705545528, "step": 17960 }, { "epoch": 0.8707239073553639, "grad_norm": 0.39759910106658936, "learning_rate": 2.1596616764631595e-06, "loss": 1.1454, "num_input_tokens_seen": 705922212, "step": 17970 }, { "epoch": 0.8712084504312433, "grad_norm": 0.3922360837459564, "learning_rate": 2.143737662508635e-06, "loss": 1.0771, "num_input_tokens_seen": 706339472, "step": 17980 }, { "epoch": 0.8716929935071228, "grad_norm": 0.39637112617492676, "learning_rate": 2.127869942734262e-06, "loss": 1.0886, "num_input_tokens_seen": 706709976, "step": 17990 }, { "epoch": 0.8721775365830022, "grad_norm": 0.36679452657699585, "learning_rate": 2.112058556221663e-06, "loss": 1.137, "num_input_tokens_seen": 707117360, "step": 18000 }, { "epoch": 0.8721775365830022, "eval_loss": 1.1178553104400635, "eval_runtime": 5.0781, "eval_samples_per_second": 29.539, "eval_steps_per_second": 3.742, "num_input_tokens_seen": 707117360, "step": 18000 }, { "epoch": 0.8726620796588817, "grad_norm": 0.3815447986125946, "learning_rate": 2.0963035419137577e-06, "loss": 1.1119, "num_input_tokens_seen": 707514656, "step": 18010 }, { "epoch": 0.8731466227347611, "grad_norm": 0.39872461557388306, "learning_rate": 2.0806049386145774e-06, "loss": 1.1278, "num_input_tokens_seen": 707917760, "step": 18020 }, { "epoch": 0.8736311658106406, "grad_norm": 0.3805377781391144, "learning_rate": 2.0649627849892466e-06, "loss": 1.1064, "num_input_tokens_seen": 708298628, "step": 18030 }, { "epoch": 0.87411570888652, "grad_norm": 0.3966962695121765, "learning_rate": 2.0493771195638443e-06, "loss": 1.1155, "num_input_tokens_seen": 708713480, "step": 18040 }, { "epoch": 0.8746002519623994, "grad_norm": 0.42251577973365784, "learning_rate": 2.033847980725323e-06, "loss": 1.1489, "num_input_tokens_seen": 709099180, "step": 18050 }, { "epoch": 0.8750847950382789, "grad_norm": 0.4105037450790405, "learning_rate": 2.018375406721415e-06, "loss": 1.1305, "num_input_tokens_seen": 709499996, "step": 18060 }, { "epoch": 0.8755693381141584, "grad_norm": 0.4050074517726898, "learning_rate": 2.0029594356605286e-06, "loss": 1.1278, "num_input_tokens_seen": 709902500, "step": 18070 }, { "epoch": 0.8760538811900378, "grad_norm": 0.3788783550262451, "learning_rate": 1.9876001055116664e-06, "loss": 1.0709, "num_input_tokens_seen": 710294012, "step": 18080 }, { "epoch": 0.8765384242659172, "grad_norm": 0.4039725959300995, "learning_rate": 1.9722974541043244e-06, "loss": 1.131, "num_input_tokens_seen": 710669864, "step": 18090 }, { "epoch": 0.8770229673417966, "grad_norm": 0.3935058116912842, "learning_rate": 1.957051519128403e-06, "loss": 1.092, "num_input_tokens_seen": 711071604, "step": 18100 }, { "epoch": 0.8775075104176762, "grad_norm": 0.42256423830986023, "learning_rate": 1.9418623381341094e-06, "loss": 1.1371, "num_input_tokens_seen": 711476612, "step": 18110 }, { "epoch": 0.8779920534935556, "grad_norm": 0.3739723265171051, "learning_rate": 1.926729948531872e-06, "loss": 1.14, "num_input_tokens_seen": 711872752, "step": 18120 }, { "epoch": 0.878476596569435, "grad_norm": 0.3799099624156952, "learning_rate": 1.9116543875922294e-06, "loss": 1.1055, "num_input_tokens_seen": 712244912, "step": 18130 }, { "epoch": 0.8789611396453144, "grad_norm": 0.3699980080127716, "learning_rate": 1.8966356924457833e-06, "loss": 1.1191, "num_input_tokens_seen": 712647364, "step": 18140 }, { "epoch": 0.8794456827211939, "grad_norm": 0.3966718316078186, "learning_rate": 1.881673900083042e-06, "loss": 1.1611, "num_input_tokens_seen": 713028800, "step": 18150 }, { "epoch": 0.8799302257970734, "grad_norm": 0.3989785313606262, "learning_rate": 1.8667690473543858e-06, "loss": 1.1063, "num_input_tokens_seen": 713412044, "step": 18160 }, { "epoch": 0.8804147688729528, "grad_norm": 0.3920852541923523, "learning_rate": 1.8519211709699475e-06, "loss": 1.1163, "num_input_tokens_seen": 713796584, "step": 18170 }, { "epoch": 0.8808993119488322, "grad_norm": 0.4311419129371643, "learning_rate": 1.837130307499535e-06, "loss": 1.0843, "num_input_tokens_seen": 714199172, "step": 18180 }, { "epoch": 0.8813838550247117, "grad_norm": 0.3938891887664795, "learning_rate": 1.822396493372533e-06, "loss": 1.0989, "num_input_tokens_seen": 714590532, "step": 18190 }, { "epoch": 0.8818683981005911, "grad_norm": 0.42306891083717346, "learning_rate": 1.807719764877805e-06, "loss": 1.0779, "num_input_tokens_seen": 715001368, "step": 18200 }, { "epoch": 0.8823529411764706, "grad_norm": 0.39101406931877136, "learning_rate": 1.793100158163627e-06, "loss": 1.1353, "num_input_tokens_seen": 715383004, "step": 18210 }, { "epoch": 0.88283748425235, "grad_norm": 0.3895913362503052, "learning_rate": 1.7785377092375848e-06, "loss": 1.1042, "num_input_tokens_seen": 715769884, "step": 18220 }, { "epoch": 0.8833220273282295, "grad_norm": 0.40741434693336487, "learning_rate": 1.7640324539664827e-06, "loss": 1.1079, "num_input_tokens_seen": 716157804, "step": 18230 }, { "epoch": 0.8838065704041089, "grad_norm": 0.40758630633354187, "learning_rate": 1.749584428076262e-06, "loss": 1.1415, "num_input_tokens_seen": 716535668, "step": 18240 }, { "epoch": 0.8842911134799883, "grad_norm": 0.39152994751930237, "learning_rate": 1.7351936671519104e-06, "loss": 1.1472, "num_input_tokens_seen": 716917736, "step": 18250 }, { "epoch": 0.8847756565558678, "grad_norm": 0.3884604275226593, "learning_rate": 1.7208602066373592e-06, "loss": 1.1329, "num_input_tokens_seen": 717307888, "step": 18260 }, { "epoch": 0.8852601996317473, "grad_norm": 0.4423676133155823, "learning_rate": 1.706584081835444e-06, "loss": 1.154, "num_input_tokens_seen": 717697032, "step": 18270 }, { "epoch": 0.8857447427076267, "grad_norm": 0.39519426226615906, "learning_rate": 1.6923653279077468e-06, "loss": 1.106, "num_input_tokens_seen": 718093096, "step": 18280 }, { "epoch": 0.8862292857835061, "grad_norm": 0.4418719708919525, "learning_rate": 1.6782039798745791e-06, "loss": 1.1005, "num_input_tokens_seen": 718458596, "step": 18290 }, { "epoch": 0.8867138288593855, "grad_norm": 0.39423951506614685, "learning_rate": 1.6641000726148353e-06, "loss": 1.109, "num_input_tokens_seen": 718874968, "step": 18300 }, { "epoch": 0.8871983719352651, "grad_norm": 0.40183213353157043, "learning_rate": 1.650053640865959e-06, "loss": 1.1351, "num_input_tokens_seen": 719282808, "step": 18310 }, { "epoch": 0.8876829150111445, "grad_norm": 0.3977072238922119, "learning_rate": 1.6360647192238176e-06, "loss": 1.1096, "num_input_tokens_seen": 719670048, "step": 18320 }, { "epoch": 0.8881674580870239, "grad_norm": 0.4405307173728943, "learning_rate": 1.6221333421426483e-06, "loss": 1.1682, "num_input_tokens_seen": 720046844, "step": 18330 }, { "epoch": 0.8886520011629033, "grad_norm": 0.38534072041511536, "learning_rate": 1.6082595439349368e-06, "loss": 1.1598, "num_input_tokens_seen": 720440272, "step": 18340 }, { "epoch": 0.8891365442387829, "grad_norm": 0.3926476836204529, "learning_rate": 1.5944433587713693e-06, "loss": 1.1201, "num_input_tokens_seen": 720832848, "step": 18350 }, { "epoch": 0.8896210873146623, "grad_norm": 0.38580575585365295, "learning_rate": 1.5806848206807362e-06, "loss": 1.1723, "num_input_tokens_seen": 721201392, "step": 18360 }, { "epoch": 0.8901056303905417, "grad_norm": 0.41746583580970764, "learning_rate": 1.566983963549834e-06, "loss": 1.0699, "num_input_tokens_seen": 721622692, "step": 18370 }, { "epoch": 0.8905901734664212, "grad_norm": 0.3924805223941803, "learning_rate": 1.5533408211234002e-06, "loss": 1.1053, "num_input_tokens_seen": 722027036, "step": 18380 }, { "epoch": 0.8910747165423006, "grad_norm": 0.37664535641670227, "learning_rate": 1.5397554270040137e-06, "loss": 1.1089, "num_input_tokens_seen": 722405744, "step": 18390 }, { "epoch": 0.8915592596181801, "grad_norm": 0.4099685847759247, "learning_rate": 1.5262278146520426e-06, "loss": 1.1115, "num_input_tokens_seen": 722805360, "step": 18400 }, { "epoch": 0.8920438026940595, "grad_norm": 0.3878704607486725, "learning_rate": 1.5127580173855071e-06, "loss": 1.1438, "num_input_tokens_seen": 723211584, "step": 18410 }, { "epoch": 0.892528345769939, "grad_norm": 0.438963919878006, "learning_rate": 1.4993460683800698e-06, "loss": 1.1211, "num_input_tokens_seen": 723640776, "step": 18420 }, { "epoch": 0.8930128888458184, "grad_norm": 0.3852384090423584, "learning_rate": 1.4859920006688789e-06, "loss": 1.1244, "num_input_tokens_seen": 724035952, "step": 18430 }, { "epoch": 0.8934974319216978, "grad_norm": 0.36654970049858093, "learning_rate": 1.4726958471425495e-06, "loss": 1.125, "num_input_tokens_seen": 724439364, "step": 18440 }, { "epoch": 0.8939819749975773, "grad_norm": 0.39647069573402405, "learning_rate": 1.4594576405490417e-06, "loss": 1.083, "num_input_tokens_seen": 724842880, "step": 18450 }, { "epoch": 0.8944665180734568, "grad_norm": 0.35190171003341675, "learning_rate": 1.4462774134935963e-06, "loss": 1.1442, "num_input_tokens_seen": 725280940, "step": 18460 }, { "epoch": 0.8949510611493362, "grad_norm": 0.40093109011650085, "learning_rate": 1.43315519843866e-06, "loss": 1.1372, "num_input_tokens_seen": 725666144, "step": 18470 }, { "epoch": 0.8954356042252156, "grad_norm": 0.4389098286628723, "learning_rate": 1.4200910277037932e-06, "loss": 1.1656, "num_input_tokens_seen": 726077688, "step": 18480 }, { "epoch": 0.895920147301095, "grad_norm": 0.401603102684021, "learning_rate": 1.4070849334655883e-06, "loss": 1.1504, "num_input_tokens_seen": 726457876, "step": 18490 }, { "epoch": 0.8964046903769746, "grad_norm": 0.4019956588745117, "learning_rate": 1.3941369477576072e-06, "loss": 1.1493, "num_input_tokens_seen": 726837760, "step": 18500 }, { "epoch": 0.896889233452854, "grad_norm": 0.3990192413330078, "learning_rate": 1.3812471024702873e-06, "loss": 1.0712, "num_input_tokens_seen": 727203844, "step": 18510 }, { "epoch": 0.8973737765287334, "grad_norm": 0.396411269903183, "learning_rate": 1.3684154293508722e-06, "loss": 1.0972, "num_input_tokens_seen": 727595240, "step": 18520 }, { "epoch": 0.8978583196046128, "grad_norm": 0.3998691737651825, "learning_rate": 1.3556419600033288e-06, "loss": 1.0947, "num_input_tokens_seen": 727979392, "step": 18530 }, { "epoch": 0.8983428626804923, "grad_norm": 0.4026288390159607, "learning_rate": 1.3429267258882578e-06, "loss": 1.1136, "num_input_tokens_seen": 728342232, "step": 18540 }, { "epoch": 0.8988274057563718, "grad_norm": 0.3861730396747589, "learning_rate": 1.3302697583228523e-06, "loss": 1.0925, "num_input_tokens_seen": 728749172, "step": 18550 }, { "epoch": 0.8993119488322512, "grad_norm": 0.38084274530410767, "learning_rate": 1.3176710884807791e-06, "loss": 1.115, "num_input_tokens_seen": 729125920, "step": 18560 }, { "epoch": 0.8997964919081306, "grad_norm": 0.43145671486854553, "learning_rate": 1.3051307473921193e-06, "loss": 1.1619, "num_input_tokens_seen": 729532004, "step": 18570 }, { "epoch": 0.9002810349840101, "grad_norm": 0.41663506627082825, "learning_rate": 1.2926487659433024e-06, "loss": 1.1116, "num_input_tokens_seen": 729903788, "step": 18580 }, { "epoch": 0.9007655780598895, "grad_norm": 0.4152432680130005, "learning_rate": 1.2802251748770144e-06, "loss": 1.1174, "num_input_tokens_seen": 730309520, "step": 18590 }, { "epoch": 0.901250121135769, "grad_norm": 0.3903380334377289, "learning_rate": 1.267860004792129e-06, "loss": 1.0801, "num_input_tokens_seen": 730705760, "step": 18600 }, { "epoch": 0.9017346642116484, "grad_norm": 0.4450056552886963, "learning_rate": 1.255553286143632e-06, "loss": 1.1158, "num_input_tokens_seen": 731091508, "step": 18610 }, { "epoch": 0.9022192072875279, "grad_norm": 0.39127764105796814, "learning_rate": 1.2433050492425352e-06, "loss": 1.1237, "num_input_tokens_seen": 731486328, "step": 18620 }, { "epoch": 0.9027037503634073, "grad_norm": 0.3897622227668762, "learning_rate": 1.2311153242558354e-06, "loss": 1.0822, "num_input_tokens_seen": 731875972, "step": 18630 }, { "epoch": 0.9031882934392867, "grad_norm": 0.4100290536880493, "learning_rate": 1.218984141206389e-06, "loss": 1.0725, "num_input_tokens_seen": 732293268, "step": 18640 }, { "epoch": 0.9036728365151662, "grad_norm": 0.3897705674171448, "learning_rate": 1.2069115299728845e-06, "loss": 1.1306, "num_input_tokens_seen": 732690944, "step": 18650 }, { "epoch": 0.9041573795910457, "grad_norm": 0.4191875755786896, "learning_rate": 1.1948975202897423e-06, "loss": 1.1358, "num_input_tokens_seen": 733075324, "step": 18660 }, { "epoch": 0.9046419226669251, "grad_norm": 0.3774699568748474, "learning_rate": 1.1829421417470481e-06, "loss": 1.1105, "num_input_tokens_seen": 733466324, "step": 18670 }, { "epoch": 0.9051264657428045, "grad_norm": 0.39872831106185913, "learning_rate": 1.17104542379049e-06, "loss": 1.1047, "num_input_tokens_seen": 733853980, "step": 18680 }, { "epoch": 0.905611008818684, "grad_norm": 0.38337400555610657, "learning_rate": 1.159207395721268e-06, "loss": 1.0915, "num_input_tokens_seen": 734284336, "step": 18690 }, { "epoch": 0.9060955518945635, "grad_norm": 0.4249227046966553, "learning_rate": 1.1474280866960314e-06, "loss": 1.1328, "num_input_tokens_seen": 734691688, "step": 18700 }, { "epoch": 0.9065800949704429, "grad_norm": 0.40089496970176697, "learning_rate": 1.1357075257268147e-06, "loss": 1.1384, "num_input_tokens_seen": 735099992, "step": 18710 }, { "epoch": 0.9070646380463223, "grad_norm": 0.42720267176628113, "learning_rate": 1.1240457416809458e-06, "loss": 1.1603, "num_input_tokens_seen": 735517812, "step": 18720 }, { "epoch": 0.9075491811222017, "grad_norm": 0.41853755712509155, "learning_rate": 1.112442763280999e-06, "loss": 1.0719, "num_input_tokens_seen": 735875148, "step": 18730 }, { "epoch": 0.9080337241980813, "grad_norm": 0.4119112491607666, "learning_rate": 1.1008986191047095e-06, "loss": 1.12, "num_input_tokens_seen": 736279692, "step": 18740 }, { "epoch": 0.9085182672739607, "grad_norm": 0.40979915857315063, "learning_rate": 1.0894133375848907e-06, "loss": 1.0894, "num_input_tokens_seen": 736678652, "step": 18750 }, { "epoch": 0.9090028103498401, "grad_norm": 0.3658255338668823, "learning_rate": 1.0779869470094072e-06, "loss": 1.1409, "num_input_tokens_seen": 737076332, "step": 18760 }, { "epoch": 0.9094873534257195, "grad_norm": 0.3819989264011383, "learning_rate": 1.0666194755210524e-06, "loss": 1.0877, "num_input_tokens_seen": 737463700, "step": 18770 }, { "epoch": 0.909971896501599, "grad_norm": 0.3775273859500885, "learning_rate": 1.0553109511175192e-06, "loss": 1.0883, "num_input_tokens_seen": 737833988, "step": 18780 }, { "epoch": 0.9104564395774785, "grad_norm": 0.4142516553401947, "learning_rate": 1.0440614016513056e-06, "loss": 1.0953, "num_input_tokens_seen": 738251768, "step": 18790 }, { "epoch": 0.9109409826533579, "grad_norm": 0.4076782166957855, "learning_rate": 1.0328708548296622e-06, "loss": 1.0818, "num_input_tokens_seen": 738646584, "step": 18800 }, { "epoch": 0.9114255257292373, "grad_norm": 0.3837725520133972, "learning_rate": 1.0217393382145224e-06, "loss": 1.1542, "num_input_tokens_seen": 739027612, "step": 18810 }, { "epoch": 0.9119100688051168, "grad_norm": 0.4405462145805359, "learning_rate": 1.0106668792224134e-06, "loss": 1.1116, "num_input_tokens_seen": 739421692, "step": 18820 }, { "epoch": 0.9123946118809962, "grad_norm": 0.4041633605957031, "learning_rate": 9.996535051244316e-07, "loss": 1.062, "num_input_tokens_seen": 739822296, "step": 18830 }, { "epoch": 0.9128791549568757, "grad_norm": 0.4356255829334259, "learning_rate": 9.88699243046126e-07, "loss": 1.0675, "num_input_tokens_seen": 740216416, "step": 18840 }, { "epoch": 0.9133636980327551, "grad_norm": 0.365655779838562, "learning_rate": 9.778041199674626e-07, "loss": 1.13, "num_input_tokens_seen": 740584032, "step": 18850 }, { "epoch": 0.9138482411086346, "grad_norm": 0.38094544410705566, "learning_rate": 9.669681627227562e-07, "loss": 1.1065, "num_input_tokens_seen": 740973480, "step": 18860 }, { "epoch": 0.914332784184514, "grad_norm": 0.41554176807403564, "learning_rate": 9.561913980005916e-07, "loss": 1.148, "num_input_tokens_seen": 741350984, "step": 18870 }, { "epoch": 0.9148173272603934, "grad_norm": 0.40062519907951355, "learning_rate": 9.45473852343759e-07, "loss": 1.1589, "num_input_tokens_seen": 741724776, "step": 18880 }, { "epoch": 0.9153018703362729, "grad_norm": 0.3932797610759735, "learning_rate": 9.348155521492125e-07, "loss": 1.1375, "num_input_tokens_seen": 742129168, "step": 18890 }, { "epoch": 0.9157864134121524, "grad_norm": 0.4039793610572815, "learning_rate": 9.242165236679646e-07, "loss": 1.1235, "num_input_tokens_seen": 742522860, "step": 18900 }, { "epoch": 0.9162709564880318, "grad_norm": 0.4156935513019562, "learning_rate": 9.136767930050666e-07, "loss": 1.1931, "num_input_tokens_seen": 742904860, "step": 18910 }, { "epoch": 0.9167554995639112, "grad_norm": 0.419506698846817, "learning_rate": 9.031963861194953e-07, "loss": 1.0917, "num_input_tokens_seen": 743285040, "step": 18920 }, { "epoch": 0.9172400426397906, "grad_norm": 0.417361855506897, "learning_rate": 8.927753288241386e-07, "loss": 1.1108, "num_input_tokens_seen": 743675480, "step": 18930 }, { "epoch": 0.9177245857156702, "grad_norm": 0.38126012682914734, "learning_rate": 8.82413646785693e-07, "loss": 1.1074, "num_input_tokens_seen": 744082016, "step": 18940 }, { "epoch": 0.9182091287915496, "grad_norm": 0.401082843542099, "learning_rate": 8.721113655246222e-07, "loss": 1.092, "num_input_tokens_seen": 744477944, "step": 18950 }, { "epoch": 0.918693671867429, "grad_norm": 0.41044408082962036, "learning_rate": 8.618685104150925e-07, "loss": 1.1382, "num_input_tokens_seen": 744870004, "step": 18960 }, { "epoch": 0.9191782149433084, "grad_norm": 0.40474820137023926, "learning_rate": 8.516851066848936e-07, "loss": 1.0912, "num_input_tokens_seen": 745260580, "step": 18970 }, { "epoch": 0.919662758019188, "grad_norm": 0.405483603477478, "learning_rate": 8.415611794153982e-07, "loss": 1.1106, "num_input_tokens_seen": 745671092, "step": 18980 }, { "epoch": 0.9201473010950674, "grad_norm": 0.43090713024139404, "learning_rate": 8.314967535414858e-07, "loss": 1.076, "num_input_tokens_seen": 746080284, "step": 18990 }, { "epoch": 0.9206318441709468, "grad_norm": 0.4164203107357025, "learning_rate": 8.214918538514915e-07, "loss": 1.1442, "num_input_tokens_seen": 746446148, "step": 19000 }, { "epoch": 0.9211163872468262, "grad_norm": 0.4038824737071991, "learning_rate": 8.11546504987129e-07, "loss": 1.1099, "num_input_tokens_seen": 746836508, "step": 19010 }, { "epoch": 0.9216009303227057, "grad_norm": 0.39919909834861755, "learning_rate": 8.016607314434571e-07, "loss": 1.0966, "num_input_tokens_seen": 747221668, "step": 19020 }, { "epoch": 0.9220854733985852, "grad_norm": 0.4156663417816162, "learning_rate": 7.918345575687796e-07, "loss": 1.1268, "num_input_tokens_seen": 747595608, "step": 19030 }, { "epoch": 0.9225700164744646, "grad_norm": 0.40613898634910583, "learning_rate": 7.820680075646319e-07, "loss": 1.101, "num_input_tokens_seen": 747986884, "step": 19040 }, { "epoch": 0.923054559550344, "grad_norm": 0.3905068039894104, "learning_rate": 7.723611054856833e-07, "loss": 1.1111, "num_input_tokens_seen": 748374856, "step": 19050 }, { "epoch": 0.9235391026262235, "grad_norm": 0.3992663621902466, "learning_rate": 7.627138752396984e-07, "loss": 1.0937, "num_input_tokens_seen": 748752876, "step": 19060 }, { "epoch": 0.9240236457021029, "grad_norm": 0.4008621871471405, "learning_rate": 7.531263405874678e-07, "loss": 1.0564, "num_input_tokens_seen": 749125480, "step": 19070 }, { "epoch": 0.9245081887779824, "grad_norm": 0.4372722804546356, "learning_rate": 7.435985251427552e-07, "loss": 1.1177, "num_input_tokens_seen": 749506916, "step": 19080 }, { "epoch": 0.9249927318538618, "grad_norm": 0.41504257917404175, "learning_rate": 7.34130452372242e-07, "loss": 1.0958, "num_input_tokens_seen": 749895376, "step": 19090 }, { "epoch": 0.9254772749297413, "grad_norm": 0.41004976630210876, "learning_rate": 7.247221455954662e-07, "loss": 1.109, "num_input_tokens_seen": 750282460, "step": 19100 }, { "epoch": 0.9259618180056207, "grad_norm": 0.40631359815597534, "learning_rate": 7.1537362798475e-07, "loss": 1.1214, "num_input_tokens_seen": 750694156, "step": 19110 }, { "epoch": 0.9264463610815001, "grad_norm": 0.3669877350330353, "learning_rate": 7.060849225651756e-07, "loss": 1.0916, "num_input_tokens_seen": 751090800, "step": 19120 }, { "epoch": 0.9269309041573796, "grad_norm": 0.39109060168266296, "learning_rate": 6.968560522145007e-07, "loss": 1.1302, "num_input_tokens_seen": 751504408, "step": 19130 }, { "epoch": 0.9274154472332591, "grad_norm": 0.4094466269016266, "learning_rate": 6.876870396631097e-07, "loss": 1.1174, "num_input_tokens_seen": 751899136, "step": 19140 }, { "epoch": 0.9278999903091385, "grad_norm": 0.39219051599502563, "learning_rate": 6.785779074939657e-07, "loss": 1.1205, "num_input_tokens_seen": 752306356, "step": 19150 }, { "epoch": 0.9283845333850179, "grad_norm": 0.39333903789520264, "learning_rate": 6.695286781425392e-07, "loss": 1.0771, "num_input_tokens_seen": 752706400, "step": 19160 }, { "epoch": 0.9288690764608973, "grad_norm": 0.4072161614894867, "learning_rate": 6.605393738967763e-07, "loss": 1.1592, "num_input_tokens_seen": 753110792, "step": 19170 }, { "epoch": 0.9293536195367769, "grad_norm": 0.39852002263069153, "learning_rate": 6.516100168970113e-07, "loss": 1.1284, "num_input_tokens_seen": 753499472, "step": 19180 }, { "epoch": 0.9298381626126563, "grad_norm": 0.4045473039150238, "learning_rate": 6.427406291359489e-07, "loss": 1.1153, "num_input_tokens_seen": 753919864, "step": 19190 }, { "epoch": 0.9303227056885357, "grad_norm": 0.37104934453964233, "learning_rate": 6.339312324585761e-07, "loss": 1.1329, "num_input_tokens_seen": 754331304, "step": 19200 }, { "epoch": 0.9308072487644151, "grad_norm": 0.40764960646629333, "learning_rate": 6.251818485621341e-07, "loss": 1.0779, "num_input_tokens_seen": 754724460, "step": 19210 }, { "epoch": 0.9312917918402946, "grad_norm": 0.3814258277416229, "learning_rate": 6.164924989960519e-07, "loss": 1.0927, "num_input_tokens_seen": 755092192, "step": 19220 }, { "epoch": 0.9317763349161741, "grad_norm": 0.4385196566581726, "learning_rate": 6.078632051618988e-07, "loss": 1.086, "num_input_tokens_seen": 755461096, "step": 19230 }, { "epoch": 0.9322608779920535, "grad_norm": 0.42675715684890747, "learning_rate": 5.99293988313318e-07, "loss": 1.1314, "num_input_tokens_seen": 755853476, "step": 19240 }, { "epoch": 0.9327454210679329, "grad_norm": 0.4081270396709442, "learning_rate": 5.907848695559964e-07, "loss": 1.0827, "num_input_tokens_seen": 756266064, "step": 19250 }, { "epoch": 0.9332299641438124, "grad_norm": 0.40428853034973145, "learning_rate": 5.823358698476e-07, "loss": 1.1371, "num_input_tokens_seen": 756658988, "step": 19260 }, { "epoch": 0.9337145072196918, "grad_norm": 0.3700270652770996, "learning_rate": 5.73947009997719e-07, "loss": 1.1292, "num_input_tokens_seen": 757041432, "step": 19270 }, { "epoch": 0.9341990502955713, "grad_norm": 0.4112299978733063, "learning_rate": 5.656183106678287e-07, "loss": 1.0989, "num_input_tokens_seen": 757439484, "step": 19280 }, { "epoch": 0.9346835933714507, "grad_norm": 0.3882913291454315, "learning_rate": 5.573497923712173e-07, "loss": 1.1112, "num_input_tokens_seen": 757858200, "step": 19290 }, { "epoch": 0.9351681364473302, "grad_norm": 0.3786594271659851, "learning_rate": 5.491414754729667e-07, "loss": 1.0912, "num_input_tokens_seen": 758250852, "step": 19300 }, { "epoch": 0.9356526795232096, "grad_norm": 0.4327850043773651, "learning_rate": 5.409933801898692e-07, "loss": 1.1164, "num_input_tokens_seen": 758642612, "step": 19310 }, { "epoch": 0.936137222599089, "grad_norm": 0.38797247409820557, "learning_rate": 5.329055265904076e-07, "loss": 1.0734, "num_input_tokens_seen": 759069940, "step": 19320 }, { "epoch": 0.9366217656749685, "grad_norm": 0.40884557366371155, "learning_rate": 5.248779345946808e-07, "loss": 1.0806, "num_input_tokens_seen": 759446920, "step": 19330 }, { "epoch": 0.937106308750848, "grad_norm": 0.4048992097377777, "learning_rate": 5.169106239743648e-07, "loss": 1.1333, "num_input_tokens_seen": 759815092, "step": 19340 }, { "epoch": 0.9375908518267274, "grad_norm": 0.39969974756240845, "learning_rate": 5.090036143526767e-07, "loss": 1.1397, "num_input_tokens_seen": 760232476, "step": 19350 }, { "epoch": 0.9380753949026068, "grad_norm": 0.41090163588523865, "learning_rate": 5.011569252043019e-07, "loss": 1.0991, "num_input_tokens_seen": 760610356, "step": 19360 }, { "epoch": 0.9385599379784862, "grad_norm": 0.42378801107406616, "learning_rate": 4.933705758553619e-07, "loss": 1.1243, "num_input_tokens_seen": 761002188, "step": 19370 }, { "epoch": 0.9390444810543658, "grad_norm": 0.40979644656181335, "learning_rate": 4.856445854833719e-07, "loss": 1.1623, "num_input_tokens_seen": 761397232, "step": 19380 }, { "epoch": 0.9395290241302452, "grad_norm": 0.3961712718009949, "learning_rate": 4.779789731171713e-07, "loss": 1.1282, "num_input_tokens_seen": 761789268, "step": 19390 }, { "epoch": 0.9400135672061246, "grad_norm": 0.3944045603275299, "learning_rate": 4.7037375763689664e-07, "loss": 1.1045, "num_input_tokens_seen": 762178272, "step": 19400 }, { "epoch": 0.940498110282004, "grad_norm": 0.40869519114494324, "learning_rate": 4.628289577739309e-07, "loss": 1.1048, "num_input_tokens_seen": 762586012, "step": 19410 }, { "epoch": 0.9409826533578836, "grad_norm": 0.46123749017715454, "learning_rate": 4.55344592110854e-07, "loss": 1.1706, "num_input_tokens_seen": 763010200, "step": 19420 }, { "epoch": 0.941467196433763, "grad_norm": 0.40070682764053345, "learning_rate": 4.4792067908140387e-07, "loss": 1.1299, "num_input_tokens_seen": 763393780, "step": 19430 }, { "epoch": 0.9419517395096424, "grad_norm": 0.381905734539032, "learning_rate": 4.4055723697040976e-07, "loss": 1.1155, "num_input_tokens_seen": 763809116, "step": 19440 }, { "epoch": 0.9424362825855218, "grad_norm": 0.41140425205230713, "learning_rate": 4.3325428391378377e-07, "loss": 1.1394, "num_input_tokens_seen": 764205148, "step": 19450 }, { "epoch": 0.9429208256614013, "grad_norm": 0.43021172285079956, "learning_rate": 4.260118378984407e-07, "loss": 1.1463, "num_input_tokens_seen": 764595480, "step": 19460 }, { "epoch": 0.9434053687372808, "grad_norm": 0.43823298811912537, "learning_rate": 4.188299167622728e-07, "loss": 1.1693, "num_input_tokens_seen": 764964992, "step": 19470 }, { "epoch": 0.9438899118131602, "grad_norm": 0.3995303213596344, "learning_rate": 4.117085381941055e-07, "loss": 1.1092, "num_input_tokens_seen": 765363980, "step": 19480 }, { "epoch": 0.9443744548890396, "grad_norm": 0.36656254529953003, "learning_rate": 4.0464771973364456e-07, "loss": 1.128, "num_input_tokens_seen": 765753044, "step": 19490 }, { "epoch": 0.9448589979649191, "grad_norm": 0.34931907057762146, "learning_rate": 3.9764747877144015e-07, "loss": 1.1266, "num_input_tokens_seen": 766153976, "step": 19500 }, { "epoch": 0.9453435410407985, "grad_norm": 0.38153091073036194, "learning_rate": 3.90707832548845e-07, "loss": 1.0923, "num_input_tokens_seen": 766544700, "step": 19510 }, { "epoch": 0.945828084116678, "grad_norm": 0.3994804620742798, "learning_rate": 3.838287981579619e-07, "loss": 1.1353, "num_input_tokens_seen": 766951124, "step": 19520 }, { "epoch": 0.9463126271925574, "grad_norm": 0.4302452504634857, "learning_rate": 3.7701039254162405e-07, "loss": 1.1205, "num_input_tokens_seen": 767361300, "step": 19530 }, { "epoch": 0.9467971702684369, "grad_norm": 0.4123125970363617, "learning_rate": 3.702526324933148e-07, "loss": 1.1401, "num_input_tokens_seen": 767727640, "step": 19540 }, { "epoch": 0.9472817133443163, "grad_norm": 0.3907496929168701, "learning_rate": 3.635555346571701e-07, "loss": 1.1068, "num_input_tokens_seen": 768096588, "step": 19550 }, { "epoch": 0.9477662564201957, "grad_norm": 0.47248342633247375, "learning_rate": 3.569191155279067e-07, "loss": 1.0979, "num_input_tokens_seen": 768467260, "step": 19560 }, { "epoch": 0.9482507994960752, "grad_norm": 0.38330796360969543, "learning_rate": 3.503433914507942e-07, "loss": 1.0569, "num_input_tokens_seen": 768874516, "step": 19570 }, { "epoch": 0.9487353425719547, "grad_norm": 0.40380969643592834, "learning_rate": 3.438283786216134e-07, "loss": 1.1573, "num_input_tokens_seen": 769280172, "step": 19580 }, { "epoch": 0.9492198856478341, "grad_norm": 0.37804293632507324, "learning_rate": 3.373740930866176e-07, "loss": 1.1426, "num_input_tokens_seen": 769668856, "step": 19590 }, { "epoch": 0.9497044287237135, "grad_norm": 0.39080822467803955, "learning_rate": 3.309805507424796e-07, "loss": 1.0912, "num_input_tokens_seen": 770088040, "step": 19600 }, { "epoch": 0.9501889717995929, "grad_norm": 0.4255511462688446, "learning_rate": 3.2464776733628075e-07, "loss": 1.1104, "num_input_tokens_seen": 770463620, "step": 19610 }, { "epoch": 0.9506735148754725, "grad_norm": 0.3901178240776062, "learning_rate": 3.183757584654418e-07, "loss": 1.1788, "num_input_tokens_seen": 770856772, "step": 19620 }, { "epoch": 0.9511580579513519, "grad_norm": 0.4149034321308136, "learning_rate": 3.1216453957770565e-07, "loss": 1.118, "num_input_tokens_seen": 771274300, "step": 19630 }, { "epoch": 0.9516426010272313, "grad_norm": 0.43007585406303406, "learning_rate": 3.0601412597108527e-07, "loss": 1.0905, "num_input_tokens_seen": 771659776, "step": 19640 }, { "epoch": 0.9521271441031107, "grad_norm": 0.41749218106269836, "learning_rate": 2.9992453279383825e-07, "loss": 1.1274, "num_input_tokens_seen": 772049360, "step": 19650 }, { "epoch": 0.9526116871789903, "grad_norm": 0.39733201265335083, "learning_rate": 2.938957750444199e-07, "loss": 1.0949, "num_input_tokens_seen": 772424652, "step": 19660 }, { "epoch": 0.9530962302548697, "grad_norm": 0.3866783380508423, "learning_rate": 2.879278675714497e-07, "loss": 1.1138, "num_input_tokens_seen": 772839880, "step": 19670 }, { "epoch": 0.9535807733307491, "grad_norm": 0.40627822279930115, "learning_rate": 2.820208250736839e-07, "loss": 1.1247, "num_input_tokens_seen": 773238608, "step": 19680 }, { "epoch": 0.9540653164066285, "grad_norm": 0.3711201548576355, "learning_rate": 2.7617466209995115e-07, "loss": 1.0661, "num_input_tokens_seen": 773603544, "step": 19690 }, { "epoch": 0.954549859482508, "grad_norm": 0.4101531505584717, "learning_rate": 2.703893930491558e-07, "loss": 1.1215, "num_input_tokens_seen": 773975992, "step": 19700 }, { "epoch": 0.9550344025583875, "grad_norm": 0.39880767464637756, "learning_rate": 2.6466503217021654e-07, "loss": 1.1503, "num_input_tokens_seen": 774381780, "step": 19710 }, { "epoch": 0.9555189456342669, "grad_norm": 0.3977833688259125, "learning_rate": 2.5900159356202493e-07, "loss": 1.1096, "num_input_tokens_seen": 774779660, "step": 19720 }, { "epoch": 0.9560034887101463, "grad_norm": 0.3905205726623535, "learning_rate": 2.5339909117344515e-07, "loss": 1.1316, "num_input_tokens_seen": 775185260, "step": 19730 }, { "epoch": 0.9564880317860258, "grad_norm": 0.3854650557041168, "learning_rate": 2.4785753880323945e-07, "loss": 1.0807, "num_input_tokens_seen": 775588504, "step": 19740 }, { "epoch": 0.9569725748619052, "grad_norm": 0.3917495608329773, "learning_rate": 2.4237695010005945e-07, "loss": 1.1197, "num_input_tokens_seen": 775995628, "step": 19750 }, { "epoch": 0.9574571179377847, "grad_norm": 0.401319295167923, "learning_rate": 2.3695733856240466e-07, "loss": 1.1239, "num_input_tokens_seen": 776399704, "step": 19760 }, { "epoch": 0.9579416610136641, "grad_norm": 0.3838230073451996, "learning_rate": 2.3159871753859475e-07, "loss": 1.1737, "num_input_tokens_seen": 776806144, "step": 19770 }, { "epoch": 0.9584262040895436, "grad_norm": 0.39468008279800415, "learning_rate": 2.263011002267168e-07, "loss": 1.1331, "num_input_tokens_seen": 777150100, "step": 19780 }, { "epoch": 0.958910747165423, "grad_norm": 0.390552282333374, "learning_rate": 2.2106449967463084e-07, "loss": 1.0929, "num_input_tokens_seen": 777529280, "step": 19790 }, { "epoch": 0.9593952902413024, "grad_norm": 0.38140344619750977, "learning_rate": 2.158889287798921e-07, "loss": 1.102, "num_input_tokens_seen": 777934560, "step": 19800 }, { "epoch": 0.9598798333171819, "grad_norm": 0.3968132734298706, "learning_rate": 2.1077440028975936e-07, "loss": 1.096, "num_input_tokens_seen": 778325752, "step": 19810 }, { "epoch": 0.9603643763930614, "grad_norm": 0.39363691210746765, "learning_rate": 2.057209268011312e-07, "loss": 1.1293, "num_input_tokens_seen": 778719844, "step": 19820 }, { "epoch": 0.9608489194689408, "grad_norm": 0.4126024544239044, "learning_rate": 2.0072852076054305e-07, "loss": 1.0956, "num_input_tokens_seen": 779089772, "step": 19830 }, { "epoch": 0.9613334625448202, "grad_norm": 0.4260122776031494, "learning_rate": 1.957971944641146e-07, "loss": 1.1148, "num_input_tokens_seen": 779469980, "step": 19840 }, { "epoch": 0.9618180056206996, "grad_norm": 0.43452244997024536, "learning_rate": 1.9092696005753309e-07, "loss": 1.1507, "num_input_tokens_seen": 779871528, "step": 19850 }, { "epoch": 0.9623025486965792, "grad_norm": 0.3910650312900543, "learning_rate": 1.861178295360172e-07, "loss": 1.1368, "num_input_tokens_seen": 780275872, "step": 19860 }, { "epoch": 0.9627870917724586, "grad_norm": 0.38760048151016235, "learning_rate": 1.813698147442866e-07, "loss": 1.1478, "num_input_tokens_seen": 780644748, "step": 19870 }, { "epoch": 0.963271634848338, "grad_norm": 0.4468711316585541, "learning_rate": 1.7668292737653692e-07, "loss": 1.0858, "num_input_tokens_seen": 781036480, "step": 19880 }, { "epoch": 0.9637561779242174, "grad_norm": 0.4267124533653259, "learning_rate": 1.7205717897640638e-07, "loss": 1.1375, "num_input_tokens_seen": 781423288, "step": 19890 }, { "epoch": 0.964240721000097, "grad_norm": 0.3964008092880249, "learning_rate": 1.674925809369593e-07, "loss": 1.1471, "num_input_tokens_seen": 781830760, "step": 19900 }, { "epoch": 0.9647252640759764, "grad_norm": 0.39716637134552, "learning_rate": 1.6298914450063596e-07, "loss": 1.1797, "num_input_tokens_seen": 782212080, "step": 19910 }, { "epoch": 0.9652098071518558, "grad_norm": 0.372944712638855, "learning_rate": 1.5854688075924718e-07, "loss": 1.1394, "num_input_tokens_seen": 782640040, "step": 19920 }, { "epoch": 0.9656943502277352, "grad_norm": 0.3637344539165497, "learning_rate": 1.5416580065392984e-07, "loss": 1.1311, "num_input_tokens_seen": 783024516, "step": 19930 }, { "epoch": 0.9661788933036147, "grad_norm": 0.42081308364868164, "learning_rate": 1.4984591497513856e-07, "loss": 1.1429, "num_input_tokens_seen": 783413016, "step": 19940 }, { "epoch": 0.9666634363794941, "grad_norm": 0.4066031873226166, "learning_rate": 1.4558723436259857e-07, "loss": 1.0993, "num_input_tokens_seen": 783799984, "step": 19950 }, { "epoch": 0.9671479794553736, "grad_norm": 0.39138638973236084, "learning_rate": 1.413897693052918e-07, "loss": 1.1108, "num_input_tokens_seen": 784171868, "step": 19960 }, { "epoch": 0.967632522531253, "grad_norm": 0.39998748898506165, "learning_rate": 1.3725353014142627e-07, "loss": 1.1042, "num_input_tokens_seen": 784567228, "step": 19970 }, { "epoch": 0.9681170656071325, "grad_norm": 0.42037269473075867, "learning_rate": 1.3317852705842239e-07, "loss": 1.1562, "num_input_tokens_seen": 784983108, "step": 19980 }, { "epoch": 0.9686016086830119, "grad_norm": 0.4196394979953766, "learning_rate": 1.2916477009286553e-07, "loss": 1.1435, "num_input_tokens_seen": 785363296, "step": 19990 }, { "epoch": 0.9690861517588913, "grad_norm": 0.405160129070282, "learning_rate": 1.2521226913050077e-07, "loss": 1.0713, "num_input_tokens_seen": 785755388, "step": 20000 }, { "epoch": 0.9690861517588913, "eval_loss": 1.1160393953323364, "eval_runtime": 4.9905, "eval_samples_per_second": 30.057, "eval_steps_per_second": 3.807, "num_input_tokens_seen": 785755388, "step": 20000 }, { "epoch": 0.9695706948347708, "grad_norm": 0.38724902272224426, "learning_rate": 1.2132103390620208e-07, "loss": 1.1259, "num_input_tokens_seen": 786168264, "step": 20010 }, { "epoch": 0.9700552379106503, "grad_norm": 0.4315202236175537, "learning_rate": 1.1749107400394477e-07, "loss": 1.1549, "num_input_tokens_seen": 786578560, "step": 20020 }, { "epoch": 0.9705397809865297, "grad_norm": 0.4061446785926819, "learning_rate": 1.1372239885678871e-07, "loss": 1.1396, "num_input_tokens_seen": 786961092, "step": 20030 }, { "epoch": 0.9710243240624091, "grad_norm": 0.38450926542282104, "learning_rate": 1.1001501774684785e-07, "loss": 1.119, "num_input_tokens_seen": 787377820, "step": 20040 }, { "epoch": 0.9715088671382885, "grad_norm": 0.3797142803668976, "learning_rate": 1.063689398052764e-07, "loss": 1.1298, "num_input_tokens_seen": 787789388, "step": 20050 }, { "epoch": 0.9719934102141681, "grad_norm": 0.38401252031326294, "learning_rate": 1.0278417401223539e-07, "loss": 1.1608, "num_input_tokens_seen": 788175444, "step": 20060 }, { "epoch": 0.9724779532900475, "grad_norm": 0.39400792121887207, "learning_rate": 9.92607291968789e-08, "loss": 1.1329, "num_input_tokens_seen": 788565228, "step": 20070 }, { "epoch": 0.9729624963659269, "grad_norm": 0.36722561717033386, "learning_rate": 9.579861403732627e-08, "loss": 1.1117, "num_input_tokens_seen": 788948208, "step": 20080 }, { "epoch": 0.9734470394418063, "grad_norm": 0.39072954654693604, "learning_rate": 9.239783706065375e-08, "loss": 1.1896, "num_input_tokens_seen": 789332972, "step": 20090 }, { "epoch": 0.9739315825176859, "grad_norm": 0.4184485077857971, "learning_rate": 8.905840664284736e-08, "loss": 1.114, "num_input_tokens_seen": 789721976, "step": 20100 }, { "epoch": 0.9744161255935653, "grad_norm": 0.3963470160961151, "learning_rate": 8.578033100881677e-08, "loss": 1.1298, "num_input_tokens_seen": 790100792, "step": 20110 }, { "epoch": 0.9749006686694447, "grad_norm": 0.39767026901245117, "learning_rate": 8.256361823234527e-08, "loss": 1.1244, "num_input_tokens_seen": 790492440, "step": 20120 }, { "epoch": 0.9753852117453241, "grad_norm": 0.4353410005569458, "learning_rate": 7.940827623608427e-08, "loss": 1.1209, "num_input_tokens_seen": 790900392, "step": 20130 }, { "epoch": 0.9758697548212036, "grad_norm": 0.4203384518623352, "learning_rate": 7.631431279153111e-08, "loss": 1.1472, "num_input_tokens_seen": 791285628, "step": 20140 }, { "epoch": 0.9763542978970831, "grad_norm": 0.3865092098712921, "learning_rate": 7.328173551901241e-08, "loss": 1.1548, "num_input_tokens_seen": 791701900, "step": 20150 }, { "epoch": 0.9768388409729625, "grad_norm": 0.4074331521987915, "learning_rate": 7.031055188765622e-08, "loss": 1.0969, "num_input_tokens_seen": 792103252, "step": 20160 }, { "epoch": 0.9773233840488419, "grad_norm": 0.4258996546268463, "learning_rate": 6.74007692153894e-08, "loss": 1.1043, "num_input_tokens_seen": 792516300, "step": 20170 }, { "epoch": 0.9778079271247214, "grad_norm": 0.42485523223876953, "learning_rate": 6.455239466890418e-08, "loss": 1.1026, "num_input_tokens_seen": 792886424, "step": 20180 }, { "epoch": 0.9782924702006008, "grad_norm": 0.4095394015312195, "learning_rate": 6.176543526364709e-08, "loss": 1.1432, "num_input_tokens_seen": 793294236, "step": 20190 }, { "epoch": 0.9787770132764803, "grad_norm": 0.38347864151000977, "learning_rate": 5.903989786380515e-08, "loss": 1.126, "num_input_tokens_seen": 793704988, "step": 20200 }, { "epoch": 0.9792615563523597, "grad_norm": 0.39956969022750854, "learning_rate": 5.637578918227526e-08, "loss": 1.1295, "num_input_tokens_seen": 794085224, "step": 20210 }, { "epoch": 0.9797460994282392, "grad_norm": 0.4015878736972809, "learning_rate": 5.377311578067257e-08, "loss": 1.1144, "num_input_tokens_seen": 794453220, "step": 20220 }, { "epoch": 0.9802306425041186, "grad_norm": 0.4096335172653198, "learning_rate": 5.1231884069288805e-08, "loss": 1.0987, "num_input_tokens_seen": 794843816, "step": 20230 }, { "epoch": 0.980715185579998, "grad_norm": 0.3710688650608063, "learning_rate": 4.875210030708677e-08, "loss": 1.1324, "num_input_tokens_seen": 795210116, "step": 20240 }, { "epoch": 0.9811997286558775, "grad_norm": 0.39916378259658813, "learning_rate": 4.6333770601689195e-08, "loss": 1.1467, "num_input_tokens_seen": 795613888, "step": 20250 }, { "epoch": 0.981684271731757, "grad_norm": 0.4291175603866577, "learning_rate": 4.397690090935935e-08, "loss": 1.1484, "num_input_tokens_seen": 796019220, "step": 20260 }, { "epoch": 0.9821688148076364, "grad_norm": 0.37853673100471497, "learning_rate": 4.1681497034984364e-08, "loss": 1.0957, "num_input_tokens_seen": 796422340, "step": 20270 }, { "epoch": 0.9826533578835158, "grad_norm": 0.3903977572917938, "learning_rate": 3.9447564632066894e-08, "loss": 1.1658, "num_input_tokens_seen": 796813048, "step": 20280 }, { "epoch": 0.9831379009593952, "grad_norm": 0.39838188886642456, "learning_rate": 3.7275109202700165e-08, "loss": 1.0973, "num_input_tokens_seen": 797185304, "step": 20290 }, { "epoch": 0.9836224440352748, "grad_norm": 0.3897262513637543, "learning_rate": 3.5164136097567965e-08, "loss": 1.1272, "num_input_tokens_seen": 797583336, "step": 20300 }, { "epoch": 0.9841069871111542, "grad_norm": 0.400979220867157, "learning_rate": 3.311465051592522e-08, "loss": 1.144, "num_input_tokens_seen": 797981308, "step": 20310 }, { "epoch": 0.9845915301870336, "grad_norm": 0.4422793686389923, "learning_rate": 3.112665750558408e-08, "loss": 1.07, "num_input_tokens_seen": 798370388, "step": 20320 }, { "epoch": 0.985076073262913, "grad_norm": 0.37973761558532715, "learning_rate": 2.9200161962900098e-08, "loss": 1.0582, "num_input_tokens_seen": 798765140, "step": 20330 }, { "epoch": 0.9855606163387926, "grad_norm": 0.42133259773254395, "learning_rate": 2.7335168632763863e-08, "loss": 1.1353, "num_input_tokens_seen": 799159564, "step": 20340 }, { "epoch": 0.986045159414672, "grad_norm": 0.5169322490692139, "learning_rate": 2.5531682108595468e-08, "loss": 1.1157, "num_input_tokens_seen": 799545036, "step": 20350 }, { "epoch": 0.9865297024905514, "grad_norm": 0.4151138365268707, "learning_rate": 2.3789706832311186e-08, "loss": 1.0991, "num_input_tokens_seen": 799911908, "step": 20360 }, { "epoch": 0.9870142455664308, "grad_norm": 0.3700861930847168, "learning_rate": 2.2109247094342922e-08, "loss": 1.0973, "num_input_tokens_seen": 800292720, "step": 20370 }, { "epoch": 0.9874987886423103, "grad_norm": 0.3943372070789337, "learning_rate": 2.049030703359933e-08, "loss": 1.0944, "num_input_tokens_seen": 800674764, "step": 20380 }, { "epoch": 0.9879833317181898, "grad_norm": 0.40062424540519714, "learning_rate": 1.893289063747694e-08, "loss": 1.1316, "num_input_tokens_seen": 801063864, "step": 20390 }, { "epoch": 0.9884678747940692, "grad_norm": 0.40976980328559875, "learning_rate": 1.7437001741835157e-08, "loss": 1.1555, "num_input_tokens_seen": 801463696, "step": 20400 }, { "epoch": 0.9889524178699486, "grad_norm": 0.36616411805152893, "learning_rate": 1.6002644031001823e-08, "loss": 1.1345, "num_input_tokens_seen": 801848428, "step": 20410 }, { "epoch": 0.9894369609458281, "grad_norm": 0.42311692237854004, "learning_rate": 1.4629821037742686e-08, "loss": 1.1274, "num_input_tokens_seen": 802239180, "step": 20420 }, { "epoch": 0.9899215040217075, "grad_norm": 0.39517006278038025, "learning_rate": 1.331853614327805e-08, "loss": 1.1543, "num_input_tokens_seen": 802651824, "step": 20430 }, { "epoch": 0.990406047097587, "grad_norm": 0.4204373359680176, "learning_rate": 1.2068792577255017e-08, "loss": 1.1141, "num_input_tokens_seen": 803056060, "step": 20440 }, { "epoch": 0.9908905901734664, "grad_norm": 0.42741021513938904, "learning_rate": 1.0880593417753049e-08, "loss": 1.107, "num_input_tokens_seen": 803449480, "step": 20450 }, { "epoch": 0.9913751332493459, "grad_norm": 0.3925841450691223, "learning_rate": 9.753941591258974e-09, "loss": 1.0623, "num_input_tokens_seen": 803845032, "step": 20460 }, { "epoch": 0.9918596763252253, "grad_norm": 0.38680553436279297, "learning_rate": 8.6888398726781e-09, "loss": 1.0997, "num_input_tokens_seen": 804253704, "step": 20470 }, { "epoch": 0.9923442194011047, "grad_norm": 0.3853330612182617, "learning_rate": 7.685290885323104e-09, "loss": 1.1338, "num_input_tokens_seen": 804638340, "step": 20480 }, { "epoch": 0.9928287624769842, "grad_norm": 0.378024160861969, "learning_rate": 6.743297100897383e-09, "loss": 1.1218, "num_input_tokens_seen": 805045548, "step": 20490 }, { "epoch": 0.9933133055528637, "grad_norm": 0.4238669276237488, "learning_rate": 5.862860839500606e-09, "loss": 1.1136, "num_input_tokens_seen": 805444148, "step": 20500 }, { "epoch": 0.9937978486287431, "grad_norm": 0.40654098987579346, "learning_rate": 5.0439842696148324e-09, "loss": 1.1313, "num_input_tokens_seen": 805854612, "step": 20510 }, { "epoch": 0.9942823917046225, "grad_norm": 0.38661834597587585, "learning_rate": 4.286669408104516e-09, "loss": 1.0887, "num_input_tokens_seen": 806230656, "step": 20520 }, { "epoch": 0.9947669347805019, "grad_norm": 0.42557960748672485, "learning_rate": 3.5909181202137267e-09, "loss": 1.1222, "num_input_tokens_seen": 806626356, "step": 20530 }, { "epoch": 0.9952514778563815, "grad_norm": 0.40263572335243225, "learning_rate": 2.9567321195467236e-09, "loss": 1.0986, "num_input_tokens_seen": 807005260, "step": 20540 }, { "epoch": 0.9957360209322609, "grad_norm": 0.39904552698135376, "learning_rate": 2.384112968087382e-09, "loss": 1.0754, "num_input_tokens_seen": 807374212, "step": 20550 }, { "epoch": 0.9962205640081403, "grad_norm": 0.41701799631118774, "learning_rate": 1.8730620761742147e-09, "loss": 1.1052, "num_input_tokens_seen": 807812204, "step": 20560 }, { "epoch": 0.9967051070840197, "grad_norm": 0.3973786532878876, "learning_rate": 1.4235807025114733e-09, "loss": 1.1094, "num_input_tokens_seen": 808181824, "step": 20570 }, { "epoch": 0.9971896501598992, "grad_norm": 0.3856273889541626, "learning_rate": 1.0356699541497205e-09, "loss": 1.1272, "num_input_tokens_seen": 808554948, "step": 20580 }, { "epoch": 0.9976741932357787, "grad_norm": 0.4207695424556732, "learning_rate": 7.093307865052578e-10, "loss": 1.1468, "num_input_tokens_seen": 808980516, "step": 20590 }, { "epoch": 0.9981587363116581, "grad_norm": 0.4193539619445801, "learning_rate": 4.4456400333514616e-10, "loss": 1.1278, "num_input_tokens_seen": 809358268, "step": 20600 }, { "epoch": 0.9986432793875375, "grad_norm": 0.4102073609828949, "learning_rate": 2.4137025675663447e-10, "loss": 1.0943, "num_input_tokens_seen": 809733876, "step": 20610 }, { "epoch": 0.999127822463417, "grad_norm": 0.41482511162757874, "learning_rate": 9.975004722495573e-11, "loss": 1.1314, "num_input_tokens_seen": 810140020, "step": 20620 }, { "epoch": 0.9996123655392964, "grad_norm": 0.3799668252468109, "learning_rate": 1.9703723547204178e-11, "loss": 1.1301, "num_input_tokens_seen": 810535920, "step": 20630 }, { "epoch": 1.0, "num_input_tokens_seen": 810839096, "step": 20638, "total_flos": 4.0772707408255386e+18, "train_loss": 1.171214375919653, "train_runtime": 248208.7734, "train_samples_per_second": 10.643, "train_steps_per_second": 0.083 } ], "logging_steps": 10, "max_steps": 20638, "num_input_tokens_seen": 810839096, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.0772707408255386e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }