diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16637 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 20638, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00048454307587944567, + "grad_norm": 0.4255552887916565, + "learning_rate": 8.064516129032258e-07, + "loss": 1.221, + "num_input_tokens_seen": 392540, + "step": 10 + }, + { + "epoch": 0.0009690861517588913, + "grad_norm": 0.4066425859928131, + "learning_rate": 1.6129032258064516e-06, + "loss": 1.2327, + "num_input_tokens_seen": 788876, + "step": 20 + }, + { + "epoch": 0.0014536292276383371, + "grad_norm": 0.37268057465553284, + "learning_rate": 2.4193548387096776e-06, + "loss": 1.2174, + "num_input_tokens_seen": 1187392, + "step": 30 + }, + { + "epoch": 0.0019381723035177827, + "grad_norm": 0.38770782947540283, + "learning_rate": 3.225806451612903e-06, + "loss": 1.1503, + "num_input_tokens_seen": 1556468, + "step": 40 + }, + { + "epoch": 0.0024227153793972282, + "grad_norm": 0.40057116746902466, + "learning_rate": 4.032258064516129e-06, + "loss": 1.2156, + "num_input_tokens_seen": 1950296, + "step": 50 + }, + { + "epoch": 0.0029072584552766742, + "grad_norm": 0.4051766097545624, + "learning_rate": 4.838709677419355e-06, + "loss": 1.2149, + "num_input_tokens_seen": 2336152, + "step": 60 + }, + { + "epoch": 0.00339180153115612, + "grad_norm": 0.41299352049827576, + "learning_rate": 5.64516129032258e-06, + "loss": 1.19, + "num_input_tokens_seen": 2728108, + "step": 70 + }, + { + "epoch": 0.0038763446070355654, + "grad_norm": 0.4118805527687073, + "learning_rate": 6.451612903225806e-06, + "loss": 1.1994, + "num_input_tokens_seen": 3111872, + "step": 80 + }, + { + "epoch": 0.004360887682915011, + "grad_norm": 0.42130813002586365, + "learning_rate": 7.258064516129033e-06, + "loss": 1.2706, + "num_input_tokens_seen": 3506908, + "step": 90 + }, + { + "epoch": 0.0048454307587944565, + "grad_norm": 0.3949407637119293, + "learning_rate": 8.064516129032258e-06, + "loss": 1.187, + "num_input_tokens_seen": 3920340, + "step": 100 + }, + { + "epoch": 0.005329973834673903, + "grad_norm": 0.42650577425956726, + "learning_rate": 8.870967741935484e-06, + "loss": 1.2195, + "num_input_tokens_seen": 4303752, + "step": 110 + }, + { + "epoch": 0.0058145169105533485, + "grad_norm": 0.43526706099510193, + "learning_rate": 9.67741935483871e-06, + "loss": 1.2141, + "num_input_tokens_seen": 4684992, + "step": 120 + }, + { + "epoch": 0.006299059986432794, + "grad_norm": 0.4030572474002838, + "learning_rate": 1.0483870967741936e-05, + "loss": 1.2873, + "num_input_tokens_seen": 5052616, + "step": 130 + }, + { + "epoch": 0.00678360306231224, + "grad_norm": 0.41993653774261475, + "learning_rate": 1.129032258064516e-05, + "loss": 1.2175, + "num_input_tokens_seen": 5427828, + "step": 140 + }, + { + "epoch": 0.007268146138191685, + "grad_norm": 0.4032376706600189, + "learning_rate": 1.2096774193548388e-05, + "loss": 1.2747, + "num_input_tokens_seen": 5819148, + "step": 150 + }, + { + "epoch": 0.007752689214071131, + "grad_norm": 0.4497855603694916, + "learning_rate": 1.2903225806451613e-05, + "loss": 1.2422, + "num_input_tokens_seen": 6198028, + "step": 160 + }, + { + "epoch": 0.008237232289950576, + "grad_norm": 0.4343348741531372, + "learning_rate": 1.3709677419354839e-05, + "loss": 1.2621, + "num_input_tokens_seen": 6604716, + "step": 170 + }, + { + "epoch": 0.008721775365830022, + "grad_norm": 0.4532233774662018, + "learning_rate": 1.4516129032258066e-05, + "loss": 1.2263, + "num_input_tokens_seen": 6995768, + "step": 180 + }, + { + "epoch": 0.009206318441709467, + "grad_norm": 0.4360114634037018, + "learning_rate": 1.5322580645161292e-05, + "loss": 1.2022, + "num_input_tokens_seen": 7395348, + "step": 190 + }, + { + "epoch": 0.009690861517588913, + "grad_norm": 0.4088006913661957, + "learning_rate": 1.6129032258064517e-05, + "loss": 1.2199, + "num_input_tokens_seen": 7783256, + "step": 200 + }, + { + "epoch": 0.010175404593468359, + "grad_norm": 0.4394921064376831, + "learning_rate": 1.693548387096774e-05, + "loss": 1.2227, + "num_input_tokens_seen": 8183636, + "step": 210 + }, + { + "epoch": 0.010659947669347806, + "grad_norm": 0.4413774311542511, + "learning_rate": 1.774193548387097e-05, + "loss": 1.2134, + "num_input_tokens_seen": 8541160, + "step": 220 + }, + { + "epoch": 0.011144490745227251, + "grad_norm": 0.4048616588115692, + "learning_rate": 1.8548387096774193e-05, + "loss": 1.1969, + "num_input_tokens_seen": 8913056, + "step": 230 + }, + { + "epoch": 0.011629033821106697, + "grad_norm": 0.4245803654193878, + "learning_rate": 1.935483870967742e-05, + "loss": 1.2161, + "num_input_tokens_seen": 9304116, + "step": 240 + }, + { + "epoch": 0.012113576896986143, + "grad_norm": 0.4485531151294708, + "learning_rate": 2.0161290322580645e-05, + "loss": 1.1966, + "num_input_tokens_seen": 9690224, + "step": 250 + }, + { + "epoch": 0.012598119972865588, + "grad_norm": 0.44514381885528564, + "learning_rate": 2.0967741935483873e-05, + "loss": 1.2736, + "num_input_tokens_seen": 10093420, + "step": 260 + }, + { + "epoch": 0.013082663048745034, + "grad_norm": 0.47046637535095215, + "learning_rate": 2.1774193548387097e-05, + "loss": 1.2217, + "num_input_tokens_seen": 10480092, + "step": 270 + }, + { + "epoch": 0.01356720612462448, + "grad_norm": 0.4320948123931885, + "learning_rate": 2.258064516129032e-05, + "loss": 1.2185, + "num_input_tokens_seen": 10856260, + "step": 280 + }, + { + "epoch": 0.014051749200503925, + "grad_norm": 0.418040931224823, + "learning_rate": 2.338709677419355e-05, + "loss": 1.2504, + "num_input_tokens_seen": 11231836, + "step": 290 + }, + { + "epoch": 0.01453629227638337, + "grad_norm": 0.4360697865486145, + "learning_rate": 2.4193548387096777e-05, + "loss": 1.2199, + "num_input_tokens_seen": 11622080, + "step": 300 + }, + { + "epoch": 0.015020835352262816, + "grad_norm": 0.4039739668369293, + "learning_rate": 2.5e-05, + "loss": 1.1947, + "num_input_tokens_seen": 12009244, + "step": 310 + }, + { + "epoch": 0.015505378428142261, + "grad_norm": 0.4114651381969452, + "learning_rate": 2.5806451612903226e-05, + "loss": 1.2345, + "num_input_tokens_seen": 12404068, + "step": 320 + }, + { + "epoch": 0.015989921504021707, + "grad_norm": 0.44177761673927307, + "learning_rate": 2.661290322580645e-05, + "loss": 1.2303, + "num_input_tokens_seen": 12774432, + "step": 330 + }, + { + "epoch": 0.016474464579901153, + "grad_norm": 0.4257271885871887, + "learning_rate": 2.7419354838709678e-05, + "loss": 1.2513, + "num_input_tokens_seen": 13155544, + "step": 340 + }, + { + "epoch": 0.016959007655780598, + "grad_norm": 0.4340864419937134, + "learning_rate": 2.822580645161291e-05, + "loss": 1.2486, + "num_input_tokens_seen": 13554616, + "step": 350 + }, + { + "epoch": 0.017443550731660044, + "grad_norm": 0.4148235023021698, + "learning_rate": 2.9032258064516133e-05, + "loss": 1.2193, + "num_input_tokens_seen": 13937664, + "step": 360 + }, + { + "epoch": 0.01792809380753949, + "grad_norm": 0.417041152715683, + "learning_rate": 2.9838709677419357e-05, + "loss": 1.2142, + "num_input_tokens_seen": 14331280, + "step": 370 + }, + { + "epoch": 0.018412636883418935, + "grad_norm": 0.39417609572410583, + "learning_rate": 3.0645161290322585e-05, + "loss": 1.2865, + "num_input_tokens_seen": 14707672, + "step": 380 + }, + { + "epoch": 0.01889717995929838, + "grad_norm": 0.4522784948348999, + "learning_rate": 3.1451612903225806e-05, + "loss": 1.2437, + "num_input_tokens_seen": 15106840, + "step": 390 + }, + { + "epoch": 0.019381723035177826, + "grad_norm": 0.43782681226730347, + "learning_rate": 3.2258064516129034e-05, + "loss": 1.2215, + "num_input_tokens_seen": 15480172, + "step": 400 + }, + { + "epoch": 0.01986626611105727, + "grad_norm": 0.4035712480545044, + "learning_rate": 3.306451612903226e-05, + "loss": 1.2133, + "num_input_tokens_seen": 15879184, + "step": 410 + }, + { + "epoch": 0.020350809186936717, + "grad_norm": 0.4507812261581421, + "learning_rate": 3.387096774193548e-05, + "loss": 1.2469, + "num_input_tokens_seen": 16282872, + "step": 420 + }, + { + "epoch": 0.020835352262816166, + "grad_norm": 0.41557714343070984, + "learning_rate": 3.467741935483872e-05, + "loss": 1.2694, + "num_input_tokens_seen": 16649572, + "step": 430 + }, + { + "epoch": 0.02131989533869561, + "grad_norm": 0.43545469641685486, + "learning_rate": 3.548387096774194e-05, + "loss": 1.2469, + "num_input_tokens_seen": 17047428, + "step": 440 + }, + { + "epoch": 0.021804438414575057, + "grad_norm": 0.46730995178222656, + "learning_rate": 3.6290322580645165e-05, + "loss": 1.2082, + "num_input_tokens_seen": 17447240, + "step": 450 + }, + { + "epoch": 0.022288981490454503, + "grad_norm": 0.45304545760154724, + "learning_rate": 3.7096774193548386e-05, + "loss": 1.1941, + "num_input_tokens_seen": 17831524, + "step": 460 + }, + { + "epoch": 0.02277352456633395, + "grad_norm": 0.44148966670036316, + "learning_rate": 3.7903225806451614e-05, + "loss": 1.2507, + "num_input_tokens_seen": 18226524, + "step": 470 + }, + { + "epoch": 0.023258067642213394, + "grad_norm": 0.46780791878700256, + "learning_rate": 3.870967741935484e-05, + "loss": 1.2136, + "num_input_tokens_seen": 18621280, + "step": 480 + }, + { + "epoch": 0.02374261071809284, + "grad_norm": 0.40209197998046875, + "learning_rate": 3.951612903225806e-05, + "loss": 1.3024, + "num_input_tokens_seen": 19023172, + "step": 490 + }, + { + "epoch": 0.024227153793972285, + "grad_norm": 0.45693570375442505, + "learning_rate": 4.032258064516129e-05, + "loss": 1.2317, + "num_input_tokens_seen": 19384076, + "step": 500 + }, + { + "epoch": 0.02471169686985173, + "grad_norm": 0.4480474293231964, + "learning_rate": 4.112903225806452e-05, + "loss": 1.244, + "num_input_tokens_seen": 19772672, + "step": 510 + }, + { + "epoch": 0.025196239945731176, + "grad_norm": 0.3898276686668396, + "learning_rate": 4.1935483870967746e-05, + "loss": 1.2248, + "num_input_tokens_seen": 20147580, + "step": 520 + }, + { + "epoch": 0.02568078302161062, + "grad_norm": 0.45244449377059937, + "learning_rate": 4.2741935483870973e-05, + "loss": 1.2479, + "num_input_tokens_seen": 20538480, + "step": 530 + }, + { + "epoch": 0.026165326097490067, + "grad_norm": 0.4444933235645294, + "learning_rate": 4.3548387096774194e-05, + "loss": 1.2349, + "num_input_tokens_seen": 20938472, + "step": 540 + }, + { + "epoch": 0.026649869173369513, + "grad_norm": 0.4245850741863251, + "learning_rate": 4.435483870967742e-05, + "loss": 1.2061, + "num_input_tokens_seen": 21323560, + "step": 550 + }, + { + "epoch": 0.02713441224924896, + "grad_norm": 0.4357333481311798, + "learning_rate": 4.516129032258064e-05, + "loss": 1.2532, + "num_input_tokens_seen": 21725356, + "step": 560 + }, + { + "epoch": 0.027618955325128404, + "grad_norm": 0.4161827564239502, + "learning_rate": 4.596774193548387e-05, + "loss": 1.241, + "num_input_tokens_seen": 22107816, + "step": 570 + }, + { + "epoch": 0.02810349840100785, + "grad_norm": 0.44058066606521606, + "learning_rate": 4.67741935483871e-05, + "loss": 1.284, + "num_input_tokens_seen": 22490484, + "step": 580 + }, + { + "epoch": 0.028588041476887295, + "grad_norm": 0.4422559142112732, + "learning_rate": 4.7580645161290326e-05, + "loss": 1.2633, + "num_input_tokens_seen": 22890776, + "step": 590 + }, + { + "epoch": 0.02907258455276674, + "grad_norm": 0.4280403256416321, + "learning_rate": 4.8387096774193554e-05, + "loss": 1.2649, + "num_input_tokens_seen": 23274324, + "step": 600 + }, + { + "epoch": 0.029557127628646186, + "grad_norm": 0.4213055968284607, + "learning_rate": 4.9193548387096775e-05, + "loss": 1.2544, + "num_input_tokens_seen": 23668904, + "step": 610 + }, + { + "epoch": 0.030041670704525632, + "grad_norm": 0.4733617901802063, + "learning_rate": 5e-05, + "loss": 1.2238, + "num_input_tokens_seen": 24062116, + "step": 620 + }, + { + "epoch": 0.030526213780405077, + "grad_norm": 0.4112738370895386, + "learning_rate": 4.999996921293424e-05, + "loss": 1.2544, + "num_input_tokens_seen": 24430924, + "step": 630 + }, + { + "epoch": 0.031010756856284523, + "grad_norm": 0.3989699184894562, + "learning_rate": 4.999987685181276e-05, + "loss": 1.2679, + "num_input_tokens_seen": 24803116, + "step": 640 + }, + { + "epoch": 0.03149529993216397, + "grad_norm": 0.46107640862464905, + "learning_rate": 4.9999722916863064e-05, + "loss": 1.2498, + "num_input_tokens_seen": 25174648, + "step": 650 + }, + { + "epoch": 0.031979843008043414, + "grad_norm": 0.4159105122089386, + "learning_rate": 4.999950740846427e-05, + "loss": 1.2148, + "num_input_tokens_seen": 25572492, + "step": 660 + }, + { + "epoch": 0.03246438608392286, + "grad_norm": 0.41995078325271606, + "learning_rate": 4.9999230327147187e-05, + "loss": 1.2921, + "num_input_tokens_seen": 25970816, + "step": 670 + }, + { + "epoch": 0.032948929159802305, + "grad_norm": 0.4316340386867523, + "learning_rate": 4.999889167359425e-05, + "loss": 1.2401, + "num_input_tokens_seen": 26346612, + "step": 680 + }, + { + "epoch": 0.03343347223568175, + "grad_norm": 0.4364665448665619, + "learning_rate": 4.999849144863954e-05, + "loss": 1.228, + "num_input_tokens_seen": 26750808, + "step": 690 + }, + { + "epoch": 0.033918015311561196, + "grad_norm": 0.418192595243454, + "learning_rate": 4.9998029653268805e-05, + "loss": 1.2565, + "num_input_tokens_seen": 27144672, + "step": 700 + }, + { + "epoch": 0.03440255838744064, + "grad_norm": 0.42366668581962585, + "learning_rate": 4.9997506288619436e-05, + "loss": 1.3009, + "num_input_tokens_seen": 27537568, + "step": 710 + }, + { + "epoch": 0.03488710146332009, + "grad_norm": 0.40567639470100403, + "learning_rate": 4.9996921355980456e-05, + "loss": 1.1824, + "num_input_tokens_seen": 27934864, + "step": 720 + }, + { + "epoch": 0.03537164453919953, + "grad_norm": 0.4429372549057007, + "learning_rate": 4.999627485679254e-05, + "loss": 1.2941, + "num_input_tokens_seen": 28315316, + "step": 730 + }, + { + "epoch": 0.03585618761507898, + "grad_norm": 0.4326581358909607, + "learning_rate": 4.999556679264798e-05, + "loss": 1.2178, + "num_input_tokens_seen": 28695628, + "step": 740 + }, + { + "epoch": 0.036340730690958424, + "grad_norm": 0.39658111333847046, + "learning_rate": 4.9994797165290724e-05, + "loss": 1.2194, + "num_input_tokens_seen": 29075004, + "step": 750 + }, + { + "epoch": 0.03682527376683787, + "grad_norm": 0.42921292781829834, + "learning_rate": 4.999396597661634e-05, + "loss": 1.262, + "num_input_tokens_seen": 29452648, + "step": 760 + }, + { + "epoch": 0.037309816842717315, + "grad_norm": 0.42564624547958374, + "learning_rate": 4.999307322867201e-05, + "loss": 1.2686, + "num_input_tokens_seen": 29806040, + "step": 770 + }, + { + "epoch": 0.03779435991859676, + "grad_norm": 0.42206162214279175, + "learning_rate": 4.9992118923656525e-05, + "loss": 1.2644, + "num_input_tokens_seen": 30187964, + "step": 780 + }, + { + "epoch": 0.038278902994476206, + "grad_norm": 0.40607237815856934, + "learning_rate": 4.999110306392034e-05, + "loss": 1.2095, + "num_input_tokens_seen": 30595388, + "step": 790 + }, + { + "epoch": 0.03876344607035565, + "grad_norm": 0.4239601492881775, + "learning_rate": 4.999002565196546e-05, + "loss": 1.2785, + "num_input_tokens_seen": 30969404, + "step": 800 + }, + { + "epoch": 0.0392479891462351, + "grad_norm": 0.37221038341522217, + "learning_rate": 4.9988886690445524e-05, + "loss": 1.2645, + "num_input_tokens_seen": 31352172, + "step": 810 + }, + { + "epoch": 0.03973253222211454, + "grad_norm": 0.42992502450942993, + "learning_rate": 4.998768618216575e-05, + "loss": 1.2646, + "num_input_tokens_seen": 31707436, + "step": 820 + }, + { + "epoch": 0.04021707529799399, + "grad_norm": 0.40228039026260376, + "learning_rate": 4.998642413008294e-05, + "loss": 1.2796, + "num_input_tokens_seen": 32113760, + "step": 830 + }, + { + "epoch": 0.040701618373873434, + "grad_norm": 0.4015443027019501, + "learning_rate": 4.9985100537305494e-05, + "loss": 1.271, + "num_input_tokens_seen": 32495944, + "step": 840 + }, + { + "epoch": 0.04118616144975288, + "grad_norm": 0.4232306480407715, + "learning_rate": 4.998371540709338e-05, + "loss": 1.2403, + "num_input_tokens_seen": 32878288, + "step": 850 + }, + { + "epoch": 0.04167070452563233, + "grad_norm": 0.4329904317855835, + "learning_rate": 4.998226874285811e-05, + "loss": 1.2433, + "num_input_tokens_seen": 33283124, + "step": 860 + }, + { + "epoch": 0.04215524760151178, + "grad_norm": 0.3939256966114044, + "learning_rate": 4.998076054816278e-05, + "loss": 1.2513, + "num_input_tokens_seen": 33675788, + "step": 870 + }, + { + "epoch": 0.04263979067739122, + "grad_norm": 0.40896937251091003, + "learning_rate": 4.997919082672201e-05, + "loss": 1.2058, + "num_input_tokens_seen": 34078876, + "step": 880 + }, + { + "epoch": 0.04312433375327067, + "grad_norm": 0.41988757252693176, + "learning_rate": 4.997755958240198e-05, + "loss": 1.2704, + "num_input_tokens_seen": 34476892, + "step": 890 + }, + { + "epoch": 0.043608876829150114, + "grad_norm": 0.3970952332019806, + "learning_rate": 4.997586681922039e-05, + "loss": 1.2592, + "num_input_tokens_seen": 34879628, + "step": 900 + }, + { + "epoch": 0.04409341990502956, + "grad_norm": 0.3762570917606354, + "learning_rate": 4.997411254134645e-05, + "loss": 1.2744, + "num_input_tokens_seen": 35263368, + "step": 910 + }, + { + "epoch": 0.044577962980909006, + "grad_norm": 0.44425544142723083, + "learning_rate": 4.9972296753100875e-05, + "loss": 1.27, + "num_input_tokens_seen": 35669980, + "step": 920 + }, + { + "epoch": 0.04506250605678845, + "grad_norm": 0.3981640636920929, + "learning_rate": 4.9970419458955916e-05, + "loss": 1.2627, + "num_input_tokens_seen": 36075236, + "step": 930 + }, + { + "epoch": 0.0455470491326679, + "grad_norm": 0.42503446340560913, + "learning_rate": 4.996848066353526e-05, + "loss": 1.2742, + "num_input_tokens_seen": 36460584, + "step": 940 + }, + { + "epoch": 0.04603159220854734, + "grad_norm": 0.44177576899528503, + "learning_rate": 4.99664803716141e-05, + "loss": 1.2338, + "num_input_tokens_seen": 36844140, + "step": 950 + }, + { + "epoch": 0.04651613528442679, + "grad_norm": 0.4258316159248352, + "learning_rate": 4.996441858811909e-05, + "loss": 1.251, + "num_input_tokens_seen": 37245224, + "step": 960 + }, + { + "epoch": 0.04700067836030623, + "grad_norm": 0.39270251989364624, + "learning_rate": 4.996229531812833e-05, + "loss": 1.2682, + "num_input_tokens_seen": 37623120, + "step": 970 + }, + { + "epoch": 0.04748522143618568, + "grad_norm": 0.40389484167099, + "learning_rate": 4.996011056687135e-05, + "loss": 1.2599, + "num_input_tokens_seen": 37992076, + "step": 980 + }, + { + "epoch": 0.047969764512065124, + "grad_norm": 0.4183795154094696, + "learning_rate": 4.9957864339729126e-05, + "loss": 1.2647, + "num_input_tokens_seen": 38392008, + "step": 990 + }, + { + "epoch": 0.04845430758794457, + "grad_norm": 0.3972471356391907, + "learning_rate": 4.9955556642234034e-05, + "loss": 1.2387, + "num_input_tokens_seen": 38802860, + "step": 1000 + }, + { + "epoch": 0.048938850663824016, + "grad_norm": 0.3918885886669159, + "learning_rate": 4.9953187480069854e-05, + "loss": 1.2611, + "num_input_tokens_seen": 39175600, + "step": 1010 + }, + { + "epoch": 0.04942339373970346, + "grad_norm": 0.4439409673213959, + "learning_rate": 4.9950756859071755e-05, + "loss": 1.1817, + "num_input_tokens_seen": 39586740, + "step": 1020 + }, + { + "epoch": 0.04990793681558291, + "grad_norm": 0.43080589175224304, + "learning_rate": 4.994826478522626e-05, + "loss": 1.2453, + "num_input_tokens_seen": 39993292, + "step": 1030 + }, + { + "epoch": 0.05039247989146235, + "grad_norm": 0.41559743881225586, + "learning_rate": 4.9945711264671276e-05, + "loss": 1.2635, + "num_input_tokens_seen": 40412108, + "step": 1040 + }, + { + "epoch": 0.0508770229673418, + "grad_norm": 0.4444926381111145, + "learning_rate": 4.994309630369602e-05, + "loss": 1.2742, + "num_input_tokens_seen": 40815268, + "step": 1050 + }, + { + "epoch": 0.05136156604322124, + "grad_norm": 0.43432703614234924, + "learning_rate": 4.9940419908741065e-05, + "loss": 1.2782, + "num_input_tokens_seen": 41228008, + "step": 1060 + }, + { + "epoch": 0.05184610911910069, + "grad_norm": 0.46701622009277344, + "learning_rate": 4.993768208639826e-05, + "loss": 1.1921, + "num_input_tokens_seen": 41641820, + "step": 1070 + }, + { + "epoch": 0.052330652194980135, + "grad_norm": 0.42707207798957825, + "learning_rate": 4.993488284341078e-05, + "loss": 1.221, + "num_input_tokens_seen": 42035360, + "step": 1080 + }, + { + "epoch": 0.05281519527085958, + "grad_norm": 0.4334350824356079, + "learning_rate": 4.993202218667307e-05, + "loss": 1.2273, + "num_input_tokens_seen": 42437020, + "step": 1090 + }, + { + "epoch": 0.053299738346739026, + "grad_norm": 0.3986518681049347, + "learning_rate": 4.99291001232308e-05, + "loss": 1.2333, + "num_input_tokens_seen": 42804100, + "step": 1100 + }, + { + "epoch": 0.05378428142261847, + "grad_norm": 0.4267319142818451, + "learning_rate": 4.992611666028094e-05, + "loss": 1.2582, + "num_input_tokens_seen": 43211524, + "step": 1110 + }, + { + "epoch": 0.05426882449849792, + "grad_norm": 0.4689897894859314, + "learning_rate": 4.992307180517165e-05, + "loss": 1.2354, + "num_input_tokens_seen": 43626756, + "step": 1120 + }, + { + "epoch": 0.05475336757437736, + "grad_norm": 0.45038825273513794, + "learning_rate": 4.991996556540229e-05, + "loss": 1.2588, + "num_input_tokens_seen": 44045992, + "step": 1130 + }, + { + "epoch": 0.05523791065025681, + "grad_norm": 0.4214176535606384, + "learning_rate": 4.991679794862343e-05, + "loss": 1.1964, + "num_input_tokens_seen": 44421180, + "step": 1140 + }, + { + "epoch": 0.055722453726136253, + "grad_norm": 0.41900694370269775, + "learning_rate": 4.99135689626368e-05, + "loss": 1.2565, + "num_input_tokens_seen": 44831096, + "step": 1150 + }, + { + "epoch": 0.0562069968020157, + "grad_norm": 0.39536434412002563, + "learning_rate": 4.9910278615395276e-05, + "loss": 1.2181, + "num_input_tokens_seen": 45272232, + "step": 1160 + }, + { + "epoch": 0.056691539877895145, + "grad_norm": 0.4110620319843292, + "learning_rate": 4.9906926915002875e-05, + "loss": 1.2794, + "num_input_tokens_seen": 45672992, + "step": 1170 + }, + { + "epoch": 0.05717608295377459, + "grad_norm": 0.4034215211868286, + "learning_rate": 4.9903513869714704e-05, + "loss": 1.2561, + "num_input_tokens_seen": 46085848, + "step": 1180 + }, + { + "epoch": 0.057660626029654036, + "grad_norm": 0.456302672624588, + "learning_rate": 4.990003948793699e-05, + "loss": 1.258, + "num_input_tokens_seen": 46486968, + "step": 1190 + }, + { + "epoch": 0.05814516910553348, + "grad_norm": 0.4716334044933319, + "learning_rate": 4.989650377822702e-05, + "loss": 1.2495, + "num_input_tokens_seen": 46892488, + "step": 1200 + }, + { + "epoch": 0.05862971218141293, + "grad_norm": 0.4383469820022583, + "learning_rate": 4.98929067492931e-05, + "loss": 1.2278, + "num_input_tokens_seen": 47271524, + "step": 1210 + }, + { + "epoch": 0.05911425525729237, + "grad_norm": 0.4059995710849762, + "learning_rate": 4.988924840999462e-05, + "loss": 1.2907, + "num_input_tokens_seen": 47666308, + "step": 1220 + }, + { + "epoch": 0.05959879833317182, + "grad_norm": 0.42955759167671204, + "learning_rate": 4.9885528769341905e-05, + "loss": 1.2455, + "num_input_tokens_seen": 48036456, + "step": 1230 + }, + { + "epoch": 0.060083341409051264, + "grad_norm": 0.43056222796440125, + "learning_rate": 4.988174783649633e-05, + "loss": 1.2205, + "num_input_tokens_seen": 48426548, + "step": 1240 + }, + { + "epoch": 0.06056788448493071, + "grad_norm": 0.408191442489624, + "learning_rate": 4.987790562077019e-05, + "loss": 1.2517, + "num_input_tokens_seen": 48803056, + "step": 1250 + }, + { + "epoch": 0.061052427560810155, + "grad_norm": 0.3784200847148895, + "learning_rate": 4.987400213162673e-05, + "loss": 1.2009, + "num_input_tokens_seen": 49204892, + "step": 1260 + }, + { + "epoch": 0.0615369706366896, + "grad_norm": 0.4229642450809479, + "learning_rate": 4.987003737868011e-05, + "loss": 1.2979, + "num_input_tokens_seen": 49597520, + "step": 1270 + }, + { + "epoch": 0.062021513712569046, + "grad_norm": 0.4075496792793274, + "learning_rate": 4.9866011371695374e-05, + "loss": 1.2041, + "num_input_tokens_seen": 49973816, + "step": 1280 + }, + { + "epoch": 0.06250605678844849, + "grad_norm": 0.4234859049320221, + "learning_rate": 4.9861924120588445e-05, + "loss": 1.2489, + "num_input_tokens_seen": 50382180, + "step": 1290 + }, + { + "epoch": 0.06299059986432794, + "grad_norm": 0.4083632528781891, + "learning_rate": 4.985777563542607e-05, + "loss": 1.2459, + "num_input_tokens_seen": 50746876, + "step": 1300 + }, + { + "epoch": 0.06347514294020738, + "grad_norm": 0.4187311828136444, + "learning_rate": 4.985356592642584e-05, + "loss": 1.2366, + "num_input_tokens_seen": 51150280, + "step": 1310 + }, + { + "epoch": 0.06395968601608683, + "grad_norm": 0.4542880952358246, + "learning_rate": 4.984929500395611e-05, + "loss": 1.2511, + "num_input_tokens_seen": 51527856, + "step": 1320 + }, + { + "epoch": 0.06444422909196627, + "grad_norm": 0.40190425515174866, + "learning_rate": 4.9844962878536004e-05, + "loss": 1.2314, + "num_input_tokens_seen": 51885820, + "step": 1330 + }, + { + "epoch": 0.06492877216784572, + "grad_norm": 0.3928161859512329, + "learning_rate": 4.9840569560835416e-05, + "loss": 1.2156, + "num_input_tokens_seen": 52276412, + "step": 1340 + }, + { + "epoch": 0.06541331524372516, + "grad_norm": 0.407649964094162, + "learning_rate": 4.9836115061674925e-05, + "loss": 1.2379, + "num_input_tokens_seen": 52647892, + "step": 1350 + }, + { + "epoch": 0.06589785831960461, + "grad_norm": 0.4136994183063507, + "learning_rate": 4.983159939202582e-05, + "loss": 1.2633, + "num_input_tokens_seen": 53053832, + "step": 1360 + }, + { + "epoch": 0.06638240139548406, + "grad_norm": 0.4096525311470032, + "learning_rate": 4.9827022563010016e-05, + "loss": 1.2265, + "num_input_tokens_seen": 53431196, + "step": 1370 + }, + { + "epoch": 0.0668669444713635, + "grad_norm": 0.4357371926307678, + "learning_rate": 4.982238458590009e-05, + "loss": 1.2617, + "num_input_tokens_seen": 53833560, + "step": 1380 + }, + { + "epoch": 0.06735148754724295, + "grad_norm": 0.4323122799396515, + "learning_rate": 4.9817685472119246e-05, + "loss": 1.3117, + "num_input_tokens_seen": 54217320, + "step": 1390 + }, + { + "epoch": 0.06783603062312239, + "grad_norm": 0.4947691261768341, + "learning_rate": 4.98129252332412e-05, + "loss": 1.2197, + "num_input_tokens_seen": 54609344, + "step": 1400 + }, + { + "epoch": 0.06832057369900184, + "grad_norm": 0.43488138914108276, + "learning_rate": 4.980810388099028e-05, + "loss": 1.256, + "num_input_tokens_seen": 55028116, + "step": 1410 + }, + { + "epoch": 0.06880511677488128, + "grad_norm": 0.4741380512714386, + "learning_rate": 4.980322142724129e-05, + "loss": 1.2525, + "num_input_tokens_seen": 55416832, + "step": 1420 + }, + { + "epoch": 0.06928965985076073, + "grad_norm": 0.4468291103839874, + "learning_rate": 4.979827788401956e-05, + "loss": 1.2414, + "num_input_tokens_seen": 55798836, + "step": 1430 + }, + { + "epoch": 0.06977420292664017, + "grad_norm": 0.42211663722991943, + "learning_rate": 4.979327326350086e-05, + "loss": 1.2589, + "num_input_tokens_seen": 56187416, + "step": 1440 + }, + { + "epoch": 0.07025874600251962, + "grad_norm": 0.4140244722366333, + "learning_rate": 4.9788207578011405e-05, + "loss": 1.239, + "num_input_tokens_seen": 56577512, + "step": 1450 + }, + { + "epoch": 0.07074328907839907, + "grad_norm": 0.39189738035202026, + "learning_rate": 4.978308084002779e-05, + "loss": 1.1905, + "num_input_tokens_seen": 56978600, + "step": 1460 + }, + { + "epoch": 0.07122783215427851, + "grad_norm": 0.3976607024669647, + "learning_rate": 4.9777893062176986e-05, + "loss": 1.252, + "num_input_tokens_seen": 57367068, + "step": 1470 + }, + { + "epoch": 0.07171237523015796, + "grad_norm": 0.3938475251197815, + "learning_rate": 4.977264425723632e-05, + "loss": 1.2644, + "num_input_tokens_seen": 57756716, + "step": 1480 + }, + { + "epoch": 0.0721969183060374, + "grad_norm": 0.3958350718021393, + "learning_rate": 4.976733443813343e-05, + "loss": 1.2834, + "num_input_tokens_seen": 58127588, + "step": 1490 + }, + { + "epoch": 0.07268146138191685, + "grad_norm": 0.41049420833587646, + "learning_rate": 4.976196361794619e-05, + "loss": 1.2359, + "num_input_tokens_seen": 58531680, + "step": 1500 + }, + { + "epoch": 0.0731660044577963, + "grad_norm": 0.40144941210746765, + "learning_rate": 4.9756531809902765e-05, + "loss": 1.2358, + "num_input_tokens_seen": 58917960, + "step": 1510 + }, + { + "epoch": 0.07365054753367574, + "grad_norm": 0.4596763551235199, + "learning_rate": 4.975103902738149e-05, + "loss": 1.2676, + "num_input_tokens_seen": 59324348, + "step": 1520 + }, + { + "epoch": 0.07413509060955518, + "grad_norm": 0.41802385449409485, + "learning_rate": 4.974548528391091e-05, + "loss": 1.2669, + "num_input_tokens_seen": 59734700, + "step": 1530 + }, + { + "epoch": 0.07461963368543463, + "grad_norm": 0.409807026386261, + "learning_rate": 4.9739870593169705e-05, + "loss": 1.2579, + "num_input_tokens_seen": 60135636, + "step": 1540 + }, + { + "epoch": 0.07510417676131408, + "grad_norm": 0.41342952847480774, + "learning_rate": 4.9734194968986656e-05, + "loss": 1.2319, + "num_input_tokens_seen": 60516912, + "step": 1550 + }, + { + "epoch": 0.07558871983719352, + "grad_norm": 0.4141559898853302, + "learning_rate": 4.972845842534063e-05, + "loss": 1.2452, + "num_input_tokens_seen": 60899940, + "step": 1560 + }, + { + "epoch": 0.07607326291307297, + "grad_norm": 0.4386405646800995, + "learning_rate": 4.9722660976360534e-05, + "loss": 1.2262, + "num_input_tokens_seen": 61295608, + "step": 1570 + }, + { + "epoch": 0.07655780598895241, + "grad_norm": 0.40798524022102356, + "learning_rate": 4.9716802636325286e-05, + "loss": 1.2671, + "num_input_tokens_seen": 61693040, + "step": 1580 + }, + { + "epoch": 0.07704234906483186, + "grad_norm": 0.37625306844711304, + "learning_rate": 4.9710883419663774e-05, + "loss": 1.2962, + "num_input_tokens_seen": 62072768, + "step": 1590 + }, + { + "epoch": 0.0775268921407113, + "grad_norm": 0.4342903792858124, + "learning_rate": 4.970490334095482e-05, + "loss": 1.257, + "num_input_tokens_seen": 62446884, + "step": 1600 + }, + { + "epoch": 0.07801143521659075, + "grad_norm": 0.41230422258377075, + "learning_rate": 4.969886241492715e-05, + "loss": 1.27, + "num_input_tokens_seen": 62836772, + "step": 1610 + }, + { + "epoch": 0.0784959782924702, + "grad_norm": 0.4433565139770508, + "learning_rate": 4.969276065645936e-05, + "loss": 1.2592, + "num_input_tokens_seen": 63200816, + "step": 1620 + }, + { + "epoch": 0.07898052136834964, + "grad_norm": 0.47754809260368347, + "learning_rate": 4.968659808057986e-05, + "loss": 1.2752, + "num_input_tokens_seen": 63605328, + "step": 1630 + }, + { + "epoch": 0.07946506444422909, + "grad_norm": 0.3927769958972931, + "learning_rate": 4.968037470246687e-05, + "loss": 1.2468, + "num_input_tokens_seen": 63972724, + "step": 1640 + }, + { + "epoch": 0.07994960752010853, + "grad_norm": 0.39971116185188293, + "learning_rate": 4.9674090537448346e-05, + "loss": 1.1927, + "num_input_tokens_seen": 64354840, + "step": 1650 + }, + { + "epoch": 0.08043415059598798, + "grad_norm": 0.41240766644477844, + "learning_rate": 4.966774560100198e-05, + "loss": 1.2495, + "num_input_tokens_seen": 64781948, + "step": 1660 + }, + { + "epoch": 0.08091869367186742, + "grad_norm": 0.38843321800231934, + "learning_rate": 4.966133990875512e-05, + "loss": 1.2167, + "num_input_tokens_seen": 65145908, + "step": 1670 + }, + { + "epoch": 0.08140323674774687, + "grad_norm": 0.42320936918258667, + "learning_rate": 4.965487347648476e-05, + "loss": 1.2934, + "num_input_tokens_seen": 65551472, + "step": 1680 + }, + { + "epoch": 0.08188777982362631, + "grad_norm": 0.39081892371177673, + "learning_rate": 4.964834632011751e-05, + "loss": 1.2347, + "num_input_tokens_seen": 65941084, + "step": 1690 + }, + { + "epoch": 0.08237232289950576, + "grad_norm": 0.4150112271308899, + "learning_rate": 4.964175845572952e-05, + "loss": 1.2232, + "num_input_tokens_seen": 66336864, + "step": 1700 + }, + { + "epoch": 0.0828568659753852, + "grad_norm": 0.37995874881744385, + "learning_rate": 4.9635109899546476e-05, + "loss": 1.263, + "num_input_tokens_seen": 66718060, + "step": 1710 + }, + { + "epoch": 0.08334140905126466, + "grad_norm": 0.39722344279289246, + "learning_rate": 4.962840066794354e-05, + "loss": 1.276, + "num_input_tokens_seen": 67124980, + "step": 1720 + }, + { + "epoch": 0.08382595212714411, + "grad_norm": 0.4169159233570099, + "learning_rate": 4.9621630777445316e-05, + "loss": 1.2465, + "num_input_tokens_seen": 67528936, + "step": 1730 + }, + { + "epoch": 0.08431049520302356, + "grad_norm": 0.426110178232193, + "learning_rate": 4.96148002447258e-05, + "loss": 1.2534, + "num_input_tokens_seen": 67929316, + "step": 1740 + }, + { + "epoch": 0.084795038278903, + "grad_norm": 0.44997116923332214, + "learning_rate": 4.960790908660838e-05, + "loss": 1.2193, + "num_input_tokens_seen": 68318772, + "step": 1750 + }, + { + "epoch": 0.08527958135478245, + "grad_norm": 0.39495953917503357, + "learning_rate": 4.9600957320065715e-05, + "loss": 1.2464, + "num_input_tokens_seen": 68728948, + "step": 1760 + }, + { + "epoch": 0.08576412443066189, + "grad_norm": 0.39029765129089355, + "learning_rate": 4.959394496221977e-05, + "loss": 1.251, + "num_input_tokens_seen": 69116412, + "step": 1770 + }, + { + "epoch": 0.08624866750654134, + "grad_norm": 0.3958011865615845, + "learning_rate": 4.958687203034176e-05, + "loss": 1.2569, + "num_input_tokens_seen": 69515196, + "step": 1780 + }, + { + "epoch": 0.08673321058242078, + "grad_norm": 0.41105154156684875, + "learning_rate": 4.957973854185204e-05, + "loss": 1.2641, + "num_input_tokens_seen": 69919164, + "step": 1790 + }, + { + "epoch": 0.08721775365830023, + "grad_norm": 0.4101089835166931, + "learning_rate": 4.957254451432016e-05, + "loss": 1.2562, + "num_input_tokens_seen": 70312276, + "step": 1800 + }, + { + "epoch": 0.08770229673417967, + "grad_norm": 0.4429624080657959, + "learning_rate": 4.956528996546476e-05, + "loss": 1.2673, + "num_input_tokens_seen": 70700052, + "step": 1810 + }, + { + "epoch": 0.08818683981005912, + "grad_norm": 0.38688287138938904, + "learning_rate": 4.9557974913153536e-05, + "loss": 1.2294, + "num_input_tokens_seen": 71078868, + "step": 1820 + }, + { + "epoch": 0.08867138288593857, + "grad_norm": 0.4344911277294159, + "learning_rate": 4.955059937540322e-05, + "loss": 1.2436, + "num_input_tokens_seen": 71470396, + "step": 1830 + }, + { + "epoch": 0.08915592596181801, + "grad_norm": 0.4057205021381378, + "learning_rate": 4.9543163370379484e-05, + "loss": 1.2561, + "num_input_tokens_seen": 71856368, + "step": 1840 + }, + { + "epoch": 0.08964046903769746, + "grad_norm": 0.4123441278934479, + "learning_rate": 4.953566691639697e-05, + "loss": 1.2738, + "num_input_tokens_seen": 72253720, + "step": 1850 + }, + { + "epoch": 0.0901250121135769, + "grad_norm": 0.42719489336013794, + "learning_rate": 4.9528110031919175e-05, + "loss": 1.2306, + "num_input_tokens_seen": 72625716, + "step": 1860 + }, + { + "epoch": 0.09060955518945635, + "grad_norm": 0.41707441210746765, + "learning_rate": 4.9520492735558444e-05, + "loss": 1.2504, + "num_input_tokens_seen": 73013404, + "step": 1870 + }, + { + "epoch": 0.0910940982653358, + "grad_norm": 0.4267444312572479, + "learning_rate": 4.951281504607591e-05, + "loss": 1.2637, + "num_input_tokens_seen": 73406604, + "step": 1880 + }, + { + "epoch": 0.09157864134121524, + "grad_norm": 0.42108404636383057, + "learning_rate": 4.950507698238147e-05, + "loss": 1.2222, + "num_input_tokens_seen": 73783332, + "step": 1890 + }, + { + "epoch": 0.09206318441709468, + "grad_norm": 0.4063856303691864, + "learning_rate": 4.949727856353369e-05, + "loss": 1.2441, + "num_input_tokens_seen": 74175440, + "step": 1900 + }, + { + "epoch": 0.09254772749297413, + "grad_norm": 0.37594878673553467, + "learning_rate": 4.94894198087398e-05, + "loss": 1.2485, + "num_input_tokens_seen": 74581572, + "step": 1910 + }, + { + "epoch": 0.09303227056885358, + "grad_norm": 0.3911046087741852, + "learning_rate": 4.9481500737355656e-05, + "loss": 1.2702, + "num_input_tokens_seen": 74943092, + "step": 1920 + }, + { + "epoch": 0.09351681364473302, + "grad_norm": 0.4102419912815094, + "learning_rate": 4.947352136888566e-05, + "loss": 1.2468, + "num_input_tokens_seen": 75331720, + "step": 1930 + }, + { + "epoch": 0.09400135672061247, + "grad_norm": 0.39770299196243286, + "learning_rate": 4.94654817229827e-05, + "loss": 1.2201, + "num_input_tokens_seen": 75736304, + "step": 1940 + }, + { + "epoch": 0.09448589979649191, + "grad_norm": 0.43506377935409546, + "learning_rate": 4.945738181944815e-05, + "loss": 1.2235, + "num_input_tokens_seen": 76130884, + "step": 1950 + }, + { + "epoch": 0.09497044287237136, + "grad_norm": 0.3792125880718231, + "learning_rate": 4.9449221678231795e-05, + "loss": 1.2698, + "num_input_tokens_seen": 76512888, + "step": 1960 + }, + { + "epoch": 0.0954549859482508, + "grad_norm": 0.39077404141426086, + "learning_rate": 4.9441001319431784e-05, + "loss": 1.2458, + "num_input_tokens_seen": 76922052, + "step": 1970 + }, + { + "epoch": 0.09593952902413025, + "grad_norm": 0.38054585456848145, + "learning_rate": 4.943272076329457e-05, + "loss": 1.2281, + "num_input_tokens_seen": 77306336, + "step": 1980 + }, + { + "epoch": 0.0964240721000097, + "grad_norm": 0.3908326029777527, + "learning_rate": 4.942438003021487e-05, + "loss": 1.2028, + "num_input_tokens_seen": 77689480, + "step": 1990 + }, + { + "epoch": 0.09690861517588914, + "grad_norm": 0.43404319882392883, + "learning_rate": 4.941597914073563e-05, + "loss": 1.234, + "num_input_tokens_seen": 78067836, + "step": 2000 + }, + { + "epoch": 0.09690861517588914, + "eval_loss": 1.2438511848449707, + "eval_runtime": 5.3239, + "eval_samples_per_second": 28.175, + "eval_steps_per_second": 3.569, + "num_input_tokens_seen": 78067836, + "step": 2000 + }, + { + "epoch": 0.09739315825176859, + "grad_norm": 0.4713827967643738, + "learning_rate": 4.9407518115547945e-05, + "loss": 1.2405, + "num_input_tokens_seen": 78461236, + "step": 2010 + }, + { + "epoch": 0.09787770132764803, + "grad_norm": 0.4206187427043915, + "learning_rate": 4.939899697549102e-05, + "loss": 1.2046, + "num_input_tokens_seen": 78840632, + "step": 2020 + }, + { + "epoch": 0.09836224440352748, + "grad_norm": 0.3922071158885956, + "learning_rate": 4.939041574155213e-05, + "loss": 1.2593, + "num_input_tokens_seen": 79253124, + "step": 2030 + }, + { + "epoch": 0.09884678747940692, + "grad_norm": 0.4534291625022888, + "learning_rate": 4.938177443486657e-05, + "loss": 1.2696, + "num_input_tokens_seen": 79629736, + "step": 2040 + }, + { + "epoch": 0.09933133055528637, + "grad_norm": 0.424139142036438, + "learning_rate": 4.937307307671756e-05, + "loss": 1.2686, + "num_input_tokens_seen": 80023928, + "step": 2050 + }, + { + "epoch": 0.09981587363116581, + "grad_norm": 0.45865899324417114, + "learning_rate": 4.9364311688536245e-05, + "loss": 1.2834, + "num_input_tokens_seen": 80420160, + "step": 2060 + }, + { + "epoch": 0.10030041670704526, + "grad_norm": 0.42138755321502686, + "learning_rate": 4.935549029190163e-05, + "loss": 1.2079, + "num_input_tokens_seen": 80810044, + "step": 2070 + }, + { + "epoch": 0.1007849597829247, + "grad_norm": 0.3841094672679901, + "learning_rate": 4.934660890854049e-05, + "loss": 1.2496, + "num_input_tokens_seen": 81210716, + "step": 2080 + }, + { + "epoch": 0.10126950285880415, + "grad_norm": 0.38037049770355225, + "learning_rate": 4.9337667560327396e-05, + "loss": 1.2724, + "num_input_tokens_seen": 81585980, + "step": 2090 + }, + { + "epoch": 0.1017540459346836, + "grad_norm": 0.4014841318130493, + "learning_rate": 4.932866626928454e-05, + "loss": 1.227, + "num_input_tokens_seen": 81980652, + "step": 2100 + }, + { + "epoch": 0.10223858901056304, + "grad_norm": 0.41437438130378723, + "learning_rate": 4.9319605057581816e-05, + "loss": 1.2472, + "num_input_tokens_seen": 82387492, + "step": 2110 + }, + { + "epoch": 0.10272313208644249, + "grad_norm": 0.4091525971889496, + "learning_rate": 4.931048394753666e-05, + "loss": 1.188, + "num_input_tokens_seen": 82759272, + "step": 2120 + }, + { + "epoch": 0.10320767516232193, + "grad_norm": 0.43405330181121826, + "learning_rate": 4.930130296161406e-05, + "loss": 1.2372, + "num_input_tokens_seen": 83158564, + "step": 2130 + }, + { + "epoch": 0.10369221823820138, + "grad_norm": 0.43749940395355225, + "learning_rate": 4.929206212242646e-05, + "loss": 1.2085, + "num_input_tokens_seen": 83536672, + "step": 2140 + }, + { + "epoch": 0.10417676131408082, + "grad_norm": 0.43483173847198486, + "learning_rate": 4.928276145273372e-05, + "loss": 1.2199, + "num_input_tokens_seen": 83943180, + "step": 2150 + }, + { + "epoch": 0.10466130438996027, + "grad_norm": 0.42413613200187683, + "learning_rate": 4.9273400975443076e-05, + "loss": 1.2012, + "num_input_tokens_seen": 84356164, + "step": 2160 + }, + { + "epoch": 0.10514584746583971, + "grad_norm": 0.445049911737442, + "learning_rate": 4.926398071360905e-05, + "loss": 1.2046, + "num_input_tokens_seen": 84761956, + "step": 2170 + }, + { + "epoch": 0.10563039054171916, + "grad_norm": 0.4001222848892212, + "learning_rate": 4.925450069043342e-05, + "loss": 1.246, + "num_input_tokens_seen": 85138340, + "step": 2180 + }, + { + "epoch": 0.1061149336175986, + "grad_norm": 0.401049941778183, + "learning_rate": 4.924496092926517e-05, + "loss": 1.2164, + "num_input_tokens_seen": 85513616, + "step": 2190 + }, + { + "epoch": 0.10659947669347805, + "grad_norm": 0.4612962603569031, + "learning_rate": 4.923536145360038e-05, + "loss": 1.2358, + "num_input_tokens_seen": 85931448, + "step": 2200 + }, + { + "epoch": 0.1070840197693575, + "grad_norm": 0.41920316219329834, + "learning_rate": 4.922570228708223e-05, + "loss": 1.3026, + "num_input_tokens_seen": 86315108, + "step": 2210 + }, + { + "epoch": 0.10756856284523694, + "grad_norm": 0.39290520548820496, + "learning_rate": 4.921598345350092e-05, + "loss": 1.2535, + "num_input_tokens_seen": 86709944, + "step": 2220 + }, + { + "epoch": 0.10805310592111639, + "grad_norm": 0.4282558262348175, + "learning_rate": 4.92062049767936e-05, + "loss": 1.2274, + "num_input_tokens_seen": 87113940, + "step": 2230 + }, + { + "epoch": 0.10853764899699583, + "grad_norm": 0.4753318428993225, + "learning_rate": 4.91963668810443e-05, + "loss": 1.2483, + "num_input_tokens_seen": 87501616, + "step": 2240 + }, + { + "epoch": 0.10902219207287528, + "grad_norm": 0.423225075006485, + "learning_rate": 4.918646919048393e-05, + "loss": 1.2327, + "num_input_tokens_seen": 87894360, + "step": 2250 + }, + { + "epoch": 0.10950673514875472, + "grad_norm": 0.39466777443885803, + "learning_rate": 4.9176511929490144e-05, + "loss": 1.2604, + "num_input_tokens_seen": 88317576, + "step": 2260 + }, + { + "epoch": 0.10999127822463417, + "grad_norm": 0.41255709528923035, + "learning_rate": 4.916649512258733e-05, + "loss": 1.2341, + "num_input_tokens_seen": 88690104, + "step": 2270 + }, + { + "epoch": 0.11047582130051362, + "grad_norm": 0.42974668741226196, + "learning_rate": 4.9156418794446545e-05, + "loss": 1.2159, + "num_input_tokens_seen": 89090732, + "step": 2280 + }, + { + "epoch": 0.11096036437639306, + "grad_norm": 0.44297677278518677, + "learning_rate": 4.914628296988543e-05, + "loss": 1.2344, + "num_input_tokens_seen": 89486096, + "step": 2290 + }, + { + "epoch": 0.11144490745227251, + "grad_norm": 0.4597613513469696, + "learning_rate": 4.913608767386817e-05, + "loss": 1.2482, + "num_input_tokens_seen": 89880732, + "step": 2300 + }, + { + "epoch": 0.11192945052815195, + "grad_norm": 0.4195135235786438, + "learning_rate": 4.912583293150542e-05, + "loss": 1.2158, + "num_input_tokens_seen": 90328148, + "step": 2310 + }, + { + "epoch": 0.1124139936040314, + "grad_norm": 0.44454923272132874, + "learning_rate": 4.9115518768054264e-05, + "loss": 1.2103, + "num_input_tokens_seen": 90703168, + "step": 2320 + }, + { + "epoch": 0.11289853667991084, + "grad_norm": 0.3934001624584198, + "learning_rate": 4.910514520891812e-05, + "loss": 1.2893, + "num_input_tokens_seen": 91052408, + "step": 2330 + }, + { + "epoch": 0.11338307975579029, + "grad_norm": 0.38367339968681335, + "learning_rate": 4.90947122796467e-05, + "loss": 1.2125, + "num_input_tokens_seen": 91430836, + "step": 2340 + }, + { + "epoch": 0.11386762283166973, + "grad_norm": 0.40738987922668457, + "learning_rate": 4.908422000593596e-05, + "loss": 1.2289, + "num_input_tokens_seen": 91848392, + "step": 2350 + }, + { + "epoch": 0.11435216590754918, + "grad_norm": 0.4179443418979645, + "learning_rate": 4.907366841362799e-05, + "loss": 1.2494, + "num_input_tokens_seen": 92240852, + "step": 2360 + }, + { + "epoch": 0.11483670898342863, + "grad_norm": 0.38464492559432983, + "learning_rate": 4.906305752871102e-05, + "loss": 1.1879, + "num_input_tokens_seen": 92619204, + "step": 2370 + }, + { + "epoch": 0.11532125205930807, + "grad_norm": 0.42252519726753235, + "learning_rate": 4.905238737731926e-05, + "loss": 1.2608, + "num_input_tokens_seen": 93002632, + "step": 2380 + }, + { + "epoch": 0.11580579513518752, + "grad_norm": 0.4146505296230316, + "learning_rate": 4.9041657985732936e-05, + "loss": 1.2486, + "num_input_tokens_seen": 93399964, + "step": 2390 + }, + { + "epoch": 0.11629033821106696, + "grad_norm": 0.438689261674881, + "learning_rate": 4.903086938037818e-05, + "loss": 1.27, + "num_input_tokens_seen": 93791652, + "step": 2400 + }, + { + "epoch": 0.11677488128694641, + "grad_norm": 0.42062869668006897, + "learning_rate": 4.9020021587826926e-05, + "loss": 1.2591, + "num_input_tokens_seen": 94191368, + "step": 2410 + }, + { + "epoch": 0.11725942436282585, + "grad_norm": 0.4142090976238251, + "learning_rate": 4.9009114634796926e-05, + "loss": 1.2627, + "num_input_tokens_seen": 94571872, + "step": 2420 + }, + { + "epoch": 0.1177439674387053, + "grad_norm": 0.4038289487361908, + "learning_rate": 4.8998148548151625e-05, + "loss": 1.2551, + "num_input_tokens_seen": 94979976, + "step": 2430 + }, + { + "epoch": 0.11822851051458474, + "grad_norm": 0.38102611899375916, + "learning_rate": 4.898712335490012e-05, + "loss": 1.2559, + "num_input_tokens_seen": 95358664, + "step": 2440 + }, + { + "epoch": 0.11871305359046419, + "grad_norm": 0.3852735757827759, + "learning_rate": 4.897603908219706e-05, + "loss": 1.2448, + "num_input_tokens_seen": 95763084, + "step": 2450 + }, + { + "epoch": 0.11919759666634364, + "grad_norm": 0.3712899684906006, + "learning_rate": 4.8964895757342643e-05, + "loss": 1.2488, + "num_input_tokens_seen": 96188292, + "step": 2460 + }, + { + "epoch": 0.11968213974222308, + "grad_norm": 0.3657510578632355, + "learning_rate": 4.8953693407782484e-05, + "loss": 1.253, + "num_input_tokens_seen": 96601596, + "step": 2470 + }, + { + "epoch": 0.12016668281810253, + "grad_norm": 0.4605284631252289, + "learning_rate": 4.894243206110758e-05, + "loss": 1.236, + "num_input_tokens_seen": 96973176, + "step": 2480 + }, + { + "epoch": 0.12065122589398197, + "grad_norm": 0.41020140051841736, + "learning_rate": 4.8931111745054226e-05, + "loss": 1.2364, + "num_input_tokens_seen": 97378752, + "step": 2490 + }, + { + "epoch": 0.12113576896986142, + "grad_norm": 0.36746346950531006, + "learning_rate": 4.891973248750399e-05, + "loss": 1.2666, + "num_input_tokens_seen": 97771668, + "step": 2500 + }, + { + "epoch": 0.12162031204574086, + "grad_norm": 0.4322774112224579, + "learning_rate": 4.890829431648357e-05, + "loss": 1.1992, + "num_input_tokens_seen": 98151028, + "step": 2510 + }, + { + "epoch": 0.12210485512162031, + "grad_norm": 0.42371341586112976, + "learning_rate": 4.8896797260164795e-05, + "loss": 1.3052, + "num_input_tokens_seen": 98531364, + "step": 2520 + }, + { + "epoch": 0.12258939819749975, + "grad_norm": 0.43509674072265625, + "learning_rate": 4.888524134686451e-05, + "loss": 1.2146, + "num_input_tokens_seen": 98929648, + "step": 2530 + }, + { + "epoch": 0.1230739412733792, + "grad_norm": 0.38487133383750916, + "learning_rate": 4.887362660504452e-05, + "loss": 1.2251, + "num_input_tokens_seen": 99326780, + "step": 2540 + }, + { + "epoch": 0.12355848434925865, + "grad_norm": 0.47879770398139954, + "learning_rate": 4.8861953063311544e-05, + "loss": 1.2862, + "num_input_tokens_seen": 99746768, + "step": 2550 + }, + { + "epoch": 0.12404302742513809, + "grad_norm": 0.3904902935028076, + "learning_rate": 4.8850220750417107e-05, + "loss": 1.2565, + "num_input_tokens_seen": 100114696, + "step": 2560 + }, + { + "epoch": 0.12452757050101754, + "grad_norm": 0.3858630359172821, + "learning_rate": 4.883842969525748e-05, + "loss": 1.2145, + "num_input_tokens_seen": 100542848, + "step": 2570 + }, + { + "epoch": 0.12501211357689698, + "grad_norm": 0.42013123631477356, + "learning_rate": 4.882657992687363e-05, + "loss": 1.2271, + "num_input_tokens_seen": 100976524, + "step": 2580 + }, + { + "epoch": 0.12549665665277643, + "grad_norm": 0.3980698585510254, + "learning_rate": 4.8814671474451126e-05, + "loss": 1.2801, + "num_input_tokens_seen": 101345460, + "step": 2590 + }, + { + "epoch": 0.12598119972865587, + "grad_norm": 0.3923351764678955, + "learning_rate": 4.880270436732006e-05, + "loss": 1.2064, + "num_input_tokens_seen": 101759284, + "step": 2600 + }, + { + "epoch": 0.12646574280453532, + "grad_norm": 0.40927964448928833, + "learning_rate": 4.879067863495502e-05, + "loss": 1.2739, + "num_input_tokens_seen": 102167588, + "step": 2610 + }, + { + "epoch": 0.12695028588041476, + "grad_norm": 0.44859156012535095, + "learning_rate": 4.8778594306974956e-05, + "loss": 1.2309, + "num_input_tokens_seen": 102539236, + "step": 2620 + }, + { + "epoch": 0.1274348289562942, + "grad_norm": 0.39487892389297485, + "learning_rate": 4.876645141314315e-05, + "loss": 1.2866, + "num_input_tokens_seen": 102925936, + "step": 2630 + }, + { + "epoch": 0.12791937203217366, + "grad_norm": 0.3923860490322113, + "learning_rate": 4.875424998336713e-05, + "loss": 1.2757, + "num_input_tokens_seen": 103325248, + "step": 2640 + }, + { + "epoch": 0.1284039151080531, + "grad_norm": 0.4167238175868988, + "learning_rate": 4.874199004769858e-05, + "loss": 1.2338, + "num_input_tokens_seen": 103712300, + "step": 2650 + }, + { + "epoch": 0.12888845818393255, + "grad_norm": 0.38341742753982544, + "learning_rate": 4.872967163633332e-05, + "loss": 1.2383, + "num_input_tokens_seen": 104124208, + "step": 2660 + }, + { + "epoch": 0.129373001259812, + "grad_norm": 0.3938426375389099, + "learning_rate": 4.871729477961115e-05, + "loss": 1.2011, + "num_input_tokens_seen": 104502028, + "step": 2670 + }, + { + "epoch": 0.12985754433569144, + "grad_norm": 0.3831293284893036, + "learning_rate": 4.8704859508015847e-05, + "loss": 1.2634, + "num_input_tokens_seen": 104876704, + "step": 2680 + }, + { + "epoch": 0.13034208741157088, + "grad_norm": 0.4196309745311737, + "learning_rate": 4.869236585217504e-05, + "loss": 1.2585, + "num_input_tokens_seen": 105302956, + "step": 2690 + }, + { + "epoch": 0.13082663048745033, + "grad_norm": 0.4367752969264984, + "learning_rate": 4.867981384286019e-05, + "loss": 1.1848, + "num_input_tokens_seen": 105695812, + "step": 2700 + }, + { + "epoch": 0.13131117356332977, + "grad_norm": 0.39066222310066223, + "learning_rate": 4.8667203510986444e-05, + "loss": 1.2545, + "num_input_tokens_seen": 106084708, + "step": 2710 + }, + { + "epoch": 0.13179571663920922, + "grad_norm": 0.4448121190071106, + "learning_rate": 4.865453488761262e-05, + "loss": 1.275, + "num_input_tokens_seen": 106499276, + "step": 2720 + }, + { + "epoch": 0.13228025971508867, + "grad_norm": 0.4147442877292633, + "learning_rate": 4.864180800394109e-05, + "loss": 1.277, + "num_input_tokens_seen": 106866988, + "step": 2730 + }, + { + "epoch": 0.1327648027909681, + "grad_norm": 0.40027832984924316, + "learning_rate": 4.862902289131773e-05, + "loss": 1.2567, + "num_input_tokens_seen": 107259856, + "step": 2740 + }, + { + "epoch": 0.13324934586684756, + "grad_norm": 0.36126065254211426, + "learning_rate": 4.861617958123183e-05, + "loss": 1.2553, + "num_input_tokens_seen": 107646308, + "step": 2750 + }, + { + "epoch": 0.133733888942727, + "grad_norm": 0.41318050026893616, + "learning_rate": 4.860327810531602e-05, + "loss": 1.244, + "num_input_tokens_seen": 108059036, + "step": 2760 + }, + { + "epoch": 0.13421843201860645, + "grad_norm": 0.3924005925655365, + "learning_rate": 4.859031849534618e-05, + "loss": 1.223, + "num_input_tokens_seen": 108463880, + "step": 2770 + }, + { + "epoch": 0.1347029750944859, + "grad_norm": 0.41446420550346375, + "learning_rate": 4.857730078324138e-05, + "loss": 1.2424, + "num_input_tokens_seen": 108859416, + "step": 2780 + }, + { + "epoch": 0.13518751817036534, + "grad_norm": 0.4131425619125366, + "learning_rate": 4.856422500106379e-05, + "loss": 1.2368, + "num_input_tokens_seen": 109284028, + "step": 2790 + }, + { + "epoch": 0.13567206124624479, + "grad_norm": 0.40505287051200867, + "learning_rate": 4.855109118101862e-05, + "loss": 1.2731, + "num_input_tokens_seen": 109666092, + "step": 2800 + }, + { + "epoch": 0.13615660432212423, + "grad_norm": 0.4067140817642212, + "learning_rate": 4.8537899355454e-05, + "loss": 1.2945, + "num_input_tokens_seen": 110023200, + "step": 2810 + }, + { + "epoch": 0.13664114739800368, + "grad_norm": 0.42721500992774963, + "learning_rate": 4.8524649556860934e-05, + "loss": 1.2448, + "num_input_tokens_seen": 110400364, + "step": 2820 + }, + { + "epoch": 0.13712569047388312, + "grad_norm": 0.40935906767845154, + "learning_rate": 4.851134181787323e-05, + "loss": 1.2359, + "num_input_tokens_seen": 110793504, + "step": 2830 + }, + { + "epoch": 0.13761023354976257, + "grad_norm": 0.4633960425853729, + "learning_rate": 4.849797617126738e-05, + "loss": 1.2571, + "num_input_tokens_seen": 111170796, + "step": 2840 + }, + { + "epoch": 0.138094776625642, + "grad_norm": 0.43465399742126465, + "learning_rate": 4.848455264996251e-05, + "loss": 1.2171, + "num_input_tokens_seen": 111576676, + "step": 2850 + }, + { + "epoch": 0.13857931970152146, + "grad_norm": 0.3760763704776764, + "learning_rate": 4.847107128702028e-05, + "loss": 1.2611, + "num_input_tokens_seen": 112001188, + "step": 2860 + }, + { + "epoch": 0.1390638627774009, + "grad_norm": 0.4067819118499756, + "learning_rate": 4.845753211564482e-05, + "loss": 1.1932, + "num_input_tokens_seen": 112357276, + "step": 2870 + }, + { + "epoch": 0.13954840585328035, + "grad_norm": 0.44168785214424133, + "learning_rate": 4.844393516918265e-05, + "loss": 1.2458, + "num_input_tokens_seen": 112745724, + "step": 2880 + }, + { + "epoch": 0.1400329489291598, + "grad_norm": 0.42745277285575867, + "learning_rate": 4.8430280481122556e-05, + "loss": 1.1844, + "num_input_tokens_seen": 113158204, + "step": 2890 + }, + { + "epoch": 0.14051749200503924, + "grad_norm": 0.4196942150592804, + "learning_rate": 4.8416568085095585e-05, + "loss": 1.2464, + "num_input_tokens_seen": 113534300, + "step": 2900 + }, + { + "epoch": 0.1410020350809187, + "grad_norm": 0.3699077069759369, + "learning_rate": 4.840279801487488e-05, + "loss": 1.2435, + "num_input_tokens_seen": 113941828, + "step": 2910 + }, + { + "epoch": 0.14148657815679813, + "grad_norm": 0.4300510287284851, + "learning_rate": 4.8388970304375636e-05, + "loss": 1.2438, + "num_input_tokens_seen": 114360276, + "step": 2920 + }, + { + "epoch": 0.14197112123267758, + "grad_norm": 0.39781638979911804, + "learning_rate": 4.837508498765504e-05, + "loss": 1.2151, + "num_input_tokens_seen": 114724572, + "step": 2930 + }, + { + "epoch": 0.14245566430855702, + "grad_norm": 0.3915679156780243, + "learning_rate": 4.836114209891214e-05, + "loss": 1.2486, + "num_input_tokens_seen": 115115652, + "step": 2940 + }, + { + "epoch": 0.14294020738443647, + "grad_norm": 0.3921675682067871, + "learning_rate": 4.834714167248778e-05, + "loss": 1.2237, + "num_input_tokens_seen": 115528916, + "step": 2950 + }, + { + "epoch": 0.14342475046031591, + "grad_norm": 0.4208355247974396, + "learning_rate": 4.8333083742864524e-05, + "loss": 1.3102, + "num_input_tokens_seen": 115883916, + "step": 2960 + }, + { + "epoch": 0.14390929353619536, + "grad_norm": 0.4001530408859253, + "learning_rate": 4.831896834466658e-05, + "loss": 1.2382, + "num_input_tokens_seen": 116270492, + "step": 2970 + }, + { + "epoch": 0.1443938366120748, + "grad_norm": 0.4217351973056793, + "learning_rate": 4.830479551265966e-05, + "loss": 1.1906, + "num_input_tokens_seen": 116680092, + "step": 2980 + }, + { + "epoch": 0.14487837968795425, + "grad_norm": 0.37808090448379517, + "learning_rate": 4.8290565281750974e-05, + "loss": 1.2392, + "num_input_tokens_seen": 117085684, + "step": 2990 + }, + { + "epoch": 0.1453629227638337, + "grad_norm": 0.3951645493507385, + "learning_rate": 4.827627768698909e-05, + "loss": 1.2293, + "num_input_tokens_seen": 117454120, + "step": 3000 + }, + { + "epoch": 0.14584746583971314, + "grad_norm": 0.35314205288887024, + "learning_rate": 4.8261932763563834e-05, + "loss": 1.1356, + "num_input_tokens_seen": 117866504, + "step": 3010 + }, + { + "epoch": 0.1463320089155926, + "grad_norm": 0.4276081323623657, + "learning_rate": 4.824753054680628e-05, + "loss": 1.2778, + "num_input_tokens_seen": 118250676, + "step": 3020 + }, + { + "epoch": 0.14681655199147203, + "grad_norm": 0.40378737449645996, + "learning_rate": 4.823307107218857e-05, + "loss": 1.2868, + "num_input_tokens_seen": 118607944, + "step": 3030 + }, + { + "epoch": 0.14730109506735148, + "grad_norm": 0.41640186309814453, + "learning_rate": 4.82185543753239e-05, + "loss": 1.2456, + "num_input_tokens_seen": 119009904, + "step": 3040 + }, + { + "epoch": 0.14778563814323092, + "grad_norm": 0.4186214804649353, + "learning_rate": 4.820398049196638e-05, + "loss": 1.2287, + "num_input_tokens_seen": 119411820, + "step": 3050 + }, + { + "epoch": 0.14827018121911037, + "grad_norm": 0.42039960622787476, + "learning_rate": 4.818934945801098e-05, + "loss": 1.2247, + "num_input_tokens_seen": 119798480, + "step": 3060 + }, + { + "epoch": 0.14875472429498982, + "grad_norm": 0.42505961656570435, + "learning_rate": 4.8174661309493436e-05, + "loss": 1.2242, + "num_input_tokens_seen": 120184372, + "step": 3070 + }, + { + "epoch": 0.14923926737086926, + "grad_norm": 0.4036807417869568, + "learning_rate": 4.815991608259014e-05, + "loss": 1.2154, + "num_input_tokens_seen": 120579184, + "step": 3080 + }, + { + "epoch": 0.1497238104467487, + "grad_norm": 0.40509092807769775, + "learning_rate": 4.8145113813618083e-05, + "loss": 1.2086, + "num_input_tokens_seen": 120941008, + "step": 3090 + }, + { + "epoch": 0.15020835352262815, + "grad_norm": 0.4269082546234131, + "learning_rate": 4.813025453903472e-05, + "loss": 1.2496, + "num_input_tokens_seen": 121322824, + "step": 3100 + }, + { + "epoch": 0.1506928965985076, + "grad_norm": 0.4054916203022003, + "learning_rate": 4.811533829543795e-05, + "loss": 1.2079, + "num_input_tokens_seen": 121738192, + "step": 3110 + }, + { + "epoch": 0.15117743967438704, + "grad_norm": 0.413043349981308, + "learning_rate": 4.8100365119565946e-05, + "loss": 1.2609, + "num_input_tokens_seen": 122108864, + "step": 3120 + }, + { + "epoch": 0.1516619827502665, + "grad_norm": 0.41178593039512634, + "learning_rate": 4.8085335048297135e-05, + "loss": 1.2548, + "num_input_tokens_seen": 122510356, + "step": 3130 + }, + { + "epoch": 0.15214652582614593, + "grad_norm": 0.39382249116897583, + "learning_rate": 4.8070248118650055e-05, + "loss": 1.1809, + "num_input_tokens_seen": 122865892, + "step": 3140 + }, + { + "epoch": 0.15263106890202538, + "grad_norm": 0.46355170011520386, + "learning_rate": 4.8055104367783275e-05, + "loss": 1.2281, + "num_input_tokens_seen": 123237536, + "step": 3150 + }, + { + "epoch": 0.15311561197790483, + "grad_norm": 0.40591225028038025, + "learning_rate": 4.803990383299535e-05, + "loss": 1.2243, + "num_input_tokens_seen": 123611572, + "step": 3160 + }, + { + "epoch": 0.15360015505378427, + "grad_norm": 0.41489487886428833, + "learning_rate": 4.802464655172466e-05, + "loss": 1.239, + "num_input_tokens_seen": 124010748, + "step": 3170 + }, + { + "epoch": 0.15408469812966372, + "grad_norm": 0.3760206997394562, + "learning_rate": 4.800933256154935e-05, + "loss": 1.2013, + "num_input_tokens_seen": 124392148, + "step": 3180 + }, + { + "epoch": 0.15456924120554316, + "grad_norm": 0.43629947304725647, + "learning_rate": 4.7993961900187255e-05, + "loss": 1.2253, + "num_input_tokens_seen": 124776000, + "step": 3190 + }, + { + "epoch": 0.1550537842814226, + "grad_norm": 0.4242326021194458, + "learning_rate": 4.7978534605495784e-05, + "loss": 1.2461, + "num_input_tokens_seen": 125158264, + "step": 3200 + }, + { + "epoch": 0.15553832735730205, + "grad_norm": 0.40662136673927307, + "learning_rate": 4.796305071547182e-05, + "loss": 1.1913, + "num_input_tokens_seen": 125544848, + "step": 3210 + }, + { + "epoch": 0.1560228704331815, + "grad_norm": 0.4275949001312256, + "learning_rate": 4.794751026825165e-05, + "loss": 1.2397, + "num_input_tokens_seen": 125940532, + "step": 3220 + }, + { + "epoch": 0.15650741350906094, + "grad_norm": 0.41079017519950867, + "learning_rate": 4.793191330211085e-05, + "loss": 1.2478, + "num_input_tokens_seen": 126357472, + "step": 3230 + }, + { + "epoch": 0.1569919565849404, + "grad_norm": 0.4043791890144348, + "learning_rate": 4.791625985546422e-05, + "loss": 1.2263, + "num_input_tokens_seen": 126745452, + "step": 3240 + }, + { + "epoch": 0.15747649966081984, + "grad_norm": 0.4353187680244446, + "learning_rate": 4.790054996686564e-05, + "loss": 1.1988, + "num_input_tokens_seen": 127138656, + "step": 3250 + }, + { + "epoch": 0.15796104273669928, + "grad_norm": 0.4300646483898163, + "learning_rate": 4.7884783675008016e-05, + "loss": 1.183, + "num_input_tokens_seen": 127522676, + "step": 3260 + }, + { + "epoch": 0.15844558581257873, + "grad_norm": 0.3753831684589386, + "learning_rate": 4.7868961018723194e-05, + "loss": 1.2412, + "num_input_tokens_seen": 127917336, + "step": 3270 + }, + { + "epoch": 0.15893012888845817, + "grad_norm": 0.36102235317230225, + "learning_rate": 4.785308203698182e-05, + "loss": 1.2764, + "num_input_tokens_seen": 128336744, + "step": 3280 + }, + { + "epoch": 0.15941467196433762, + "grad_norm": 0.40448567271232605, + "learning_rate": 4.783714676889327e-05, + "loss": 1.2122, + "num_input_tokens_seen": 128733104, + "step": 3290 + }, + { + "epoch": 0.15989921504021706, + "grad_norm": 0.3721427023410797, + "learning_rate": 4.7821155253705564e-05, + "loss": 1.2684, + "num_input_tokens_seen": 129154876, + "step": 3300 + }, + { + "epoch": 0.1603837581160965, + "grad_norm": 0.4208812415599823, + "learning_rate": 4.7805107530805244e-05, + "loss": 1.2343, + "num_input_tokens_seen": 129547976, + "step": 3310 + }, + { + "epoch": 0.16086830119197595, + "grad_norm": 0.39913055300712585, + "learning_rate": 4.778900363971729e-05, + "loss": 1.2078, + "num_input_tokens_seen": 129961780, + "step": 3320 + }, + { + "epoch": 0.1613528442678554, + "grad_norm": 0.4164119362831116, + "learning_rate": 4.777284362010504e-05, + "loss": 1.2501, + "num_input_tokens_seen": 130358684, + "step": 3330 + }, + { + "epoch": 0.16183738734373485, + "grad_norm": 0.3951362073421478, + "learning_rate": 4.775662751177003e-05, + "loss": 1.206, + "num_input_tokens_seen": 130762100, + "step": 3340 + }, + { + "epoch": 0.1623219304196143, + "grad_norm": 0.4006355404853821, + "learning_rate": 4.774035535465201e-05, + "loss": 1.2516, + "num_input_tokens_seen": 131146316, + "step": 3350 + }, + { + "epoch": 0.16280647349549374, + "grad_norm": 0.41116824746131897, + "learning_rate": 4.7724027188828716e-05, + "loss": 1.1868, + "num_input_tokens_seen": 131538384, + "step": 3360 + }, + { + "epoch": 0.16329101657137318, + "grad_norm": 0.4331505596637726, + "learning_rate": 4.7707643054515855e-05, + "loss": 1.2212, + "num_input_tokens_seen": 131926864, + "step": 3370 + }, + { + "epoch": 0.16377555964725263, + "grad_norm": 0.3957429826259613, + "learning_rate": 4.7691202992066984e-05, + "loss": 1.1938, + "num_input_tokens_seen": 132309648, + "step": 3380 + }, + { + "epoch": 0.16426010272313207, + "grad_norm": 0.3914317786693573, + "learning_rate": 4.767470704197341e-05, + "loss": 1.2082, + "num_input_tokens_seen": 132725476, + "step": 3390 + }, + { + "epoch": 0.16474464579901152, + "grad_norm": 0.4648467004299164, + "learning_rate": 4.765815524486407e-05, + "loss": 1.2276, + "num_input_tokens_seen": 133107108, + "step": 3400 + }, + { + "epoch": 0.16522918887489096, + "grad_norm": 0.3745146691799164, + "learning_rate": 4.764154764150548e-05, + "loss": 1.223, + "num_input_tokens_seen": 133483520, + "step": 3410 + }, + { + "epoch": 0.1657137319507704, + "grad_norm": 0.38040289282798767, + "learning_rate": 4.762488427280159e-05, + "loss": 1.1943, + "num_input_tokens_seen": 133871252, + "step": 3420 + }, + { + "epoch": 0.16619827502664986, + "grad_norm": 0.41370365023612976, + "learning_rate": 4.760816517979369e-05, + "loss": 1.2548, + "num_input_tokens_seen": 134284456, + "step": 3430 + }, + { + "epoch": 0.16668281810252933, + "grad_norm": 0.4398741126060486, + "learning_rate": 4.7591390403660326e-05, + "loss": 1.2549, + "num_input_tokens_seen": 134684752, + "step": 3440 + }, + { + "epoch": 0.16716736117840877, + "grad_norm": 0.41080906987190247, + "learning_rate": 4.7574559985717196e-05, + "loss": 1.1814, + "num_input_tokens_seen": 135115868, + "step": 3450 + }, + { + "epoch": 0.16765190425428822, + "grad_norm": 0.4656144678592682, + "learning_rate": 4.7557673967417024e-05, + "loss": 1.1833, + "num_input_tokens_seen": 135511836, + "step": 3460 + }, + { + "epoch": 0.16813644733016767, + "grad_norm": 0.3965156376361847, + "learning_rate": 4.7540732390349494e-05, + "loss": 1.2127, + "num_input_tokens_seen": 135920396, + "step": 3470 + }, + { + "epoch": 0.1686209904060471, + "grad_norm": 0.400642991065979, + "learning_rate": 4.752373529624113e-05, + "loss": 1.2603, + "num_input_tokens_seen": 136333964, + "step": 3480 + }, + { + "epoch": 0.16910553348192656, + "grad_norm": 0.4004031717777252, + "learning_rate": 4.7506682726955166e-05, + "loss": 1.2232, + "num_input_tokens_seen": 136718116, + "step": 3490 + }, + { + "epoch": 0.169590076557806, + "grad_norm": 0.40622180700302124, + "learning_rate": 4.74895747244915e-05, + "loss": 1.1966, + "num_input_tokens_seen": 137109584, + "step": 3500 + }, + { + "epoch": 0.17007461963368545, + "grad_norm": 0.41456958651542664, + "learning_rate": 4.747241133098655e-05, + "loss": 1.2557, + "num_input_tokens_seen": 137510040, + "step": 3510 + }, + { + "epoch": 0.1705591627095649, + "grad_norm": 0.38758885860443115, + "learning_rate": 4.745519258871314e-05, + "loss": 1.2414, + "num_input_tokens_seen": 137901036, + "step": 3520 + }, + { + "epoch": 0.17104370578544434, + "grad_norm": 0.4364662766456604, + "learning_rate": 4.743791854008045e-05, + "loss": 1.2312, + "num_input_tokens_seen": 138292348, + "step": 3530 + }, + { + "epoch": 0.17152824886132378, + "grad_norm": 0.40788963437080383, + "learning_rate": 4.742058922763386e-05, + "loss": 1.2465, + "num_input_tokens_seen": 138699080, + "step": 3540 + }, + { + "epoch": 0.17201279193720323, + "grad_norm": 0.39772936701774597, + "learning_rate": 4.740320469405487e-05, + "loss": 1.2562, + "num_input_tokens_seen": 139086780, + "step": 3550 + }, + { + "epoch": 0.17249733501308268, + "grad_norm": 0.4041747748851776, + "learning_rate": 4.738576498216097e-05, + "loss": 1.193, + "num_input_tokens_seen": 139486640, + "step": 3560 + }, + { + "epoch": 0.17298187808896212, + "grad_norm": 0.4034312069416046, + "learning_rate": 4.7368270134905565e-05, + "loss": 1.1797, + "num_input_tokens_seen": 139891212, + "step": 3570 + }, + { + "epoch": 0.17346642116484157, + "grad_norm": 0.3874165713787079, + "learning_rate": 4.735072019537786e-05, + "loss": 1.2149, + "num_input_tokens_seen": 140300916, + "step": 3580 + }, + { + "epoch": 0.173950964240721, + "grad_norm": 0.3958899974822998, + "learning_rate": 4.733311520680276e-05, + "loss": 1.2303, + "num_input_tokens_seen": 140685408, + "step": 3590 + }, + { + "epoch": 0.17443550731660046, + "grad_norm": 0.42453455924987793, + "learning_rate": 4.7315455212540714e-05, + "loss": 1.2372, + "num_input_tokens_seen": 141095616, + "step": 3600 + }, + { + "epoch": 0.1749200503924799, + "grad_norm": 0.4058285057544708, + "learning_rate": 4.7297740256087695e-05, + "loss": 1.1825, + "num_input_tokens_seen": 141494816, + "step": 3610 + }, + { + "epoch": 0.17540459346835935, + "grad_norm": 0.3956435024738312, + "learning_rate": 4.727997038107501e-05, + "loss": 1.1843, + "num_input_tokens_seen": 141885796, + "step": 3620 + }, + { + "epoch": 0.1758891365442388, + "grad_norm": 0.40899357199668884, + "learning_rate": 4.726214563126926e-05, + "loss": 1.2292, + "num_input_tokens_seen": 142285860, + "step": 3630 + }, + { + "epoch": 0.17637367962011824, + "grad_norm": 0.41558772325515747, + "learning_rate": 4.7244266050572175e-05, + "loss": 1.2355, + "num_input_tokens_seen": 142706648, + "step": 3640 + }, + { + "epoch": 0.17685822269599769, + "grad_norm": 0.40557295083999634, + "learning_rate": 4.722633168302054e-05, + "loss": 1.2054, + "num_input_tokens_seen": 143098084, + "step": 3650 + }, + { + "epoch": 0.17734276577187713, + "grad_norm": 0.41156765818595886, + "learning_rate": 4.720834257278609e-05, + "loss": 1.1816, + "num_input_tokens_seen": 143490440, + "step": 3660 + }, + { + "epoch": 0.17782730884775658, + "grad_norm": 0.4566863179206848, + "learning_rate": 4.719029876417537e-05, + "loss": 1.2687, + "num_input_tokens_seen": 143900176, + "step": 3670 + }, + { + "epoch": 0.17831185192363602, + "grad_norm": 0.38572782278060913, + "learning_rate": 4.717220030162964e-05, + "loss": 1.2074, + "num_input_tokens_seen": 144266036, + "step": 3680 + }, + { + "epoch": 0.17879639499951547, + "grad_norm": 0.4045168459415436, + "learning_rate": 4.715404722972481e-05, + "loss": 1.2177, + "num_input_tokens_seen": 144661236, + "step": 3690 + }, + { + "epoch": 0.1792809380753949, + "grad_norm": 0.43136268854141235, + "learning_rate": 4.713583959317125e-05, + "loss": 1.231, + "num_input_tokens_seen": 145039188, + "step": 3700 + }, + { + "epoch": 0.17976548115127436, + "grad_norm": 0.37638697028160095, + "learning_rate": 4.711757743681375e-05, + "loss": 1.265, + "num_input_tokens_seen": 145471096, + "step": 3710 + }, + { + "epoch": 0.1802500242271538, + "grad_norm": 0.418754905462265, + "learning_rate": 4.7099260805631354e-05, + "loss": 1.2135, + "num_input_tokens_seen": 145875596, + "step": 3720 + }, + { + "epoch": 0.18073456730303325, + "grad_norm": 0.3730713725090027, + "learning_rate": 4.7080889744737275e-05, + "loss": 1.219, + "num_input_tokens_seen": 146269480, + "step": 3730 + }, + { + "epoch": 0.1812191103789127, + "grad_norm": 0.3674296438694, + "learning_rate": 4.7062464299378835e-05, + "loss": 1.2366, + "num_input_tokens_seen": 146676848, + "step": 3740 + }, + { + "epoch": 0.18170365345479214, + "grad_norm": 0.4344138205051422, + "learning_rate": 4.7043984514937236e-05, + "loss": 1.2243, + "num_input_tokens_seen": 147058324, + "step": 3750 + }, + { + "epoch": 0.1821881965306716, + "grad_norm": 0.3907421827316284, + "learning_rate": 4.7025450436927555e-05, + "loss": 1.2282, + "num_input_tokens_seen": 147448108, + "step": 3760 + }, + { + "epoch": 0.18267273960655103, + "grad_norm": 0.40709200501441956, + "learning_rate": 4.700686211099859e-05, + "loss": 1.2951, + "num_input_tokens_seen": 147840480, + "step": 3770 + }, + { + "epoch": 0.18315728268243048, + "grad_norm": 0.4162179231643677, + "learning_rate": 4.698821958293273e-05, + "loss": 1.2506, + "num_input_tokens_seen": 148246316, + "step": 3780 + }, + { + "epoch": 0.18364182575830992, + "grad_norm": 0.41495996713638306, + "learning_rate": 4.696952289864587e-05, + "loss": 1.2097, + "num_input_tokens_seen": 148608520, + "step": 3790 + }, + { + "epoch": 0.18412636883418937, + "grad_norm": 0.3995479643344879, + "learning_rate": 4.6950772104187303e-05, + "loss": 1.2055, + "num_input_tokens_seen": 148986420, + "step": 3800 + }, + { + "epoch": 0.18461091191006881, + "grad_norm": 0.3885250985622406, + "learning_rate": 4.6931967245739586e-05, + "loss": 1.1965, + "num_input_tokens_seen": 149370912, + "step": 3810 + }, + { + "epoch": 0.18509545498594826, + "grad_norm": 0.4217512905597687, + "learning_rate": 4.691310836961843e-05, + "loss": 1.2325, + "num_input_tokens_seen": 149784152, + "step": 3820 + }, + { + "epoch": 0.1855799980618277, + "grad_norm": 0.4255019724369049, + "learning_rate": 4.689419552227259e-05, + "loss": 1.2151, + "num_input_tokens_seen": 150163748, + "step": 3830 + }, + { + "epoch": 0.18606454113770715, + "grad_norm": 0.3889770209789276, + "learning_rate": 4.687522875028376e-05, + "loss": 1.1787, + "num_input_tokens_seen": 150567952, + "step": 3840 + }, + { + "epoch": 0.1865490842135866, + "grad_norm": 0.39190900325775146, + "learning_rate": 4.685620810036642e-05, + "loss": 1.2238, + "num_input_tokens_seen": 150990236, + "step": 3850 + }, + { + "epoch": 0.18703362728946604, + "grad_norm": 0.380062997341156, + "learning_rate": 4.683713361936779e-05, + "loss": 1.2109, + "num_input_tokens_seen": 151403764, + "step": 3860 + }, + { + "epoch": 0.1875181703653455, + "grad_norm": 0.38644662499427795, + "learning_rate": 4.681800535426765e-05, + "loss": 1.1836, + "num_input_tokens_seen": 151781412, + "step": 3870 + }, + { + "epoch": 0.18800271344122493, + "grad_norm": 0.41668441891670227, + "learning_rate": 4.679882335217825e-05, + "loss": 1.1741, + "num_input_tokens_seen": 152190552, + "step": 3880 + }, + { + "epoch": 0.18848725651710438, + "grad_norm": 0.43978846073150635, + "learning_rate": 4.6779587660344195e-05, + "loss": 1.2325, + "num_input_tokens_seen": 152621828, + "step": 3890 + }, + { + "epoch": 0.18897179959298382, + "grad_norm": 0.3876698911190033, + "learning_rate": 4.676029832614231e-05, + "loss": 1.2592, + "num_input_tokens_seen": 153007936, + "step": 3900 + }, + { + "epoch": 0.18945634266886327, + "grad_norm": 0.40177682042121887, + "learning_rate": 4.6740955397081594e-05, + "loss": 1.2129, + "num_input_tokens_seen": 153391480, + "step": 3910 + }, + { + "epoch": 0.18994088574474272, + "grad_norm": 0.44533804059028625, + "learning_rate": 4.672155892080298e-05, + "loss": 1.2284, + "num_input_tokens_seen": 153772340, + "step": 3920 + }, + { + "epoch": 0.19042542882062216, + "grad_norm": 0.4126012921333313, + "learning_rate": 4.670210894507932e-05, + "loss": 1.2157, + "num_input_tokens_seen": 154169508, + "step": 3930 + }, + { + "epoch": 0.1909099718965016, + "grad_norm": 0.4048556685447693, + "learning_rate": 4.668260551781522e-05, + "loss": 1.2128, + "num_input_tokens_seen": 154556080, + "step": 3940 + }, + { + "epoch": 0.19139451497238105, + "grad_norm": 0.41531628370285034, + "learning_rate": 4.6663048687046965e-05, + "loss": 1.2164, + "num_input_tokens_seen": 154956220, + "step": 3950 + }, + { + "epoch": 0.1918790580482605, + "grad_norm": 0.4042895436286926, + "learning_rate": 4.6643438500942324e-05, + "loss": 1.2069, + "num_input_tokens_seen": 155328904, + "step": 3960 + }, + { + "epoch": 0.19236360112413994, + "grad_norm": 0.38158828020095825, + "learning_rate": 4.662377500780053e-05, + "loss": 1.2578, + "num_input_tokens_seen": 155742432, + "step": 3970 + }, + { + "epoch": 0.1928481442000194, + "grad_norm": 0.3938775956630707, + "learning_rate": 4.660405825605207e-05, + "loss": 1.2278, + "num_input_tokens_seen": 156106304, + "step": 3980 + }, + { + "epoch": 0.19333268727589883, + "grad_norm": 0.4105328917503357, + "learning_rate": 4.6584288294258623e-05, + "loss": 1.2094, + "num_input_tokens_seen": 156496196, + "step": 3990 + }, + { + "epoch": 0.19381723035177828, + "grad_norm": 0.3988109827041626, + "learning_rate": 4.6564465171112916e-05, + "loss": 1.2248, + "num_input_tokens_seen": 156868756, + "step": 4000 + }, + { + "epoch": 0.19381723035177828, + "eval_loss": 1.2256019115447998, + "eval_runtime": 5.1669, + "eval_samples_per_second": 29.031, + "eval_steps_per_second": 3.677, + "num_input_tokens_seen": 156868756, + "step": 4000 + }, + { + "epoch": 0.19430177342765773, + "grad_norm": 0.3892788887023926, + "learning_rate": 4.654458893543861e-05, + "loss": 1.1957, + "num_input_tokens_seen": 157256772, + "step": 4010 + }, + { + "epoch": 0.19478631650353717, + "grad_norm": 0.42166975140571594, + "learning_rate": 4.6524659636190183e-05, + "loss": 1.2207, + "num_input_tokens_seen": 157653400, + "step": 4020 + }, + { + "epoch": 0.19527085957941662, + "grad_norm": 0.4215097725391388, + "learning_rate": 4.650467732245282e-05, + "loss": 1.257, + "num_input_tokens_seen": 158042052, + "step": 4030 + }, + { + "epoch": 0.19575540265529606, + "grad_norm": 0.4513319432735443, + "learning_rate": 4.648464204344224e-05, + "loss": 1.2021, + "num_input_tokens_seen": 158421912, + "step": 4040 + }, + { + "epoch": 0.1962399457311755, + "grad_norm": 0.3815930485725403, + "learning_rate": 4.646455384850466e-05, + "loss": 1.2464, + "num_input_tokens_seen": 158794420, + "step": 4050 + }, + { + "epoch": 0.19672448880705495, + "grad_norm": 0.4318133294582367, + "learning_rate": 4.64444127871166e-05, + "loss": 1.1736, + "num_input_tokens_seen": 159183072, + "step": 4060 + }, + { + "epoch": 0.1972090318829344, + "grad_norm": 0.4135242700576782, + "learning_rate": 4.6424218908884795e-05, + "loss": 1.2094, + "num_input_tokens_seen": 159567032, + "step": 4070 + }, + { + "epoch": 0.19769357495881384, + "grad_norm": 0.410666823387146, + "learning_rate": 4.640397226354607e-05, + "loss": 1.2323, + "num_input_tokens_seen": 159937052, + "step": 4080 + }, + { + "epoch": 0.1981781180346933, + "grad_norm": 0.42202097177505493, + "learning_rate": 4.63836729009672e-05, + "loss": 1.209, + "num_input_tokens_seen": 160346176, + "step": 4090 + }, + { + "epoch": 0.19866266111057274, + "grad_norm": 0.42181599140167236, + "learning_rate": 4.636332087114481e-05, + "loss": 1.1995, + "num_input_tokens_seen": 160734656, + "step": 4100 + }, + { + "epoch": 0.19914720418645218, + "grad_norm": 0.42154404520988464, + "learning_rate": 4.6342916224205254e-05, + "loss": 1.2422, + "num_input_tokens_seen": 161143420, + "step": 4110 + }, + { + "epoch": 0.19963174726233163, + "grad_norm": 0.38917213678359985, + "learning_rate": 4.632245901040446e-05, + "loss": 1.2135, + "num_input_tokens_seen": 161523856, + "step": 4120 + }, + { + "epoch": 0.20011629033821107, + "grad_norm": 0.41019120812416077, + "learning_rate": 4.6301949280127835e-05, + "loss": 1.2309, + "num_input_tokens_seen": 161906012, + "step": 4130 + }, + { + "epoch": 0.20060083341409052, + "grad_norm": 0.3908578157424927, + "learning_rate": 4.6281387083890134e-05, + "loss": 1.2335, + "num_input_tokens_seen": 162302056, + "step": 4140 + }, + { + "epoch": 0.20108537648996996, + "grad_norm": 0.4184846878051758, + "learning_rate": 4.626077247233533e-05, + "loss": 1.207, + "num_input_tokens_seen": 162674300, + "step": 4150 + }, + { + "epoch": 0.2015699195658494, + "grad_norm": 0.41114839911460876, + "learning_rate": 4.62401054962365e-05, + "loss": 1.2287, + "num_input_tokens_seen": 163082608, + "step": 4160 + }, + { + "epoch": 0.20205446264172885, + "grad_norm": 0.4034859538078308, + "learning_rate": 4.6219386206495684e-05, + "loss": 1.2536, + "num_input_tokens_seen": 163463012, + "step": 4170 + }, + { + "epoch": 0.2025390057176083, + "grad_norm": 0.37627366185188293, + "learning_rate": 4.6198614654143765e-05, + "loss": 1.2428, + "num_input_tokens_seen": 163855604, + "step": 4180 + }, + { + "epoch": 0.20302354879348775, + "grad_norm": 0.3891449272632599, + "learning_rate": 4.6177790890340376e-05, + "loss": 1.2113, + "num_input_tokens_seen": 164258760, + "step": 4190 + }, + { + "epoch": 0.2035080918693672, + "grad_norm": 0.3885304033756256, + "learning_rate": 4.615691496637371e-05, + "loss": 1.2276, + "num_input_tokens_seen": 164630872, + "step": 4200 + }, + { + "epoch": 0.20399263494524664, + "grad_norm": 0.4263809323310852, + "learning_rate": 4.613598693366044e-05, + "loss": 1.1905, + "num_input_tokens_seen": 165040256, + "step": 4210 + }, + { + "epoch": 0.20447717802112608, + "grad_norm": 0.3751036524772644, + "learning_rate": 4.611500684374559e-05, + "loss": 1.2174, + "num_input_tokens_seen": 165424264, + "step": 4220 + }, + { + "epoch": 0.20496172109700553, + "grad_norm": 0.40289247035980225, + "learning_rate": 4.6093974748302385e-05, + "loss": 1.1899, + "num_input_tokens_seen": 165812068, + "step": 4230 + }, + { + "epoch": 0.20544626417288497, + "grad_norm": 0.4211735427379608, + "learning_rate": 4.6072890699132155e-05, + "loss": 1.1992, + "num_input_tokens_seen": 166177560, + "step": 4240 + }, + { + "epoch": 0.20593080724876442, + "grad_norm": 0.36460644006729126, + "learning_rate": 4.605175474816418e-05, + "loss": 1.2103, + "num_input_tokens_seen": 166578756, + "step": 4250 + }, + { + "epoch": 0.20641535032464386, + "grad_norm": 0.3921453356742859, + "learning_rate": 4.603056694745556e-05, + "loss": 1.2143, + "num_input_tokens_seen": 166975748, + "step": 4260 + }, + { + "epoch": 0.2068998934005233, + "grad_norm": 0.3987452983856201, + "learning_rate": 4.600932734919113e-05, + "loss": 1.2292, + "num_input_tokens_seen": 167388080, + "step": 4270 + }, + { + "epoch": 0.20738443647640276, + "grad_norm": 0.4142529368400574, + "learning_rate": 4.5988036005683265e-05, + "loss": 1.2048, + "num_input_tokens_seen": 167784884, + "step": 4280 + }, + { + "epoch": 0.2078689795522822, + "grad_norm": 0.39146703481674194, + "learning_rate": 4.596669296937182e-05, + "loss": 1.2422, + "num_input_tokens_seen": 168184036, + "step": 4290 + }, + { + "epoch": 0.20835352262816165, + "grad_norm": 0.3954419791698456, + "learning_rate": 4.594529829282395e-05, + "loss": 1.2297, + "num_input_tokens_seen": 168575028, + "step": 4300 + }, + { + "epoch": 0.2088380657040411, + "grad_norm": 0.4144536554813385, + "learning_rate": 4.5923852028733985e-05, + "loss": 1.1951, + "num_input_tokens_seen": 168948296, + "step": 4310 + }, + { + "epoch": 0.20932260877992054, + "grad_norm": 0.3958514630794525, + "learning_rate": 4.590235422992335e-05, + "loss": 1.1684, + "num_input_tokens_seen": 169357988, + "step": 4320 + }, + { + "epoch": 0.20980715185579998, + "grad_norm": 0.40073680877685547, + "learning_rate": 4.588080494934036e-05, + "loss": 1.2098, + "num_input_tokens_seen": 169761752, + "step": 4330 + }, + { + "epoch": 0.21029169493167943, + "grad_norm": 0.37582847476005554, + "learning_rate": 4.585920424006015e-05, + "loss": 1.2668, + "num_input_tokens_seen": 170160840, + "step": 4340 + }, + { + "epoch": 0.21077623800755887, + "grad_norm": 0.36600205302238464, + "learning_rate": 4.5837552155284516e-05, + "loss": 1.2413, + "num_input_tokens_seen": 170562748, + "step": 4350 + }, + { + "epoch": 0.21126078108343832, + "grad_norm": 0.41803476214408875, + "learning_rate": 4.581584874834179e-05, + "loss": 1.2522, + "num_input_tokens_seen": 170956952, + "step": 4360 + }, + { + "epoch": 0.21174532415931777, + "grad_norm": 0.43290525674819946, + "learning_rate": 4.5794094072686716e-05, + "loss": 1.254, + "num_input_tokens_seen": 171333724, + "step": 4370 + }, + { + "epoch": 0.2122298672351972, + "grad_norm": 0.4034164845943451, + "learning_rate": 4.5772288181900294e-05, + "loss": 1.2486, + "num_input_tokens_seen": 171727380, + "step": 4380 + }, + { + "epoch": 0.21271441031107666, + "grad_norm": 0.4040049910545349, + "learning_rate": 4.575043112968969e-05, + "loss": 1.24, + "num_input_tokens_seen": 172114780, + "step": 4390 + }, + { + "epoch": 0.2131989533869561, + "grad_norm": 0.39204099774360657, + "learning_rate": 4.5728522969888044e-05, + "loss": 1.2371, + "num_input_tokens_seen": 172504760, + "step": 4400 + }, + { + "epoch": 0.21368349646283555, + "grad_norm": 0.41550543904304504, + "learning_rate": 4.5706563756454414e-05, + "loss": 1.1737, + "num_input_tokens_seen": 172866336, + "step": 4410 + }, + { + "epoch": 0.214168039538715, + "grad_norm": 0.3796961009502411, + "learning_rate": 4.5684553543473555e-05, + "loss": 1.2245, + "num_input_tokens_seen": 173274608, + "step": 4420 + }, + { + "epoch": 0.21465258261459444, + "grad_norm": 0.39554864168167114, + "learning_rate": 4.5662492385155886e-05, + "loss": 1.2163, + "num_input_tokens_seen": 173670996, + "step": 4430 + }, + { + "epoch": 0.21513712569047388, + "grad_norm": 0.41738855838775635, + "learning_rate": 4.564038033583725e-05, + "loss": 1.1744, + "num_input_tokens_seen": 174089336, + "step": 4440 + }, + { + "epoch": 0.21562166876635333, + "grad_norm": 0.38417306542396545, + "learning_rate": 4.561821744997887e-05, + "loss": 1.2079, + "num_input_tokens_seen": 174457260, + "step": 4450 + }, + { + "epoch": 0.21610621184223278, + "grad_norm": 0.3893676698207855, + "learning_rate": 4.5596003782167154e-05, + "loss": 1.2236, + "num_input_tokens_seen": 174845308, + "step": 4460 + }, + { + "epoch": 0.21659075491811222, + "grad_norm": 0.5625600218772888, + "learning_rate": 4.55737393871136e-05, + "loss": 1.1457, + "num_input_tokens_seen": 175217384, + "step": 4470 + }, + { + "epoch": 0.21707529799399167, + "grad_norm": 0.4266904294490814, + "learning_rate": 4.555142431965465e-05, + "loss": 1.2847, + "num_input_tokens_seen": 175597084, + "step": 4480 + }, + { + "epoch": 0.2175598410698711, + "grad_norm": 0.40159592032432556, + "learning_rate": 4.552905863475152e-05, + "loss": 1.2064, + "num_input_tokens_seen": 175985196, + "step": 4490 + }, + { + "epoch": 0.21804438414575056, + "grad_norm": 0.3944200873374939, + "learning_rate": 4.550664238749012e-05, + "loss": 1.1969, + "num_input_tokens_seen": 176390156, + "step": 4500 + }, + { + "epoch": 0.21852892722163, + "grad_norm": 0.37802281975746155, + "learning_rate": 4.54841756330809e-05, + "loss": 1.1757, + "num_input_tokens_seen": 176751292, + "step": 4510 + }, + { + "epoch": 0.21901347029750945, + "grad_norm": 0.46125710010528564, + "learning_rate": 4.546165842685869e-05, + "loss": 1.1816, + "num_input_tokens_seen": 177136472, + "step": 4520 + }, + { + "epoch": 0.2194980133733889, + "grad_norm": 0.40457674860954285, + "learning_rate": 4.543909082428257e-05, + "loss": 1.2635, + "num_input_tokens_seen": 177564052, + "step": 4530 + }, + { + "epoch": 0.21998255644926834, + "grad_norm": 0.3985607326030731, + "learning_rate": 4.541647288093579e-05, + "loss": 1.2209, + "num_input_tokens_seen": 177977356, + "step": 4540 + }, + { + "epoch": 0.22046709952514779, + "grad_norm": 0.41993504762649536, + "learning_rate": 4.539380465252554e-05, + "loss": 1.1924, + "num_input_tokens_seen": 178361500, + "step": 4550 + }, + { + "epoch": 0.22095164260102723, + "grad_norm": 0.4131138324737549, + "learning_rate": 4.5371086194882886e-05, + "loss": 1.2114, + "num_input_tokens_seen": 178786424, + "step": 4560 + }, + { + "epoch": 0.22143618567690668, + "grad_norm": 0.3843264579772949, + "learning_rate": 4.5348317563962594e-05, + "loss": 1.2211, + "num_input_tokens_seen": 179173444, + "step": 4570 + }, + { + "epoch": 0.22192072875278612, + "grad_norm": 0.39719441533088684, + "learning_rate": 4.532549881584301e-05, + "loss": 1.2099, + "num_input_tokens_seen": 179550012, + "step": 4580 + }, + { + "epoch": 0.22240527182866557, + "grad_norm": 0.41848018765449524, + "learning_rate": 4.5302630006725934e-05, + "loss": 1.2015, + "num_input_tokens_seen": 179934896, + "step": 4590 + }, + { + "epoch": 0.22288981490454501, + "grad_norm": 0.37484750151634216, + "learning_rate": 4.527971119293643e-05, + "loss": 1.2331, + "num_input_tokens_seen": 180319432, + "step": 4600 + }, + { + "epoch": 0.22337435798042446, + "grad_norm": 0.40925487875938416, + "learning_rate": 4.525674243092275e-05, + "loss": 1.2214, + "num_input_tokens_seen": 180713196, + "step": 4610 + }, + { + "epoch": 0.2238589010563039, + "grad_norm": 0.381757915019989, + "learning_rate": 4.523372377725615e-05, + "loss": 1.1964, + "num_input_tokens_seen": 181112116, + "step": 4620 + }, + { + "epoch": 0.22434344413218335, + "grad_norm": 0.3882332742214203, + "learning_rate": 4.5210655288630774e-05, + "loss": 1.189, + "num_input_tokens_seen": 181537372, + "step": 4630 + }, + { + "epoch": 0.2248279872080628, + "grad_norm": 0.3924133777618408, + "learning_rate": 4.518753702186352e-05, + "loss": 1.2174, + "num_input_tokens_seen": 181917440, + "step": 4640 + }, + { + "epoch": 0.22531253028394224, + "grad_norm": 0.3810102045536041, + "learning_rate": 4.5164369033893874e-05, + "loss": 1.235, + "num_input_tokens_seen": 182302576, + "step": 4650 + }, + { + "epoch": 0.2257970733598217, + "grad_norm": 0.4031634032726288, + "learning_rate": 4.514115138178378e-05, + "loss": 1.2213, + "num_input_tokens_seen": 182677192, + "step": 4660 + }, + { + "epoch": 0.22628161643570113, + "grad_norm": 0.4006711542606354, + "learning_rate": 4.511788412271749e-05, + "loss": 1.1881, + "num_input_tokens_seen": 183057516, + "step": 4670 + }, + { + "epoch": 0.22676615951158058, + "grad_norm": 0.42727628350257874, + "learning_rate": 4.509456731400149e-05, + "loss": 1.1956, + "num_input_tokens_seen": 183446504, + "step": 4680 + }, + { + "epoch": 0.22725070258746002, + "grad_norm": 0.419729620218277, + "learning_rate": 4.507120101306425e-05, + "loss": 1.2287, + "num_input_tokens_seen": 183853916, + "step": 4690 + }, + { + "epoch": 0.22773524566333947, + "grad_norm": 0.3630729019641876, + "learning_rate": 4.504778527745616e-05, + "loss": 1.179, + "num_input_tokens_seen": 184251140, + "step": 4700 + }, + { + "epoch": 0.22821978873921892, + "grad_norm": 0.3897242248058319, + "learning_rate": 4.5024320164849366e-05, + "loss": 1.1704, + "num_input_tokens_seen": 184655808, + "step": 4710 + }, + { + "epoch": 0.22870433181509836, + "grad_norm": 0.3991836905479431, + "learning_rate": 4.500080573303762e-05, + "loss": 1.2208, + "num_input_tokens_seen": 185054436, + "step": 4720 + }, + { + "epoch": 0.2291888748909778, + "grad_norm": 0.39902886748313904, + "learning_rate": 4.497724203993615e-05, + "loss": 1.2088, + "num_input_tokens_seen": 185439916, + "step": 4730 + }, + { + "epoch": 0.22967341796685725, + "grad_norm": 0.39412978291511536, + "learning_rate": 4.495362914358152e-05, + "loss": 1.2317, + "num_input_tokens_seen": 185844756, + "step": 4740 + }, + { + "epoch": 0.2301579610427367, + "grad_norm": 0.37934547662734985, + "learning_rate": 4.4929967102131473e-05, + "loss": 1.2081, + "num_input_tokens_seen": 186226308, + "step": 4750 + }, + { + "epoch": 0.23064250411861614, + "grad_norm": 0.405785471200943, + "learning_rate": 4.49062559738648e-05, + "loss": 1.1877, + "num_input_tokens_seen": 186592700, + "step": 4760 + }, + { + "epoch": 0.2311270471944956, + "grad_norm": 0.36941656470298767, + "learning_rate": 4.488249581718118e-05, + "loss": 1.2322, + "num_input_tokens_seen": 186963268, + "step": 4770 + }, + { + "epoch": 0.23161159027037503, + "grad_norm": 0.3917679488658905, + "learning_rate": 4.485868669060104e-05, + "loss": 1.2006, + "num_input_tokens_seen": 187364124, + "step": 4780 + }, + { + "epoch": 0.23209613334625448, + "grad_norm": 0.38910412788391113, + "learning_rate": 4.483482865276545e-05, + "loss": 1.2244, + "num_input_tokens_seen": 187761036, + "step": 4790 + }, + { + "epoch": 0.23258067642213393, + "grad_norm": 0.42575618624687195, + "learning_rate": 4.481092176243592e-05, + "loss": 1.2119, + "num_input_tokens_seen": 188156176, + "step": 4800 + }, + { + "epoch": 0.23306521949801337, + "grad_norm": 0.3848058879375458, + "learning_rate": 4.4786966078494296e-05, + "loss": 1.2539, + "num_input_tokens_seen": 188549760, + "step": 4810 + }, + { + "epoch": 0.23354976257389282, + "grad_norm": 0.4218595623970032, + "learning_rate": 4.47629616599426e-05, + "loss": 1.2262, + "num_input_tokens_seen": 188935732, + "step": 4820 + }, + { + "epoch": 0.23403430564977226, + "grad_norm": 0.3957028388977051, + "learning_rate": 4.473890856590287e-05, + "loss": 1.2091, + "num_input_tokens_seen": 189364744, + "step": 4830 + }, + { + "epoch": 0.2345188487256517, + "grad_norm": 0.3964502811431885, + "learning_rate": 4.471480685561704e-05, + "loss": 1.1825, + "num_input_tokens_seen": 189761404, + "step": 4840 + }, + { + "epoch": 0.23500339180153115, + "grad_norm": 0.44072818756103516, + "learning_rate": 4.469065658844679e-05, + "loss": 1.1775, + "num_input_tokens_seen": 190143308, + "step": 4850 + }, + { + "epoch": 0.2354879348774106, + "grad_norm": 0.4088262617588043, + "learning_rate": 4.466645782387339e-05, + "loss": 1.1938, + "num_input_tokens_seen": 190553260, + "step": 4860 + }, + { + "epoch": 0.23597247795329004, + "grad_norm": 0.3793879449367523, + "learning_rate": 4.464221062149756e-05, + "loss": 1.184, + "num_input_tokens_seen": 190944324, + "step": 4870 + }, + { + "epoch": 0.2364570210291695, + "grad_norm": 0.3843640089035034, + "learning_rate": 4.461791504103931e-05, + "loss": 1.1906, + "num_input_tokens_seen": 191314092, + "step": 4880 + }, + { + "epoch": 0.23694156410504894, + "grad_norm": 0.42063283920288086, + "learning_rate": 4.459357114233781e-05, + "loss": 1.2077, + "num_input_tokens_seen": 191680088, + "step": 4890 + }, + { + "epoch": 0.23742610718092838, + "grad_norm": 0.3904326558113098, + "learning_rate": 4.4569178985351246e-05, + "loss": 1.1678, + "num_input_tokens_seen": 192097904, + "step": 4900 + }, + { + "epoch": 0.23791065025680783, + "grad_norm": 0.40638238191604614, + "learning_rate": 4.4544738630156644e-05, + "loss": 1.2103, + "num_input_tokens_seen": 192484692, + "step": 4910 + }, + { + "epoch": 0.23839519333268727, + "grad_norm": 0.4055958092212677, + "learning_rate": 4.4520250136949755e-05, + "loss": 1.2244, + "num_input_tokens_seen": 192862996, + "step": 4920 + }, + { + "epoch": 0.23887973640856672, + "grad_norm": 0.42126190662384033, + "learning_rate": 4.449571356604488e-05, + "loss": 1.1551, + "num_input_tokens_seen": 193256264, + "step": 4930 + }, + { + "epoch": 0.23936427948444616, + "grad_norm": 0.3869550824165344, + "learning_rate": 4.4471128977874755e-05, + "loss": 1.2033, + "num_input_tokens_seen": 193631576, + "step": 4940 + }, + { + "epoch": 0.2398488225603256, + "grad_norm": 0.4147154986858368, + "learning_rate": 4.4446496432990345e-05, + "loss": 1.1741, + "num_input_tokens_seen": 194010208, + "step": 4950 + }, + { + "epoch": 0.24033336563620505, + "grad_norm": 0.39161938428878784, + "learning_rate": 4.442181599206078e-05, + "loss": 1.2042, + "num_input_tokens_seen": 194370712, + "step": 4960 + }, + { + "epoch": 0.2408179087120845, + "grad_norm": 0.40375518798828125, + "learning_rate": 4.439708771587311e-05, + "loss": 1.2074, + "num_input_tokens_seen": 194764676, + "step": 4970 + }, + { + "epoch": 0.24130245178796395, + "grad_norm": 0.37387263774871826, + "learning_rate": 4.437231166533222e-05, + "loss": 1.23, + "num_input_tokens_seen": 195173608, + "step": 4980 + }, + { + "epoch": 0.2417869948638434, + "grad_norm": 0.45811185240745544, + "learning_rate": 4.434748790146066e-05, + "loss": 1.1996, + "num_input_tokens_seen": 195550528, + "step": 4990 + }, + { + "epoch": 0.24227153793972284, + "grad_norm": 0.40425029397010803, + "learning_rate": 4.432261648539852e-05, + "loss": 1.1837, + "num_input_tokens_seen": 195956732, + "step": 5000 + }, + { + "epoch": 0.24275608101560228, + "grad_norm": 0.40268510580062866, + "learning_rate": 4.42976974784032e-05, + "loss": 1.25, + "num_input_tokens_seen": 196340952, + "step": 5010 + }, + { + "epoch": 0.24324062409148173, + "grad_norm": 0.43269723653793335, + "learning_rate": 4.427273094184938e-05, + "loss": 1.174, + "num_input_tokens_seen": 196711452, + "step": 5020 + }, + { + "epoch": 0.24372516716736117, + "grad_norm": 0.3802777826786041, + "learning_rate": 4.424771693722875e-05, + "loss": 1.2558, + "num_input_tokens_seen": 197112496, + "step": 5030 + }, + { + "epoch": 0.24420971024324062, + "grad_norm": 0.39778804779052734, + "learning_rate": 4.422265552614995e-05, + "loss": 1.1993, + "num_input_tokens_seen": 197504404, + "step": 5040 + }, + { + "epoch": 0.24469425331912006, + "grad_norm": 0.4463038146495819, + "learning_rate": 4.4197546770338346e-05, + "loss": 1.196, + "num_input_tokens_seen": 197904548, + "step": 5050 + }, + { + "epoch": 0.2451787963949995, + "grad_norm": 0.37699079513549805, + "learning_rate": 4.417239073163596e-05, + "loss": 1.1525, + "num_input_tokens_seen": 198319024, + "step": 5060 + }, + { + "epoch": 0.24566333947087896, + "grad_norm": 0.3865511119365692, + "learning_rate": 4.414718747200121e-05, + "loss": 1.2275, + "num_input_tokens_seen": 198714528, + "step": 5070 + }, + { + "epoch": 0.2461478825467584, + "grad_norm": 0.3743216395378113, + "learning_rate": 4.412193705350886e-05, + "loss": 1.2307, + "num_input_tokens_seen": 199085164, + "step": 5080 + }, + { + "epoch": 0.24663242562263785, + "grad_norm": 0.41756245493888855, + "learning_rate": 4.4096639538349835e-05, + "loss": 1.2028, + "num_input_tokens_seen": 199462308, + "step": 5090 + }, + { + "epoch": 0.2471169686985173, + "grad_norm": 0.35077378153800964, + "learning_rate": 4.407129498883101e-05, + "loss": 1.1789, + "num_input_tokens_seen": 199882788, + "step": 5100 + }, + { + "epoch": 0.24760151177439674, + "grad_norm": 0.4078513979911804, + "learning_rate": 4.404590346737514e-05, + "loss": 1.2188, + "num_input_tokens_seen": 200271336, + "step": 5110 + }, + { + "epoch": 0.24808605485027618, + "grad_norm": 0.42663541436195374, + "learning_rate": 4.402046503652065e-05, + "loss": 1.2293, + "num_input_tokens_seen": 200660788, + "step": 5120 + }, + { + "epoch": 0.24857059792615563, + "grad_norm": 0.3861826956272125, + "learning_rate": 4.399497975892153e-05, + "loss": 1.2182, + "num_input_tokens_seen": 201075832, + "step": 5130 + }, + { + "epoch": 0.24905514100203507, + "grad_norm": 0.38945886492729187, + "learning_rate": 4.3969447697347116e-05, + "loss": 1.2464, + "num_input_tokens_seen": 201447360, + "step": 5140 + }, + { + "epoch": 0.24953968407791452, + "grad_norm": 0.3884553611278534, + "learning_rate": 4.3943868914681996e-05, + "loss": 1.2353, + "num_input_tokens_seen": 201863948, + "step": 5150 + }, + { + "epoch": 0.25002422715379397, + "grad_norm": 0.409061461687088, + "learning_rate": 4.3918243473925823e-05, + "loss": 1.2235, + "num_input_tokens_seen": 202265196, + "step": 5160 + }, + { + "epoch": 0.25050877022967344, + "grad_norm": 0.43314066529273987, + "learning_rate": 4.389257143819318e-05, + "loss": 1.2076, + "num_input_tokens_seen": 202665352, + "step": 5170 + }, + { + "epoch": 0.25099331330555286, + "grad_norm": 0.43011194467544556, + "learning_rate": 4.386685287071337e-05, + "loss": 1.1834, + "num_input_tokens_seen": 203076796, + "step": 5180 + }, + { + "epoch": 0.25147785638143233, + "grad_norm": 0.4096457064151764, + "learning_rate": 4.3841087834830354e-05, + "loss": 1.2336, + "num_input_tokens_seen": 203488952, + "step": 5190 + }, + { + "epoch": 0.25196239945731175, + "grad_norm": 0.4316791594028473, + "learning_rate": 4.381527639400251e-05, + "loss": 1.2205, + "num_input_tokens_seen": 203855896, + "step": 5200 + }, + { + "epoch": 0.2524469425331912, + "grad_norm": 0.4344671666622162, + "learning_rate": 4.3789418611802533e-05, + "loss": 1.1693, + "num_input_tokens_seen": 204240628, + "step": 5210 + }, + { + "epoch": 0.25293148560907064, + "grad_norm": 0.378844678401947, + "learning_rate": 4.3763514551917236e-05, + "loss": 1.1722, + "num_input_tokens_seen": 204638860, + "step": 5220 + }, + { + "epoch": 0.2534160286849501, + "grad_norm": 0.38779839873313904, + "learning_rate": 4.3737564278147406e-05, + "loss": 1.2179, + "num_input_tokens_seen": 205028820, + "step": 5230 + }, + { + "epoch": 0.25390057176082953, + "grad_norm": 0.4080614745616913, + "learning_rate": 4.3711567854407683e-05, + "loss": 1.1904, + "num_input_tokens_seen": 205408752, + "step": 5240 + }, + { + "epoch": 0.254385114836709, + "grad_norm": 0.40406113862991333, + "learning_rate": 4.3685525344726345e-05, + "loss": 1.2017, + "num_input_tokens_seen": 205800048, + "step": 5250 + }, + { + "epoch": 0.2548696579125884, + "grad_norm": 0.3979373872280121, + "learning_rate": 4.365943681324519e-05, + "loss": 1.2127, + "num_input_tokens_seen": 206178400, + "step": 5260 + }, + { + "epoch": 0.2553542009884679, + "grad_norm": 0.406194269657135, + "learning_rate": 4.363330232421938e-05, + "loss": 1.2299, + "num_input_tokens_seen": 206550496, + "step": 5270 + }, + { + "epoch": 0.2558387440643473, + "grad_norm": 0.43176546692848206, + "learning_rate": 4.3607121942017225e-05, + "loss": 1.1843, + "num_input_tokens_seen": 206938216, + "step": 5280 + }, + { + "epoch": 0.2563232871402268, + "grad_norm": 0.43860721588134766, + "learning_rate": 4.358089573112012e-05, + "loss": 1.1568, + "num_input_tokens_seen": 207342028, + "step": 5290 + }, + { + "epoch": 0.2568078302161062, + "grad_norm": 0.40631529688835144, + "learning_rate": 4.355462375612231e-05, + "loss": 1.2707, + "num_input_tokens_seen": 207740984, + "step": 5300 + }, + { + "epoch": 0.2572923732919857, + "grad_norm": 0.3890661895275116, + "learning_rate": 4.352830608173074e-05, + "loss": 1.1957, + "num_input_tokens_seen": 208160252, + "step": 5310 + }, + { + "epoch": 0.2577769163678651, + "grad_norm": 0.40882420539855957, + "learning_rate": 4.350194277276495e-05, + "loss": 1.1938, + "num_input_tokens_seen": 208537272, + "step": 5320 + }, + { + "epoch": 0.25826145944374457, + "grad_norm": 0.4055963456630707, + "learning_rate": 4.347553389415684e-05, + "loss": 1.1603, + "num_input_tokens_seen": 208927108, + "step": 5330 + }, + { + "epoch": 0.258746002519624, + "grad_norm": 0.3766281008720398, + "learning_rate": 4.3449079510950555e-05, + "loss": 1.2169, + "num_input_tokens_seen": 209320208, + "step": 5340 + }, + { + "epoch": 0.25923054559550346, + "grad_norm": 0.3955957591533661, + "learning_rate": 4.3422579688302337e-05, + "loss": 1.2096, + "num_input_tokens_seen": 209704852, + "step": 5350 + }, + { + "epoch": 0.2597150886713829, + "grad_norm": 0.416856050491333, + "learning_rate": 4.339603449148031e-05, + "loss": 1.2155, + "num_input_tokens_seen": 210092272, + "step": 5360 + }, + { + "epoch": 0.26019963174726235, + "grad_norm": 0.3843337893486023, + "learning_rate": 4.33694439858644e-05, + "loss": 1.2163, + "num_input_tokens_seen": 210513236, + "step": 5370 + }, + { + "epoch": 0.26068417482314177, + "grad_norm": 0.3668496310710907, + "learning_rate": 4.334280823694607e-05, + "loss": 1.2402, + "num_input_tokens_seen": 210927300, + "step": 5380 + }, + { + "epoch": 0.26116871789902124, + "grad_norm": 0.38131269812583923, + "learning_rate": 4.331612731032826e-05, + "loss": 1.1913, + "num_input_tokens_seen": 211339824, + "step": 5390 + }, + { + "epoch": 0.26165326097490066, + "grad_norm": 0.4179877042770386, + "learning_rate": 4.328940127172516e-05, + "loss": 1.197, + "num_input_tokens_seen": 211752352, + "step": 5400 + }, + { + "epoch": 0.26213780405078013, + "grad_norm": 0.42718562483787537, + "learning_rate": 4.326263018696208e-05, + "loss": 1.199, + "num_input_tokens_seen": 212135324, + "step": 5410 + }, + { + "epoch": 0.26262234712665955, + "grad_norm": 0.43352535367012024, + "learning_rate": 4.3235814121975274e-05, + "loss": 1.2094, + "num_input_tokens_seen": 212524932, + "step": 5420 + }, + { + "epoch": 0.263106890202539, + "grad_norm": 0.396100789308548, + "learning_rate": 4.320895314281177e-05, + "loss": 1.2254, + "num_input_tokens_seen": 212938816, + "step": 5430 + }, + { + "epoch": 0.26359143327841844, + "grad_norm": 0.3762286901473999, + "learning_rate": 4.318204731562922e-05, + "loss": 1.2321, + "num_input_tokens_seen": 213336448, + "step": 5440 + }, + { + "epoch": 0.2640759763542979, + "grad_norm": 0.35878944396972656, + "learning_rate": 4.3155096706695755e-05, + "loss": 1.2227, + "num_input_tokens_seen": 213756868, + "step": 5450 + }, + { + "epoch": 0.26456051943017733, + "grad_norm": 0.4023889899253845, + "learning_rate": 4.312810138238979e-05, + "loss": 1.2823, + "num_input_tokens_seen": 214154532, + "step": 5460 + }, + { + "epoch": 0.2650450625060568, + "grad_norm": 0.40626269578933716, + "learning_rate": 4.310106140919986e-05, + "loss": 1.2193, + "num_input_tokens_seen": 214554500, + "step": 5470 + }, + { + "epoch": 0.2655296055819362, + "grad_norm": 0.4005719721317291, + "learning_rate": 4.307397685372448e-05, + "loss": 1.1867, + "num_input_tokens_seen": 214941668, + "step": 5480 + }, + { + "epoch": 0.2660141486578157, + "grad_norm": 0.4291759431362152, + "learning_rate": 4.304684778267199e-05, + "loss": 1.2184, + "num_input_tokens_seen": 215343812, + "step": 5490 + }, + { + "epoch": 0.2664986917336951, + "grad_norm": 0.41158977150917053, + "learning_rate": 4.3019674262860324e-05, + "loss": 1.1839, + "num_input_tokens_seen": 215688820, + "step": 5500 + }, + { + "epoch": 0.2669832348095746, + "grad_norm": 0.4709673821926117, + "learning_rate": 4.2992456361216934e-05, + "loss": 1.2018, + "num_input_tokens_seen": 216078056, + "step": 5510 + }, + { + "epoch": 0.267467777885454, + "grad_norm": 0.47514909505844116, + "learning_rate": 4.2965194144778554e-05, + "loss": 1.2325, + "num_input_tokens_seen": 216461372, + "step": 5520 + }, + { + "epoch": 0.2679523209613335, + "grad_norm": 0.38686904311180115, + "learning_rate": 4.293788768069108e-05, + "loss": 1.2933, + "num_input_tokens_seen": 216842388, + "step": 5530 + }, + { + "epoch": 0.2684368640372129, + "grad_norm": 0.3987712860107422, + "learning_rate": 4.291053703620939e-05, + "loss": 1.2467, + "num_input_tokens_seen": 217231240, + "step": 5540 + }, + { + "epoch": 0.26892140711309237, + "grad_norm": 0.4047813415527344, + "learning_rate": 4.288314227869716e-05, + "loss": 1.2023, + "num_input_tokens_seen": 217648192, + "step": 5550 + }, + { + "epoch": 0.2694059501889718, + "grad_norm": 0.424683541059494, + "learning_rate": 4.285570347562674e-05, + "loss": 1.2134, + "num_input_tokens_seen": 218017588, + "step": 5560 + }, + { + "epoch": 0.26989049326485126, + "grad_norm": 0.4166494607925415, + "learning_rate": 4.2828220694578935e-05, + "loss": 1.1818, + "num_input_tokens_seen": 218425896, + "step": 5570 + }, + { + "epoch": 0.2703750363407307, + "grad_norm": 0.3912692964076996, + "learning_rate": 4.28006940032429e-05, + "loss": 1.1785, + "num_input_tokens_seen": 218807588, + "step": 5580 + }, + { + "epoch": 0.27085957941661015, + "grad_norm": 0.36660903692245483, + "learning_rate": 4.2773123469415897e-05, + "loss": 1.1937, + "num_input_tokens_seen": 219185200, + "step": 5590 + }, + { + "epoch": 0.27134412249248957, + "grad_norm": 0.39641043543815613, + "learning_rate": 4.274550916100321e-05, + "loss": 1.1904, + "num_input_tokens_seen": 219546888, + "step": 5600 + }, + { + "epoch": 0.27182866556836904, + "grad_norm": 0.371076375246048, + "learning_rate": 4.271785114601791e-05, + "loss": 1.2116, + "num_input_tokens_seen": 219938908, + "step": 5610 + }, + { + "epoch": 0.27231320864424846, + "grad_norm": 0.4308777153491974, + "learning_rate": 4.269014949258072e-05, + "loss": 1.1918, + "num_input_tokens_seen": 220337796, + "step": 5620 + }, + { + "epoch": 0.27279775172012793, + "grad_norm": 0.4023754894733429, + "learning_rate": 4.266240426891987e-05, + "loss": 1.1797, + "num_input_tokens_seen": 220717632, + "step": 5630 + }, + { + "epoch": 0.27328229479600735, + "grad_norm": 0.3996109366416931, + "learning_rate": 4.2634615543370885e-05, + "loss": 1.1916, + "num_input_tokens_seen": 221115656, + "step": 5640 + }, + { + "epoch": 0.2737668378718868, + "grad_norm": 0.43148279190063477, + "learning_rate": 4.26067833843764e-05, + "loss": 1.1981, + "num_input_tokens_seen": 221450512, + "step": 5650 + }, + { + "epoch": 0.27425138094776624, + "grad_norm": 0.4533494710922241, + "learning_rate": 4.257890786048609e-05, + "loss": 1.1904, + "num_input_tokens_seen": 221855800, + "step": 5660 + }, + { + "epoch": 0.2747359240236457, + "grad_norm": 0.38806283473968506, + "learning_rate": 4.255098904035638e-05, + "loss": 1.2631, + "num_input_tokens_seen": 222267776, + "step": 5670 + }, + { + "epoch": 0.27522046709952513, + "grad_norm": 0.3938799202442169, + "learning_rate": 4.252302699275037e-05, + "loss": 1.1635, + "num_input_tokens_seen": 222682008, + "step": 5680 + }, + { + "epoch": 0.2757050101754046, + "grad_norm": 0.4024967551231384, + "learning_rate": 4.249502178653759e-05, + "loss": 1.1915, + "num_input_tokens_seen": 223053692, + "step": 5690 + }, + { + "epoch": 0.276189553251284, + "grad_norm": 0.3679750859737396, + "learning_rate": 4.246697349069391e-05, + "loss": 1.1711, + "num_input_tokens_seen": 223438432, + "step": 5700 + }, + { + "epoch": 0.2766740963271635, + "grad_norm": 0.42832136154174805, + "learning_rate": 4.243888217430129e-05, + "loss": 1.1753, + "num_input_tokens_seen": 223831660, + "step": 5710 + }, + { + "epoch": 0.2771586394030429, + "grad_norm": 0.4067901074886322, + "learning_rate": 4.241074790654769e-05, + "loss": 1.2218, + "num_input_tokens_seen": 224233376, + "step": 5720 + }, + { + "epoch": 0.2776431824789224, + "grad_norm": 0.38691502809524536, + "learning_rate": 4.2382570756726815e-05, + "loss": 1.2041, + "num_input_tokens_seen": 224640328, + "step": 5730 + }, + { + "epoch": 0.2781277255548018, + "grad_norm": 0.40320757031440735, + "learning_rate": 4.235435079423802e-05, + "loss": 1.2297, + "num_input_tokens_seen": 225039904, + "step": 5740 + }, + { + "epoch": 0.2786122686306813, + "grad_norm": 0.43891477584838867, + "learning_rate": 4.232608808858608e-05, + "loss": 1.1493, + "num_input_tokens_seen": 225441684, + "step": 5750 + }, + { + "epoch": 0.2790968117065607, + "grad_norm": 0.4412339925765991, + "learning_rate": 4.2297782709381065e-05, + "loss": 1.2154, + "num_input_tokens_seen": 225837976, + "step": 5760 + }, + { + "epoch": 0.2795813547824402, + "grad_norm": 0.39056098461151123, + "learning_rate": 4.226943472633813e-05, + "loss": 1.1641, + "num_input_tokens_seen": 226187316, + "step": 5770 + }, + { + "epoch": 0.2800658978583196, + "grad_norm": 0.41356363892555237, + "learning_rate": 4.2241044209277384e-05, + "loss": 1.2058, + "num_input_tokens_seen": 226579696, + "step": 5780 + }, + { + "epoch": 0.28055044093419906, + "grad_norm": 0.40139320492744446, + "learning_rate": 4.2212611228123686e-05, + "loss": 1.2125, + "num_input_tokens_seen": 226972904, + "step": 5790 + }, + { + "epoch": 0.2810349840100785, + "grad_norm": 0.40096282958984375, + "learning_rate": 4.218413585290647e-05, + "loss": 1.2027, + "num_input_tokens_seen": 227364980, + "step": 5800 + }, + { + "epoch": 0.28151952708595795, + "grad_norm": 0.41258716583251953, + "learning_rate": 4.21556181537596e-05, + "loss": 1.2129, + "num_input_tokens_seen": 227750424, + "step": 5810 + }, + { + "epoch": 0.2820040701618374, + "grad_norm": 0.4123449921607971, + "learning_rate": 4.2127058200921186e-05, + "loss": 1.1705, + "num_input_tokens_seen": 228150768, + "step": 5820 + }, + { + "epoch": 0.28248861323771685, + "grad_norm": 0.3969346582889557, + "learning_rate": 4.209845606473339e-05, + "loss": 1.1913, + "num_input_tokens_seen": 228525776, + "step": 5830 + }, + { + "epoch": 0.28297315631359626, + "grad_norm": 0.378460168838501, + "learning_rate": 4.206981181564229e-05, + "loss": 1.1467, + "num_input_tokens_seen": 228912604, + "step": 5840 + }, + { + "epoch": 0.28345769938947574, + "grad_norm": 0.38569942116737366, + "learning_rate": 4.2041125524197664e-05, + "loss": 1.2075, + "num_input_tokens_seen": 229298524, + "step": 5850 + }, + { + "epoch": 0.28394224246535515, + "grad_norm": 0.3867596983909607, + "learning_rate": 4.201239726105286e-05, + "loss": 1.1713, + "num_input_tokens_seen": 229686964, + "step": 5860 + }, + { + "epoch": 0.28442678554123463, + "grad_norm": 0.373037725687027, + "learning_rate": 4.198362709696458e-05, + "loss": 1.2551, + "num_input_tokens_seen": 230060948, + "step": 5870 + }, + { + "epoch": 0.28491132861711405, + "grad_norm": 0.3632396161556244, + "learning_rate": 4.195481510279276e-05, + "loss": 1.2301, + "num_input_tokens_seen": 230448628, + "step": 5880 + }, + { + "epoch": 0.2853958716929935, + "grad_norm": 0.4397088289260864, + "learning_rate": 4.192596134950033e-05, + "loss": 1.2441, + "num_input_tokens_seen": 230837484, + "step": 5890 + }, + { + "epoch": 0.28588041476887294, + "grad_norm": 0.37855035066604614, + "learning_rate": 4.189706590815307e-05, + "loss": 1.2049, + "num_input_tokens_seen": 231247448, + "step": 5900 + }, + { + "epoch": 0.2863649578447524, + "grad_norm": 0.3964214324951172, + "learning_rate": 4.186812884991946e-05, + "loss": 1.1927, + "num_input_tokens_seen": 231653776, + "step": 5910 + }, + { + "epoch": 0.28684950092063183, + "grad_norm": 0.35895830392837524, + "learning_rate": 4.183915024607048e-05, + "loss": 1.2302, + "num_input_tokens_seen": 232045136, + "step": 5920 + }, + { + "epoch": 0.2873340439965113, + "grad_norm": 0.375405877828598, + "learning_rate": 4.18101301679794e-05, + "loss": 1.1242, + "num_input_tokens_seen": 232402564, + "step": 5930 + }, + { + "epoch": 0.2878185870723907, + "grad_norm": 0.37960904836654663, + "learning_rate": 4.178106868712168e-05, + "loss": 1.2209, + "num_input_tokens_seen": 232790844, + "step": 5940 + }, + { + "epoch": 0.2883031301482702, + "grad_norm": 0.4015379250049591, + "learning_rate": 4.1751965875074746e-05, + "loss": 1.1929, + "num_input_tokens_seen": 233194892, + "step": 5950 + }, + { + "epoch": 0.2887876732241496, + "grad_norm": 0.42236605286598206, + "learning_rate": 4.172282180351779e-05, + "loss": 1.2123, + "num_input_tokens_seen": 233566416, + "step": 5960 + }, + { + "epoch": 0.2892722163000291, + "grad_norm": 0.4050694406032562, + "learning_rate": 4.169363654423166e-05, + "loss": 1.1981, + "num_input_tokens_seen": 233992752, + "step": 5970 + }, + { + "epoch": 0.2897567593759085, + "grad_norm": 0.4495182931423187, + "learning_rate": 4.166441016909864e-05, + "loss": 1.2207, + "num_input_tokens_seen": 234347140, + "step": 5980 + }, + { + "epoch": 0.290241302451788, + "grad_norm": 0.3789389431476593, + "learning_rate": 4.163514275010228e-05, + "loss": 1.1707, + "num_input_tokens_seen": 234751484, + "step": 5990 + }, + { + "epoch": 0.2907258455276674, + "grad_norm": 0.3984537720680237, + "learning_rate": 4.160583435932719e-05, + "loss": 1.2024, + "num_input_tokens_seen": 235148092, + "step": 6000 + }, + { + "epoch": 0.2907258455276674, + "eval_loss": 1.200870156288147, + "eval_runtime": 5.1147, + "eval_samples_per_second": 29.327, + "eval_steps_per_second": 3.715, + "num_input_tokens_seen": 235148092, + "step": 6000 + }, + { + "epoch": 0.29121038860354687, + "grad_norm": 0.409307062625885, + "learning_rate": 4.157648506895895e-05, + "loss": 1.2106, + "num_input_tokens_seen": 235543492, + "step": 6010 + }, + { + "epoch": 0.2916949316794263, + "grad_norm": 0.3580097258090973, + "learning_rate": 4.154709495128383e-05, + "loss": 1.1765, + "num_input_tokens_seen": 235908876, + "step": 6020 + }, + { + "epoch": 0.29217947475530576, + "grad_norm": 0.38826310634613037, + "learning_rate": 4.151766407868866e-05, + "loss": 1.1798, + "num_input_tokens_seen": 236298644, + "step": 6030 + }, + { + "epoch": 0.2926640178311852, + "grad_norm": 0.3992711901664734, + "learning_rate": 4.1488192523660676e-05, + "loss": 1.2077, + "num_input_tokens_seen": 236681740, + "step": 6040 + }, + { + "epoch": 0.29314856090706465, + "grad_norm": 0.4223003089427948, + "learning_rate": 4.1458680358787275e-05, + "loss": 1.1904, + "num_input_tokens_seen": 237098732, + "step": 6050 + }, + { + "epoch": 0.29363310398294407, + "grad_norm": 0.3620993494987488, + "learning_rate": 4.14291276567559e-05, + "loss": 1.2307, + "num_input_tokens_seen": 237483400, + "step": 6060 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.3823581039905548, + "learning_rate": 4.139953449035383e-05, + "loss": 1.1695, + "num_input_tokens_seen": 237865200, + "step": 6070 + }, + { + "epoch": 0.29460219013470296, + "grad_norm": 0.37260711193084717, + "learning_rate": 4.136990093246802e-05, + "loss": 1.2297, + "num_input_tokens_seen": 238233548, + "step": 6080 + }, + { + "epoch": 0.29508673321058243, + "grad_norm": 0.4020680785179138, + "learning_rate": 4.134022705608487e-05, + "loss": 1.2263, + "num_input_tokens_seen": 238623180, + "step": 6090 + }, + { + "epoch": 0.29557127628646185, + "grad_norm": 0.41028186678886414, + "learning_rate": 4.1310512934290124e-05, + "loss": 1.2341, + "num_input_tokens_seen": 238985848, + "step": 6100 + }, + { + "epoch": 0.2960558193623413, + "grad_norm": 0.4138743579387665, + "learning_rate": 4.1280758640268625e-05, + "loss": 1.2075, + "num_input_tokens_seen": 239355272, + "step": 6110 + }, + { + "epoch": 0.29654036243822074, + "grad_norm": 0.35869237780570984, + "learning_rate": 4.125096424730417e-05, + "loss": 1.2002, + "num_input_tokens_seen": 239742404, + "step": 6120 + }, + { + "epoch": 0.2970249055141002, + "grad_norm": 0.39054998755455017, + "learning_rate": 4.12211298287793e-05, + "loss": 1.2103, + "num_input_tokens_seen": 240150336, + "step": 6130 + }, + { + "epoch": 0.29750944858997963, + "grad_norm": 0.42105597257614136, + "learning_rate": 4.119125545817517e-05, + "loss": 1.2114, + "num_input_tokens_seen": 240519488, + "step": 6140 + }, + { + "epoch": 0.2979939916658591, + "grad_norm": 0.37524622678756714, + "learning_rate": 4.11613412090713e-05, + "loss": 1.2072, + "num_input_tokens_seen": 240898472, + "step": 6150 + }, + { + "epoch": 0.2984785347417385, + "grad_norm": 0.3830278217792511, + "learning_rate": 4.113138715514546e-05, + "loss": 1.2617, + "num_input_tokens_seen": 241258828, + "step": 6160 + }, + { + "epoch": 0.298963077817618, + "grad_norm": 0.3757288157939911, + "learning_rate": 4.110139337017345e-05, + "loss": 1.201, + "num_input_tokens_seen": 241648504, + "step": 6170 + }, + { + "epoch": 0.2994476208934974, + "grad_norm": 0.500215470790863, + "learning_rate": 4.1071359928028896e-05, + "loss": 1.218, + "num_input_tokens_seen": 242026552, + "step": 6180 + }, + { + "epoch": 0.2999321639693769, + "grad_norm": 0.3983130156993866, + "learning_rate": 4.104128690268314e-05, + "loss": 1.1784, + "num_input_tokens_seen": 242415488, + "step": 6190 + }, + { + "epoch": 0.3004167070452563, + "grad_norm": 0.4273653030395508, + "learning_rate": 4.101117436820499e-05, + "loss": 1.2151, + "num_input_tokens_seen": 242796596, + "step": 6200 + }, + { + "epoch": 0.3009012501211358, + "grad_norm": 0.4290236830711365, + "learning_rate": 4.098102239876058e-05, + "loss": 1.1989, + "num_input_tokens_seen": 243188916, + "step": 6210 + }, + { + "epoch": 0.3013857931970152, + "grad_norm": 0.4325619339942932, + "learning_rate": 4.095083106861317e-05, + "loss": 1.1811, + "num_input_tokens_seen": 243612644, + "step": 6220 + }, + { + "epoch": 0.30187033627289467, + "grad_norm": 0.38956135511398315, + "learning_rate": 4.0920600452122934e-05, + "loss": 1.2224, + "num_input_tokens_seen": 244023064, + "step": 6230 + }, + { + "epoch": 0.3023548793487741, + "grad_norm": 0.3935461938381195, + "learning_rate": 4.0890330623746856e-05, + "loss": 1.175, + "num_input_tokens_seen": 244414972, + "step": 6240 + }, + { + "epoch": 0.30283942242465356, + "grad_norm": 0.3803289830684662, + "learning_rate": 4.086002165803845e-05, + "loss": 1.1878, + "num_input_tokens_seen": 244793504, + "step": 6250 + }, + { + "epoch": 0.303323965500533, + "grad_norm": 0.3836755156517029, + "learning_rate": 4.082967362964766e-05, + "loss": 1.2004, + "num_input_tokens_seen": 245195808, + "step": 6260 + }, + { + "epoch": 0.30380850857641245, + "grad_norm": 0.36543598771095276, + "learning_rate": 4.079928661332062e-05, + "loss": 1.2333, + "num_input_tokens_seen": 245600428, + "step": 6270 + }, + { + "epoch": 0.30429305165229187, + "grad_norm": 0.46092289686203003, + "learning_rate": 4.07688606838995e-05, + "loss": 1.1549, + "num_input_tokens_seen": 246013016, + "step": 6280 + }, + { + "epoch": 0.30477759472817134, + "grad_norm": 0.364113986492157, + "learning_rate": 4.0738395916322305e-05, + "loss": 1.213, + "num_input_tokens_seen": 246396572, + "step": 6290 + }, + { + "epoch": 0.30526213780405076, + "grad_norm": 0.4042966067790985, + "learning_rate": 4.0707892385622695e-05, + "loss": 1.1444, + "num_input_tokens_seen": 246823552, + "step": 6300 + }, + { + "epoch": 0.30574668087993023, + "grad_norm": 0.3558143079280853, + "learning_rate": 4.0677350166929805e-05, + "loss": 1.2313, + "num_input_tokens_seen": 247219524, + "step": 6310 + }, + { + "epoch": 0.30623122395580965, + "grad_norm": 0.4053792655467987, + "learning_rate": 4.064676933546807e-05, + "loss": 1.2071, + "num_input_tokens_seen": 247632420, + "step": 6320 + }, + { + "epoch": 0.3067157670316891, + "grad_norm": 0.37768182158470154, + "learning_rate": 4.061614996655701e-05, + "loss": 1.2165, + "num_input_tokens_seen": 248035440, + "step": 6330 + }, + { + "epoch": 0.30720031010756854, + "grad_norm": 0.38902053236961365, + "learning_rate": 4.0585492135611064e-05, + "loss": 1.2016, + "num_input_tokens_seen": 248418968, + "step": 6340 + }, + { + "epoch": 0.307684853183448, + "grad_norm": 0.4327777028083801, + "learning_rate": 4.055479591813941e-05, + "loss": 1.248, + "num_input_tokens_seen": 248804400, + "step": 6350 + }, + { + "epoch": 0.30816939625932743, + "grad_norm": 0.38908207416534424, + "learning_rate": 4.052406138974576e-05, + "loss": 1.2374, + "num_input_tokens_seen": 249179376, + "step": 6360 + }, + { + "epoch": 0.3086539393352069, + "grad_norm": 0.45117059350013733, + "learning_rate": 4.049328862612819e-05, + "loss": 1.1853, + "num_input_tokens_seen": 249591320, + "step": 6370 + }, + { + "epoch": 0.3091384824110863, + "grad_norm": 0.3981532156467438, + "learning_rate": 4.0462477703078944e-05, + "loss": 1.1586, + "num_input_tokens_seen": 249973744, + "step": 6380 + }, + { + "epoch": 0.3096230254869658, + "grad_norm": 0.39917775988578796, + "learning_rate": 4.043162869648427e-05, + "loss": 1.2109, + "num_input_tokens_seen": 250419824, + "step": 6390 + }, + { + "epoch": 0.3101075685628452, + "grad_norm": 0.45799973607063293, + "learning_rate": 4.0400741682324185e-05, + "loss": 1.2142, + "num_input_tokens_seen": 250837544, + "step": 6400 + }, + { + "epoch": 0.3105921116387247, + "grad_norm": 0.4319687783718109, + "learning_rate": 4.036981673667234e-05, + "loss": 1.2561, + "num_input_tokens_seen": 251224108, + "step": 6410 + }, + { + "epoch": 0.3110766547146041, + "grad_norm": 0.3861522376537323, + "learning_rate": 4.0338853935695795e-05, + "loss": 1.1854, + "num_input_tokens_seen": 251600636, + "step": 6420 + }, + { + "epoch": 0.3115611977904836, + "grad_norm": 0.4382316768169403, + "learning_rate": 4.0307853355654856e-05, + "loss": 1.1935, + "num_input_tokens_seen": 251988072, + "step": 6430 + }, + { + "epoch": 0.312045740866363, + "grad_norm": 0.41492384672164917, + "learning_rate": 4.027681507290288e-05, + "loss": 1.2185, + "num_input_tokens_seen": 252366656, + "step": 6440 + }, + { + "epoch": 0.31253028394224247, + "grad_norm": 0.4099133014678955, + "learning_rate": 4.0245739163886076e-05, + "loss": 1.2092, + "num_input_tokens_seen": 252784056, + "step": 6450 + }, + { + "epoch": 0.3130148270181219, + "grad_norm": 0.41487085819244385, + "learning_rate": 4.021462570514333e-05, + "loss": 1.1743, + "num_input_tokens_seen": 253186828, + "step": 6460 + }, + { + "epoch": 0.31349937009400136, + "grad_norm": 0.370210736989975, + "learning_rate": 4.0183474773305995e-05, + "loss": 1.2137, + "num_input_tokens_seen": 253574680, + "step": 6470 + }, + { + "epoch": 0.3139839131698808, + "grad_norm": 0.3662930727005005, + "learning_rate": 4.015228644509776e-05, + "loss": 1.1403, + "num_input_tokens_seen": 253967160, + "step": 6480 + }, + { + "epoch": 0.31446845624576025, + "grad_norm": 0.41104060411453247, + "learning_rate": 4.0121060797334375e-05, + "loss": 1.2502, + "num_input_tokens_seen": 254357996, + "step": 6490 + }, + { + "epoch": 0.31495299932163967, + "grad_norm": 0.3852168023586273, + "learning_rate": 4.008979790692353e-05, + "loss": 1.1896, + "num_input_tokens_seen": 254758512, + "step": 6500 + }, + { + "epoch": 0.31543754239751914, + "grad_norm": 0.40157365798950195, + "learning_rate": 4.005849785086464e-05, + "loss": 1.2011, + "num_input_tokens_seen": 255148092, + "step": 6510 + }, + { + "epoch": 0.31592208547339856, + "grad_norm": 0.41574087738990784, + "learning_rate": 4.002716070624866e-05, + "loss": 1.2243, + "num_input_tokens_seen": 255552832, + "step": 6520 + }, + { + "epoch": 0.31640662854927804, + "grad_norm": 0.3938582241535187, + "learning_rate": 3.999578655025787e-05, + "loss": 1.1956, + "num_input_tokens_seen": 255953292, + "step": 6530 + }, + { + "epoch": 0.31689117162515745, + "grad_norm": 0.39169254899024963, + "learning_rate": 3.996437546016575e-05, + "loss": 1.2158, + "num_input_tokens_seen": 256352132, + "step": 6540 + }, + { + "epoch": 0.3173757147010369, + "grad_norm": 0.38570165634155273, + "learning_rate": 3.993292751333671e-05, + "loss": 1.2131, + "num_input_tokens_seen": 256744884, + "step": 6550 + }, + { + "epoch": 0.31786025777691634, + "grad_norm": 0.38711732625961304, + "learning_rate": 3.9901442787225955e-05, + "loss": 1.1811, + "num_input_tokens_seen": 257140072, + "step": 6560 + }, + { + "epoch": 0.3183448008527958, + "grad_norm": 0.4001631438732147, + "learning_rate": 3.986992135937927e-05, + "loss": 1.2469, + "num_input_tokens_seen": 257526996, + "step": 6570 + }, + { + "epoch": 0.31882934392867524, + "grad_norm": 0.4085213243961334, + "learning_rate": 3.9838363307432844e-05, + "loss": 1.173, + "num_input_tokens_seen": 257934640, + "step": 6580 + }, + { + "epoch": 0.3193138870045547, + "grad_norm": 0.3954554498195648, + "learning_rate": 3.980676870911305e-05, + "loss": 1.1913, + "num_input_tokens_seen": 258291948, + "step": 6590 + }, + { + "epoch": 0.3197984300804341, + "grad_norm": 0.3921205401420593, + "learning_rate": 3.977513764223629e-05, + "loss": 1.1919, + "num_input_tokens_seen": 258703912, + "step": 6600 + }, + { + "epoch": 0.3202829731563136, + "grad_norm": 0.39368802309036255, + "learning_rate": 3.974347018470879e-05, + "loss": 1.1346, + "num_input_tokens_seen": 259074912, + "step": 6610 + }, + { + "epoch": 0.320767516232193, + "grad_norm": 0.4187362790107727, + "learning_rate": 3.9711766414526386e-05, + "loss": 1.2207, + "num_input_tokens_seen": 259450412, + "step": 6620 + }, + { + "epoch": 0.3212520593080725, + "grad_norm": 0.4282675087451935, + "learning_rate": 3.9680026409774385e-05, + "loss": 1.1791, + "num_input_tokens_seen": 259839588, + "step": 6630 + }, + { + "epoch": 0.3217366023839519, + "grad_norm": 0.41149836778640747, + "learning_rate": 3.9648250248627285e-05, + "loss": 1.1941, + "num_input_tokens_seen": 260210136, + "step": 6640 + }, + { + "epoch": 0.3222211454598314, + "grad_norm": 0.40860268473625183, + "learning_rate": 3.961643800934869e-05, + "loss": 1.1643, + "num_input_tokens_seen": 260596296, + "step": 6650 + }, + { + "epoch": 0.3227056885357108, + "grad_norm": 0.40710315108299255, + "learning_rate": 3.958458977029103e-05, + "loss": 1.1714, + "num_input_tokens_seen": 260991860, + "step": 6660 + }, + { + "epoch": 0.3231902316115903, + "grad_norm": 0.36702919006347656, + "learning_rate": 3.955270560989542e-05, + "loss": 1.2253, + "num_input_tokens_seen": 261370536, + "step": 6670 + }, + { + "epoch": 0.3236747746874697, + "grad_norm": 0.36147916316986084, + "learning_rate": 3.952078560669142e-05, + "loss": 1.1437, + "num_input_tokens_seen": 261745012, + "step": 6680 + }, + { + "epoch": 0.32415931776334916, + "grad_norm": 0.38143739104270935, + "learning_rate": 3.9488829839296914e-05, + "loss": 1.2353, + "num_input_tokens_seen": 262122240, + "step": 6690 + }, + { + "epoch": 0.3246438608392286, + "grad_norm": 0.4425183832645416, + "learning_rate": 3.9456838386417835e-05, + "loss": 1.1905, + "num_input_tokens_seen": 262497788, + "step": 6700 + }, + { + "epoch": 0.32512840391510806, + "grad_norm": 0.4275406301021576, + "learning_rate": 3.9424811326848024e-05, + "loss": 1.2369, + "num_input_tokens_seen": 262888292, + "step": 6710 + }, + { + "epoch": 0.3256129469909875, + "grad_norm": 0.4195478558540344, + "learning_rate": 3.939274873946901e-05, + "loss": 1.2022, + "num_input_tokens_seen": 263277288, + "step": 6720 + }, + { + "epoch": 0.32609749006686695, + "grad_norm": 0.38973620533943176, + "learning_rate": 3.936065070324984e-05, + "loss": 1.1681, + "num_input_tokens_seen": 263661148, + "step": 6730 + }, + { + "epoch": 0.32658203314274636, + "grad_norm": 0.40185558795928955, + "learning_rate": 3.932851729724685e-05, + "loss": 1.2155, + "num_input_tokens_seen": 264039708, + "step": 6740 + }, + { + "epoch": 0.32706657621862584, + "grad_norm": 0.3969719409942627, + "learning_rate": 3.929634860060351e-05, + "loss": 1.2022, + "num_input_tokens_seen": 264431972, + "step": 6750 + }, + { + "epoch": 0.32755111929450526, + "grad_norm": 0.4183984696865082, + "learning_rate": 3.92641446925502e-05, + "loss": 1.2105, + "num_input_tokens_seen": 264799512, + "step": 6760 + }, + { + "epoch": 0.32803566237038473, + "grad_norm": 0.42662981152534485, + "learning_rate": 3.9231905652404034e-05, + "loss": 1.218, + "num_input_tokens_seen": 265163968, + "step": 6770 + }, + { + "epoch": 0.32852020544626415, + "grad_norm": 0.4058518707752228, + "learning_rate": 3.919963155956864e-05, + "loss": 1.1482, + "num_input_tokens_seen": 265559800, + "step": 6780 + }, + { + "epoch": 0.3290047485221436, + "grad_norm": 0.3763887584209442, + "learning_rate": 3.916732249353399e-05, + "loss": 1.1912, + "num_input_tokens_seen": 265950740, + "step": 6790 + }, + { + "epoch": 0.32948929159802304, + "grad_norm": 0.36359450221061707, + "learning_rate": 3.91349785338762e-05, + "loss": 1.178, + "num_input_tokens_seen": 266339700, + "step": 6800 + }, + { + "epoch": 0.3299738346739025, + "grad_norm": 0.3989261984825134, + "learning_rate": 3.91025997602573e-05, + "loss": 1.23, + "num_input_tokens_seen": 266740236, + "step": 6810 + }, + { + "epoch": 0.33045837774978193, + "grad_norm": 0.37006011605262756, + "learning_rate": 3.90701862524251e-05, + "loss": 1.1739, + "num_input_tokens_seen": 267116000, + "step": 6820 + }, + { + "epoch": 0.3309429208256614, + "grad_norm": 0.3907497525215149, + "learning_rate": 3.903773809021294e-05, + "loss": 1.1801, + "num_input_tokens_seen": 267526384, + "step": 6830 + }, + { + "epoch": 0.3314274639015408, + "grad_norm": 0.3850066363811493, + "learning_rate": 3.900525535353952e-05, + "loss": 1.2116, + "num_input_tokens_seen": 267893528, + "step": 6840 + }, + { + "epoch": 0.3319120069774203, + "grad_norm": 0.4400016665458679, + "learning_rate": 3.897273812240868e-05, + "loss": 1.1532, + "num_input_tokens_seen": 268282380, + "step": 6850 + }, + { + "epoch": 0.3323965500532997, + "grad_norm": 0.3924698233604431, + "learning_rate": 3.894018647690924e-05, + "loss": 1.2103, + "num_input_tokens_seen": 268696688, + "step": 6860 + }, + { + "epoch": 0.3328810931291792, + "grad_norm": 0.3710118234157562, + "learning_rate": 3.890760049721477e-05, + "loss": 1.1848, + "num_input_tokens_seen": 269125764, + "step": 6870 + }, + { + "epoch": 0.33336563620505866, + "grad_norm": 0.37334132194519043, + "learning_rate": 3.887498026358341e-05, + "loss": 1.2415, + "num_input_tokens_seen": 269507904, + "step": 6880 + }, + { + "epoch": 0.3338501792809381, + "grad_norm": 0.46161389350891113, + "learning_rate": 3.8842325856357656e-05, + "loss": 1.2383, + "num_input_tokens_seen": 269890536, + "step": 6890 + }, + { + "epoch": 0.33433472235681755, + "grad_norm": 0.43443042039871216, + "learning_rate": 3.8809637355964176e-05, + "loss": 1.1867, + "num_input_tokens_seen": 270291048, + "step": 6900 + }, + { + "epoch": 0.33481926543269697, + "grad_norm": 0.39960533380508423, + "learning_rate": 3.8776914842913626e-05, + "loss": 1.2211, + "num_input_tokens_seen": 270695776, + "step": 6910 + }, + { + "epoch": 0.33530380850857644, + "grad_norm": 0.4040696322917938, + "learning_rate": 3.8744158397800404e-05, + "loss": 1.2013, + "num_input_tokens_seen": 271104472, + "step": 6920 + }, + { + "epoch": 0.33578835158445586, + "grad_norm": 0.4039623737335205, + "learning_rate": 3.87113681013025e-05, + "loss": 1.1545, + "num_input_tokens_seen": 271507036, + "step": 6930 + }, + { + "epoch": 0.33627289466033533, + "grad_norm": 0.40450507402420044, + "learning_rate": 3.867854403418128e-05, + "loss": 1.1504, + "num_input_tokens_seen": 271911112, + "step": 6940 + }, + { + "epoch": 0.33675743773621475, + "grad_norm": 0.4363791346549988, + "learning_rate": 3.864568627728128e-05, + "loss": 1.1979, + "num_input_tokens_seen": 272282304, + "step": 6950 + }, + { + "epoch": 0.3372419808120942, + "grad_norm": 0.38292601704597473, + "learning_rate": 3.861279491153e-05, + "loss": 1.23, + "num_input_tokens_seen": 272673128, + "step": 6960 + }, + { + "epoch": 0.33772652388797364, + "grad_norm": 0.39683130383491516, + "learning_rate": 3.857987001793775e-05, + "loss": 1.1738, + "num_input_tokens_seen": 273079120, + "step": 6970 + }, + { + "epoch": 0.3382110669638531, + "grad_norm": 0.4041154682636261, + "learning_rate": 3.8546911677597395e-05, + "loss": 1.1948, + "num_input_tokens_seen": 273482888, + "step": 6980 + }, + { + "epoch": 0.33869561003973253, + "grad_norm": 0.3793328106403351, + "learning_rate": 3.8513919971684175e-05, + "loss": 1.1841, + "num_input_tokens_seen": 273889924, + "step": 6990 + }, + { + "epoch": 0.339180153115612, + "grad_norm": 0.3985520005226135, + "learning_rate": 3.848089498145552e-05, + "loss": 1.1821, + "num_input_tokens_seen": 274289204, + "step": 7000 + }, + { + "epoch": 0.3396646961914914, + "grad_norm": 0.42367762327194214, + "learning_rate": 3.844783678825083e-05, + "loss": 1.2186, + "num_input_tokens_seen": 274669384, + "step": 7010 + }, + { + "epoch": 0.3401492392673709, + "grad_norm": 0.38627681136131287, + "learning_rate": 3.8414745473491295e-05, + "loss": 1.1691, + "num_input_tokens_seen": 275045420, + "step": 7020 + }, + { + "epoch": 0.3406337823432503, + "grad_norm": 0.40813496708869934, + "learning_rate": 3.838162111867967e-05, + "loss": 1.1912, + "num_input_tokens_seen": 275452320, + "step": 7030 + }, + { + "epoch": 0.3411183254191298, + "grad_norm": 0.3678280711174011, + "learning_rate": 3.834846380540009e-05, + "loss": 1.2274, + "num_input_tokens_seen": 275844788, + "step": 7040 + }, + { + "epoch": 0.3416028684950092, + "grad_norm": 0.3878428637981415, + "learning_rate": 3.831527361531786e-05, + "loss": 1.2127, + "num_input_tokens_seen": 276211512, + "step": 7050 + }, + { + "epoch": 0.3420874115708887, + "grad_norm": 0.4239497482776642, + "learning_rate": 3.828205063017927e-05, + "loss": 1.2089, + "num_input_tokens_seen": 276596564, + "step": 7060 + }, + { + "epoch": 0.3425719546467681, + "grad_norm": 0.3910852074623108, + "learning_rate": 3.824879493181138e-05, + "loss": 1.153, + "num_input_tokens_seen": 276989732, + "step": 7070 + }, + { + "epoch": 0.34305649772264757, + "grad_norm": 0.5242650508880615, + "learning_rate": 3.821550660212182e-05, + "loss": 1.1671, + "num_input_tokens_seen": 277396804, + "step": 7080 + }, + { + "epoch": 0.343541040798527, + "grad_norm": 0.3888712525367737, + "learning_rate": 3.8182185723098584e-05, + "loss": 1.1751, + "num_input_tokens_seen": 277776744, + "step": 7090 + }, + { + "epoch": 0.34402558387440646, + "grad_norm": 0.40083321928977966, + "learning_rate": 3.814883237680984e-05, + "loss": 1.262, + "num_input_tokens_seen": 278174104, + "step": 7100 + }, + { + "epoch": 0.3445101269502859, + "grad_norm": 0.4279809594154358, + "learning_rate": 3.811544664540373e-05, + "loss": 1.1803, + "num_input_tokens_seen": 278545544, + "step": 7110 + }, + { + "epoch": 0.34499467002616535, + "grad_norm": 0.39681699872016907, + "learning_rate": 3.8082028611108144e-05, + "loss": 1.1754, + "num_input_tokens_seen": 278922000, + "step": 7120 + }, + { + "epoch": 0.34547921310204477, + "grad_norm": 0.4334243834018707, + "learning_rate": 3.804857835623054e-05, + "loss": 1.1708, + "num_input_tokens_seen": 279321312, + "step": 7130 + }, + { + "epoch": 0.34596375617792424, + "grad_norm": 0.3777294158935547, + "learning_rate": 3.801509596315773e-05, + "loss": 1.1567, + "num_input_tokens_seen": 279699076, + "step": 7140 + }, + { + "epoch": 0.34644829925380366, + "grad_norm": 0.4178134500980377, + "learning_rate": 3.798158151435569e-05, + "loss": 1.1974, + "num_input_tokens_seen": 280103592, + "step": 7150 + }, + { + "epoch": 0.34693284232968313, + "grad_norm": 0.3830433189868927, + "learning_rate": 3.794803509236935e-05, + "loss": 1.1979, + "num_input_tokens_seen": 280490292, + "step": 7160 + }, + { + "epoch": 0.34741738540556255, + "grad_norm": 0.4182297885417938, + "learning_rate": 3.791445677982237e-05, + "loss": 1.1493, + "num_input_tokens_seen": 280873500, + "step": 7170 + }, + { + "epoch": 0.347901928481442, + "grad_norm": 0.3943808674812317, + "learning_rate": 3.788084665941697e-05, + "loss": 1.2254, + "num_input_tokens_seen": 281272964, + "step": 7180 + }, + { + "epoch": 0.34838647155732144, + "grad_norm": 0.3648427128791809, + "learning_rate": 3.7847204813933715e-05, + "loss": 1.1815, + "num_input_tokens_seen": 281658580, + "step": 7190 + }, + { + "epoch": 0.3488710146332009, + "grad_norm": 0.4216112494468689, + "learning_rate": 3.78135313262313e-05, + "loss": 1.2728, + "num_input_tokens_seen": 282047632, + "step": 7200 + }, + { + "epoch": 0.34935555770908033, + "grad_norm": 0.4069361686706543, + "learning_rate": 3.7779826279246344e-05, + "loss": 1.2193, + "num_input_tokens_seen": 282442860, + "step": 7210 + }, + { + "epoch": 0.3498401007849598, + "grad_norm": 0.3994755148887634, + "learning_rate": 3.774608975599323e-05, + "loss": 1.1925, + "num_input_tokens_seen": 282829096, + "step": 7220 + }, + { + "epoch": 0.3503246438608392, + "grad_norm": 0.40187373757362366, + "learning_rate": 3.7712321839563816e-05, + "loss": 1.1644, + "num_input_tokens_seen": 283214644, + "step": 7230 + }, + { + "epoch": 0.3508091869367187, + "grad_norm": 0.40469735860824585, + "learning_rate": 3.7678522613127315e-05, + "loss": 1.1836, + "num_input_tokens_seen": 283597500, + "step": 7240 + }, + { + "epoch": 0.3512937300125981, + "grad_norm": 0.3870331943035126, + "learning_rate": 3.764469215993006e-05, + "loss": 1.1341, + "num_input_tokens_seen": 283969576, + "step": 7250 + }, + { + "epoch": 0.3517782730884776, + "grad_norm": 0.38946250081062317, + "learning_rate": 3.761083056329527e-05, + "loss": 1.2089, + "num_input_tokens_seen": 284368172, + "step": 7260 + }, + { + "epoch": 0.352262816164357, + "grad_norm": 0.41155683994293213, + "learning_rate": 3.757693790662289e-05, + "loss": 1.2169, + "num_input_tokens_seen": 284753172, + "step": 7270 + }, + { + "epoch": 0.3527473592402365, + "grad_norm": 0.3682643175125122, + "learning_rate": 3.754301427338935e-05, + "loss": 1.2065, + "num_input_tokens_seen": 285154396, + "step": 7280 + }, + { + "epoch": 0.3532319023161159, + "grad_norm": 0.384960800409317, + "learning_rate": 3.750905974714739e-05, + "loss": 1.194, + "num_input_tokens_seen": 285556552, + "step": 7290 + }, + { + "epoch": 0.35371644539199537, + "grad_norm": 0.39383670687675476, + "learning_rate": 3.747507441152581e-05, + "loss": 1.1667, + "num_input_tokens_seen": 285937684, + "step": 7300 + }, + { + "epoch": 0.3542009884678748, + "grad_norm": 0.4078581631183624, + "learning_rate": 3.744105835022932e-05, + "loss": 1.168, + "num_input_tokens_seen": 286339704, + "step": 7310 + }, + { + "epoch": 0.35468553154375426, + "grad_norm": 0.4079924523830414, + "learning_rate": 3.740701164703831e-05, + "loss": 1.2325, + "num_input_tokens_seen": 286724008, + "step": 7320 + }, + { + "epoch": 0.3551700746196337, + "grad_norm": 0.40963229537010193, + "learning_rate": 3.737293438580861e-05, + "loss": 1.217, + "num_input_tokens_seen": 287109268, + "step": 7330 + }, + { + "epoch": 0.35565461769551315, + "grad_norm": 0.40729522705078125, + "learning_rate": 3.7338826650471335e-05, + "loss": 1.1671, + "num_input_tokens_seen": 287487492, + "step": 7340 + }, + { + "epoch": 0.35613916077139257, + "grad_norm": 0.4351021647453308, + "learning_rate": 3.730468852503265e-05, + "loss": 1.1748, + "num_input_tokens_seen": 287878812, + "step": 7350 + }, + { + "epoch": 0.35662370384727204, + "grad_norm": 0.3914582431316376, + "learning_rate": 3.727052009357358e-05, + "loss": 1.1588, + "num_input_tokens_seen": 288271328, + "step": 7360 + }, + { + "epoch": 0.35710824692315146, + "grad_norm": 0.4153572618961334, + "learning_rate": 3.723632144024979e-05, + "loss": 1.2123, + "num_input_tokens_seen": 288659084, + "step": 7370 + }, + { + "epoch": 0.35759278999903094, + "grad_norm": 0.38702714443206787, + "learning_rate": 3.7202092649291356e-05, + "loss": 1.1788, + "num_input_tokens_seen": 289060932, + "step": 7380 + }, + { + "epoch": 0.35807733307491035, + "grad_norm": 0.40125608444213867, + "learning_rate": 3.7167833805002614e-05, + "loss": 1.1521, + "num_input_tokens_seen": 289437452, + "step": 7390 + }, + { + "epoch": 0.3585618761507898, + "grad_norm": 0.43882331252098083, + "learning_rate": 3.7133544991761896e-05, + "loss": 1.1843, + "num_input_tokens_seen": 289845972, + "step": 7400 + }, + { + "epoch": 0.35904641922666924, + "grad_norm": 0.37727952003479004, + "learning_rate": 3.7099226294021375e-05, + "loss": 1.2051, + "num_input_tokens_seen": 290209868, + "step": 7410 + }, + { + "epoch": 0.3595309623025487, + "grad_norm": 0.3910406231880188, + "learning_rate": 3.706487779630679e-05, + "loss": 1.1358, + "num_input_tokens_seen": 290641272, + "step": 7420 + }, + { + "epoch": 0.36001550537842814, + "grad_norm": 0.3660062551498413, + "learning_rate": 3.703049958321733e-05, + "loss": 1.1969, + "num_input_tokens_seen": 291057612, + "step": 7430 + }, + { + "epoch": 0.3605000484543076, + "grad_norm": 0.3746223449707031, + "learning_rate": 3.69960917394253e-05, + "loss": 1.1809, + "num_input_tokens_seen": 291459584, + "step": 7440 + }, + { + "epoch": 0.360984591530187, + "grad_norm": 0.3975430428981781, + "learning_rate": 3.696165434967605e-05, + "loss": 1.2374, + "num_input_tokens_seen": 291868768, + "step": 7450 + }, + { + "epoch": 0.3614691346060665, + "grad_norm": 0.40742921829223633, + "learning_rate": 3.692718749878767e-05, + "loss": 1.2081, + "num_input_tokens_seen": 292249240, + "step": 7460 + }, + { + "epoch": 0.3619536776819459, + "grad_norm": 0.44806450605392456, + "learning_rate": 3.6892691271650814e-05, + "loss": 1.2251, + "num_input_tokens_seen": 292658816, + "step": 7470 + }, + { + "epoch": 0.3624382207578254, + "grad_norm": 0.37986406683921814, + "learning_rate": 3.68581657532285e-05, + "loss": 1.1863, + "num_input_tokens_seen": 293031712, + "step": 7480 + }, + { + "epoch": 0.3629227638337048, + "grad_norm": 0.4018678069114685, + "learning_rate": 3.682361102855586e-05, + "loss": 1.1884, + "num_input_tokens_seen": 293444948, + "step": 7490 + }, + { + "epoch": 0.3634073069095843, + "grad_norm": 0.3642045855522156, + "learning_rate": 3.6789027182739996e-05, + "loss": 1.1818, + "num_input_tokens_seen": 293852108, + "step": 7500 + }, + { + "epoch": 0.3638918499854637, + "grad_norm": 0.3775769770145416, + "learning_rate": 3.675441430095972e-05, + "loss": 1.2457, + "num_input_tokens_seen": 294219892, + "step": 7510 + }, + { + "epoch": 0.3643763930613432, + "grad_norm": 0.37454187870025635, + "learning_rate": 3.6719772468465345e-05, + "loss": 1.1447, + "num_input_tokens_seen": 294604332, + "step": 7520 + }, + { + "epoch": 0.3648609361372226, + "grad_norm": 0.38920724391937256, + "learning_rate": 3.6685101770578515e-05, + "loss": 1.2058, + "num_input_tokens_seen": 295015584, + "step": 7530 + }, + { + "epoch": 0.36534547921310206, + "grad_norm": 0.4151008725166321, + "learning_rate": 3.6650402292691946e-05, + "loss": 1.1891, + "num_input_tokens_seen": 295420200, + "step": 7540 + }, + { + "epoch": 0.3658300222889815, + "grad_norm": 0.4080232083797455, + "learning_rate": 3.661567412026925e-05, + "loss": 1.2027, + "num_input_tokens_seen": 295806112, + "step": 7550 + }, + { + "epoch": 0.36631456536486096, + "grad_norm": 0.43082767724990845, + "learning_rate": 3.658091733884471e-05, + "loss": 1.1592, + "num_input_tokens_seen": 296215744, + "step": 7560 + }, + { + "epoch": 0.3667991084407404, + "grad_norm": 0.3868250250816345, + "learning_rate": 3.654613203402307e-05, + "loss": 1.1657, + "num_input_tokens_seen": 296611788, + "step": 7570 + }, + { + "epoch": 0.36728365151661985, + "grad_norm": 0.3819040358066559, + "learning_rate": 3.6511318291479324e-05, + "loss": 1.1796, + "num_input_tokens_seen": 297006960, + "step": 7580 + }, + { + "epoch": 0.36776819459249926, + "grad_norm": 0.4080538749694824, + "learning_rate": 3.6476476196958514e-05, + "loss": 1.2078, + "num_input_tokens_seen": 297411416, + "step": 7590 + }, + { + "epoch": 0.36825273766837874, + "grad_norm": 0.40407416224479675, + "learning_rate": 3.644160583627551e-05, + "loss": 1.1925, + "num_input_tokens_seen": 297793232, + "step": 7600 + }, + { + "epoch": 0.36873728074425816, + "grad_norm": 0.37448978424072266, + "learning_rate": 3.6406707295314795e-05, + "loss": 1.1922, + "num_input_tokens_seen": 298204896, + "step": 7610 + }, + { + "epoch": 0.36922182382013763, + "grad_norm": 0.37111660838127136, + "learning_rate": 3.6371780660030266e-05, + "loss": 1.148, + "num_input_tokens_seen": 298561544, + "step": 7620 + }, + { + "epoch": 0.36970636689601705, + "grad_norm": 0.3740837574005127, + "learning_rate": 3.633682601644501e-05, + "loss": 1.2045, + "num_input_tokens_seen": 298944532, + "step": 7630 + }, + { + "epoch": 0.3701909099718965, + "grad_norm": 0.39125657081604004, + "learning_rate": 3.6301843450651096e-05, + "loss": 1.1708, + "num_input_tokens_seen": 299346324, + "step": 7640 + }, + { + "epoch": 0.37067545304777594, + "grad_norm": 0.3972494304180145, + "learning_rate": 3.6266833048809386e-05, + "loss": 1.1542, + "num_input_tokens_seen": 299741248, + "step": 7650 + }, + { + "epoch": 0.3711599961236554, + "grad_norm": 0.3871600329875946, + "learning_rate": 3.623179489714926e-05, + "loss": 1.1652, + "num_input_tokens_seen": 300143696, + "step": 7660 + }, + { + "epoch": 0.37164453919953483, + "grad_norm": 0.3876330852508545, + "learning_rate": 3.619672908196849e-05, + "loss": 1.1383, + "num_input_tokens_seen": 300512992, + "step": 7670 + }, + { + "epoch": 0.3721290822754143, + "grad_norm": 0.4013025760650635, + "learning_rate": 3.616163568963295e-05, + "loss": 1.1484, + "num_input_tokens_seen": 300904468, + "step": 7680 + }, + { + "epoch": 0.3726136253512937, + "grad_norm": 0.36160480976104736, + "learning_rate": 3.6126514806576437e-05, + "loss": 1.205, + "num_input_tokens_seen": 301325360, + "step": 7690 + }, + { + "epoch": 0.3730981684271732, + "grad_norm": 0.35836082696914673, + "learning_rate": 3.6091366519300476e-05, + "loss": 1.1436, + "num_input_tokens_seen": 301732708, + "step": 7700 + }, + { + "epoch": 0.3735827115030526, + "grad_norm": 0.43142881989479065, + "learning_rate": 3.605619091437408e-05, + "loss": 1.193, + "num_input_tokens_seen": 302120864, + "step": 7710 + }, + { + "epoch": 0.3740672545789321, + "grad_norm": 0.41082125902175903, + "learning_rate": 3.6020988078433537e-05, + "loss": 1.1767, + "num_input_tokens_seen": 302492456, + "step": 7720 + }, + { + "epoch": 0.3745517976548115, + "grad_norm": 0.393816739320755, + "learning_rate": 3.598575809818221e-05, + "loss": 1.2283, + "num_input_tokens_seen": 302846348, + "step": 7730 + }, + { + "epoch": 0.375036340730691, + "grad_norm": 0.4002731144428253, + "learning_rate": 3.5950501060390315e-05, + "loss": 1.1765, + "num_input_tokens_seen": 303200188, + "step": 7740 + }, + { + "epoch": 0.3755208838065704, + "grad_norm": 0.41621705889701843, + "learning_rate": 3.5915217051894725e-05, + "loss": 1.1259, + "num_input_tokens_seen": 303616612, + "step": 7750 + }, + { + "epoch": 0.37600542688244987, + "grad_norm": 0.352398544549942, + "learning_rate": 3.587990615959871e-05, + "loss": 1.1968, + "num_input_tokens_seen": 304005224, + "step": 7760 + }, + { + "epoch": 0.3764899699583293, + "grad_norm": 0.36988088488578796, + "learning_rate": 3.584456847047177e-05, + "loss": 1.1962, + "num_input_tokens_seen": 304405536, + "step": 7770 + }, + { + "epoch": 0.37697451303420876, + "grad_norm": 0.377210795879364, + "learning_rate": 3.5809204071549415e-05, + "loss": 1.2039, + "num_input_tokens_seen": 304791388, + "step": 7780 + }, + { + "epoch": 0.3774590561100882, + "grad_norm": 0.35832905769348145, + "learning_rate": 3.577381304993294e-05, + "loss": 1.1936, + "num_input_tokens_seen": 305184000, + "step": 7790 + }, + { + "epoch": 0.37794359918596765, + "grad_norm": 0.3834468424320221, + "learning_rate": 3.5738395492789176e-05, + "loss": 1.2239, + "num_input_tokens_seen": 305552272, + "step": 7800 + }, + { + "epoch": 0.37842814226184707, + "grad_norm": 0.3998955488204956, + "learning_rate": 3.570295148735036e-05, + "loss": 1.1768, + "num_input_tokens_seen": 305942616, + "step": 7810 + }, + { + "epoch": 0.37891268533772654, + "grad_norm": 0.382485955953598, + "learning_rate": 3.5667481120913836e-05, + "loss": 1.1988, + "num_input_tokens_seen": 306364336, + "step": 7820 + }, + { + "epoch": 0.37939722841360596, + "grad_norm": 0.43074601888656616, + "learning_rate": 3.5631984480841885e-05, + "loss": 1.2357, + "num_input_tokens_seen": 306775904, + "step": 7830 + }, + { + "epoch": 0.37988177148948543, + "grad_norm": 0.3977073132991791, + "learning_rate": 3.55964616545615e-05, + "loss": 1.1833, + "num_input_tokens_seen": 307121492, + "step": 7840 + }, + { + "epoch": 0.38036631456536485, + "grad_norm": 0.38828161358833313, + "learning_rate": 3.5560912729564155e-05, + "loss": 1.1875, + "num_input_tokens_seen": 307493488, + "step": 7850 + }, + { + "epoch": 0.3808508576412443, + "grad_norm": 0.404365211725235, + "learning_rate": 3.5525337793405636e-05, + "loss": 1.1878, + "num_input_tokens_seen": 307890584, + "step": 7860 + }, + { + "epoch": 0.38133540071712374, + "grad_norm": 0.42278143763542175, + "learning_rate": 3.548973693370576e-05, + "loss": 1.1444, + "num_input_tokens_seen": 308295900, + "step": 7870 + }, + { + "epoch": 0.3818199437930032, + "grad_norm": 0.38250425457954407, + "learning_rate": 3.545411023814823e-05, + "loss": 1.1498, + "num_input_tokens_seen": 308723400, + "step": 7880 + }, + { + "epoch": 0.38230448686888263, + "grad_norm": 0.4000505805015564, + "learning_rate": 3.541845779448034e-05, + "loss": 1.1917, + "num_input_tokens_seen": 309139432, + "step": 7890 + }, + { + "epoch": 0.3827890299447621, + "grad_norm": 0.4143424928188324, + "learning_rate": 3.5382779690512824e-05, + "loss": 1.2196, + "num_input_tokens_seen": 309543828, + "step": 7900 + }, + { + "epoch": 0.3832735730206415, + "grad_norm": 0.4070720672607422, + "learning_rate": 3.5347076014119606e-05, + "loss": 1.1941, + "num_input_tokens_seen": 309945560, + "step": 7910 + }, + { + "epoch": 0.383758116096521, + "grad_norm": 0.4233192801475525, + "learning_rate": 3.5311346853237614e-05, + "loss": 1.2011, + "num_input_tokens_seen": 310335412, + "step": 7920 + }, + { + "epoch": 0.3842426591724004, + "grad_norm": 0.4479762017726898, + "learning_rate": 3.527559229586653e-05, + "loss": 1.2178, + "num_input_tokens_seen": 310706360, + "step": 7930 + }, + { + "epoch": 0.3847272022482799, + "grad_norm": 0.41147053241729736, + "learning_rate": 3.523981243006857e-05, + "loss": 1.2376, + "num_input_tokens_seen": 311088520, + "step": 7940 + }, + { + "epoch": 0.3852117453241593, + "grad_norm": 0.38983359932899475, + "learning_rate": 3.5204007343968326e-05, + "loss": 1.2019, + "num_input_tokens_seen": 311467552, + "step": 7950 + }, + { + "epoch": 0.3856962884000388, + "grad_norm": 0.3905565142631531, + "learning_rate": 3.516817712575246e-05, + "loss": 1.1972, + "num_input_tokens_seen": 311857180, + "step": 7960 + }, + { + "epoch": 0.3861808314759182, + "grad_norm": 0.390117883682251, + "learning_rate": 3.513232186366956e-05, + "loss": 1.1901, + "num_input_tokens_seen": 312250400, + "step": 7970 + }, + { + "epoch": 0.38666537455179767, + "grad_norm": 0.3978836238384247, + "learning_rate": 3.50964416460299e-05, + "loss": 1.1729, + "num_input_tokens_seen": 312652712, + "step": 7980 + }, + { + "epoch": 0.3871499176276771, + "grad_norm": 0.4010826349258423, + "learning_rate": 3.5060536561205195e-05, + "loss": 1.1981, + "num_input_tokens_seen": 313069828, + "step": 7990 + }, + { + "epoch": 0.38763446070355656, + "grad_norm": 0.42469522356987, + "learning_rate": 3.5024606697628424e-05, + "loss": 1.2074, + "num_input_tokens_seen": 313452856, + "step": 8000 + }, + { + "epoch": 0.38763446070355656, + "eval_loss": 1.1777091026306152, + "eval_runtime": 5.3785, + "eval_samples_per_second": 27.889, + "eval_steps_per_second": 3.533, + "num_input_tokens_seen": 313452856, + "step": 8000 + }, + { + "epoch": 0.388119003779436, + "grad_norm": 0.3903387486934662, + "learning_rate": 3.49886521437936e-05, + "loss": 1.153, + "num_input_tokens_seen": 313830208, + "step": 8010 + }, + { + "epoch": 0.38860354685531545, + "grad_norm": 0.398037314414978, + "learning_rate": 3.495267298825555e-05, + "loss": 1.1707, + "num_input_tokens_seen": 314242840, + "step": 8020 + }, + { + "epoch": 0.38908808993119487, + "grad_norm": 0.3712460398674011, + "learning_rate": 3.4916669319629664e-05, + "loss": 1.161, + "num_input_tokens_seen": 314602680, + "step": 8030 + }, + { + "epoch": 0.38957263300707434, + "grad_norm": 0.4191719591617584, + "learning_rate": 3.4880641226591733e-05, + "loss": 1.1705, + "num_input_tokens_seen": 315002656, + "step": 8040 + }, + { + "epoch": 0.39005717608295376, + "grad_norm": 0.4063269793987274, + "learning_rate": 3.48445887978777e-05, + "loss": 1.1794, + "num_input_tokens_seen": 315420940, + "step": 8050 + }, + { + "epoch": 0.39054171915883323, + "grad_norm": 0.40375033020973206, + "learning_rate": 3.480851212228345e-05, + "loss": 1.1876, + "num_input_tokens_seen": 315794464, + "step": 8060 + }, + { + "epoch": 0.39102626223471265, + "grad_norm": 0.43218469619750977, + "learning_rate": 3.4772411288664576e-05, + "loss": 1.2107, + "num_input_tokens_seen": 316178848, + "step": 8070 + }, + { + "epoch": 0.3915108053105921, + "grad_norm": 0.3920303285121918, + "learning_rate": 3.473628638593618e-05, + "loss": 1.1755, + "num_input_tokens_seen": 316541328, + "step": 8080 + }, + { + "epoch": 0.39199534838647154, + "grad_norm": 0.405202716588974, + "learning_rate": 3.470013750307263e-05, + "loss": 1.1929, + "num_input_tokens_seen": 316916528, + "step": 8090 + }, + { + "epoch": 0.392479891462351, + "grad_norm": 0.39330440759658813, + "learning_rate": 3.466396472910739e-05, + "loss": 1.1855, + "num_input_tokens_seen": 317357952, + "step": 8100 + }, + { + "epoch": 0.39296443453823043, + "grad_norm": 0.3834272623062134, + "learning_rate": 3.462776815313274e-05, + "loss": 1.1863, + "num_input_tokens_seen": 317785332, + "step": 8110 + }, + { + "epoch": 0.3934489776141099, + "grad_norm": 0.40610063076019287, + "learning_rate": 3.4591547864299576e-05, + "loss": 1.1825, + "num_input_tokens_seen": 318172140, + "step": 8120 + }, + { + "epoch": 0.3939335206899893, + "grad_norm": 0.41190090775489807, + "learning_rate": 3.455530395181722e-05, + "loss": 1.1949, + "num_input_tokens_seen": 318556536, + "step": 8130 + }, + { + "epoch": 0.3944180637658688, + "grad_norm": 0.3720203936100006, + "learning_rate": 3.451903650495317e-05, + "loss": 1.1492, + "num_input_tokens_seen": 318939604, + "step": 8140 + }, + { + "epoch": 0.3949026068417482, + "grad_norm": 0.39604607224464417, + "learning_rate": 3.448274561303288e-05, + "loss": 1.2164, + "num_input_tokens_seen": 319344624, + "step": 8150 + }, + { + "epoch": 0.3953871499176277, + "grad_norm": 0.40863263607025146, + "learning_rate": 3.444643136543957e-05, + "loss": 1.1766, + "num_input_tokens_seen": 319738832, + "step": 8160 + }, + { + "epoch": 0.3958716929935071, + "grad_norm": 0.4493870437145233, + "learning_rate": 3.4410093851613964e-05, + "loss": 1.2016, + "num_input_tokens_seen": 320145484, + "step": 8170 + }, + { + "epoch": 0.3963562360693866, + "grad_norm": 0.3747680187225342, + "learning_rate": 3.4373733161054095e-05, + "loss": 1.1803, + "num_input_tokens_seen": 320561880, + "step": 8180 + }, + { + "epoch": 0.396840779145266, + "grad_norm": 0.39447423815727234, + "learning_rate": 3.433734938331508e-05, + "loss": 1.1853, + "num_input_tokens_seen": 320944848, + "step": 8190 + }, + { + "epoch": 0.39732532222114547, + "grad_norm": 0.4037897288799286, + "learning_rate": 3.43009426080089e-05, + "loss": 1.1759, + "num_input_tokens_seen": 321319020, + "step": 8200 + }, + { + "epoch": 0.3978098652970249, + "grad_norm": 0.3832351565361023, + "learning_rate": 3.4264512924804175e-05, + "loss": 1.1788, + "num_input_tokens_seen": 321694824, + "step": 8210 + }, + { + "epoch": 0.39829440837290436, + "grad_norm": 0.42781588435173035, + "learning_rate": 3.422806042342596e-05, + "loss": 1.1784, + "num_input_tokens_seen": 322077948, + "step": 8220 + }, + { + "epoch": 0.3987789514487838, + "grad_norm": 0.3955729305744171, + "learning_rate": 3.419158519365548e-05, + "loss": 1.234, + "num_input_tokens_seen": 322505044, + "step": 8230 + }, + { + "epoch": 0.39926349452466325, + "grad_norm": 0.419911652803421, + "learning_rate": 3.4155087325329985e-05, + "loss": 1.1763, + "num_input_tokens_seen": 322911104, + "step": 8240 + }, + { + "epoch": 0.39974803760054267, + "grad_norm": 0.4269978702068329, + "learning_rate": 3.411856690834243e-05, + "loss": 1.17, + "num_input_tokens_seen": 323320996, + "step": 8250 + }, + { + "epoch": 0.40023258067642214, + "grad_norm": 0.36580172181129456, + "learning_rate": 3.408202403264135e-05, + "loss": 1.1565, + "num_input_tokens_seen": 323709072, + "step": 8260 + }, + { + "epoch": 0.40071712375230156, + "grad_norm": 0.3848220705986023, + "learning_rate": 3.404545878823058e-05, + "loss": 1.1959, + "num_input_tokens_seen": 324080944, + "step": 8270 + }, + { + "epoch": 0.40120166682818104, + "grad_norm": 0.4162658452987671, + "learning_rate": 3.400887126516904e-05, + "loss": 1.1497, + "num_input_tokens_seen": 324485356, + "step": 8280 + }, + { + "epoch": 0.40168620990406045, + "grad_norm": 0.40066495537757874, + "learning_rate": 3.3972261553570536e-05, + "loss": 1.1657, + "num_input_tokens_seen": 324871712, + "step": 8290 + }, + { + "epoch": 0.4021707529799399, + "grad_norm": 0.3958793878555298, + "learning_rate": 3.3935629743603496e-05, + "loss": 1.1941, + "num_input_tokens_seen": 325257684, + "step": 8300 + }, + { + "epoch": 0.40265529605581935, + "grad_norm": 0.408408522605896, + "learning_rate": 3.389897592549082e-05, + "loss": 1.1954, + "num_input_tokens_seen": 325638416, + "step": 8310 + }, + { + "epoch": 0.4031398391316988, + "grad_norm": 0.36070606112480164, + "learning_rate": 3.3862300189509564e-05, + "loss": 1.2009, + "num_input_tokens_seen": 326018508, + "step": 8320 + }, + { + "epoch": 0.40362438220757824, + "grad_norm": 0.3974900543689728, + "learning_rate": 3.382560262599082e-05, + "loss": 1.1711, + "num_input_tokens_seen": 326405592, + "step": 8330 + }, + { + "epoch": 0.4041089252834577, + "grad_norm": 0.3950560390949249, + "learning_rate": 3.378888332531939e-05, + "loss": 1.1634, + "num_input_tokens_seen": 326784284, + "step": 8340 + }, + { + "epoch": 0.4045934683593371, + "grad_norm": 0.36520737409591675, + "learning_rate": 3.3752142377933646e-05, + "loss": 1.1633, + "num_input_tokens_seen": 327186212, + "step": 8350 + }, + { + "epoch": 0.4050780114352166, + "grad_norm": 0.4279801547527313, + "learning_rate": 3.371537987432526e-05, + "loss": 1.1751, + "num_input_tokens_seen": 327580260, + "step": 8360 + }, + { + "epoch": 0.405562554511096, + "grad_norm": 0.4253396987915039, + "learning_rate": 3.367859590503901e-05, + "loss": 1.1813, + "num_input_tokens_seen": 327974240, + "step": 8370 + }, + { + "epoch": 0.4060470975869755, + "grad_norm": 0.46800392866134644, + "learning_rate": 3.364179056067253e-05, + "loss": 1.2001, + "num_input_tokens_seen": 328378912, + "step": 8380 + }, + { + "epoch": 0.4065316406628549, + "grad_norm": 0.3665182292461395, + "learning_rate": 3.360496393187609e-05, + "loss": 1.2047, + "num_input_tokens_seen": 328787716, + "step": 8390 + }, + { + "epoch": 0.4070161837387344, + "grad_norm": 0.38969382643699646, + "learning_rate": 3.356811610935241e-05, + "loss": 1.1611, + "num_input_tokens_seen": 329162816, + "step": 8400 + }, + { + "epoch": 0.4075007268146138, + "grad_norm": 0.3802407383918762, + "learning_rate": 3.353124718385641e-05, + "loss": 1.2203, + "num_input_tokens_seen": 329545976, + "step": 8410 + }, + { + "epoch": 0.4079852698904933, + "grad_norm": 0.4015510082244873, + "learning_rate": 3.3494357246194956e-05, + "loss": 1.1915, + "num_input_tokens_seen": 329952104, + "step": 8420 + }, + { + "epoch": 0.4084698129663727, + "grad_norm": 0.3899809420108795, + "learning_rate": 3.345744638722669e-05, + "loss": 1.1899, + "num_input_tokens_seen": 330329116, + "step": 8430 + }, + { + "epoch": 0.40895435604225217, + "grad_norm": 0.3643754720687866, + "learning_rate": 3.3420514697861766e-05, + "loss": 1.2037, + "num_input_tokens_seen": 330723032, + "step": 8440 + }, + { + "epoch": 0.4094388991181316, + "grad_norm": 0.3885078728199005, + "learning_rate": 3.338356226906166e-05, + "loss": 1.2045, + "num_input_tokens_seen": 331128052, + "step": 8450 + }, + { + "epoch": 0.40992344219401106, + "grad_norm": 0.4163413643836975, + "learning_rate": 3.3346589191838915e-05, + "loss": 1.1901, + "num_input_tokens_seen": 331532708, + "step": 8460 + }, + { + "epoch": 0.4104079852698905, + "grad_norm": 0.36400115489959717, + "learning_rate": 3.330959555725695e-05, + "loss": 1.1846, + "num_input_tokens_seen": 331914620, + "step": 8470 + }, + { + "epoch": 0.41089252834576995, + "grad_norm": 0.36604899168014526, + "learning_rate": 3.327258145642979e-05, + "loss": 1.1852, + "num_input_tokens_seen": 332313264, + "step": 8480 + }, + { + "epoch": 0.41137707142164937, + "grad_norm": 0.4256085455417633, + "learning_rate": 3.323554698052187e-05, + "loss": 1.1764, + "num_input_tokens_seen": 332721544, + "step": 8490 + }, + { + "epoch": 0.41186161449752884, + "grad_norm": 0.39886176586151123, + "learning_rate": 3.3198492220747834e-05, + "loss": 1.1835, + "num_input_tokens_seen": 333090772, + "step": 8500 + }, + { + "epoch": 0.41234615757340826, + "grad_norm": 0.3774135410785675, + "learning_rate": 3.316141726837226e-05, + "loss": 1.1219, + "num_input_tokens_seen": 333473080, + "step": 8510 + }, + { + "epoch": 0.41283070064928773, + "grad_norm": 0.4200323522090912, + "learning_rate": 3.312432221470947e-05, + "loss": 1.187, + "num_input_tokens_seen": 333860928, + "step": 8520 + }, + { + "epoch": 0.41331524372516715, + "grad_norm": 0.433433473110199, + "learning_rate": 3.30872071511233e-05, + "loss": 1.1961, + "num_input_tokens_seen": 334240240, + "step": 8530 + }, + { + "epoch": 0.4137997868010466, + "grad_norm": 0.38179147243499756, + "learning_rate": 3.3050072169026855e-05, + "loss": 1.2174, + "num_input_tokens_seen": 334633964, + "step": 8540 + }, + { + "epoch": 0.41428432987692604, + "grad_norm": 0.3912273049354553, + "learning_rate": 3.30129173598823e-05, + "loss": 1.1801, + "num_input_tokens_seen": 335042032, + "step": 8550 + }, + { + "epoch": 0.4147688729528055, + "grad_norm": 0.3975712060928345, + "learning_rate": 3.297574281520065e-05, + "loss": 1.1647, + "num_input_tokens_seen": 335436124, + "step": 8560 + }, + { + "epoch": 0.41525341602868493, + "grad_norm": 0.45118552446365356, + "learning_rate": 3.293854862654151e-05, + "loss": 1.1916, + "num_input_tokens_seen": 335836836, + "step": 8570 + }, + { + "epoch": 0.4157379591045644, + "grad_norm": 0.3697469234466553, + "learning_rate": 3.290133488551288e-05, + "loss": 1.1983, + "num_input_tokens_seen": 336228900, + "step": 8580 + }, + { + "epoch": 0.4162225021804438, + "grad_norm": 0.37667909264564514, + "learning_rate": 3.286410168377091e-05, + "loss": 1.1968, + "num_input_tokens_seen": 336622764, + "step": 8590 + }, + { + "epoch": 0.4167070452563233, + "grad_norm": 0.36621060967445374, + "learning_rate": 3.282684911301968e-05, + "loss": 1.1642, + "num_input_tokens_seen": 337027652, + "step": 8600 + }, + { + "epoch": 0.4171915883322027, + "grad_norm": 0.39898422360420227, + "learning_rate": 3.2789577265010974e-05, + "loss": 1.2309, + "num_input_tokens_seen": 337410656, + "step": 8610 + }, + { + "epoch": 0.4176761314080822, + "grad_norm": 0.40708452463150024, + "learning_rate": 3.275228623154406e-05, + "loss": 1.1755, + "num_input_tokens_seen": 337815548, + "step": 8620 + }, + { + "epoch": 0.4181606744839616, + "grad_norm": 0.39701101183891296, + "learning_rate": 3.271497610446547e-05, + "loss": 1.2235, + "num_input_tokens_seen": 338219464, + "step": 8630 + }, + { + "epoch": 0.4186452175598411, + "grad_norm": 0.3921484351158142, + "learning_rate": 3.267764697566874e-05, + "loss": 1.1719, + "num_input_tokens_seen": 338635512, + "step": 8640 + }, + { + "epoch": 0.4191297606357205, + "grad_norm": 0.37697601318359375, + "learning_rate": 3.264029893709422e-05, + "loss": 1.1836, + "num_input_tokens_seen": 339021496, + "step": 8650 + }, + { + "epoch": 0.41961430371159997, + "grad_norm": 0.3878484070301056, + "learning_rate": 3.260293208072883e-05, + "loss": 1.1847, + "num_input_tokens_seen": 339403824, + "step": 8660 + }, + { + "epoch": 0.4200988467874794, + "grad_norm": 0.42603814601898193, + "learning_rate": 3.2565546498605834e-05, + "loss": 1.1747, + "num_input_tokens_seen": 339808580, + "step": 8670 + }, + { + "epoch": 0.42058338986335886, + "grad_norm": 0.3939875662326813, + "learning_rate": 3.252814228280464e-05, + "loss": 1.2169, + "num_input_tokens_seen": 340199464, + "step": 8680 + }, + { + "epoch": 0.4210679329392383, + "grad_norm": 0.3872847855091095, + "learning_rate": 3.249071952545052e-05, + "loss": 1.1669, + "num_input_tokens_seen": 340608396, + "step": 8690 + }, + { + "epoch": 0.42155247601511775, + "grad_norm": 0.4044415056705475, + "learning_rate": 3.245327831871442e-05, + "loss": 1.2167, + "num_input_tokens_seen": 340997568, + "step": 8700 + }, + { + "epoch": 0.42203701909099717, + "grad_norm": 0.4061294198036194, + "learning_rate": 3.2415818754812735e-05, + "loss": 1.179, + "num_input_tokens_seen": 341399468, + "step": 8710 + }, + { + "epoch": 0.42252156216687664, + "grad_norm": 0.3923657238483429, + "learning_rate": 3.237834092600709e-05, + "loss": 1.1482, + "num_input_tokens_seen": 341803448, + "step": 8720 + }, + { + "epoch": 0.42300610524275606, + "grad_norm": 0.3841986060142517, + "learning_rate": 3.234084492460404e-05, + "loss": 1.1648, + "num_input_tokens_seen": 342189656, + "step": 8730 + }, + { + "epoch": 0.42349064831863553, + "grad_norm": 0.4120123088359833, + "learning_rate": 3.230333084295496e-05, + "loss": 1.1752, + "num_input_tokens_seen": 342569416, + "step": 8740 + }, + { + "epoch": 0.42397519139451495, + "grad_norm": 0.44047102332115173, + "learning_rate": 3.226579877345572e-05, + "loss": 1.2093, + "num_input_tokens_seen": 342959072, + "step": 8750 + }, + { + "epoch": 0.4244597344703944, + "grad_norm": 0.38502237200737, + "learning_rate": 3.22282488085465e-05, + "loss": 1.1982, + "num_input_tokens_seen": 343375272, + "step": 8760 + }, + { + "epoch": 0.42494427754627384, + "grad_norm": 0.37675246596336365, + "learning_rate": 3.2190681040711566e-05, + "loss": 1.168, + "num_input_tokens_seen": 343750180, + "step": 8770 + }, + { + "epoch": 0.4254288206221533, + "grad_norm": 0.36886003613471985, + "learning_rate": 3.215309556247903e-05, + "loss": 1.2242, + "num_input_tokens_seen": 344158160, + "step": 8780 + }, + { + "epoch": 0.42591336369803273, + "grad_norm": 0.40360134840011597, + "learning_rate": 3.21154924664206e-05, + "loss": 1.2046, + "num_input_tokens_seen": 344541580, + "step": 8790 + }, + { + "epoch": 0.4263979067739122, + "grad_norm": 0.39642900228500366, + "learning_rate": 3.207787184515142e-05, + "loss": 1.2041, + "num_input_tokens_seen": 344912764, + "step": 8800 + }, + { + "epoch": 0.4268824498497916, + "grad_norm": 0.40765756368637085, + "learning_rate": 3.204023379132975e-05, + "loss": 1.1939, + "num_input_tokens_seen": 345319956, + "step": 8810 + }, + { + "epoch": 0.4273669929256711, + "grad_norm": 0.4131956100463867, + "learning_rate": 3.2002578397656826e-05, + "loss": 1.1394, + "num_input_tokens_seen": 345697944, + "step": 8820 + }, + { + "epoch": 0.4278515360015505, + "grad_norm": 0.4072543978691101, + "learning_rate": 3.196490575687657e-05, + "loss": 1.1678, + "num_input_tokens_seen": 346082640, + "step": 8830 + }, + { + "epoch": 0.42833607907743, + "grad_norm": 0.41665875911712646, + "learning_rate": 3.192721596177538e-05, + "loss": 1.1764, + "num_input_tokens_seen": 346505640, + "step": 8840 + }, + { + "epoch": 0.4288206221533094, + "grad_norm": 0.4276581406593323, + "learning_rate": 3.188950910518193e-05, + "loss": 1.1871, + "num_input_tokens_seen": 346926848, + "step": 8850 + }, + { + "epoch": 0.4293051652291889, + "grad_norm": 0.40989550948143005, + "learning_rate": 3.185178527996687e-05, + "loss": 1.2048, + "num_input_tokens_seen": 347308076, + "step": 8860 + }, + { + "epoch": 0.4297897083050683, + "grad_norm": 0.3860456347465515, + "learning_rate": 3.18140445790427e-05, + "loss": 1.1526, + "num_input_tokens_seen": 347719608, + "step": 8870 + }, + { + "epoch": 0.43027425138094777, + "grad_norm": 0.360655277967453, + "learning_rate": 3.1776287095363435e-05, + "loss": 1.1543, + "num_input_tokens_seen": 348104112, + "step": 8880 + }, + { + "epoch": 0.4307587944568272, + "grad_norm": 0.42288416624069214, + "learning_rate": 3.173851292192446e-05, + "loss": 1.1498, + "num_input_tokens_seen": 348488872, + "step": 8890 + }, + { + "epoch": 0.43124333753270666, + "grad_norm": 0.39610555768013, + "learning_rate": 3.170072215176224e-05, + "loss": 1.1501, + "num_input_tokens_seen": 348882096, + "step": 8900 + }, + { + "epoch": 0.4317278806085861, + "grad_norm": 0.40388280153274536, + "learning_rate": 3.166291487795413e-05, + "loss": 1.214, + "num_input_tokens_seen": 349271852, + "step": 8910 + }, + { + "epoch": 0.43221242368446555, + "grad_norm": 0.41157808899879456, + "learning_rate": 3.1625091193618144e-05, + "loss": 1.1712, + "num_input_tokens_seen": 349662936, + "step": 8920 + }, + { + "epoch": 0.43269696676034497, + "grad_norm": 0.41196179389953613, + "learning_rate": 3.158725119191269e-05, + "loss": 1.1742, + "num_input_tokens_seen": 350080888, + "step": 8930 + }, + { + "epoch": 0.43318150983622444, + "grad_norm": 0.3659604489803314, + "learning_rate": 3.1549394966036384e-05, + "loss": 1.1984, + "num_input_tokens_seen": 350479392, + "step": 8940 + }, + { + "epoch": 0.43366605291210386, + "grad_norm": 0.3877184987068176, + "learning_rate": 3.1511522609227795e-05, + "loss": 1.1329, + "num_input_tokens_seen": 350885976, + "step": 8950 + }, + { + "epoch": 0.43415059598798333, + "grad_norm": 0.4030652642250061, + "learning_rate": 3.147363421476522e-05, + "loss": 1.2162, + "num_input_tokens_seen": 351282332, + "step": 8960 + }, + { + "epoch": 0.43463513906386275, + "grad_norm": 0.38181596994400024, + "learning_rate": 3.1435729875966455e-05, + "loss": 1.1721, + "num_input_tokens_seen": 351676412, + "step": 8970 + }, + { + "epoch": 0.4351196821397422, + "grad_norm": 0.4003159701824188, + "learning_rate": 3.139780968618858e-05, + "loss": 1.1416, + "num_input_tokens_seen": 352076632, + "step": 8980 + }, + { + "epoch": 0.43560422521562164, + "grad_norm": 0.3967534303665161, + "learning_rate": 3.13598737388277e-05, + "loss": 1.1532, + "num_input_tokens_seen": 352466764, + "step": 8990 + }, + { + "epoch": 0.4360887682915011, + "grad_norm": 0.3866839110851288, + "learning_rate": 3.1321922127318724e-05, + "loss": 1.1999, + "num_input_tokens_seen": 352848332, + "step": 9000 + }, + { + "epoch": 0.43657331136738053, + "grad_norm": 0.3901658356189728, + "learning_rate": 3.128395494513517e-05, + "loss": 1.2005, + "num_input_tokens_seen": 353205300, + "step": 9010 + }, + { + "epoch": 0.43705785444326, + "grad_norm": 0.3866805136203766, + "learning_rate": 3.1245972285788885e-05, + "loss": 1.1484, + "num_input_tokens_seen": 353606764, + "step": 9020 + }, + { + "epoch": 0.4375423975191394, + "grad_norm": 0.3832225799560547, + "learning_rate": 3.120797424282984e-05, + "loss": 1.1939, + "num_input_tokens_seen": 353998028, + "step": 9030 + }, + { + "epoch": 0.4380269405950189, + "grad_norm": 0.39194390177726746, + "learning_rate": 3.116996090984588e-05, + "loss": 1.1963, + "num_input_tokens_seen": 354399420, + "step": 9040 + }, + { + "epoch": 0.4385114836708983, + "grad_norm": 0.3819257915019989, + "learning_rate": 3.113193238046255e-05, + "loss": 1.1531, + "num_input_tokens_seen": 354797548, + "step": 9050 + }, + { + "epoch": 0.4389960267467778, + "grad_norm": 0.3843849301338196, + "learning_rate": 3.1093888748342765e-05, + "loss": 1.1642, + "num_input_tokens_seen": 355200164, + "step": 9060 + }, + { + "epoch": 0.4394805698226572, + "grad_norm": 0.404051274061203, + "learning_rate": 3.10558301071867e-05, + "loss": 1.1794, + "num_input_tokens_seen": 355583100, + "step": 9070 + }, + { + "epoch": 0.4399651128985367, + "grad_norm": 0.3765638768672943, + "learning_rate": 3.1017756550731437e-05, + "loss": 1.2015, + "num_input_tokens_seen": 355999884, + "step": 9080 + }, + { + "epoch": 0.4404496559744161, + "grad_norm": 0.4142244756221771, + "learning_rate": 3.097966817275085e-05, + "loss": 1.1928, + "num_input_tokens_seen": 356363412, + "step": 9090 + }, + { + "epoch": 0.44093419905029557, + "grad_norm": 0.4150756597518921, + "learning_rate": 3.0941565067055275e-05, + "loss": 1.1665, + "num_input_tokens_seen": 356759816, + "step": 9100 + }, + { + "epoch": 0.441418742126175, + "grad_norm": 0.4147997498512268, + "learning_rate": 3.090344732749134e-05, + "loss": 1.1488, + "num_input_tokens_seen": 357163868, + "step": 9110 + }, + { + "epoch": 0.44190328520205446, + "grad_norm": 0.3696651756763458, + "learning_rate": 3.086531504794172e-05, + "loss": 1.1948, + "num_input_tokens_seen": 357578232, + "step": 9120 + }, + { + "epoch": 0.4423878282779339, + "grad_norm": 0.38870376348495483, + "learning_rate": 3.0827168322324884e-05, + "loss": 1.141, + "num_input_tokens_seen": 357996944, + "step": 9130 + }, + { + "epoch": 0.44287237135381335, + "grad_norm": 0.4145285487174988, + "learning_rate": 3.0789007244594904e-05, + "loss": 1.1882, + "num_input_tokens_seen": 358391992, + "step": 9140 + }, + { + "epoch": 0.44335691442969277, + "grad_norm": 0.4269642233848572, + "learning_rate": 3.0750831908741176e-05, + "loss": 1.1631, + "num_input_tokens_seen": 358795456, + "step": 9150 + }, + { + "epoch": 0.44384145750557225, + "grad_norm": 0.4123391807079315, + "learning_rate": 3.071264240878824e-05, + "loss": 1.223, + "num_input_tokens_seen": 359217840, + "step": 9160 + }, + { + "epoch": 0.44432600058145166, + "grad_norm": 0.3737592399120331, + "learning_rate": 3.06744388387955e-05, + "loss": 1.1953, + "num_input_tokens_seen": 359596700, + "step": 9170 + }, + { + "epoch": 0.44481054365733114, + "grad_norm": 0.35300344228744507, + "learning_rate": 3.0636221292857014e-05, + "loss": 1.1526, + "num_input_tokens_seen": 359971336, + "step": 9180 + }, + { + "epoch": 0.4452950867332106, + "grad_norm": 0.42388081550598145, + "learning_rate": 3.059798986510129e-05, + "loss": 1.2093, + "num_input_tokens_seen": 360365460, + "step": 9190 + }, + { + "epoch": 0.44577962980909003, + "grad_norm": 0.4228881895542145, + "learning_rate": 3.055974464969099e-05, + "loss": 1.183, + "num_input_tokens_seen": 360783972, + "step": 9200 + }, + { + "epoch": 0.4462641728849695, + "grad_norm": 0.47396156191825867, + "learning_rate": 3.0521485740822756e-05, + "loss": 1.1713, + "num_input_tokens_seen": 361168724, + "step": 9210 + }, + { + "epoch": 0.4467487159608489, + "grad_norm": 0.3625274896621704, + "learning_rate": 3.0483213232726943e-05, + "loss": 1.1959, + "num_input_tokens_seen": 361586496, + "step": 9220 + }, + { + "epoch": 0.4472332590367284, + "grad_norm": 0.3853507936000824, + "learning_rate": 3.044492721966743e-05, + "loss": 1.1483, + "num_input_tokens_seen": 361979848, + "step": 9230 + }, + { + "epoch": 0.4477178021126078, + "grad_norm": 0.3805350959300995, + "learning_rate": 3.040662779594131e-05, + "loss": 1.145, + "num_input_tokens_seen": 362373352, + "step": 9240 + }, + { + "epoch": 0.4482023451884873, + "grad_norm": 0.3566093444824219, + "learning_rate": 3.036831505587876e-05, + "loss": 1.1617, + "num_input_tokens_seen": 362795508, + "step": 9250 + }, + { + "epoch": 0.4486868882643667, + "grad_norm": 0.4072115123271942, + "learning_rate": 3.0329989093842708e-05, + "loss": 1.1401, + "num_input_tokens_seen": 363198416, + "step": 9260 + }, + { + "epoch": 0.4491714313402462, + "grad_norm": 0.3717459440231323, + "learning_rate": 3.0291650004228676e-05, + "loss": 1.1726, + "num_input_tokens_seen": 363578608, + "step": 9270 + }, + { + "epoch": 0.4496559744161256, + "grad_norm": 0.3549041450023651, + "learning_rate": 3.025329788146451e-05, + "loss": 1.1873, + "num_input_tokens_seen": 363973344, + "step": 9280 + }, + { + "epoch": 0.45014051749200507, + "grad_norm": 0.40732115507125854, + "learning_rate": 3.0214932820010155e-05, + "loss": 1.1968, + "num_input_tokens_seen": 364371260, + "step": 9290 + }, + { + "epoch": 0.4506250605678845, + "grad_norm": 0.39712613821029663, + "learning_rate": 3.017655491435742e-05, + "loss": 1.1529, + "num_input_tokens_seen": 364781760, + "step": 9300 + }, + { + "epoch": 0.45110960364376396, + "grad_norm": 0.4143085181713104, + "learning_rate": 3.0138164259029757e-05, + "loss": 1.2141, + "num_input_tokens_seen": 365169144, + "step": 9310 + }, + { + "epoch": 0.4515941467196434, + "grad_norm": 0.41558942198753357, + "learning_rate": 3.0099760948582023e-05, + "loss": 1.1541, + "num_input_tokens_seen": 365542904, + "step": 9320 + }, + { + "epoch": 0.45207868979552285, + "grad_norm": 0.38127419352531433, + "learning_rate": 3.0061345077600228e-05, + "loss": 1.1399, + "num_input_tokens_seen": 365944000, + "step": 9330 + }, + { + "epoch": 0.45256323287140227, + "grad_norm": 0.3887827396392822, + "learning_rate": 3.0022916740701334e-05, + "loss": 1.1597, + "num_input_tokens_seen": 366332944, + "step": 9340 + }, + { + "epoch": 0.45304777594728174, + "grad_norm": 0.38723742961883545, + "learning_rate": 2.9984476032532992e-05, + "loss": 1.1531, + "num_input_tokens_seen": 366697388, + "step": 9350 + }, + { + "epoch": 0.45353231902316116, + "grad_norm": 0.39358997344970703, + "learning_rate": 2.9946023047773336e-05, + "loss": 1.1241, + "num_input_tokens_seen": 367099924, + "step": 9360 + }, + { + "epoch": 0.45401686209904063, + "grad_norm": 0.3835636079311371, + "learning_rate": 2.9907557881130737e-05, + "loss": 1.2058, + "num_input_tokens_seen": 367476372, + "step": 9370 + }, + { + "epoch": 0.45450140517492005, + "grad_norm": 0.40065279603004456, + "learning_rate": 2.9869080627343555e-05, + "loss": 1.1679, + "num_input_tokens_seen": 367872588, + "step": 9380 + }, + { + "epoch": 0.4549859482507995, + "grad_norm": 0.38955920934677124, + "learning_rate": 2.9830591381179928e-05, + "loss": 1.1893, + "num_input_tokens_seen": 368260348, + "step": 9390 + }, + { + "epoch": 0.45547049132667894, + "grad_norm": 0.4499861001968384, + "learning_rate": 2.9792090237437543e-05, + "loss": 1.1168, + "num_input_tokens_seen": 368644964, + "step": 9400 + }, + { + "epoch": 0.4559550344025584, + "grad_norm": 0.386730432510376, + "learning_rate": 2.9753577290943376e-05, + "loss": 1.2014, + "num_input_tokens_seen": 369055912, + "step": 9410 + }, + { + "epoch": 0.45643957747843783, + "grad_norm": 0.39342692494392395, + "learning_rate": 2.971505263655347e-05, + "loss": 1.1853, + "num_input_tokens_seen": 369472516, + "step": 9420 + }, + { + "epoch": 0.4569241205543173, + "grad_norm": 0.4010606110095978, + "learning_rate": 2.9676516369152713e-05, + "loss": 1.0836, + "num_input_tokens_seen": 369855504, + "step": 9430 + }, + { + "epoch": 0.4574086636301967, + "grad_norm": 0.41329410672187805, + "learning_rate": 2.9637968583654597e-05, + "loss": 1.1578, + "num_input_tokens_seen": 370249352, + "step": 9440 + }, + { + "epoch": 0.4578932067060762, + "grad_norm": 0.3728090226650238, + "learning_rate": 2.9599409375000975e-05, + "loss": 1.1782, + "num_input_tokens_seen": 370653324, + "step": 9450 + }, + { + "epoch": 0.4583777497819556, + "grad_norm": 0.37402981519699097, + "learning_rate": 2.9560838838161843e-05, + "loss": 1.1704, + "num_input_tokens_seen": 371025900, + "step": 9460 + }, + { + "epoch": 0.4588622928578351, + "grad_norm": 0.39774417877197266, + "learning_rate": 2.9522257068135086e-05, + "loss": 1.1619, + "num_input_tokens_seen": 371424328, + "step": 9470 + }, + { + "epoch": 0.4593468359337145, + "grad_norm": 0.383232444524765, + "learning_rate": 2.9483664159946273e-05, + "loss": 1.1858, + "num_input_tokens_seen": 371838760, + "step": 9480 + }, + { + "epoch": 0.459831379009594, + "grad_norm": 0.37605851888656616, + "learning_rate": 2.9445060208648383e-05, + "loss": 1.1487, + "num_input_tokens_seen": 372233964, + "step": 9490 + }, + { + "epoch": 0.4603159220854734, + "grad_norm": 0.376461386680603, + "learning_rate": 2.9406445309321623e-05, + "loss": 1.2029, + "num_input_tokens_seen": 372599424, + "step": 9500 + }, + { + "epoch": 0.46080046516135287, + "grad_norm": 0.413618266582489, + "learning_rate": 2.9367819557073134e-05, + "loss": 1.1964, + "num_input_tokens_seen": 372979896, + "step": 9510 + }, + { + "epoch": 0.4612850082372323, + "grad_norm": 0.40922775864601135, + "learning_rate": 2.9329183047036807e-05, + "loss": 1.1989, + "num_input_tokens_seen": 373358408, + "step": 9520 + }, + { + "epoch": 0.46176955131311176, + "grad_norm": 0.37975606322288513, + "learning_rate": 2.9290535874373022e-05, + "loss": 1.2165, + "num_input_tokens_seen": 373734508, + "step": 9530 + }, + { + "epoch": 0.4622540943889912, + "grad_norm": 0.35514163970947266, + "learning_rate": 2.925187813426843e-05, + "loss": 1.1367, + "num_input_tokens_seen": 374105356, + "step": 9540 + }, + { + "epoch": 0.46273863746487065, + "grad_norm": 0.3892764151096344, + "learning_rate": 2.9213209921935703e-05, + "loss": 1.194, + "num_input_tokens_seen": 374514008, + "step": 9550 + }, + { + "epoch": 0.46322318054075007, + "grad_norm": 0.35142743587493896, + "learning_rate": 2.9174531332613292e-05, + "loss": 1.1837, + "num_input_tokens_seen": 374918944, + "step": 9560 + }, + { + "epoch": 0.46370772361662954, + "grad_norm": 0.42724543809890747, + "learning_rate": 2.9135842461565225e-05, + "loss": 1.1408, + "num_input_tokens_seen": 375331548, + "step": 9570 + }, + { + "epoch": 0.46419226669250896, + "grad_norm": 0.38824784755706787, + "learning_rate": 2.9097143404080853e-05, + "loss": 1.1919, + "num_input_tokens_seen": 375707484, + "step": 9580 + }, + { + "epoch": 0.46467680976838843, + "grad_norm": 0.3674311637878418, + "learning_rate": 2.905843425547461e-05, + "loss": 1.1473, + "num_input_tokens_seen": 376112184, + "step": 9590 + }, + { + "epoch": 0.46516135284426785, + "grad_norm": 0.3765534460544586, + "learning_rate": 2.9019715111085773e-05, + "loss": 1.1647, + "num_input_tokens_seen": 376494180, + "step": 9600 + }, + { + "epoch": 0.4656458959201473, + "grad_norm": 0.38776230812072754, + "learning_rate": 2.8980986066278255e-05, + "loss": 1.2236, + "num_input_tokens_seen": 376885136, + "step": 9610 + }, + { + "epoch": 0.46613043899602674, + "grad_norm": 0.37993282079696655, + "learning_rate": 2.8942247216440354e-05, + "loss": 1.2307, + "num_input_tokens_seen": 377277268, + "step": 9620 + }, + { + "epoch": 0.4666149820719062, + "grad_norm": 0.4081166088581085, + "learning_rate": 2.8903498656984514e-05, + "loss": 1.1428, + "num_input_tokens_seen": 377661420, + "step": 9630 + }, + { + "epoch": 0.46709952514778563, + "grad_norm": 0.3825088441371918, + "learning_rate": 2.8864740483347074e-05, + "loss": 1.1167, + "num_input_tokens_seen": 378079588, + "step": 9640 + }, + { + "epoch": 0.4675840682236651, + "grad_norm": 0.39228224754333496, + "learning_rate": 2.8825972790988082e-05, + "loss": 1.2089, + "num_input_tokens_seen": 378473248, + "step": 9650 + }, + { + "epoch": 0.4680686112995445, + "grad_norm": 0.40170297026634216, + "learning_rate": 2.8787195675391015e-05, + "loss": 1.1505, + "num_input_tokens_seen": 378849372, + "step": 9660 + }, + { + "epoch": 0.468553154375424, + "grad_norm": 0.39289790391921997, + "learning_rate": 2.874840923206256e-05, + "loss": 1.1567, + "num_input_tokens_seen": 379273348, + "step": 9670 + }, + { + "epoch": 0.4690376974513034, + "grad_norm": 0.3959096372127533, + "learning_rate": 2.870961355653239e-05, + "loss": 1.1711, + "num_input_tokens_seen": 379660408, + "step": 9680 + }, + { + "epoch": 0.4695222405271829, + "grad_norm": 0.43645361065864563, + "learning_rate": 2.8670808744352884e-05, + "loss": 1.1985, + "num_input_tokens_seen": 380054412, + "step": 9690 + }, + { + "epoch": 0.4700067836030623, + "grad_norm": 0.39692050218582153, + "learning_rate": 2.863199489109897e-05, + "loss": 1.182, + "num_input_tokens_seen": 380485684, + "step": 9700 + }, + { + "epoch": 0.4704913266789418, + "grad_norm": 0.42202410101890564, + "learning_rate": 2.8593172092367797e-05, + "loss": 1.1693, + "num_input_tokens_seen": 380889292, + "step": 9710 + }, + { + "epoch": 0.4709758697548212, + "grad_norm": 0.403148889541626, + "learning_rate": 2.8554340443778594e-05, + "loss": 1.1838, + "num_input_tokens_seen": 381290904, + "step": 9720 + }, + { + "epoch": 0.47146041283070067, + "grad_norm": 0.3972223699092865, + "learning_rate": 2.8515500040972347e-05, + "loss": 1.2143, + "num_input_tokens_seen": 381694440, + "step": 9730 + }, + { + "epoch": 0.4719449559065801, + "grad_norm": 0.3653228282928467, + "learning_rate": 2.8476650979611623e-05, + "loss": 1.1622, + "num_input_tokens_seen": 382095168, + "step": 9740 + }, + { + "epoch": 0.47242949898245956, + "grad_norm": 0.3957580626010895, + "learning_rate": 2.843779335538031e-05, + "loss": 1.1705, + "num_input_tokens_seen": 382467792, + "step": 9750 + }, + { + "epoch": 0.472914042058339, + "grad_norm": 0.4030523896217346, + "learning_rate": 2.8398927263983382e-05, + "loss": 1.1782, + "num_input_tokens_seen": 382880656, + "step": 9760 + }, + { + "epoch": 0.47339858513421845, + "grad_norm": 0.4061778485774994, + "learning_rate": 2.8360052801146687e-05, + "loss": 1.1652, + "num_input_tokens_seen": 383259940, + "step": 9770 + }, + { + "epoch": 0.47388312821009787, + "grad_norm": 0.38041362166404724, + "learning_rate": 2.8321170062616664e-05, + "loss": 1.1677, + "num_input_tokens_seen": 383648640, + "step": 9780 + }, + { + "epoch": 0.47436767128597734, + "grad_norm": 0.39732810854911804, + "learning_rate": 2.828227914416015e-05, + "loss": 1.1045, + "num_input_tokens_seen": 384047384, + "step": 9790 + }, + { + "epoch": 0.47485221436185676, + "grad_norm": 0.4323562681674957, + "learning_rate": 2.824338014156412e-05, + "loss": 1.1875, + "num_input_tokens_seen": 384447200, + "step": 9800 + }, + { + "epoch": 0.47533675743773623, + "grad_norm": 0.3938267230987549, + "learning_rate": 2.8204473150635476e-05, + "loss": 1.1853, + "num_input_tokens_seen": 384839616, + "step": 9810 + }, + { + "epoch": 0.47582130051361565, + "grad_norm": 0.3984321355819702, + "learning_rate": 2.816555826720078e-05, + "loss": 1.1566, + "num_input_tokens_seen": 385227704, + "step": 9820 + }, + { + "epoch": 0.4763058435894951, + "grad_norm": 0.37600892782211304, + "learning_rate": 2.8126635587106036e-05, + "loss": 1.1883, + "num_input_tokens_seen": 385642608, + "step": 9830 + }, + { + "epoch": 0.47679038666537454, + "grad_norm": 0.43121588230133057, + "learning_rate": 2.808770520621646e-05, + "loss": 1.1532, + "num_input_tokens_seen": 386042136, + "step": 9840 + }, + { + "epoch": 0.477274929741254, + "grad_norm": 0.4222434163093567, + "learning_rate": 2.8048767220416223e-05, + "loss": 1.175, + "num_input_tokens_seen": 386420708, + "step": 9850 + }, + { + "epoch": 0.47775947281713343, + "grad_norm": 0.423554927110672, + "learning_rate": 2.800982172560823e-05, + "loss": 1.1398, + "num_input_tokens_seen": 386777908, + "step": 9860 + }, + { + "epoch": 0.4782440158930129, + "grad_norm": 0.3951357305049896, + "learning_rate": 2.7970868817713887e-05, + "loss": 1.1527, + "num_input_tokens_seen": 387176340, + "step": 9870 + }, + { + "epoch": 0.4787285589688923, + "grad_norm": 0.3626028001308441, + "learning_rate": 2.7931908592672844e-05, + "loss": 1.1641, + "num_input_tokens_seen": 387547504, + "step": 9880 + }, + { + "epoch": 0.4792131020447718, + "grad_norm": 0.4093586504459381, + "learning_rate": 2.78929411464428e-05, + "loss": 1.1164, + "num_input_tokens_seen": 387947832, + "step": 9890 + }, + { + "epoch": 0.4796976451206512, + "grad_norm": 0.3790941536426544, + "learning_rate": 2.7853966574999197e-05, + "loss": 1.1421, + "num_input_tokens_seen": 388371140, + "step": 9900 + }, + { + "epoch": 0.4801821881965307, + "grad_norm": 0.44463813304901123, + "learning_rate": 2.7814984974335067e-05, + "loss": 1.178, + "num_input_tokens_seen": 388758584, + "step": 9910 + }, + { + "epoch": 0.4806667312724101, + "grad_norm": 0.40607205033302307, + "learning_rate": 2.7775996440460733e-05, + "loss": 1.2158, + "num_input_tokens_seen": 389136284, + "step": 9920 + }, + { + "epoch": 0.4811512743482896, + "grad_norm": 0.41725072264671326, + "learning_rate": 2.7737001069403608e-05, + "loss": 1.1731, + "num_input_tokens_seen": 389528592, + "step": 9930 + }, + { + "epoch": 0.481635817424169, + "grad_norm": 0.43715402483940125, + "learning_rate": 2.7697998957207915e-05, + "loss": 1.1359, + "num_input_tokens_seen": 389924908, + "step": 9940 + }, + { + "epoch": 0.4821203605000485, + "grad_norm": 0.4026664197444916, + "learning_rate": 2.765899019993453e-05, + "loss": 1.1334, + "num_input_tokens_seen": 390327408, + "step": 9950 + }, + { + "epoch": 0.4826049035759279, + "grad_norm": 0.38899385929107666, + "learning_rate": 2.7619974893660643e-05, + "loss": 1.1628, + "num_input_tokens_seen": 390716640, + "step": 9960 + }, + { + "epoch": 0.48308944665180736, + "grad_norm": 0.3987119197845459, + "learning_rate": 2.758095313447961e-05, + "loss": 1.1804, + "num_input_tokens_seen": 391147420, + "step": 9970 + }, + { + "epoch": 0.4835739897276868, + "grad_norm": 0.4419252872467041, + "learning_rate": 2.754192501850066e-05, + "loss": 1.1857, + "num_input_tokens_seen": 391523552, + "step": 9980 + }, + { + "epoch": 0.48405853280356625, + "grad_norm": 0.40142005681991577, + "learning_rate": 2.7502890641848696e-05, + "loss": 1.2214, + "num_input_tokens_seen": 391906876, + "step": 9990 + }, + { + "epoch": 0.4845430758794457, + "grad_norm": 0.36947381496429443, + "learning_rate": 2.7463850100664028e-05, + "loss": 1.1617, + "num_input_tokens_seen": 392316428, + "step": 10000 + }, + { + "epoch": 0.4845430758794457, + "eval_loss": 1.1596810817718506, + "eval_runtime": 5.1226, + "eval_samples_per_second": 29.282, + "eval_steps_per_second": 3.709, + "num_input_tokens_seen": 392316428, + "step": 10000 + }, + { + "epoch": 0.48502761895532515, + "grad_norm": 0.38736334443092346, + "learning_rate": 2.7424803491102158e-05, + "loss": 1.1817, + "num_input_tokens_seen": 392732788, + "step": 10010 + }, + { + "epoch": 0.48551216203120456, + "grad_norm": 0.40144097805023193, + "learning_rate": 2.738575090933352e-05, + "loss": 1.1628, + "num_input_tokens_seen": 393114108, + "step": 10020 + }, + { + "epoch": 0.48599670510708404, + "grad_norm": 0.42609161138534546, + "learning_rate": 2.7346692451543264e-05, + "loss": 1.1679, + "num_input_tokens_seen": 393509264, + "step": 10030 + }, + { + "epoch": 0.48648124818296345, + "grad_norm": 0.4124153256416321, + "learning_rate": 2.730762821393103e-05, + "loss": 1.1491, + "num_input_tokens_seen": 393915040, + "step": 10040 + }, + { + "epoch": 0.48696579125884293, + "grad_norm": 0.42232540249824524, + "learning_rate": 2.726855829271066e-05, + "loss": 1.1626, + "num_input_tokens_seen": 394289168, + "step": 10050 + }, + { + "epoch": 0.48745033433472235, + "grad_norm": 0.4197705388069153, + "learning_rate": 2.722948278411003e-05, + "loss": 1.1708, + "num_input_tokens_seen": 394680292, + "step": 10060 + }, + { + "epoch": 0.4879348774106018, + "grad_norm": 0.3949076533317566, + "learning_rate": 2.7190401784370743e-05, + "loss": 1.1591, + "num_input_tokens_seen": 395070520, + "step": 10070 + }, + { + "epoch": 0.48841942048648124, + "grad_norm": 0.42010289430618286, + "learning_rate": 2.7151315389747967e-05, + "loss": 1.1333, + "num_input_tokens_seen": 395500444, + "step": 10080 + }, + { + "epoch": 0.4889039635623607, + "grad_norm": 0.36050793528556824, + "learning_rate": 2.7112223696510108e-05, + "loss": 1.1303, + "num_input_tokens_seen": 395889484, + "step": 10090 + }, + { + "epoch": 0.48938850663824013, + "grad_norm": 0.40366074442863464, + "learning_rate": 2.7073126800938666e-05, + "loss": 1.1765, + "num_input_tokens_seen": 396277572, + "step": 10100 + }, + { + "epoch": 0.4898730497141196, + "grad_norm": 0.410087913274765, + "learning_rate": 2.7034024799327928e-05, + "loss": 1.1247, + "num_input_tokens_seen": 396678980, + "step": 10110 + }, + { + "epoch": 0.490357592789999, + "grad_norm": 0.38809484243392944, + "learning_rate": 2.6994917787984764e-05, + "loss": 1.1675, + "num_input_tokens_seen": 397061400, + "step": 10120 + }, + { + "epoch": 0.4908421358658785, + "grad_norm": 0.4384767711162567, + "learning_rate": 2.69558058632284e-05, + "loss": 1.1633, + "num_input_tokens_seen": 397457472, + "step": 10130 + }, + { + "epoch": 0.4913266789417579, + "grad_norm": 0.4038788974285126, + "learning_rate": 2.6916689121390127e-05, + "loss": 1.1532, + "num_input_tokens_seen": 397885272, + "step": 10140 + }, + { + "epoch": 0.4918112220176374, + "grad_norm": 0.4135740399360657, + "learning_rate": 2.6877567658813134e-05, + "loss": 1.1828, + "num_input_tokens_seen": 398292228, + "step": 10150 + }, + { + "epoch": 0.4922957650935168, + "grad_norm": 0.3874277174472809, + "learning_rate": 2.6838441571852223e-05, + "loss": 1.1119, + "num_input_tokens_seen": 398678720, + "step": 10160 + }, + { + "epoch": 0.4927803081693963, + "grad_norm": 0.4087202548980713, + "learning_rate": 2.679931095687358e-05, + "loss": 1.1294, + "num_input_tokens_seen": 399066144, + "step": 10170 + }, + { + "epoch": 0.4932648512452757, + "grad_norm": 0.39371687173843384, + "learning_rate": 2.6760175910254565e-05, + "loss": 1.1805, + "num_input_tokens_seen": 399480648, + "step": 10180 + }, + { + "epoch": 0.49374939432115517, + "grad_norm": 0.39417147636413574, + "learning_rate": 2.6721036528383424e-05, + "loss": 1.1356, + "num_input_tokens_seen": 399859596, + "step": 10190 + }, + { + "epoch": 0.4942339373970346, + "grad_norm": 0.394105464220047, + "learning_rate": 2.66818929076591e-05, + "loss": 1.1561, + "num_input_tokens_seen": 400214040, + "step": 10200 + }, + { + "epoch": 0.49471848047291406, + "grad_norm": 0.3794916868209839, + "learning_rate": 2.664274514449097e-05, + "loss": 1.1644, + "num_input_tokens_seen": 400607764, + "step": 10210 + }, + { + "epoch": 0.4952030235487935, + "grad_norm": 0.43505197763442993, + "learning_rate": 2.660359333529862e-05, + "loss": 1.1151, + "num_input_tokens_seen": 400977376, + "step": 10220 + }, + { + "epoch": 0.49568756662467295, + "grad_norm": 0.373087614774704, + "learning_rate": 2.6564437576511587e-05, + "loss": 1.1589, + "num_input_tokens_seen": 401363624, + "step": 10230 + }, + { + "epoch": 0.49617210970055237, + "grad_norm": 0.41389840841293335, + "learning_rate": 2.6525277964569155e-05, + "loss": 1.1478, + "num_input_tokens_seen": 401733716, + "step": 10240 + }, + { + "epoch": 0.49665665277643184, + "grad_norm": 0.41126182675361633, + "learning_rate": 2.648611459592008e-05, + "loss": 1.1653, + "num_input_tokens_seen": 402141292, + "step": 10250 + }, + { + "epoch": 0.49714119585231126, + "grad_norm": 0.3853723108768463, + "learning_rate": 2.644694756702238e-05, + "loss": 1.1522, + "num_input_tokens_seen": 402544236, + "step": 10260 + }, + { + "epoch": 0.49762573892819073, + "grad_norm": 0.39461591839790344, + "learning_rate": 2.640777697434309e-05, + "loss": 1.175, + "num_input_tokens_seen": 402961036, + "step": 10270 + }, + { + "epoch": 0.49811028200407015, + "grad_norm": 0.4081992506980896, + "learning_rate": 2.6368602914358008e-05, + "loss": 1.16, + "num_input_tokens_seen": 403377968, + "step": 10280 + }, + { + "epoch": 0.4985948250799496, + "grad_norm": 0.4176690876483917, + "learning_rate": 2.63294254835515e-05, + "loss": 1.1912, + "num_input_tokens_seen": 403777800, + "step": 10290 + }, + { + "epoch": 0.49907936815582904, + "grad_norm": 0.38343364000320435, + "learning_rate": 2.6290244778416202e-05, + "loss": 1.1198, + "num_input_tokens_seen": 404136320, + "step": 10300 + }, + { + "epoch": 0.4995639112317085, + "grad_norm": 0.4266665577888489, + "learning_rate": 2.6251060895452844e-05, + "loss": 1.2289, + "num_input_tokens_seen": 404515816, + "step": 10310 + }, + { + "epoch": 0.5000484543075879, + "grad_norm": 0.36221781373023987, + "learning_rate": 2.6211873931169955e-05, + "loss": 1.1902, + "num_input_tokens_seen": 404931996, + "step": 10320 + }, + { + "epoch": 0.5005329973834673, + "grad_norm": 0.4041613042354584, + "learning_rate": 2.6172683982083675e-05, + "loss": 1.1757, + "num_input_tokens_seen": 405288152, + "step": 10330 + }, + { + "epoch": 0.5010175404593469, + "grad_norm": 0.38727837800979614, + "learning_rate": 2.6133491144717475e-05, + "loss": 1.1697, + "num_input_tokens_seen": 405676608, + "step": 10340 + }, + { + "epoch": 0.5015020835352263, + "grad_norm": 0.41572102904319763, + "learning_rate": 2.6094295515601967e-05, + "loss": 1.1641, + "num_input_tokens_seen": 406068632, + "step": 10350 + }, + { + "epoch": 0.5019866266111057, + "grad_norm": 0.42120370268821716, + "learning_rate": 2.605509719127463e-05, + "loss": 1.1326, + "num_input_tokens_seen": 406465000, + "step": 10360 + }, + { + "epoch": 0.5024711696869851, + "grad_norm": 0.4009106755256653, + "learning_rate": 2.6015896268279555e-05, + "loss": 1.1493, + "num_input_tokens_seen": 406845928, + "step": 10370 + }, + { + "epoch": 0.5029557127628647, + "grad_norm": 0.3664901554584503, + "learning_rate": 2.5976692843167255e-05, + "loss": 1.1917, + "num_input_tokens_seen": 407225352, + "step": 10380 + }, + { + "epoch": 0.5034402558387441, + "grad_norm": 0.4137326180934906, + "learning_rate": 2.5937487012494422e-05, + "loss": 1.1673, + "num_input_tokens_seen": 407601740, + "step": 10390 + }, + { + "epoch": 0.5039247989146235, + "grad_norm": 0.3960126042366028, + "learning_rate": 2.5898278872823646e-05, + "loss": 1.1413, + "num_input_tokens_seen": 407997996, + "step": 10400 + }, + { + "epoch": 0.5044093419905029, + "grad_norm": 0.42062610387802124, + "learning_rate": 2.5859068520723205e-05, + "loss": 1.1753, + "num_input_tokens_seen": 408377408, + "step": 10410 + }, + { + "epoch": 0.5048938850663824, + "grad_norm": 0.35909387469291687, + "learning_rate": 2.581985605276684e-05, + "loss": 1.1511, + "num_input_tokens_seen": 408721880, + "step": 10420 + }, + { + "epoch": 0.5053784281422619, + "grad_norm": 0.42842748761177063, + "learning_rate": 2.578064156553349e-05, + "loss": 1.1312, + "num_input_tokens_seen": 409143260, + "step": 10430 + }, + { + "epoch": 0.5058629712181413, + "grad_norm": 0.3839662969112396, + "learning_rate": 2.5741425155607097e-05, + "loss": 1.1704, + "num_input_tokens_seen": 409535852, + "step": 10440 + }, + { + "epoch": 0.5063475142940207, + "grad_norm": 0.41793426871299744, + "learning_rate": 2.57022069195763e-05, + "loss": 1.1989, + "num_input_tokens_seen": 409937740, + "step": 10450 + }, + { + "epoch": 0.5068320573699002, + "grad_norm": 0.4150349199771881, + "learning_rate": 2.5662986954034245e-05, + "loss": 1.1293, + "num_input_tokens_seen": 410319088, + "step": 10460 + }, + { + "epoch": 0.5073166004457796, + "grad_norm": 0.37542667984962463, + "learning_rate": 2.5623765355578354e-05, + "loss": 1.2247, + "num_input_tokens_seen": 410678500, + "step": 10470 + }, + { + "epoch": 0.5078011435216591, + "grad_norm": 0.4027327001094818, + "learning_rate": 2.5584542220810065e-05, + "loss": 1.1796, + "num_input_tokens_seen": 411079744, + "step": 10480 + }, + { + "epoch": 0.5082856865975385, + "grad_norm": 0.39059481024742126, + "learning_rate": 2.55453176463346e-05, + "loss": 1.1849, + "num_input_tokens_seen": 411488444, + "step": 10490 + }, + { + "epoch": 0.508770229673418, + "grad_norm": 0.40780818462371826, + "learning_rate": 2.5506091728760702e-05, + "loss": 1.1187, + "num_input_tokens_seen": 411879348, + "step": 10500 + }, + { + "epoch": 0.5092547727492974, + "grad_norm": 0.3999616503715515, + "learning_rate": 2.546686456470046e-05, + "loss": 1.1775, + "num_input_tokens_seen": 412294412, + "step": 10510 + }, + { + "epoch": 0.5097393158251768, + "grad_norm": 0.4117770195007324, + "learning_rate": 2.5427636250769016e-05, + "loss": 1.1678, + "num_input_tokens_seen": 412688988, + "step": 10520 + }, + { + "epoch": 0.5102238589010563, + "grad_norm": 0.3824278712272644, + "learning_rate": 2.538840688358435e-05, + "loss": 1.1374, + "num_input_tokens_seen": 413069468, + "step": 10530 + }, + { + "epoch": 0.5107084019769358, + "grad_norm": 0.3502415418624878, + "learning_rate": 2.5349176559767007e-05, + "loss": 1.2013, + "num_input_tokens_seen": 413440188, + "step": 10540 + }, + { + "epoch": 0.5111929450528152, + "grad_norm": 0.3400461971759796, + "learning_rate": 2.530994537593994e-05, + "loss": 1.1755, + "num_input_tokens_seen": 413826580, + "step": 10550 + }, + { + "epoch": 0.5116774881286946, + "grad_norm": 0.4381474554538727, + "learning_rate": 2.527071342872817e-05, + "loss": 1.1644, + "num_input_tokens_seen": 414235248, + "step": 10560 + }, + { + "epoch": 0.512162031204574, + "grad_norm": 0.3964703381061554, + "learning_rate": 2.5231480814758633e-05, + "loss": 1.1745, + "num_input_tokens_seen": 414635544, + "step": 10570 + }, + { + "epoch": 0.5126465742804536, + "grad_norm": 0.3687261939048767, + "learning_rate": 2.51922476306599e-05, + "loss": 1.166, + "num_input_tokens_seen": 415043552, + "step": 10580 + }, + { + "epoch": 0.513131117356333, + "grad_norm": 0.3944050669670105, + "learning_rate": 2.5153013973061916e-05, + "loss": 1.2209, + "num_input_tokens_seen": 415413920, + "step": 10590 + }, + { + "epoch": 0.5136156604322124, + "grad_norm": 0.4199015498161316, + "learning_rate": 2.511377993859584e-05, + "loss": 1.1452, + "num_input_tokens_seen": 415791632, + "step": 10600 + }, + { + "epoch": 0.5141002035080918, + "grad_norm": 0.4487917721271515, + "learning_rate": 2.507454562389372e-05, + "loss": 1.1334, + "num_input_tokens_seen": 416204172, + "step": 10610 + }, + { + "epoch": 0.5145847465839714, + "grad_norm": 0.4062841832637787, + "learning_rate": 2.5035311125588322e-05, + "loss": 1.1398, + "num_input_tokens_seen": 416591364, + "step": 10620 + }, + { + "epoch": 0.5150692896598508, + "grad_norm": 0.41555893421173096, + "learning_rate": 2.4996076540312854e-05, + "loss": 1.1504, + "num_input_tokens_seen": 416957332, + "step": 10630 + }, + { + "epoch": 0.5155538327357302, + "grad_norm": 0.35754913091659546, + "learning_rate": 2.4956841964700718e-05, + "loss": 1.1095, + "num_input_tokens_seen": 417326068, + "step": 10640 + }, + { + "epoch": 0.5160383758116096, + "grad_norm": 0.381197065114975, + "learning_rate": 2.4917607495385338e-05, + "loss": 1.1859, + "num_input_tokens_seen": 417756524, + "step": 10650 + }, + { + "epoch": 0.5165229188874891, + "grad_norm": 0.41809821128845215, + "learning_rate": 2.4878373228999835e-05, + "loss": 1.1568, + "num_input_tokens_seen": 418159804, + "step": 10660 + }, + { + "epoch": 0.5170074619633686, + "grad_norm": 0.39222896099090576, + "learning_rate": 2.4839139262176837e-05, + "loss": 1.1618, + "num_input_tokens_seen": 418565656, + "step": 10670 + }, + { + "epoch": 0.517492005039248, + "grad_norm": 0.38486379384994507, + "learning_rate": 2.4799905691548245e-05, + "loss": 1.1738, + "num_input_tokens_seen": 418952312, + "step": 10680 + }, + { + "epoch": 0.5179765481151274, + "grad_norm": 0.3719366788864136, + "learning_rate": 2.476067261374499e-05, + "loss": 1.1973, + "num_input_tokens_seen": 419369500, + "step": 10690 + }, + { + "epoch": 0.5184610911910069, + "grad_norm": 0.3997963070869446, + "learning_rate": 2.4721440125396773e-05, + "loss": 1.1287, + "num_input_tokens_seen": 419730868, + "step": 10700 + }, + { + "epoch": 0.5189456342668863, + "grad_norm": 0.38283371925354004, + "learning_rate": 2.4682208323131852e-05, + "loss": 1.1742, + "num_input_tokens_seen": 420139208, + "step": 10710 + }, + { + "epoch": 0.5194301773427658, + "grad_norm": 0.3874126970767975, + "learning_rate": 2.464297730357678e-05, + "loss": 1.1564, + "num_input_tokens_seen": 420521152, + "step": 10720 + }, + { + "epoch": 0.5199147204186452, + "grad_norm": 0.3984072804450989, + "learning_rate": 2.460374716335622e-05, + "loss": 1.1473, + "num_input_tokens_seen": 420929760, + "step": 10730 + }, + { + "epoch": 0.5203992634945247, + "grad_norm": 0.3916292190551758, + "learning_rate": 2.456451799909263e-05, + "loss": 1.1685, + "num_input_tokens_seen": 421298444, + "step": 10740 + }, + { + "epoch": 0.5208838065704041, + "grad_norm": 0.4141142964363098, + "learning_rate": 2.4525289907406068e-05, + "loss": 1.139, + "num_input_tokens_seen": 421670376, + "step": 10750 + }, + { + "epoch": 0.5213683496462835, + "grad_norm": 0.4091172218322754, + "learning_rate": 2.448606298491399e-05, + "loss": 1.1174, + "num_input_tokens_seen": 422062488, + "step": 10760 + }, + { + "epoch": 0.521852892722163, + "grad_norm": 0.413764089345932, + "learning_rate": 2.4446837328230907e-05, + "loss": 1.157, + "num_input_tokens_seen": 422469080, + "step": 10770 + }, + { + "epoch": 0.5223374357980425, + "grad_norm": 0.35379403829574585, + "learning_rate": 2.4407613033968287e-05, + "loss": 1.1344, + "num_input_tokens_seen": 422867012, + "step": 10780 + }, + { + "epoch": 0.5228219788739219, + "grad_norm": 0.3597969114780426, + "learning_rate": 2.436839019873418e-05, + "loss": 1.1457, + "num_input_tokens_seen": 423225644, + "step": 10790 + }, + { + "epoch": 0.5233065219498013, + "grad_norm": 0.3982402980327606, + "learning_rate": 2.4329168919133062e-05, + "loss": 1.1819, + "num_input_tokens_seen": 423605548, + "step": 10800 + }, + { + "epoch": 0.5237910650256807, + "grad_norm": 0.452061265707016, + "learning_rate": 2.4289949291765608e-05, + "loss": 1.1438, + "num_input_tokens_seen": 423993544, + "step": 10810 + }, + { + "epoch": 0.5242756081015603, + "grad_norm": 0.4269421100616455, + "learning_rate": 2.4250731413228374e-05, + "loss": 1.1333, + "num_input_tokens_seen": 424365256, + "step": 10820 + }, + { + "epoch": 0.5247601511774397, + "grad_norm": 0.3826684057712555, + "learning_rate": 2.421151538011364e-05, + "loss": 1.1673, + "num_input_tokens_seen": 424768824, + "step": 10830 + }, + { + "epoch": 0.5252446942533191, + "grad_norm": 0.3775523602962494, + "learning_rate": 2.4172301289009137e-05, + "loss": 1.1264, + "num_input_tokens_seen": 425175704, + "step": 10840 + }, + { + "epoch": 0.5257292373291985, + "grad_norm": 0.4121640920639038, + "learning_rate": 2.4133089236497804e-05, + "loss": 1.1525, + "num_input_tokens_seen": 425568528, + "step": 10850 + }, + { + "epoch": 0.526213780405078, + "grad_norm": 0.4088398218154907, + "learning_rate": 2.4093879319157572e-05, + "loss": 1.1616, + "num_input_tokens_seen": 425944588, + "step": 10860 + }, + { + "epoch": 0.5266983234809575, + "grad_norm": 0.4086717665195465, + "learning_rate": 2.4054671633561094e-05, + "loss": 1.1615, + "num_input_tokens_seen": 426354932, + "step": 10870 + }, + { + "epoch": 0.5271828665568369, + "grad_norm": 0.40191206336021423, + "learning_rate": 2.401546627627554e-05, + "loss": 1.1574, + "num_input_tokens_seen": 426750536, + "step": 10880 + }, + { + "epoch": 0.5276674096327163, + "grad_norm": 0.4215683043003082, + "learning_rate": 2.3976263343862357e-05, + "loss": 1.1275, + "num_input_tokens_seen": 427145692, + "step": 10890 + }, + { + "epoch": 0.5281519527085958, + "grad_norm": 0.428448885679245, + "learning_rate": 2.393706293287698e-05, + "loss": 1.1513, + "num_input_tokens_seen": 427517468, + "step": 10900 + }, + { + "epoch": 0.5286364957844752, + "grad_norm": 0.41464531421661377, + "learning_rate": 2.3897865139868685e-05, + "loss": 1.2119, + "num_input_tokens_seen": 427940524, + "step": 10910 + }, + { + "epoch": 0.5291210388603547, + "grad_norm": 0.40263885259628296, + "learning_rate": 2.3858670061380267e-05, + "loss": 1.1092, + "num_input_tokens_seen": 428306984, + "step": 10920 + }, + { + "epoch": 0.5296055819362341, + "grad_norm": 0.4086105227470398, + "learning_rate": 2.3819477793947825e-05, + "loss": 1.147, + "num_input_tokens_seen": 428680608, + "step": 10930 + }, + { + "epoch": 0.5300901250121136, + "grad_norm": 0.40625572204589844, + "learning_rate": 2.378028843410058e-05, + "loss": 1.1671, + "num_input_tokens_seen": 429089700, + "step": 10940 + }, + { + "epoch": 0.530574668087993, + "grad_norm": 0.4443352520465851, + "learning_rate": 2.374110207836054e-05, + "loss": 1.1634, + "num_input_tokens_seen": 429511972, + "step": 10950 + }, + { + "epoch": 0.5310592111638724, + "grad_norm": 0.3834231495857239, + "learning_rate": 2.3701918823242357e-05, + "loss": 1.1406, + "num_input_tokens_seen": 429933992, + "step": 10960 + }, + { + "epoch": 0.5315437542397519, + "grad_norm": 0.4218290150165558, + "learning_rate": 2.366273876525302e-05, + "loss": 1.1097, + "num_input_tokens_seen": 430335652, + "step": 10970 + }, + { + "epoch": 0.5320282973156314, + "grad_norm": 0.38524654507637024, + "learning_rate": 2.3623562000891646e-05, + "loss": 1.176, + "num_input_tokens_seen": 430737500, + "step": 10980 + }, + { + "epoch": 0.5325128403915108, + "grad_norm": 0.4242812395095825, + "learning_rate": 2.3584388626649246e-05, + "loss": 1.1395, + "num_input_tokens_seen": 431134616, + "step": 10990 + }, + { + "epoch": 0.5329973834673902, + "grad_norm": 0.42048659920692444, + "learning_rate": 2.3545218739008483e-05, + "loss": 1.1276, + "num_input_tokens_seen": 431534440, + "step": 11000 + }, + { + "epoch": 0.5334819265432696, + "grad_norm": 0.43740230798721313, + "learning_rate": 2.3506052434443436e-05, + "loss": 1.1547, + "num_input_tokens_seen": 431956964, + "step": 11010 + }, + { + "epoch": 0.5339664696191492, + "grad_norm": 0.40744540095329285, + "learning_rate": 2.3466889809419342e-05, + "loss": 1.1674, + "num_input_tokens_seen": 432337932, + "step": 11020 + }, + { + "epoch": 0.5344510126950286, + "grad_norm": 0.4600279927253723, + "learning_rate": 2.342773096039238e-05, + "loss": 1.1226, + "num_input_tokens_seen": 432737924, + "step": 11030 + }, + { + "epoch": 0.534935555770908, + "grad_norm": 0.4031646251678467, + "learning_rate": 2.3388575983809456e-05, + "loss": 1.1667, + "num_input_tokens_seen": 433161476, + "step": 11040 + }, + { + "epoch": 0.5354200988467874, + "grad_norm": 0.3792802691459656, + "learning_rate": 2.3349424976107903e-05, + "loss": 1.2043, + "num_input_tokens_seen": 433547952, + "step": 11050 + }, + { + "epoch": 0.535904641922667, + "grad_norm": 0.4132523536682129, + "learning_rate": 2.3310278033715285e-05, + "loss": 1.1405, + "num_input_tokens_seen": 433943164, + "step": 11060 + }, + { + "epoch": 0.5363891849985464, + "grad_norm": 0.38873347640037537, + "learning_rate": 2.3271135253049183e-05, + "loss": 1.1494, + "num_input_tokens_seen": 434332620, + "step": 11070 + }, + { + "epoch": 0.5368737280744258, + "grad_norm": 0.3987562358379364, + "learning_rate": 2.3231996730516884e-05, + "loss": 1.1497, + "num_input_tokens_seen": 434702280, + "step": 11080 + }, + { + "epoch": 0.5373582711503052, + "grad_norm": 0.41692429780960083, + "learning_rate": 2.3192862562515226e-05, + "loss": 1.1397, + "num_input_tokens_seen": 435102792, + "step": 11090 + }, + { + "epoch": 0.5378428142261847, + "grad_norm": 0.39025530219078064, + "learning_rate": 2.3153732845430302e-05, + "loss": 1.122, + "num_input_tokens_seen": 435481228, + "step": 11100 + }, + { + "epoch": 0.5383273573020642, + "grad_norm": 0.3755995035171509, + "learning_rate": 2.3114607675637233e-05, + "loss": 1.1926, + "num_input_tokens_seen": 435894796, + "step": 11110 + }, + { + "epoch": 0.5388119003779436, + "grad_norm": 0.4043687582015991, + "learning_rate": 2.3075487149499974e-05, + "loss": 1.1358, + "num_input_tokens_seen": 436289408, + "step": 11120 + }, + { + "epoch": 0.539296443453823, + "grad_norm": 0.3815999925136566, + "learning_rate": 2.3036371363371008e-05, + "loss": 1.1443, + "num_input_tokens_seen": 436692436, + "step": 11130 + }, + { + "epoch": 0.5397809865297025, + "grad_norm": 0.38345128297805786, + "learning_rate": 2.2997260413591156e-05, + "loss": 1.1892, + "num_input_tokens_seen": 437100692, + "step": 11140 + }, + { + "epoch": 0.5402655296055819, + "grad_norm": 0.4016791880130768, + "learning_rate": 2.295815439648934e-05, + "loss": 1.1396, + "num_input_tokens_seen": 437475216, + "step": 11150 + }, + { + "epoch": 0.5407500726814614, + "grad_norm": 0.387517511844635, + "learning_rate": 2.2919053408382306e-05, + "loss": 1.1204, + "num_input_tokens_seen": 437882256, + "step": 11160 + }, + { + "epoch": 0.5412346157573408, + "grad_norm": 0.3928276598453522, + "learning_rate": 2.287995754557445e-05, + "loss": 1.1519, + "num_input_tokens_seen": 438262044, + "step": 11170 + }, + { + "epoch": 0.5417191588332203, + "grad_norm": 0.4152311682701111, + "learning_rate": 2.2840866904357495e-05, + "loss": 1.1304, + "num_input_tokens_seen": 438668324, + "step": 11180 + }, + { + "epoch": 0.5422037019090997, + "grad_norm": 0.4021238684654236, + "learning_rate": 2.2801781581010362e-05, + "loss": 1.1578, + "num_input_tokens_seen": 439061528, + "step": 11190 + }, + { + "epoch": 0.5426882449849791, + "grad_norm": 0.3985769748687744, + "learning_rate": 2.2762701671798833e-05, + "loss": 1.1735, + "num_input_tokens_seen": 439445292, + "step": 11200 + }, + { + "epoch": 0.5431727880608586, + "grad_norm": 0.4368831515312195, + "learning_rate": 2.2723627272975352e-05, + "loss": 1.155, + "num_input_tokens_seen": 439862688, + "step": 11210 + }, + { + "epoch": 0.5436573311367381, + "grad_norm": 0.4021163880825043, + "learning_rate": 2.2684558480778833e-05, + "loss": 1.1758, + "num_input_tokens_seen": 440251824, + "step": 11220 + }, + { + "epoch": 0.5441418742126175, + "grad_norm": 0.41846776008605957, + "learning_rate": 2.264549539143434e-05, + "loss": 1.1374, + "num_input_tokens_seen": 440647480, + "step": 11230 + }, + { + "epoch": 0.5446264172884969, + "grad_norm": 0.38505181670188904, + "learning_rate": 2.2606438101152893e-05, + "loss": 1.1702, + "num_input_tokens_seen": 441032116, + "step": 11240 + }, + { + "epoch": 0.5451109603643763, + "grad_norm": 0.3722958564758301, + "learning_rate": 2.2567386706131268e-05, + "loss": 1.1441, + "num_input_tokens_seen": 441435016, + "step": 11250 + }, + { + "epoch": 0.5455955034402559, + "grad_norm": 0.38270628452301025, + "learning_rate": 2.2528341302551666e-05, + "loss": 1.1893, + "num_input_tokens_seen": 441836580, + "step": 11260 + }, + { + "epoch": 0.5460800465161353, + "grad_norm": 0.40590032935142517, + "learning_rate": 2.2489301986581586e-05, + "loss": 1.116, + "num_input_tokens_seen": 442232080, + "step": 11270 + }, + { + "epoch": 0.5465645895920147, + "grad_norm": 0.36881017684936523, + "learning_rate": 2.2450268854373497e-05, + "loss": 1.1247, + "num_input_tokens_seen": 442637844, + "step": 11280 + }, + { + "epoch": 0.5470491326678941, + "grad_norm": 0.38571688532829285, + "learning_rate": 2.2411242002064637e-05, + "loss": 1.1632, + "num_input_tokens_seen": 443047888, + "step": 11290 + }, + { + "epoch": 0.5475336757437737, + "grad_norm": 0.402986079454422, + "learning_rate": 2.23722215257768e-05, + "loss": 1.1353, + "num_input_tokens_seen": 443450848, + "step": 11300 + }, + { + "epoch": 0.5480182188196531, + "grad_norm": 0.43902143836021423, + "learning_rate": 2.2333207521616056e-05, + "loss": 1.1409, + "num_input_tokens_seen": 443825900, + "step": 11310 + }, + { + "epoch": 0.5485027618955325, + "grad_norm": 0.4374961853027344, + "learning_rate": 2.2294200085672552e-05, + "loss": 1.1557, + "num_input_tokens_seen": 444255800, + "step": 11320 + }, + { + "epoch": 0.5489873049714119, + "grad_norm": 0.38471776247024536, + "learning_rate": 2.225519931402024e-05, + "loss": 1.1903, + "num_input_tokens_seen": 444647372, + "step": 11330 + }, + { + "epoch": 0.5494718480472914, + "grad_norm": 0.3870150148868561, + "learning_rate": 2.2216205302716656e-05, + "loss": 1.165, + "num_input_tokens_seen": 445028328, + "step": 11340 + }, + { + "epoch": 0.5499563911231709, + "grad_norm": 0.3954487144947052, + "learning_rate": 2.217721814780272e-05, + "loss": 1.1201, + "num_input_tokens_seen": 445435648, + "step": 11350 + }, + { + "epoch": 0.5504409341990503, + "grad_norm": 0.4321858286857605, + "learning_rate": 2.2138237945302412e-05, + "loss": 1.0994, + "num_input_tokens_seen": 445822760, + "step": 11360 + }, + { + "epoch": 0.5509254772749297, + "grad_norm": 0.3976816236972809, + "learning_rate": 2.2099264791222643e-05, + "loss": 1.1399, + "num_input_tokens_seen": 446231820, + "step": 11370 + }, + { + "epoch": 0.5514100203508092, + "grad_norm": 0.3802758455276489, + "learning_rate": 2.2060298781552927e-05, + "loss": 1.1505, + "num_input_tokens_seen": 446644088, + "step": 11380 + }, + { + "epoch": 0.5518945634266886, + "grad_norm": 0.39267027378082275, + "learning_rate": 2.2021340012265177e-05, + "loss": 1.1583, + "num_input_tokens_seen": 447026360, + "step": 11390 + }, + { + "epoch": 0.552379106502568, + "grad_norm": 0.41610804200172424, + "learning_rate": 2.198238857931352e-05, + "loss": 1.1108, + "num_input_tokens_seen": 447431376, + "step": 11400 + }, + { + "epoch": 0.5528636495784475, + "grad_norm": 0.40714430809020996, + "learning_rate": 2.1943444578633957e-05, + "loss": 1.1341, + "num_input_tokens_seen": 447810844, + "step": 11410 + }, + { + "epoch": 0.553348192654327, + "grad_norm": 0.3614059090614319, + "learning_rate": 2.1904508106144208e-05, + "loss": 1.165, + "num_input_tokens_seen": 448200712, + "step": 11420 + }, + { + "epoch": 0.5538327357302064, + "grad_norm": 0.3718690574169159, + "learning_rate": 2.1865579257743475e-05, + "loss": 1.0994, + "num_input_tokens_seen": 448585024, + "step": 11430 + }, + { + "epoch": 0.5543172788060858, + "grad_norm": 0.4221544861793518, + "learning_rate": 2.1826658129312133e-05, + "loss": 1.1858, + "num_input_tokens_seen": 448991036, + "step": 11440 + }, + { + "epoch": 0.5548018218819653, + "grad_norm": 0.40332892537117004, + "learning_rate": 2.178774481671159e-05, + "loss": 1.1761, + "num_input_tokens_seen": 449381496, + "step": 11450 + }, + { + "epoch": 0.5552863649578448, + "grad_norm": 0.4298264682292938, + "learning_rate": 2.174883941578397e-05, + "loss": 1.1593, + "num_input_tokens_seen": 449808272, + "step": 11460 + }, + { + "epoch": 0.5557709080337242, + "grad_norm": 0.4092737138271332, + "learning_rate": 2.1709942022351924e-05, + "loss": 1.1434, + "num_input_tokens_seen": 450187828, + "step": 11470 + }, + { + "epoch": 0.5562554511096036, + "grad_norm": 0.4071583151817322, + "learning_rate": 2.1671052732218392e-05, + "loss": 1.1578, + "num_input_tokens_seen": 450578040, + "step": 11480 + }, + { + "epoch": 0.5567399941854831, + "grad_norm": 0.4042060673236847, + "learning_rate": 2.1632171641166326e-05, + "loss": 1.114, + "num_input_tokens_seen": 450979104, + "step": 11490 + }, + { + "epoch": 0.5572245372613626, + "grad_norm": 0.41152459383010864, + "learning_rate": 2.1593298844958526e-05, + "loss": 1.1724, + "num_input_tokens_seen": 451369344, + "step": 11500 + }, + { + "epoch": 0.557709080337242, + "grad_norm": 0.3623616099357605, + "learning_rate": 2.1554434439337326e-05, + "loss": 1.1765, + "num_input_tokens_seen": 451771544, + "step": 11510 + }, + { + "epoch": 0.5581936234131214, + "grad_norm": 0.41300123929977417, + "learning_rate": 2.15155785200244e-05, + "loss": 1.1843, + "num_input_tokens_seen": 452152352, + "step": 11520 + }, + { + "epoch": 0.5586781664890009, + "grad_norm": 0.41391250491142273, + "learning_rate": 2.147673118272054e-05, + "loss": 1.141, + "num_input_tokens_seen": 452586876, + "step": 11530 + }, + { + "epoch": 0.5591627095648803, + "grad_norm": 0.37428995966911316, + "learning_rate": 2.1437892523105378e-05, + "loss": 1.1809, + "num_input_tokens_seen": 452992560, + "step": 11540 + }, + { + "epoch": 0.5596472526407598, + "grad_norm": 0.42412886023521423, + "learning_rate": 2.1399062636837197e-05, + "loss": 1.1461, + "num_input_tokens_seen": 453342104, + "step": 11550 + }, + { + "epoch": 0.5601317957166392, + "grad_norm": 0.3886395990848541, + "learning_rate": 2.1360241619552652e-05, + "loss": 1.1342, + "num_input_tokens_seen": 453754860, + "step": 11560 + }, + { + "epoch": 0.5606163387925187, + "grad_norm": 0.3729265332221985, + "learning_rate": 2.1321429566866542e-05, + "loss": 1.1745, + "num_input_tokens_seen": 454158212, + "step": 11570 + }, + { + "epoch": 0.5611008818683981, + "grad_norm": 0.3990989625453949, + "learning_rate": 2.1282626574371635e-05, + "loss": 1.1643, + "num_input_tokens_seen": 454533420, + "step": 11580 + }, + { + "epoch": 0.5615854249442775, + "grad_norm": 0.39453041553497314, + "learning_rate": 2.124383273763834e-05, + "loss": 1.1723, + "num_input_tokens_seen": 454918724, + "step": 11590 + }, + { + "epoch": 0.562069968020157, + "grad_norm": 0.3955458700656891, + "learning_rate": 2.120504815221452e-05, + "loss": 1.1748, + "num_input_tokens_seen": 455320488, + "step": 11600 + }, + { + "epoch": 0.5625545110960365, + "grad_norm": 0.3768325448036194, + "learning_rate": 2.1166272913625273e-05, + "loss": 1.1634, + "num_input_tokens_seen": 455716484, + "step": 11610 + }, + { + "epoch": 0.5630390541719159, + "grad_norm": 0.3783731758594513, + "learning_rate": 2.1127507117372657e-05, + "loss": 1.1953, + "num_input_tokens_seen": 456099764, + "step": 11620 + }, + { + "epoch": 0.5635235972477953, + "grad_norm": 0.41152095794677734, + "learning_rate": 2.1088750858935495e-05, + "loss": 1.1831, + "num_input_tokens_seen": 456506480, + "step": 11630 + }, + { + "epoch": 0.5640081403236747, + "grad_norm": 0.414093017578125, + "learning_rate": 2.1050004233769094e-05, + "loss": 1.1475, + "num_input_tokens_seen": 456916444, + "step": 11640 + }, + { + "epoch": 0.5644926833995543, + "grad_norm": 0.41902777552604675, + "learning_rate": 2.1011267337305036e-05, + "loss": 1.1574, + "num_input_tokens_seen": 457314824, + "step": 11650 + }, + { + "epoch": 0.5649772264754337, + "grad_norm": 0.40867382287979126, + "learning_rate": 2.0972540264950976e-05, + "loss": 1.0926, + "num_input_tokens_seen": 457711980, + "step": 11660 + }, + { + "epoch": 0.5654617695513131, + "grad_norm": 0.3919362425804138, + "learning_rate": 2.0933823112090322e-05, + "loss": 1.2073, + "num_input_tokens_seen": 458128016, + "step": 11670 + }, + { + "epoch": 0.5659463126271925, + "grad_norm": 0.3909580707550049, + "learning_rate": 2.0895115974082106e-05, + "loss": 1.1525, + "num_input_tokens_seen": 458509664, + "step": 11680 + }, + { + "epoch": 0.5664308557030721, + "grad_norm": 0.38012978434562683, + "learning_rate": 2.0856418946260643e-05, + "loss": 1.146, + "num_input_tokens_seen": 458918900, + "step": 11690 + }, + { + "epoch": 0.5669153987789515, + "grad_norm": 0.378833144903183, + "learning_rate": 2.0817732123935364e-05, + "loss": 1.1811, + "num_input_tokens_seen": 459322936, + "step": 11700 + }, + { + "epoch": 0.5673999418548309, + "grad_norm": 0.37148529291152954, + "learning_rate": 2.0779055602390583e-05, + "loss": 1.1755, + "num_input_tokens_seen": 459735016, + "step": 11710 + }, + { + "epoch": 0.5678844849307103, + "grad_norm": 0.3815120458602905, + "learning_rate": 2.0740389476885223e-05, + "loss": 1.1825, + "num_input_tokens_seen": 460131788, + "step": 11720 + }, + { + "epoch": 0.5683690280065898, + "grad_norm": 0.3902805745601654, + "learning_rate": 2.070173384265261e-05, + "loss": 1.1369, + "num_input_tokens_seen": 460515608, + "step": 11730 + }, + { + "epoch": 0.5688535710824693, + "grad_norm": 0.40333831310272217, + "learning_rate": 2.066308879490023e-05, + "loss": 1.1596, + "num_input_tokens_seen": 460897436, + "step": 11740 + }, + { + "epoch": 0.5693381141583487, + "grad_norm": 0.39738982915878296, + "learning_rate": 2.0624454428809484e-05, + "loss": 1.182, + "num_input_tokens_seen": 461288352, + "step": 11750 + }, + { + "epoch": 0.5698226572342281, + "grad_norm": 0.38377317786216736, + "learning_rate": 2.0585830839535487e-05, + "loss": 1.1371, + "num_input_tokens_seen": 461665964, + "step": 11760 + }, + { + "epoch": 0.5703072003101076, + "grad_norm": 0.40340209007263184, + "learning_rate": 2.054721812220678e-05, + "loss": 1.1388, + "num_input_tokens_seen": 462099048, + "step": 11770 + }, + { + "epoch": 0.570791743385987, + "grad_norm": 0.39503028988838196, + "learning_rate": 2.0508616371925156e-05, + "loss": 1.1972, + "num_input_tokens_seen": 462477648, + "step": 11780 + }, + { + "epoch": 0.5712762864618665, + "grad_norm": 0.3878142237663269, + "learning_rate": 2.0470025683765392e-05, + "loss": 1.1394, + "num_input_tokens_seen": 462857744, + "step": 11790 + }, + { + "epoch": 0.5717608295377459, + "grad_norm": 0.430828720331192, + "learning_rate": 2.043144615277499e-05, + "loss": 1.1204, + "num_input_tokens_seen": 463231476, + "step": 11800 + }, + { + "epoch": 0.5722453726136254, + "grad_norm": 0.38779589533805847, + "learning_rate": 2.039287787397402e-05, + "loss": 1.1468, + "num_input_tokens_seen": 463608072, + "step": 11810 + }, + { + "epoch": 0.5727299156895048, + "grad_norm": 0.40600141882896423, + "learning_rate": 2.0354320942354804e-05, + "loss": 1.1389, + "num_input_tokens_seen": 464007424, + "step": 11820 + }, + { + "epoch": 0.5732144587653842, + "grad_norm": 0.3781258463859558, + "learning_rate": 2.0315775452881708e-05, + "loss": 1.172, + "num_input_tokens_seen": 464436116, + "step": 11830 + }, + { + "epoch": 0.5736990018412637, + "grad_norm": 0.41185423731803894, + "learning_rate": 2.027724150049096e-05, + "loss": 1.1674, + "num_input_tokens_seen": 464834636, + "step": 11840 + }, + { + "epoch": 0.5741835449171432, + "grad_norm": 0.39568862318992615, + "learning_rate": 2.0238719180090323e-05, + "loss": 1.1769, + "num_input_tokens_seen": 465219480, + "step": 11850 + }, + { + "epoch": 0.5746680879930226, + "grad_norm": 0.3740871846675873, + "learning_rate": 2.0200208586558954e-05, + "loss": 1.1428, + "num_input_tokens_seen": 465608864, + "step": 11860 + }, + { + "epoch": 0.575152631068902, + "grad_norm": 0.39056453108787537, + "learning_rate": 2.0161709814747102e-05, + "loss": 1.1356, + "num_input_tokens_seen": 466002328, + "step": 11870 + }, + { + "epoch": 0.5756371741447814, + "grad_norm": 0.46836405992507935, + "learning_rate": 2.012322295947589e-05, + "loss": 1.1183, + "num_input_tokens_seen": 466418136, + "step": 11880 + }, + { + "epoch": 0.576121717220661, + "grad_norm": 0.41011425852775574, + "learning_rate": 2.0084748115537126e-05, + "loss": 1.1868, + "num_input_tokens_seen": 466816508, + "step": 11890 + }, + { + "epoch": 0.5766062602965404, + "grad_norm": 0.4022364914417267, + "learning_rate": 2.0046285377692998e-05, + "loss": 1.1622, + "num_input_tokens_seen": 467208132, + "step": 11900 + }, + { + "epoch": 0.5770908033724198, + "grad_norm": 0.4446437656879425, + "learning_rate": 2.0007834840675905e-05, + "loss": 1.2062, + "num_input_tokens_seen": 467607752, + "step": 11910 + }, + { + "epoch": 0.5775753464482992, + "grad_norm": 0.4171381890773773, + "learning_rate": 1.9969396599188177e-05, + "loss": 1.1675, + "num_input_tokens_seen": 467989092, + "step": 11920 + }, + { + "epoch": 0.5780598895241787, + "grad_norm": 0.44026637077331543, + "learning_rate": 1.993097074790186e-05, + "loss": 1.1417, + "num_input_tokens_seen": 468356440, + "step": 11930 + }, + { + "epoch": 0.5785444326000582, + "grad_norm": 0.3875192105770111, + "learning_rate": 1.989255738145851e-05, + "loss": 1.1676, + "num_input_tokens_seen": 468736536, + "step": 11940 + }, + { + "epoch": 0.5790289756759376, + "grad_norm": 0.37714606523513794, + "learning_rate": 1.9854156594468905e-05, + "loss": 1.1685, + "num_input_tokens_seen": 469122032, + "step": 11950 + }, + { + "epoch": 0.579513518751817, + "grad_norm": 0.3938728868961334, + "learning_rate": 1.9815768481512837e-05, + "loss": 1.1166, + "num_input_tokens_seen": 469516476, + "step": 11960 + }, + { + "epoch": 0.5799980618276965, + "grad_norm": 0.4008789360523224, + "learning_rate": 1.9777393137138916e-05, + "loss": 1.1795, + "num_input_tokens_seen": 469938440, + "step": 11970 + }, + { + "epoch": 0.580482604903576, + "grad_norm": 0.3709718585014343, + "learning_rate": 1.9739030655864263e-05, + "loss": 1.1366, + "num_input_tokens_seen": 470313748, + "step": 11980 + }, + { + "epoch": 0.5809671479794554, + "grad_norm": 0.4143054485321045, + "learning_rate": 1.9700681132174356e-05, + "loss": 1.1339, + "num_input_tokens_seen": 470678456, + "step": 11990 + }, + { + "epoch": 0.5814516910553348, + "grad_norm": 0.3943856656551361, + "learning_rate": 1.9662344660522726e-05, + "loss": 1.1755, + "num_input_tokens_seen": 471101508, + "step": 12000 + }, + { + "epoch": 0.5814516910553348, + "eval_loss": 1.1437398195266724, + "eval_runtime": 7.1807, + "eval_samples_per_second": 20.889, + "eval_steps_per_second": 2.646, + "num_input_tokens_seen": 471101508, + "step": 12000 + }, + { + "epoch": 0.5819362341312143, + "grad_norm": 0.40113458037376404, + "learning_rate": 1.9624021335330767e-05, + "loss": 1.1483, + "num_input_tokens_seen": 471482556, + "step": 12010 + }, + { + "epoch": 0.5824207772070937, + "grad_norm": 0.3723471760749817, + "learning_rate": 1.9585711250987515e-05, + "loss": 1.0867, + "num_input_tokens_seen": 471888220, + "step": 12020 + }, + { + "epoch": 0.5829053202829731, + "grad_norm": 0.3816375732421875, + "learning_rate": 1.9547414501849363e-05, + "loss": 1.1351, + "num_input_tokens_seen": 472248060, + "step": 12030 + }, + { + "epoch": 0.5833898633588526, + "grad_norm": 0.41664794087409973, + "learning_rate": 1.9509131182239875e-05, + "loss": 1.1705, + "num_input_tokens_seen": 472639804, + "step": 12040 + }, + { + "epoch": 0.5838744064347321, + "grad_norm": 0.3829415440559387, + "learning_rate": 1.9470861386449546e-05, + "loss": 1.1315, + "num_input_tokens_seen": 473018176, + "step": 12050 + }, + { + "epoch": 0.5843589495106115, + "grad_norm": 0.408346027135849, + "learning_rate": 1.9432605208735543e-05, + "loss": 1.1286, + "num_input_tokens_seen": 473403200, + "step": 12060 + }, + { + "epoch": 0.5848434925864909, + "grad_norm": 0.3734830915927887, + "learning_rate": 1.9394362743321516e-05, + "loss": 1.1347, + "num_input_tokens_seen": 473796676, + "step": 12070 + }, + { + "epoch": 0.5853280356623703, + "grad_norm": 0.3995228111743927, + "learning_rate": 1.9356134084397305e-05, + "loss": 1.159, + "num_input_tokens_seen": 474192924, + "step": 12080 + }, + { + "epoch": 0.5858125787382499, + "grad_norm": 0.38744309544563293, + "learning_rate": 1.9317919326118793e-05, + "loss": 1.1302, + "num_input_tokens_seen": 474587284, + "step": 12090 + }, + { + "epoch": 0.5862971218141293, + "grad_norm": 0.40833401679992676, + "learning_rate": 1.9279718562607595e-05, + "loss": 1.1293, + "num_input_tokens_seen": 474984376, + "step": 12100 + }, + { + "epoch": 0.5867816648900087, + "grad_norm": 0.40676310658454895, + "learning_rate": 1.9241531887950853e-05, + "loss": 1.1125, + "num_input_tokens_seen": 475382208, + "step": 12110 + }, + { + "epoch": 0.5872662079658881, + "grad_norm": 0.4175487756729126, + "learning_rate": 1.9203359396201038e-05, + "loss": 1.1302, + "num_input_tokens_seen": 475803188, + "step": 12120 + }, + { + "epoch": 0.5877507510417677, + "grad_norm": 0.39633074402809143, + "learning_rate": 1.9165201181375663e-05, + "loss": 1.1573, + "num_input_tokens_seen": 476200608, + "step": 12130 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.38707002997398376, + "learning_rate": 1.9127057337457077e-05, + "loss": 1.1641, + "num_input_tokens_seen": 476617248, + "step": 12140 + }, + { + "epoch": 0.5887198371935265, + "grad_norm": 0.38113346695899963, + "learning_rate": 1.908892795839226e-05, + "loss": 1.1381, + "num_input_tokens_seen": 476978804, + "step": 12150 + }, + { + "epoch": 0.5892043802694059, + "grad_norm": 0.40942734479904175, + "learning_rate": 1.905081313809253e-05, + "loss": 1.1546, + "num_input_tokens_seen": 477371880, + "step": 12160 + }, + { + "epoch": 0.5896889233452854, + "grad_norm": 0.3737110495567322, + "learning_rate": 1.9012712970433384e-05, + "loss": 1.1463, + "num_input_tokens_seen": 477751464, + "step": 12170 + }, + { + "epoch": 0.5901734664211649, + "grad_norm": 0.37110334634780884, + "learning_rate": 1.8974627549254205e-05, + "loss": 1.1607, + "num_input_tokens_seen": 478117284, + "step": 12180 + }, + { + "epoch": 0.5906580094970443, + "grad_norm": 0.38362741470336914, + "learning_rate": 1.8936556968358047e-05, + "loss": 1.1784, + "num_input_tokens_seen": 478503476, + "step": 12190 + }, + { + "epoch": 0.5911425525729237, + "grad_norm": 0.4000930190086365, + "learning_rate": 1.8898501321511445e-05, + "loss": 1.0922, + "num_input_tokens_seen": 478897364, + "step": 12200 + }, + { + "epoch": 0.5916270956488032, + "grad_norm": 0.39831140637397766, + "learning_rate": 1.8860460702444126e-05, + "loss": 1.1765, + "num_input_tokens_seen": 479272824, + "step": 12210 + }, + { + "epoch": 0.5921116387246826, + "grad_norm": 0.37732118368148804, + "learning_rate": 1.8822435204848827e-05, + "loss": 1.1729, + "num_input_tokens_seen": 479653192, + "step": 12220 + }, + { + "epoch": 0.5925961818005621, + "grad_norm": 0.3948591649532318, + "learning_rate": 1.8784424922381015e-05, + "loss": 1.136, + "num_input_tokens_seen": 480072752, + "step": 12230 + }, + { + "epoch": 0.5930807248764415, + "grad_norm": 0.3917677700519562, + "learning_rate": 1.8746429948658693e-05, + "loss": 1.1598, + "num_input_tokens_seen": 480459552, + "step": 12240 + }, + { + "epoch": 0.593565267952321, + "grad_norm": 0.4181142747402191, + "learning_rate": 1.8708450377262178e-05, + "loss": 1.109, + "num_input_tokens_seen": 480856544, + "step": 12250 + }, + { + "epoch": 0.5940498110282004, + "grad_norm": 0.4221828281879425, + "learning_rate": 1.867048630173381e-05, + "loss": 1.1264, + "num_input_tokens_seen": 481254676, + "step": 12260 + }, + { + "epoch": 0.5945343541040798, + "grad_norm": 0.3877246379852295, + "learning_rate": 1.8632537815577812e-05, + "loss": 1.1524, + "num_input_tokens_seen": 481664936, + "step": 12270 + }, + { + "epoch": 0.5950188971799593, + "grad_norm": 0.41641271114349365, + "learning_rate": 1.859460501225998e-05, + "loss": 1.1492, + "num_input_tokens_seen": 482076064, + "step": 12280 + }, + { + "epoch": 0.5955034402558388, + "grad_norm": 0.38876596093177795, + "learning_rate": 1.8556687985207473e-05, + "loss": 1.1841, + "num_input_tokens_seen": 482481220, + "step": 12290 + }, + { + "epoch": 0.5959879833317182, + "grad_norm": 0.372016966342926, + "learning_rate": 1.851878682780864e-05, + "loss": 1.1443, + "num_input_tokens_seen": 482858776, + "step": 12300 + }, + { + "epoch": 0.5964725264075976, + "grad_norm": 0.4116594195365906, + "learning_rate": 1.84809016334127e-05, + "loss": 1.168, + "num_input_tokens_seen": 483260116, + "step": 12310 + }, + { + "epoch": 0.596957069483477, + "grad_norm": 0.42143404483795166, + "learning_rate": 1.8443032495329564e-05, + "loss": 1.1707, + "num_input_tokens_seen": 483647520, + "step": 12320 + }, + { + "epoch": 0.5974416125593566, + "grad_norm": 0.41614919900894165, + "learning_rate": 1.8405179506829622e-05, + "loss": 1.1575, + "num_input_tokens_seen": 484057976, + "step": 12330 + }, + { + "epoch": 0.597926155635236, + "grad_norm": 0.39331093430519104, + "learning_rate": 1.836734276114346e-05, + "loss": 1.1582, + "num_input_tokens_seen": 484441132, + "step": 12340 + }, + { + "epoch": 0.5984106987111154, + "grad_norm": 0.41087356209754944, + "learning_rate": 1.832952235146166e-05, + "loss": 1.1851, + "num_input_tokens_seen": 484824128, + "step": 12350 + }, + { + "epoch": 0.5988952417869948, + "grad_norm": 0.3742915093898773, + "learning_rate": 1.829171837093459e-05, + "loss": 1.1018, + "num_input_tokens_seen": 485241940, + "step": 12360 + }, + { + "epoch": 0.5993797848628744, + "grad_norm": 0.3956594169139862, + "learning_rate": 1.8253930912672136e-05, + "loss": 1.1769, + "num_input_tokens_seen": 485653000, + "step": 12370 + }, + { + "epoch": 0.5998643279387538, + "grad_norm": 0.4172208905220032, + "learning_rate": 1.8216160069743498e-05, + "loss": 1.1215, + "num_input_tokens_seen": 486031148, + "step": 12380 + }, + { + "epoch": 0.6003488710146332, + "grad_norm": 0.3961176872253418, + "learning_rate": 1.8178405935176933e-05, + "loss": 1.1929, + "num_input_tokens_seen": 486411448, + "step": 12390 + }, + { + "epoch": 0.6008334140905126, + "grad_norm": 0.3706035315990448, + "learning_rate": 1.8140668601959593e-05, + "loss": 1.0883, + "num_input_tokens_seen": 486792408, + "step": 12400 + }, + { + "epoch": 0.6013179571663921, + "grad_norm": 0.3640589416027069, + "learning_rate": 1.81029481630372e-05, + "loss": 1.1478, + "num_input_tokens_seen": 487182316, + "step": 12410 + }, + { + "epoch": 0.6018025002422716, + "grad_norm": 0.4026361107826233, + "learning_rate": 1.806524471131388e-05, + "loss": 1.1486, + "num_input_tokens_seen": 487604672, + "step": 12420 + }, + { + "epoch": 0.602287043318151, + "grad_norm": 0.41077134013175964, + "learning_rate": 1.8027558339651936e-05, + "loss": 1.1469, + "num_input_tokens_seen": 487986272, + "step": 12430 + }, + { + "epoch": 0.6027715863940304, + "grad_norm": 0.394106924533844, + "learning_rate": 1.7989889140871583e-05, + "loss": 1.0973, + "num_input_tokens_seen": 488373412, + "step": 12440 + }, + { + "epoch": 0.6032561294699099, + "grad_norm": 0.3956287205219269, + "learning_rate": 1.795223720775076e-05, + "loss": 1.118, + "num_input_tokens_seen": 488785752, + "step": 12450 + }, + { + "epoch": 0.6037406725457893, + "grad_norm": 0.3825465142726898, + "learning_rate": 1.791460263302487e-05, + "loss": 1.1358, + "num_input_tokens_seen": 489205628, + "step": 12460 + }, + { + "epoch": 0.6042252156216688, + "grad_norm": 0.3835325539112091, + "learning_rate": 1.7876985509386547e-05, + "loss": 1.1225, + "num_input_tokens_seen": 489595928, + "step": 12470 + }, + { + "epoch": 0.6047097586975482, + "grad_norm": 0.3921869397163391, + "learning_rate": 1.7839385929485482e-05, + "loss": 1.1232, + "num_input_tokens_seen": 490009524, + "step": 12480 + }, + { + "epoch": 0.6051943017734277, + "grad_norm": 0.4052605628967285, + "learning_rate": 1.7801803985928117e-05, + "loss": 1.1823, + "num_input_tokens_seen": 490436288, + "step": 12490 + }, + { + "epoch": 0.6056788448493071, + "grad_norm": 0.37396615743637085, + "learning_rate": 1.7764239771277477e-05, + "loss": 1.1319, + "num_input_tokens_seen": 490820096, + "step": 12500 + }, + { + "epoch": 0.6061633879251865, + "grad_norm": 0.3906390368938446, + "learning_rate": 1.772669337805292e-05, + "loss": 1.1324, + "num_input_tokens_seen": 491197804, + "step": 12510 + }, + { + "epoch": 0.606647931001066, + "grad_norm": 0.3880655765533447, + "learning_rate": 1.768916489872991e-05, + "loss": 1.1389, + "num_input_tokens_seen": 491590844, + "step": 12520 + }, + { + "epoch": 0.6071324740769455, + "grad_norm": 0.42076969146728516, + "learning_rate": 1.765165442573979e-05, + "loss": 1.1312, + "num_input_tokens_seen": 491987104, + "step": 12530 + }, + { + "epoch": 0.6076170171528249, + "grad_norm": 0.4096836745738983, + "learning_rate": 1.7614162051469545e-05, + "loss": 1.1339, + "num_input_tokens_seen": 492387748, + "step": 12540 + }, + { + "epoch": 0.6081015602287043, + "grad_norm": 0.3795408606529236, + "learning_rate": 1.7576687868261587e-05, + "loss": 1.1246, + "num_input_tokens_seen": 492784488, + "step": 12550 + }, + { + "epoch": 0.6085861033045837, + "grad_norm": 0.38204097747802734, + "learning_rate": 1.7539231968413546e-05, + "loss": 1.1139, + "num_input_tokens_seen": 493156420, + "step": 12560 + }, + { + "epoch": 0.6090706463804633, + "grad_norm": 0.4025273621082306, + "learning_rate": 1.7501794444177975e-05, + "loss": 1.1592, + "num_input_tokens_seen": 493529324, + "step": 12570 + }, + { + "epoch": 0.6095551894563427, + "grad_norm": 0.39617040753364563, + "learning_rate": 1.746437538776222e-05, + "loss": 1.1537, + "num_input_tokens_seen": 493929468, + "step": 12580 + }, + { + "epoch": 0.6100397325322221, + "grad_norm": 0.39346054196357727, + "learning_rate": 1.742697489132811e-05, + "loss": 1.1642, + "num_input_tokens_seen": 494356540, + "step": 12590 + }, + { + "epoch": 0.6105242756081015, + "grad_norm": 0.4220488369464874, + "learning_rate": 1.738959304699176e-05, + "loss": 1.1657, + "num_input_tokens_seen": 494737364, + "step": 12600 + }, + { + "epoch": 0.611008818683981, + "grad_norm": 0.38811561465263367, + "learning_rate": 1.735222994682336e-05, + "loss": 1.131, + "num_input_tokens_seen": 495145924, + "step": 12610 + }, + { + "epoch": 0.6114933617598605, + "grad_norm": 0.3839203417301178, + "learning_rate": 1.7314885682846925e-05, + "loss": 1.1502, + "num_input_tokens_seen": 495543252, + "step": 12620 + }, + { + "epoch": 0.6119779048357399, + "grad_norm": 0.4147859811782837, + "learning_rate": 1.7277560347040094e-05, + "loss": 1.219, + "num_input_tokens_seen": 495924020, + "step": 12630 + }, + { + "epoch": 0.6124624479116193, + "grad_norm": 0.3909815549850464, + "learning_rate": 1.7240254031333862e-05, + "loss": 1.1642, + "num_input_tokens_seen": 496317616, + "step": 12640 + }, + { + "epoch": 0.6129469909874988, + "grad_norm": 0.3842679560184479, + "learning_rate": 1.720296682761238e-05, + "loss": 1.142, + "num_input_tokens_seen": 496696256, + "step": 12650 + }, + { + "epoch": 0.6134315340633782, + "grad_norm": 0.4022013545036316, + "learning_rate": 1.716569882771274e-05, + "loss": 1.1284, + "num_input_tokens_seen": 497103872, + "step": 12660 + }, + { + "epoch": 0.6139160771392577, + "grad_norm": 0.39619848132133484, + "learning_rate": 1.7128450123424746e-05, + "loss": 1.1532, + "num_input_tokens_seen": 497505888, + "step": 12670 + }, + { + "epoch": 0.6144006202151371, + "grad_norm": 0.3860088586807251, + "learning_rate": 1.709122080649064e-05, + "loss": 1.1758, + "num_input_tokens_seen": 497903692, + "step": 12680 + }, + { + "epoch": 0.6148851632910166, + "grad_norm": 0.377780020236969, + "learning_rate": 1.705401096860496e-05, + "loss": 1.1119, + "num_input_tokens_seen": 498287500, + "step": 12690 + }, + { + "epoch": 0.615369706366896, + "grad_norm": 0.3980233371257782, + "learning_rate": 1.7016820701414215e-05, + "loss": 1.1614, + "num_input_tokens_seen": 498685408, + "step": 12700 + }, + { + "epoch": 0.6158542494427754, + "grad_norm": 0.399446964263916, + "learning_rate": 1.697965009651677e-05, + "loss": 1.1788, + "num_input_tokens_seen": 499066376, + "step": 12710 + }, + { + "epoch": 0.6163387925186549, + "grad_norm": 0.3826245069503784, + "learning_rate": 1.6942499245462525e-05, + "loss": 1.1421, + "num_input_tokens_seen": 499446672, + "step": 12720 + }, + { + "epoch": 0.6168233355945344, + "grad_norm": 0.39101335406303406, + "learning_rate": 1.6905368239752718e-05, + "loss": 1.1129, + "num_input_tokens_seen": 499832256, + "step": 12730 + }, + { + "epoch": 0.6173078786704138, + "grad_norm": 0.41338834166526794, + "learning_rate": 1.686825717083975e-05, + "loss": 1.1999, + "num_input_tokens_seen": 500220888, + "step": 12740 + }, + { + "epoch": 0.6177924217462932, + "grad_norm": 0.4286590814590454, + "learning_rate": 1.6831166130126872e-05, + "loss": 1.1657, + "num_input_tokens_seen": 500641332, + "step": 12750 + }, + { + "epoch": 0.6182769648221726, + "grad_norm": 0.4031652510166168, + "learning_rate": 1.6794095208968058e-05, + "loss": 1.1097, + "num_input_tokens_seen": 501041940, + "step": 12760 + }, + { + "epoch": 0.6187615078980522, + "grad_norm": 0.3872615098953247, + "learning_rate": 1.675704449866768e-05, + "loss": 1.1681, + "num_input_tokens_seen": 501421856, + "step": 12770 + }, + { + "epoch": 0.6192460509739316, + "grad_norm": 0.4028913676738739, + "learning_rate": 1.672001409048034e-05, + "loss": 1.1452, + "num_input_tokens_seen": 501803240, + "step": 12780 + }, + { + "epoch": 0.619730594049811, + "grad_norm": 0.4178786277770996, + "learning_rate": 1.668300407561067e-05, + "loss": 1.1493, + "num_input_tokens_seen": 502206632, + "step": 12790 + }, + { + "epoch": 0.6202151371256904, + "grad_norm": 0.4024272859096527, + "learning_rate": 1.6646014545213042e-05, + "loss": 1.1246, + "num_input_tokens_seen": 502597668, + "step": 12800 + }, + { + "epoch": 0.62069968020157, + "grad_norm": 0.38439643383026123, + "learning_rate": 1.6609045590391383e-05, + "loss": 1.1421, + "num_input_tokens_seen": 502992140, + "step": 12810 + }, + { + "epoch": 0.6211842232774494, + "grad_norm": 0.44478264451026917, + "learning_rate": 1.6572097302198935e-05, + "loss": 1.116, + "num_input_tokens_seen": 503374384, + "step": 12820 + }, + { + "epoch": 0.6216687663533288, + "grad_norm": 0.44358116388320923, + "learning_rate": 1.6535169771638066e-05, + "loss": 1.127, + "num_input_tokens_seen": 503789460, + "step": 12830 + }, + { + "epoch": 0.6221533094292082, + "grad_norm": 0.3874736428260803, + "learning_rate": 1.6498263089659992e-05, + "loss": 1.1076, + "num_input_tokens_seen": 504149520, + "step": 12840 + }, + { + "epoch": 0.6226378525050877, + "grad_norm": 0.3996816873550415, + "learning_rate": 1.6461377347164593e-05, + "loss": 1.1729, + "num_input_tokens_seen": 504529724, + "step": 12850 + }, + { + "epoch": 0.6231223955809672, + "grad_norm": 0.39820224046707153, + "learning_rate": 1.6424512635000158e-05, + "loss": 1.1639, + "num_input_tokens_seen": 504913244, + "step": 12860 + }, + { + "epoch": 0.6236069386568466, + "grad_norm": 0.4100916385650635, + "learning_rate": 1.638766904396321e-05, + "loss": 1.1485, + "num_input_tokens_seen": 505324308, + "step": 12870 + }, + { + "epoch": 0.624091481732726, + "grad_norm": 0.38851940631866455, + "learning_rate": 1.635084666479822e-05, + "loss": 1.1161, + "num_input_tokens_seen": 505696332, + "step": 12880 + }, + { + "epoch": 0.6245760248086055, + "grad_norm": 0.4355764091014862, + "learning_rate": 1.6314045588197442e-05, + "loss": 1.143, + "num_input_tokens_seen": 506083568, + "step": 12890 + }, + { + "epoch": 0.6250605678844849, + "grad_norm": 0.38407522439956665, + "learning_rate": 1.6277265904800643e-05, + "loss": 1.1396, + "num_input_tokens_seen": 506497556, + "step": 12900 + }, + { + "epoch": 0.6255451109603644, + "grad_norm": 0.38687968254089355, + "learning_rate": 1.6240507705194896e-05, + "loss": 1.1215, + "num_input_tokens_seen": 506881424, + "step": 12910 + }, + { + "epoch": 0.6260296540362438, + "grad_norm": 0.36193808913230896, + "learning_rate": 1.6203771079914387e-05, + "loss": 1.1487, + "num_input_tokens_seen": 507256568, + "step": 12920 + }, + { + "epoch": 0.6265141971121233, + "grad_norm": 0.3949004113674164, + "learning_rate": 1.6167056119440125e-05, + "loss": 1.1569, + "num_input_tokens_seen": 507669200, + "step": 12930 + }, + { + "epoch": 0.6269987401880027, + "grad_norm": 0.39322030544281006, + "learning_rate": 1.6130362914199814e-05, + "loss": 1.0929, + "num_input_tokens_seen": 508034880, + "step": 12940 + }, + { + "epoch": 0.6274832832638821, + "grad_norm": 0.40019360184669495, + "learning_rate": 1.6093691554567524e-05, + "loss": 1.1317, + "num_input_tokens_seen": 508462956, + "step": 12950 + }, + { + "epoch": 0.6279678263397616, + "grad_norm": 0.40196293592453003, + "learning_rate": 1.6057042130863538e-05, + "loss": 1.0989, + "num_input_tokens_seen": 508848948, + "step": 12960 + }, + { + "epoch": 0.6284523694156411, + "grad_norm": 0.4627667963504791, + "learning_rate": 1.6020414733354117e-05, + "loss": 1.1436, + "num_input_tokens_seen": 509221768, + "step": 12970 + }, + { + "epoch": 0.6289369124915205, + "grad_norm": 0.42447420954704285, + "learning_rate": 1.598380945225127e-05, + "loss": 1.1207, + "num_input_tokens_seen": 509643908, + "step": 12980 + }, + { + "epoch": 0.6294214555673999, + "grad_norm": 0.3753610849380493, + "learning_rate": 1.594722637771253e-05, + "loss": 1.1599, + "num_input_tokens_seen": 510024072, + "step": 12990 + }, + { + "epoch": 0.6299059986432793, + "grad_norm": 0.39578545093536377, + "learning_rate": 1.5910665599840745e-05, + "loss": 1.1634, + "num_input_tokens_seen": 510426748, + "step": 13000 + }, + { + "epoch": 0.6303905417191589, + "grad_norm": 0.4332728087902069, + "learning_rate": 1.5874127208683824e-05, + "loss": 1.0877, + "num_input_tokens_seen": 510818184, + "step": 13010 + }, + { + "epoch": 0.6308750847950383, + "grad_norm": 0.3980608284473419, + "learning_rate": 1.5837611294234583e-05, + "loss": 1.102, + "num_input_tokens_seen": 511196012, + "step": 13020 + }, + { + "epoch": 0.6313596278709177, + "grad_norm": 0.3887230455875397, + "learning_rate": 1.5801117946430434e-05, + "loss": 1.1257, + "num_input_tokens_seen": 511590376, + "step": 13030 + }, + { + "epoch": 0.6318441709467971, + "grad_norm": 0.4023469388484955, + "learning_rate": 1.576464725515322e-05, + "loss": 1.13, + "num_input_tokens_seen": 511985844, + "step": 13040 + }, + { + "epoch": 0.6323287140226767, + "grad_norm": 0.3778144121170044, + "learning_rate": 1.5728199310229e-05, + "loss": 1.0993, + "num_input_tokens_seen": 512401748, + "step": 13050 + }, + { + "epoch": 0.6328132570985561, + "grad_norm": 0.3801812529563904, + "learning_rate": 1.5691774201427772e-05, + "loss": 1.0846, + "num_input_tokens_seen": 512818064, + "step": 13060 + }, + { + "epoch": 0.6332978001744355, + "grad_norm": 0.3784249722957611, + "learning_rate": 1.565537201846335e-05, + "loss": 1.1541, + "num_input_tokens_seen": 513220236, + "step": 13070 + }, + { + "epoch": 0.6337823432503149, + "grad_norm": 0.4744737148284912, + "learning_rate": 1.5618992850993025e-05, + "loss": 1.1496, + "num_input_tokens_seen": 513594808, + "step": 13080 + }, + { + "epoch": 0.6342668863261944, + "grad_norm": 0.3975614607334137, + "learning_rate": 1.5582636788617412e-05, + "loss": 1.176, + "num_input_tokens_seen": 513981028, + "step": 13090 + }, + { + "epoch": 0.6347514294020739, + "grad_norm": 0.3835643231868744, + "learning_rate": 1.5546303920880256e-05, + "loss": 1.1276, + "num_input_tokens_seen": 514381624, + "step": 13100 + }, + { + "epoch": 0.6352359724779533, + "grad_norm": 0.3918743431568146, + "learning_rate": 1.5509994337268126e-05, + "loss": 1.1648, + "num_input_tokens_seen": 514799552, + "step": 13110 + }, + { + "epoch": 0.6357205155538327, + "grad_norm": 0.3977094888687134, + "learning_rate": 1.547370812721028e-05, + "loss": 1.0923, + "num_input_tokens_seen": 515216184, + "step": 13120 + }, + { + "epoch": 0.6362050586297122, + "grad_norm": 0.41210541129112244, + "learning_rate": 1.5437445380078383e-05, + "loss": 1.1419, + "num_input_tokens_seen": 515617056, + "step": 13130 + }, + { + "epoch": 0.6366896017055916, + "grad_norm": 0.3935719132423401, + "learning_rate": 1.5401206185186323e-05, + "loss": 1.1472, + "num_input_tokens_seen": 516021628, + "step": 13140 + }, + { + "epoch": 0.637174144781471, + "grad_norm": 0.4016217291355133, + "learning_rate": 1.536499063178999e-05, + "loss": 1.1494, + "num_input_tokens_seen": 516424464, + "step": 13150 + }, + { + "epoch": 0.6376586878573505, + "grad_norm": 0.41335245966911316, + "learning_rate": 1.5328798809087012e-05, + "loss": 1.1859, + "num_input_tokens_seen": 516827724, + "step": 13160 + }, + { + "epoch": 0.63814323093323, + "grad_norm": 0.38563454151153564, + "learning_rate": 1.5292630806216603e-05, + "loss": 1.1049, + "num_input_tokens_seen": 517224524, + "step": 13170 + }, + { + "epoch": 0.6386277740091094, + "grad_norm": 0.4143526256084442, + "learning_rate": 1.5256486712259304e-05, + "loss": 1.1405, + "num_input_tokens_seen": 517641892, + "step": 13180 + }, + { + "epoch": 0.6391123170849888, + "grad_norm": 0.3733633756637573, + "learning_rate": 1.5220366616236748e-05, + "loss": 1.1688, + "num_input_tokens_seen": 518022120, + "step": 13190 + }, + { + "epoch": 0.6395968601608683, + "grad_norm": 0.412308931350708, + "learning_rate": 1.5184270607111494e-05, + "loss": 1.1234, + "num_input_tokens_seen": 518438460, + "step": 13200 + }, + { + "epoch": 0.6400814032367478, + "grad_norm": 0.3862762749195099, + "learning_rate": 1.5148198773786754e-05, + "loss": 1.14, + "num_input_tokens_seen": 518813036, + "step": 13210 + }, + { + "epoch": 0.6405659463126272, + "grad_norm": 0.4147486984729767, + "learning_rate": 1.5112151205106182e-05, + "loss": 1.1246, + "num_input_tokens_seen": 519216952, + "step": 13220 + }, + { + "epoch": 0.6410504893885066, + "grad_norm": 0.40653976798057556, + "learning_rate": 1.507612798985371e-05, + "loss": 1.115, + "num_input_tokens_seen": 519604276, + "step": 13230 + }, + { + "epoch": 0.641535032464386, + "grad_norm": 0.4051119089126587, + "learning_rate": 1.5040129216753257e-05, + "loss": 1.179, + "num_input_tokens_seen": 519988992, + "step": 13240 + }, + { + "epoch": 0.6420195755402656, + "grad_norm": 0.39531031250953674, + "learning_rate": 1.5004154974468548e-05, + "loss": 1.1244, + "num_input_tokens_seen": 520363784, + "step": 13250 + }, + { + "epoch": 0.642504118616145, + "grad_norm": 0.38629281520843506, + "learning_rate": 1.4968205351602899e-05, + "loss": 1.115, + "num_input_tokens_seen": 520766492, + "step": 13260 + }, + { + "epoch": 0.6429886616920244, + "grad_norm": 0.3945416808128357, + "learning_rate": 1.4932280436698976e-05, + "loss": 1.1265, + "num_input_tokens_seen": 521157024, + "step": 13270 + }, + { + "epoch": 0.6434732047679038, + "grad_norm": 0.3776805102825165, + "learning_rate": 1.48963803182386e-05, + "loss": 1.1165, + "num_input_tokens_seen": 521579912, + "step": 13280 + }, + { + "epoch": 0.6439577478437833, + "grad_norm": 0.4211878776550293, + "learning_rate": 1.4860505084642506e-05, + "loss": 1.1249, + "num_input_tokens_seen": 521991792, + "step": 13290 + }, + { + "epoch": 0.6444422909196628, + "grad_norm": 0.4258326292037964, + "learning_rate": 1.4824654824270168e-05, + "loss": 1.1046, + "num_input_tokens_seen": 522392304, + "step": 13300 + }, + { + "epoch": 0.6449268339955422, + "grad_norm": 0.40192142128944397, + "learning_rate": 1.478882962541952e-05, + "loss": 1.1577, + "num_input_tokens_seen": 522783384, + "step": 13310 + }, + { + "epoch": 0.6454113770714216, + "grad_norm": 0.3894496262073517, + "learning_rate": 1.475302957632677e-05, + "loss": 1.1205, + "num_input_tokens_seen": 523150196, + "step": 13320 + }, + { + "epoch": 0.6458959201473011, + "grad_norm": 0.4126818776130676, + "learning_rate": 1.471725476516622e-05, + "loss": 1.1076, + "num_input_tokens_seen": 523551404, + "step": 13330 + }, + { + "epoch": 0.6463804632231805, + "grad_norm": 0.37312477827072144, + "learning_rate": 1.4681505280049974e-05, + "loss": 1.1852, + "num_input_tokens_seen": 523955836, + "step": 13340 + }, + { + "epoch": 0.64686500629906, + "grad_norm": 0.40087607502937317, + "learning_rate": 1.4645781209027764e-05, + "loss": 1.1758, + "num_input_tokens_seen": 524356324, + "step": 13350 + }, + { + "epoch": 0.6473495493749394, + "grad_norm": 0.38394778966903687, + "learning_rate": 1.4610082640086754e-05, + "loss": 1.108, + "num_input_tokens_seen": 524759624, + "step": 13360 + }, + { + "epoch": 0.6478340924508189, + "grad_norm": 0.4227401912212372, + "learning_rate": 1.4574409661151264e-05, + "loss": 1.1216, + "num_input_tokens_seen": 525129624, + "step": 13370 + }, + { + "epoch": 0.6483186355266983, + "grad_norm": 0.3866701126098633, + "learning_rate": 1.4538762360082608e-05, + "loss": 1.1328, + "num_input_tokens_seen": 525533224, + "step": 13380 + }, + { + "epoch": 0.6488031786025777, + "grad_norm": 0.39452120661735535, + "learning_rate": 1.4503140824678863e-05, + "loss": 1.124, + "num_input_tokens_seen": 525921192, + "step": 13390 + }, + { + "epoch": 0.6492877216784572, + "grad_norm": 0.3937327563762665, + "learning_rate": 1.4467545142674599e-05, + "loss": 1.132, + "num_input_tokens_seen": 526287908, + "step": 13400 + }, + { + "epoch": 0.6497722647543367, + "grad_norm": 0.427117258310318, + "learning_rate": 1.4431975401740783e-05, + "loss": 1.1116, + "num_input_tokens_seen": 526650256, + "step": 13410 + }, + { + "epoch": 0.6502568078302161, + "grad_norm": 0.36921098828315735, + "learning_rate": 1.4396431689484416e-05, + "loss": 1.0976, + "num_input_tokens_seen": 527032076, + "step": 13420 + }, + { + "epoch": 0.6507413509060955, + "grad_norm": 0.3914394676685333, + "learning_rate": 1.4360914093448463e-05, + "loss": 1.2131, + "num_input_tokens_seen": 527466664, + "step": 13430 + }, + { + "epoch": 0.651225893981975, + "grad_norm": 0.4258521497249603, + "learning_rate": 1.4325422701111502e-05, + "loss": 1.1817, + "num_input_tokens_seen": 527850952, + "step": 13440 + }, + { + "epoch": 0.6517104370578545, + "grad_norm": 0.401259183883667, + "learning_rate": 1.428995759988761e-05, + "loss": 1.158, + "num_input_tokens_seen": 528263332, + "step": 13450 + }, + { + "epoch": 0.6521949801337339, + "grad_norm": 0.3809134066104889, + "learning_rate": 1.4254518877126095e-05, + "loss": 1.1234, + "num_input_tokens_seen": 528686588, + "step": 13460 + }, + { + "epoch": 0.6526795232096133, + "grad_norm": 0.406120240688324, + "learning_rate": 1.42191066201113e-05, + "loss": 1.1635, + "num_input_tokens_seen": 529070212, + "step": 13470 + }, + { + "epoch": 0.6531640662854927, + "grad_norm": 0.43414533138275146, + "learning_rate": 1.418372091606239e-05, + "loss": 1.1747, + "num_input_tokens_seen": 529477424, + "step": 13480 + }, + { + "epoch": 0.6536486093613723, + "grad_norm": 0.4020240306854248, + "learning_rate": 1.4148361852133129e-05, + "loss": 1.1394, + "num_input_tokens_seen": 529879856, + "step": 13490 + }, + { + "epoch": 0.6541331524372517, + "grad_norm": 0.39953121542930603, + "learning_rate": 1.4113029515411647e-05, + "loss": 1.1363, + "num_input_tokens_seen": 530288184, + "step": 13500 + }, + { + "epoch": 0.6546176955131311, + "grad_norm": 0.39526060223579407, + "learning_rate": 1.407772399292027e-05, + "loss": 1.1483, + "num_input_tokens_seen": 530697004, + "step": 13510 + }, + { + "epoch": 0.6551022385890105, + "grad_norm": 0.4059954583644867, + "learning_rate": 1.4042445371615271e-05, + "loss": 1.125, + "num_input_tokens_seen": 531092792, + "step": 13520 + }, + { + "epoch": 0.65558678166489, + "grad_norm": 0.40413933992385864, + "learning_rate": 1.4007193738386675e-05, + "loss": 1.1345, + "num_input_tokens_seen": 531481940, + "step": 13530 + }, + { + "epoch": 0.6560713247407695, + "grad_norm": 0.3758378326892853, + "learning_rate": 1.3971969180058032e-05, + "loss": 1.139, + "num_input_tokens_seen": 531883224, + "step": 13540 + }, + { + "epoch": 0.6565558678166489, + "grad_norm": 0.3990527093410492, + "learning_rate": 1.3936771783386183e-05, + "loss": 1.1284, + "num_input_tokens_seen": 532298904, + "step": 13550 + }, + { + "epoch": 0.6570404108925283, + "grad_norm": 0.403590589761734, + "learning_rate": 1.390160163506113e-05, + "loss": 1.1493, + "num_input_tokens_seen": 532676364, + "step": 13560 + }, + { + "epoch": 0.6575249539684078, + "grad_norm": 0.41753387451171875, + "learning_rate": 1.3866458821705697e-05, + "loss": 1.1165, + "num_input_tokens_seen": 533090008, + "step": 13570 + }, + { + "epoch": 0.6580094970442872, + "grad_norm": 0.40354228019714355, + "learning_rate": 1.3831343429875421e-05, + "loss": 1.1542, + "num_input_tokens_seen": 533475684, + "step": 13580 + }, + { + "epoch": 0.6584940401201667, + "grad_norm": 0.40463098883628845, + "learning_rate": 1.3796255546058293e-05, + "loss": 1.1574, + "num_input_tokens_seen": 533869320, + "step": 13590 + }, + { + "epoch": 0.6589785831960461, + "grad_norm": 0.40255841612815857, + "learning_rate": 1.3761195256674554e-05, + "loss": 1.1158, + "num_input_tokens_seen": 534226148, + "step": 13600 + }, + { + "epoch": 0.6594631262719256, + "grad_norm": 0.4016367197036743, + "learning_rate": 1.3726162648076474e-05, + "loss": 1.1712, + "num_input_tokens_seen": 534636196, + "step": 13610 + }, + { + "epoch": 0.659947669347805, + "grad_norm": 0.4226371645927429, + "learning_rate": 1.3691157806548167e-05, + "loss": 1.1584, + "num_input_tokens_seen": 535027492, + "step": 13620 + }, + { + "epoch": 0.6604322124236844, + "grad_norm": 0.4005202651023865, + "learning_rate": 1.365618081830532e-05, + "loss": 1.1075, + "num_input_tokens_seen": 535420836, + "step": 13630 + }, + { + "epoch": 0.6609167554995639, + "grad_norm": 0.38361644744873047, + "learning_rate": 1.3621231769495047e-05, + "loss": 1.1318, + "num_input_tokens_seen": 535798348, + "step": 13640 + }, + { + "epoch": 0.6614012985754434, + "grad_norm": 0.36114785075187683, + "learning_rate": 1.3586310746195641e-05, + "loss": 1.1685, + "num_input_tokens_seen": 536163432, + "step": 13650 + }, + { + "epoch": 0.6618858416513228, + "grad_norm": 0.452426940202713, + "learning_rate": 1.3551417834416375e-05, + "loss": 1.1241, + "num_input_tokens_seen": 536559076, + "step": 13660 + }, + { + "epoch": 0.6623703847272022, + "grad_norm": 0.43133866786956787, + "learning_rate": 1.3516553120097281e-05, + "loss": 1.1362, + "num_input_tokens_seen": 536949788, + "step": 13670 + }, + { + "epoch": 0.6628549278030816, + "grad_norm": 0.3915277421474457, + "learning_rate": 1.3481716689108915e-05, + "loss": 1.1215, + "num_input_tokens_seen": 537337428, + "step": 13680 + }, + { + "epoch": 0.6633394708789612, + "grad_norm": 0.38239341974258423, + "learning_rate": 1.3446908627252236e-05, + "loss": 1.1594, + "num_input_tokens_seen": 537726764, + "step": 13690 + }, + { + "epoch": 0.6638240139548406, + "grad_norm": 0.4042087495326996, + "learning_rate": 1.3412129020258257e-05, + "loss": 1.1515, + "num_input_tokens_seen": 538088164, + "step": 13700 + }, + { + "epoch": 0.66430855703072, + "grad_norm": 0.39968568086624146, + "learning_rate": 1.3377377953787956e-05, + "loss": 1.116, + "num_input_tokens_seen": 538462580, + "step": 13710 + }, + { + "epoch": 0.6647931001065994, + "grad_norm": 0.3845781683921814, + "learning_rate": 1.3342655513432001e-05, + "loss": 1.1221, + "num_input_tokens_seen": 538858660, + "step": 13720 + }, + { + "epoch": 0.665277643182479, + "grad_norm": 0.37981632351875305, + "learning_rate": 1.3307961784710554e-05, + "loss": 1.1724, + "num_input_tokens_seen": 539260832, + "step": 13730 + }, + { + "epoch": 0.6657621862583584, + "grad_norm": 0.4112668037414551, + "learning_rate": 1.327329685307307e-05, + "loss": 1.1521, + "num_input_tokens_seen": 539654112, + "step": 13740 + }, + { + "epoch": 0.6662467293342378, + "grad_norm": 0.3671356439590454, + "learning_rate": 1.3238660803898074e-05, + "loss": 1.1909, + "num_input_tokens_seen": 540039560, + "step": 13750 + }, + { + "epoch": 0.6667312724101173, + "grad_norm": 0.3972877562046051, + "learning_rate": 1.3204053722492927e-05, + "loss": 1.1792, + "num_input_tokens_seen": 540430488, + "step": 13760 + }, + { + "epoch": 0.6672158154859967, + "grad_norm": 0.39763057231903076, + "learning_rate": 1.3169475694093703e-05, + "loss": 1.1322, + "num_input_tokens_seen": 540829364, + "step": 13770 + }, + { + "epoch": 0.6677003585618762, + "grad_norm": 0.37179917097091675, + "learning_rate": 1.3134926803864861e-05, + "loss": 1.1592, + "num_input_tokens_seen": 541255364, + "step": 13780 + }, + { + "epoch": 0.6681849016377556, + "grad_norm": 0.3995148241519928, + "learning_rate": 1.3100407136899123e-05, + "loss": 1.103, + "num_input_tokens_seen": 541631036, + "step": 13790 + }, + { + "epoch": 0.6686694447136351, + "grad_norm": 0.4086041748523712, + "learning_rate": 1.3065916778217235e-05, + "loss": 1.1492, + "num_input_tokens_seen": 542006572, + "step": 13800 + }, + { + "epoch": 0.6691539877895145, + "grad_norm": 0.3766721785068512, + "learning_rate": 1.3031455812767746e-05, + "loss": 1.1115, + "num_input_tokens_seen": 542379296, + "step": 13810 + }, + { + "epoch": 0.6696385308653939, + "grad_norm": 0.4227388799190521, + "learning_rate": 1.299702432542683e-05, + "loss": 1.0872, + "num_input_tokens_seen": 542769344, + "step": 13820 + }, + { + "epoch": 0.6701230739412734, + "grad_norm": 0.4330199658870697, + "learning_rate": 1.2962622400998012e-05, + "loss": 1.0849, + "num_input_tokens_seen": 543147408, + "step": 13830 + }, + { + "epoch": 0.6706076170171529, + "grad_norm": 0.37317484617233276, + "learning_rate": 1.292825012421208e-05, + "loss": 1.1378, + "num_input_tokens_seen": 543533556, + "step": 13840 + }, + { + "epoch": 0.6710921600930323, + "grad_norm": 0.39739182591438293, + "learning_rate": 1.2893907579726728e-05, + "loss": 1.1708, + "num_input_tokens_seen": 543906020, + "step": 13850 + }, + { + "epoch": 0.6715767031689117, + "grad_norm": 0.4071529507637024, + "learning_rate": 1.2859594852126456e-05, + "loss": 1.1109, + "num_input_tokens_seen": 544321144, + "step": 13860 + }, + { + "epoch": 0.6720612462447911, + "grad_norm": 0.398400217294693, + "learning_rate": 1.282531202592232e-05, + "loss": 1.0842, + "num_input_tokens_seen": 544708388, + "step": 13870 + }, + { + "epoch": 0.6725457893206707, + "grad_norm": 0.37718960642814636, + "learning_rate": 1.2791059185551744e-05, + "loss": 1.1508, + "num_input_tokens_seen": 545104336, + "step": 13880 + }, + { + "epoch": 0.6730303323965501, + "grad_norm": 0.41166213154792786, + "learning_rate": 1.2756836415378254e-05, + "loss": 1.1302, + "num_input_tokens_seen": 545501896, + "step": 13890 + }, + { + "epoch": 0.6735148754724295, + "grad_norm": 0.4132916033267975, + "learning_rate": 1.2722643799691378e-05, + "loss": 1.1038, + "num_input_tokens_seen": 545881328, + "step": 13900 + }, + { + "epoch": 0.6739994185483089, + "grad_norm": 0.40238797664642334, + "learning_rate": 1.2688481422706309e-05, + "loss": 1.146, + "num_input_tokens_seen": 546279016, + "step": 13910 + }, + { + "epoch": 0.6744839616241884, + "grad_norm": 0.4064416289329529, + "learning_rate": 1.2654349368563828e-05, + "loss": 1.1636, + "num_input_tokens_seen": 546677216, + "step": 13920 + }, + { + "epoch": 0.6749685047000679, + "grad_norm": 0.37984856963157654, + "learning_rate": 1.2620247721329973e-05, + "loss": 1.1299, + "num_input_tokens_seen": 547080472, + "step": 13930 + }, + { + "epoch": 0.6754530477759473, + "grad_norm": 0.4335133135318756, + "learning_rate": 1.2586176564995922e-05, + "loss": 1.1437, + "num_input_tokens_seen": 547472432, + "step": 13940 + }, + { + "epoch": 0.6759375908518267, + "grad_norm": 0.3851531445980072, + "learning_rate": 1.2552135983477756e-05, + "loss": 1.1503, + "num_input_tokens_seen": 547869104, + "step": 13950 + }, + { + "epoch": 0.6764221339277062, + "grad_norm": 0.37984663248062134, + "learning_rate": 1.251812606061624e-05, + "loss": 1.1761, + "num_input_tokens_seen": 548278152, + "step": 13960 + }, + { + "epoch": 0.6769066770035856, + "grad_norm": 0.40836653113365173, + "learning_rate": 1.2484146880176642e-05, + "loss": 1.066, + "num_input_tokens_seen": 548692648, + "step": 13970 + }, + { + "epoch": 0.6773912200794651, + "grad_norm": 0.38975247740745544, + "learning_rate": 1.2450198525848487e-05, + "loss": 1.0798, + "num_input_tokens_seen": 549066912, + "step": 13980 + }, + { + "epoch": 0.6778757631553445, + "grad_norm": 0.3853222727775574, + "learning_rate": 1.2416281081245398e-05, + "loss": 1.1685, + "num_input_tokens_seen": 549440944, + "step": 13990 + }, + { + "epoch": 0.678360306231224, + "grad_norm": 0.45054277777671814, + "learning_rate": 1.2382394629904864e-05, + "loss": 1.1473, + "num_input_tokens_seen": 549831184, + "step": 14000 + }, + { + "epoch": 0.678360306231224, + "eval_loss": 1.1321347951889038, + "eval_runtime": 5.0521, + "eval_samples_per_second": 29.69, + "eval_steps_per_second": 3.761, + "num_input_tokens_seen": 549831184, + "step": 14000 + }, + { + "epoch": 0.6788448493071034, + "grad_norm": 0.3913303315639496, + "learning_rate": 1.2348539255288038e-05, + "loss": 1.1504, + "num_input_tokens_seen": 550197984, + "step": 14010 + }, + { + "epoch": 0.6793293923829828, + "grad_norm": 0.38616275787353516, + "learning_rate": 1.2314715040779534e-05, + "loss": 1.1412, + "num_input_tokens_seen": 550565288, + "step": 14020 + }, + { + "epoch": 0.6798139354588623, + "grad_norm": 0.376402884721756, + "learning_rate": 1.2280922069687225e-05, + "loss": 1.1712, + "num_input_tokens_seen": 550966008, + "step": 14030 + }, + { + "epoch": 0.6802984785347418, + "grad_norm": 0.4074733853340149, + "learning_rate": 1.2247160425241996e-05, + "loss": 1.0974, + "num_input_tokens_seen": 551358528, + "step": 14040 + }, + { + "epoch": 0.6807830216106212, + "grad_norm": 0.41833576560020447, + "learning_rate": 1.221343019059764e-05, + "loss": 1.1279, + "num_input_tokens_seen": 551759700, + "step": 14050 + }, + { + "epoch": 0.6812675646865006, + "grad_norm": 0.36511725187301636, + "learning_rate": 1.217973144883053e-05, + "loss": 1.1514, + "num_input_tokens_seen": 552164368, + "step": 14060 + }, + { + "epoch": 0.68175210776238, + "grad_norm": 0.3876798152923584, + "learning_rate": 1.2146064282939501e-05, + "loss": 1.169, + "num_input_tokens_seen": 552571428, + "step": 14070 + }, + { + "epoch": 0.6822366508382596, + "grad_norm": 0.3926098942756653, + "learning_rate": 1.2112428775845616e-05, + "loss": 1.1665, + "num_input_tokens_seen": 552952628, + "step": 14080 + }, + { + "epoch": 0.682721193914139, + "grad_norm": 0.3909384608268738, + "learning_rate": 1.2078825010391958e-05, + "loss": 1.133, + "num_input_tokens_seen": 553359608, + "step": 14090 + }, + { + "epoch": 0.6832057369900184, + "grad_norm": 0.38221150636672974, + "learning_rate": 1.2045253069343448e-05, + "loss": 1.1773, + "num_input_tokens_seen": 553757744, + "step": 14100 + }, + { + "epoch": 0.6836902800658978, + "grad_norm": 0.42475947737693787, + "learning_rate": 1.2011713035386588e-05, + "loss": 1.1495, + "num_input_tokens_seen": 554165892, + "step": 14110 + }, + { + "epoch": 0.6841748231417774, + "grad_norm": 0.41192540526390076, + "learning_rate": 1.1978204991129324e-05, + "loss": 1.1358, + "num_input_tokens_seen": 554549428, + "step": 14120 + }, + { + "epoch": 0.6846593662176568, + "grad_norm": 0.390726238489151, + "learning_rate": 1.1944729019100808e-05, + "loss": 1.1225, + "num_input_tokens_seen": 554948408, + "step": 14130 + }, + { + "epoch": 0.6851439092935362, + "grad_norm": 0.40107494592666626, + "learning_rate": 1.19112852017512e-05, + "loss": 1.1305, + "num_input_tokens_seen": 555381500, + "step": 14140 + }, + { + "epoch": 0.6856284523694156, + "grad_norm": 0.41240257024765015, + "learning_rate": 1.1877873621451453e-05, + "loss": 1.1169, + "num_input_tokens_seen": 555772300, + "step": 14150 + }, + { + "epoch": 0.6861129954452951, + "grad_norm": 0.42891547083854675, + "learning_rate": 1.1844494360493141e-05, + "loss": 1.095, + "num_input_tokens_seen": 556200948, + "step": 14160 + }, + { + "epoch": 0.6865975385211746, + "grad_norm": 0.4002200663089752, + "learning_rate": 1.1811147501088196e-05, + "loss": 1.188, + "num_input_tokens_seen": 556610528, + "step": 14170 + }, + { + "epoch": 0.687082081597054, + "grad_norm": 0.4111470878124237, + "learning_rate": 1.1777833125368812e-05, + "loss": 1.1927, + "num_input_tokens_seen": 557009708, + "step": 14180 + }, + { + "epoch": 0.6875666246729334, + "grad_norm": 0.43058955669403076, + "learning_rate": 1.17445513153871e-05, + "loss": 1.1684, + "num_input_tokens_seen": 557438504, + "step": 14190 + }, + { + "epoch": 0.6880511677488129, + "grad_norm": 0.36157649755477905, + "learning_rate": 1.1711302153115045e-05, + "loss": 1.1805, + "num_input_tokens_seen": 557855252, + "step": 14200 + }, + { + "epoch": 0.6885357108246923, + "grad_norm": 0.3946475088596344, + "learning_rate": 1.1678085720444142e-05, + "loss": 1.0994, + "num_input_tokens_seen": 558247492, + "step": 14210 + }, + { + "epoch": 0.6890202539005718, + "grad_norm": 0.38171178102493286, + "learning_rate": 1.1644902099185328e-05, + "loss": 1.1378, + "num_input_tokens_seen": 558657816, + "step": 14220 + }, + { + "epoch": 0.6895047969764512, + "grad_norm": 0.38918769359588623, + "learning_rate": 1.1611751371068706e-05, + "loss": 1.1926, + "num_input_tokens_seen": 559061308, + "step": 14230 + }, + { + "epoch": 0.6899893400523307, + "grad_norm": 0.4294280707836151, + "learning_rate": 1.1578633617743373e-05, + "loss": 1.1388, + "num_input_tokens_seen": 559449040, + "step": 14240 + }, + { + "epoch": 0.6904738831282101, + "grad_norm": 0.4078969359397888, + "learning_rate": 1.1545548920777194e-05, + "loss": 1.1432, + "num_input_tokens_seen": 559819924, + "step": 14250 + }, + { + "epoch": 0.6909584262040895, + "grad_norm": 0.3768039643764496, + "learning_rate": 1.1512497361656633e-05, + "loss": 1.1597, + "num_input_tokens_seen": 560234120, + "step": 14260 + }, + { + "epoch": 0.691442969279969, + "grad_norm": 0.3907431364059448, + "learning_rate": 1.1479479021786533e-05, + "loss": 1.1175, + "num_input_tokens_seen": 560648180, + "step": 14270 + }, + { + "epoch": 0.6919275123558485, + "grad_norm": 0.415067195892334, + "learning_rate": 1.1446493982489916e-05, + "loss": 1.1103, + "num_input_tokens_seen": 561025820, + "step": 14280 + }, + { + "epoch": 0.6924120554317279, + "grad_norm": 0.3955281674861908, + "learning_rate": 1.1413542325007804e-05, + "loss": 1.1182, + "num_input_tokens_seen": 561453128, + "step": 14290 + }, + { + "epoch": 0.6928965985076073, + "grad_norm": 0.3895666003227234, + "learning_rate": 1.1380624130498946e-05, + "loss": 1.165, + "num_input_tokens_seen": 561854100, + "step": 14300 + }, + { + "epoch": 0.6933811415834867, + "grad_norm": 0.40961936116218567, + "learning_rate": 1.134773948003976e-05, + "loss": 1.1452, + "num_input_tokens_seen": 562259444, + "step": 14310 + }, + { + "epoch": 0.6938656846593663, + "grad_norm": 0.4259204864501953, + "learning_rate": 1.1314888454623951e-05, + "loss": 1.1066, + "num_input_tokens_seen": 562672632, + "step": 14320 + }, + { + "epoch": 0.6943502277352457, + "grad_norm": 0.44359973073005676, + "learning_rate": 1.1282071135162498e-05, + "loss": 1.1223, + "num_input_tokens_seen": 563095012, + "step": 14330 + }, + { + "epoch": 0.6948347708111251, + "grad_norm": 0.4470522701740265, + "learning_rate": 1.1249287602483285e-05, + "loss": 1.1879, + "num_input_tokens_seen": 563479840, + "step": 14340 + }, + { + "epoch": 0.6953193138870045, + "grad_norm": 0.39908698201179504, + "learning_rate": 1.1216537937331028e-05, + "loss": 1.1204, + "num_input_tokens_seen": 563889984, + "step": 14350 + }, + { + "epoch": 0.695803856962884, + "grad_norm": 0.4189169704914093, + "learning_rate": 1.1183822220367014e-05, + "loss": 1.1517, + "num_input_tokens_seen": 564268180, + "step": 14360 + }, + { + "epoch": 0.6962884000387635, + "grad_norm": 0.3651569187641144, + "learning_rate": 1.1151140532168917e-05, + "loss": 1.1388, + "num_input_tokens_seen": 564649240, + "step": 14370 + }, + { + "epoch": 0.6967729431146429, + "grad_norm": 0.4237881898880005, + "learning_rate": 1.1118492953230594e-05, + "loss": 1.1211, + "num_input_tokens_seen": 565027528, + "step": 14380 + }, + { + "epoch": 0.6972574861905223, + "grad_norm": 0.41956138610839844, + "learning_rate": 1.1085879563961915e-05, + "loss": 1.1297, + "num_input_tokens_seen": 565436040, + "step": 14390 + }, + { + "epoch": 0.6977420292664018, + "grad_norm": 0.4476149082183838, + "learning_rate": 1.1053300444688502e-05, + "loss": 1.108, + "num_input_tokens_seen": 565816684, + "step": 14400 + }, + { + "epoch": 0.6982265723422812, + "grad_norm": 0.40152573585510254, + "learning_rate": 1.1020755675651605e-05, + "loss": 1.1131, + "num_input_tokens_seen": 566192928, + "step": 14410 + }, + { + "epoch": 0.6987111154181607, + "grad_norm": 0.4253696799278259, + "learning_rate": 1.0988245337007863e-05, + "loss": 1.1218, + "num_input_tokens_seen": 566589328, + "step": 14420 + }, + { + "epoch": 0.6991956584940401, + "grad_norm": 0.39764001965522766, + "learning_rate": 1.0955769508829103e-05, + "loss": 1.1141, + "num_input_tokens_seen": 566991652, + "step": 14430 + }, + { + "epoch": 0.6996802015699196, + "grad_norm": 0.3731628656387329, + "learning_rate": 1.0923328271102174e-05, + "loss": 1.1397, + "num_input_tokens_seen": 567388660, + "step": 14440 + }, + { + "epoch": 0.700164744645799, + "grad_norm": 0.4285131096839905, + "learning_rate": 1.0890921703728693e-05, + "loss": 1.1473, + "num_input_tokens_seen": 567783500, + "step": 14450 + }, + { + "epoch": 0.7006492877216784, + "grad_norm": 0.3825373649597168, + "learning_rate": 1.0858549886524944e-05, + "loss": 1.1373, + "num_input_tokens_seen": 568160888, + "step": 14460 + }, + { + "epoch": 0.7011338307975579, + "grad_norm": 0.38323846459388733, + "learning_rate": 1.0826212899221559e-05, + "loss": 1.1175, + "num_input_tokens_seen": 568542356, + "step": 14470 + }, + { + "epoch": 0.7016183738734374, + "grad_norm": 0.4034413695335388, + "learning_rate": 1.0793910821463424e-05, + "loss": 1.1396, + "num_input_tokens_seen": 568922996, + "step": 14480 + }, + { + "epoch": 0.7021029169493168, + "grad_norm": 0.4333634078502655, + "learning_rate": 1.076164373280944e-05, + "loss": 1.117, + "num_input_tokens_seen": 569333292, + "step": 14490 + }, + { + "epoch": 0.7025874600251962, + "grad_norm": 0.39302992820739746, + "learning_rate": 1.0729411712732319e-05, + "loss": 1.1303, + "num_input_tokens_seen": 569719056, + "step": 14500 + }, + { + "epoch": 0.7030720031010756, + "grad_norm": 0.39154550433158875, + "learning_rate": 1.0697214840618409e-05, + "loss": 1.2008, + "num_input_tokens_seen": 570111844, + "step": 14510 + }, + { + "epoch": 0.7035565461769552, + "grad_norm": 0.406896710395813, + "learning_rate": 1.0665053195767493e-05, + "loss": 1.1581, + "num_input_tokens_seen": 570534852, + "step": 14520 + }, + { + "epoch": 0.7040410892528346, + "grad_norm": 0.4043082892894745, + "learning_rate": 1.0632926857392567e-05, + "loss": 1.1445, + "num_input_tokens_seen": 570909832, + "step": 14530 + }, + { + "epoch": 0.704525632328714, + "grad_norm": 0.392946720123291, + "learning_rate": 1.0600835904619713e-05, + "loss": 1.1079, + "num_input_tokens_seen": 571297336, + "step": 14540 + }, + { + "epoch": 0.7050101754045934, + "grad_norm": 0.4595877528190613, + "learning_rate": 1.0568780416487811e-05, + "loss": 1.1688, + "num_input_tokens_seen": 571674808, + "step": 14550 + }, + { + "epoch": 0.705494718480473, + "grad_norm": 0.39945316314697266, + "learning_rate": 1.0536760471948423e-05, + "loss": 1.0807, + "num_input_tokens_seen": 572049484, + "step": 14560 + }, + { + "epoch": 0.7059792615563524, + "grad_norm": 0.4135000705718994, + "learning_rate": 1.0504776149865559e-05, + "loss": 1.0744, + "num_input_tokens_seen": 572426520, + "step": 14570 + }, + { + "epoch": 0.7064638046322318, + "grad_norm": 0.40249672532081604, + "learning_rate": 1.0472827529015494e-05, + "loss": 1.1261, + "num_input_tokens_seen": 572824836, + "step": 14580 + }, + { + "epoch": 0.7069483477081112, + "grad_norm": 0.4083980321884155, + "learning_rate": 1.0440914688086581e-05, + "loss": 1.1333, + "num_input_tokens_seen": 573191304, + "step": 14590 + }, + { + "epoch": 0.7074328907839907, + "grad_norm": 0.43228352069854736, + "learning_rate": 1.0409037705679018e-05, + "loss": 1.1248, + "num_input_tokens_seen": 573607132, + "step": 14600 + }, + { + "epoch": 0.7079174338598702, + "grad_norm": 0.3942347764968872, + "learning_rate": 1.0377196660304717e-05, + "loss": 1.1319, + "num_input_tokens_seen": 573989044, + "step": 14610 + }, + { + "epoch": 0.7084019769357496, + "grad_norm": 0.3767456114292145, + "learning_rate": 1.0345391630387064e-05, + "loss": 1.1459, + "num_input_tokens_seen": 574401880, + "step": 14620 + }, + { + "epoch": 0.708886520011629, + "grad_norm": 0.41538006067276, + "learning_rate": 1.0313622694260747e-05, + "loss": 1.136, + "num_input_tokens_seen": 574809836, + "step": 14630 + }, + { + "epoch": 0.7093710630875085, + "grad_norm": 0.3873327970504761, + "learning_rate": 1.0281889930171546e-05, + "loss": 1.2085, + "num_input_tokens_seen": 575187448, + "step": 14640 + }, + { + "epoch": 0.7098556061633879, + "grad_norm": 0.3875696063041687, + "learning_rate": 1.0250193416276171e-05, + "loss": 1.131, + "num_input_tokens_seen": 575619396, + "step": 14650 + }, + { + "epoch": 0.7103401492392674, + "grad_norm": 0.40218356251716614, + "learning_rate": 1.0218533230642005e-05, + "loss": 1.1638, + "num_input_tokens_seen": 576039756, + "step": 14660 + }, + { + "epoch": 0.7108246923151468, + "grad_norm": 0.360257089138031, + "learning_rate": 1.0186909451247023e-05, + "loss": 1.1354, + "num_input_tokens_seen": 576409768, + "step": 14670 + }, + { + "epoch": 0.7113092353910263, + "grad_norm": 0.3771324157714844, + "learning_rate": 1.0155322155979468e-05, + "loss": 1.1132, + "num_input_tokens_seen": 576806004, + "step": 14680 + }, + { + "epoch": 0.7117937784669057, + "grad_norm": 0.41122815012931824, + "learning_rate": 1.0123771422637757e-05, + "loss": 1.178, + "num_input_tokens_seen": 577197312, + "step": 14690 + }, + { + "epoch": 0.7122783215427851, + "grad_norm": 0.3926630914211273, + "learning_rate": 1.0092257328930255e-05, + "loss": 1.0902, + "num_input_tokens_seen": 577597560, + "step": 14700 + }, + { + "epoch": 0.7127628646186646, + "grad_norm": 0.4445978105068207, + "learning_rate": 1.0060779952475074e-05, + "loss": 1.1167, + "num_input_tokens_seen": 577999220, + "step": 14710 + }, + { + "epoch": 0.7132474076945441, + "grad_norm": 0.42254573106765747, + "learning_rate": 1.0029339370799912e-05, + "loss": 1.129, + "num_input_tokens_seen": 578383384, + "step": 14720 + }, + { + "epoch": 0.7137319507704235, + "grad_norm": 0.4229062795639038, + "learning_rate": 9.997935661341804e-06, + "loss": 1.1274, + "num_input_tokens_seen": 578758652, + "step": 14730 + }, + { + "epoch": 0.7142164938463029, + "grad_norm": 0.39834490418434143, + "learning_rate": 9.966568901447026e-06, + "loss": 1.1414, + "num_input_tokens_seen": 579152316, + "step": 14740 + }, + { + "epoch": 0.7147010369221823, + "grad_norm": 0.3739629089832306, + "learning_rate": 9.935239168370795e-06, + "loss": 1.1303, + "num_input_tokens_seen": 579581096, + "step": 14750 + }, + { + "epoch": 0.7151855799980619, + "grad_norm": 0.412434458732605, + "learning_rate": 9.903946539277162e-06, + "loss": 1.0987, + "num_input_tokens_seen": 579998920, + "step": 14760 + }, + { + "epoch": 0.7156701230739413, + "grad_norm": 0.401985764503479, + "learning_rate": 9.872691091238789e-06, + "loss": 1.0892, + "num_input_tokens_seen": 580392288, + "step": 14770 + }, + { + "epoch": 0.7161546661498207, + "grad_norm": 0.42549392580986023, + "learning_rate": 9.841472901236765e-06, + "loss": 1.1417, + "num_input_tokens_seen": 580778920, + "step": 14780 + }, + { + "epoch": 0.7166392092257001, + "grad_norm": 0.42221537232398987, + "learning_rate": 9.81029204616038e-06, + "loss": 1.1389, + "num_input_tokens_seen": 581160372, + "step": 14790 + }, + { + "epoch": 0.7171237523015797, + "grad_norm": 0.4084275960922241, + "learning_rate": 9.779148602807032e-06, + "loss": 1.1119, + "num_input_tokens_seen": 581518392, + "step": 14800 + }, + { + "epoch": 0.7176082953774591, + "grad_norm": 0.3924039900302887, + "learning_rate": 9.748042647881909e-06, + "loss": 1.1196, + "num_input_tokens_seen": 581916744, + "step": 14810 + }, + { + "epoch": 0.7180928384533385, + "grad_norm": 0.4364875555038452, + "learning_rate": 9.716974257997927e-06, + "loss": 1.1251, + "num_input_tokens_seen": 582306144, + "step": 14820 + }, + { + "epoch": 0.7185773815292179, + "grad_norm": 0.4177420735359192, + "learning_rate": 9.68594350967543e-06, + "loss": 1.1217, + "num_input_tokens_seen": 582701516, + "step": 14830 + }, + { + "epoch": 0.7190619246050974, + "grad_norm": 0.3962678015232086, + "learning_rate": 9.654950479342079e-06, + "loss": 1.1467, + "num_input_tokens_seen": 583110044, + "step": 14840 + }, + { + "epoch": 0.7195464676809769, + "grad_norm": 0.3973855972290039, + "learning_rate": 9.62399524333263e-06, + "loss": 1.0922, + "num_input_tokens_seen": 583517076, + "step": 14850 + }, + { + "epoch": 0.7200310107568563, + "grad_norm": 0.3978596329689026, + "learning_rate": 9.593077877888757e-06, + "loss": 1.1018, + "num_input_tokens_seen": 583892052, + "step": 14860 + }, + { + "epoch": 0.7205155538327357, + "grad_norm": 0.3927263617515564, + "learning_rate": 9.56219845915886e-06, + "loss": 1.1682, + "num_input_tokens_seen": 584282980, + "step": 14870 + }, + { + "epoch": 0.7210000969086152, + "grad_norm": 0.4256937801837921, + "learning_rate": 9.531357063197867e-06, + "loss": 1.1401, + "num_input_tokens_seen": 584680944, + "step": 14880 + }, + { + "epoch": 0.7214846399844946, + "grad_norm": 0.36345407366752625, + "learning_rate": 9.500553765967066e-06, + "loss": 1.1402, + "num_input_tokens_seen": 585085904, + "step": 14890 + }, + { + "epoch": 0.721969183060374, + "grad_norm": 0.38229337334632874, + "learning_rate": 9.46978864333391e-06, + "loss": 1.1655, + "num_input_tokens_seen": 585477160, + "step": 14900 + }, + { + "epoch": 0.7224537261362535, + "grad_norm": 0.41841405630111694, + "learning_rate": 9.439061771071824e-06, + "loss": 1.1179, + "num_input_tokens_seen": 585880592, + "step": 14910 + }, + { + "epoch": 0.722938269212133, + "grad_norm": 0.4083956480026245, + "learning_rate": 9.408373224860035e-06, + "loss": 1.1698, + "num_input_tokens_seen": 586284256, + "step": 14920 + }, + { + "epoch": 0.7234228122880124, + "grad_norm": 0.4051735997200012, + "learning_rate": 9.377723080283368e-06, + "loss": 1.1011, + "num_input_tokens_seen": 586669452, + "step": 14930 + }, + { + "epoch": 0.7239073553638918, + "grad_norm": 0.40003690123558044, + "learning_rate": 9.347111412832041e-06, + "loss": 1.0989, + "num_input_tokens_seen": 587054752, + "step": 14940 + }, + { + "epoch": 0.7243918984397713, + "grad_norm": 0.41723689436912537, + "learning_rate": 9.31653829790156e-06, + "loss": 1.1647, + "num_input_tokens_seen": 587436984, + "step": 14950 + }, + { + "epoch": 0.7248764415156508, + "grad_norm": 0.3850226402282715, + "learning_rate": 9.286003810792423e-06, + "loss": 1.1224, + "num_input_tokens_seen": 587849120, + "step": 14960 + }, + { + "epoch": 0.7253609845915302, + "grad_norm": 0.38576218485832214, + "learning_rate": 9.255508026710017e-06, + "loss": 1.1768, + "num_input_tokens_seen": 588230752, + "step": 14970 + }, + { + "epoch": 0.7258455276674096, + "grad_norm": 0.44978147745132446, + "learning_rate": 9.225051020764396e-06, + "loss": 1.1512, + "num_input_tokens_seen": 588620412, + "step": 14980 + }, + { + "epoch": 0.726330070743289, + "grad_norm": 0.4420928359031677, + "learning_rate": 9.194632867970115e-06, + "loss": 1.1517, + "num_input_tokens_seen": 588987072, + "step": 14990 + }, + { + "epoch": 0.7268146138191686, + "grad_norm": 0.40865370631217957, + "learning_rate": 9.16425364324602e-06, + "loss": 1.1299, + "num_input_tokens_seen": 589381344, + "step": 15000 + }, + { + "epoch": 0.727299156895048, + "grad_norm": 0.4068050682544708, + "learning_rate": 9.133913421415103e-06, + "loss": 1.1, + "num_input_tokens_seen": 589790080, + "step": 15010 + }, + { + "epoch": 0.7277836999709274, + "grad_norm": 0.37048131227493286, + "learning_rate": 9.10361227720425e-06, + "loss": 1.1035, + "num_input_tokens_seen": 590204672, + "step": 15020 + }, + { + "epoch": 0.7282682430468068, + "grad_norm": 0.3541117012500763, + "learning_rate": 9.073350285244142e-06, + "loss": 1.0962, + "num_input_tokens_seen": 590633676, + "step": 15030 + }, + { + "epoch": 0.7287527861226863, + "grad_norm": 0.4002024531364441, + "learning_rate": 9.04312752006901e-06, + "loss": 1.1322, + "num_input_tokens_seen": 590999504, + "step": 15040 + }, + { + "epoch": 0.7292373291985658, + "grad_norm": 0.4075698256492615, + "learning_rate": 9.012944056116477e-06, + "loss": 1.1658, + "num_input_tokens_seen": 591409344, + "step": 15050 + }, + { + "epoch": 0.7297218722744452, + "grad_norm": 0.41059961915016174, + "learning_rate": 8.982799967727374e-06, + "loss": 1.1439, + "num_input_tokens_seen": 591798840, + "step": 15060 + }, + { + "epoch": 0.7302064153503246, + "grad_norm": 0.3906279504299164, + "learning_rate": 8.952695329145517e-06, + "loss": 1.1835, + "num_input_tokens_seen": 592185512, + "step": 15070 + }, + { + "epoch": 0.7306909584262041, + "grad_norm": 0.41761744022369385, + "learning_rate": 8.922630214517618e-06, + "loss": 1.1068, + "num_input_tokens_seen": 592622168, + "step": 15080 + }, + { + "epoch": 0.7311755015020835, + "grad_norm": 0.408543199300766, + "learning_rate": 8.892604697892981e-06, + "loss": 1.1234, + "num_input_tokens_seen": 592987956, + "step": 15090 + }, + { + "epoch": 0.731660044577963, + "grad_norm": 0.4270235002040863, + "learning_rate": 8.862618853223442e-06, + "loss": 1.115, + "num_input_tokens_seen": 593378144, + "step": 15100 + }, + { + "epoch": 0.7321445876538424, + "grad_norm": 0.39898785948753357, + "learning_rate": 8.832672754363066e-06, + "loss": 1.1088, + "num_input_tokens_seen": 593782856, + "step": 15110 + }, + { + "epoch": 0.7326291307297219, + "grad_norm": 0.3919895887374878, + "learning_rate": 8.802766475068066e-06, + "loss": 1.0846, + "num_input_tokens_seen": 594160824, + "step": 15120 + }, + { + "epoch": 0.7331136738056013, + "grad_norm": 0.4317933917045593, + "learning_rate": 8.77290008899657e-06, + "loss": 1.0992, + "num_input_tokens_seen": 594564940, + "step": 15130 + }, + { + "epoch": 0.7335982168814807, + "grad_norm": 0.4144594073295593, + "learning_rate": 8.743073669708454e-06, + "loss": 1.1512, + "num_input_tokens_seen": 594969124, + "step": 15140 + }, + { + "epoch": 0.7340827599573602, + "grad_norm": 0.41548287868499756, + "learning_rate": 8.713287290665139e-06, + "loss": 1.109, + "num_input_tokens_seen": 595337996, + "step": 15150 + }, + { + "epoch": 0.7345673030332397, + "grad_norm": 0.396049827337265, + "learning_rate": 8.68354102522945e-06, + "loss": 1.1377, + "num_input_tokens_seen": 595759896, + "step": 15160 + }, + { + "epoch": 0.7350518461091191, + "grad_norm": 0.3859698176383972, + "learning_rate": 8.6538349466654e-06, + "loss": 1.1388, + "num_input_tokens_seen": 596196968, + "step": 15170 + }, + { + "epoch": 0.7355363891849985, + "grad_norm": 0.4152047634124756, + "learning_rate": 8.624169128138038e-06, + "loss": 1.1374, + "num_input_tokens_seen": 596608088, + "step": 15180 + }, + { + "epoch": 0.736020932260878, + "grad_norm": 0.3938763439655304, + "learning_rate": 8.594543642713245e-06, + "loss": 1.1061, + "num_input_tokens_seen": 596999532, + "step": 15190 + }, + { + "epoch": 0.7365054753367575, + "grad_norm": 0.3672252595424652, + "learning_rate": 8.564958563357543e-06, + "loss": 1.1148, + "num_input_tokens_seen": 597383556, + "step": 15200 + }, + { + "epoch": 0.7369900184126369, + "grad_norm": 0.4167373478412628, + "learning_rate": 8.535413962937983e-06, + "loss": 1.1183, + "num_input_tokens_seen": 597766768, + "step": 15210 + }, + { + "epoch": 0.7374745614885163, + "grad_norm": 0.395749568939209, + "learning_rate": 8.50590991422186e-06, + "loss": 1.1175, + "num_input_tokens_seen": 598156584, + "step": 15220 + }, + { + "epoch": 0.7379591045643957, + "grad_norm": 0.4062468409538269, + "learning_rate": 8.476446489876651e-06, + "loss": 1.1551, + "num_input_tokens_seen": 598562200, + "step": 15230 + }, + { + "epoch": 0.7384436476402753, + "grad_norm": 0.3954980969429016, + "learning_rate": 8.447023762469725e-06, + "loss": 1.1301, + "num_input_tokens_seen": 598961604, + "step": 15240 + }, + { + "epoch": 0.7389281907161547, + "grad_norm": 0.39155393838882446, + "learning_rate": 8.417641804468243e-06, + "loss": 1.1221, + "num_input_tokens_seen": 599354072, + "step": 15250 + }, + { + "epoch": 0.7394127337920341, + "grad_norm": 0.39971819519996643, + "learning_rate": 8.388300688238951e-06, + "loss": 1.127, + "num_input_tokens_seen": 599752608, + "step": 15260 + }, + { + "epoch": 0.7398972768679135, + "grad_norm": 0.4002969563007355, + "learning_rate": 8.359000486047994e-06, + "loss": 1.1052, + "num_input_tokens_seen": 600141532, + "step": 15270 + }, + { + "epoch": 0.740381819943793, + "grad_norm": 0.4160708487033844, + "learning_rate": 8.329741270060754e-06, + "loss": 1.0865, + "num_input_tokens_seen": 600526872, + "step": 15280 + }, + { + "epoch": 0.7408663630196725, + "grad_norm": 0.41618549823760986, + "learning_rate": 8.300523112341674e-06, + "loss": 1.1429, + "num_input_tokens_seen": 600899788, + "step": 15290 + }, + { + "epoch": 0.7413509060955519, + "grad_norm": 0.4318315386772156, + "learning_rate": 8.271346084854042e-06, + "loss": 1.0822, + "num_input_tokens_seen": 601307316, + "step": 15300 + }, + { + "epoch": 0.7418354491714313, + "grad_norm": 0.38331708312034607, + "learning_rate": 8.242210259459872e-06, + "loss": 1.1082, + "num_input_tokens_seen": 601712016, + "step": 15310 + }, + { + "epoch": 0.7423199922473108, + "grad_norm": 0.3820512294769287, + "learning_rate": 8.213115707919692e-06, + "loss": 1.1066, + "num_input_tokens_seen": 602117976, + "step": 15320 + }, + { + "epoch": 0.7428045353231902, + "grad_norm": 0.385928750038147, + "learning_rate": 8.18406250189237e-06, + "loss": 1.1274, + "num_input_tokens_seen": 602497772, + "step": 15330 + }, + { + "epoch": 0.7432890783990697, + "grad_norm": 0.40160906314849854, + "learning_rate": 8.15505071293495e-06, + "loss": 1.1348, + "num_input_tokens_seen": 602904540, + "step": 15340 + }, + { + "epoch": 0.7437736214749491, + "grad_norm": 0.3810666501522064, + "learning_rate": 8.126080412502437e-06, + "loss": 1.1229, + "num_input_tokens_seen": 603301296, + "step": 15350 + }, + { + "epoch": 0.7442581645508286, + "grad_norm": 0.37731239199638367, + "learning_rate": 8.097151671947709e-06, + "loss": 1.1554, + "num_input_tokens_seen": 603671376, + "step": 15360 + }, + { + "epoch": 0.744742707626708, + "grad_norm": 0.3665080666542053, + "learning_rate": 8.068264562521221e-06, + "loss": 1.1406, + "num_input_tokens_seen": 604069780, + "step": 15370 + }, + { + "epoch": 0.7452272507025874, + "grad_norm": 0.37638604640960693, + "learning_rate": 8.039419155370933e-06, + "loss": 1.2152, + "num_input_tokens_seen": 604460932, + "step": 15380 + }, + { + "epoch": 0.7457117937784669, + "grad_norm": 0.4055761396884918, + "learning_rate": 8.010615521542077e-06, + "loss": 1.075, + "num_input_tokens_seen": 604849832, + "step": 15390 + }, + { + "epoch": 0.7461963368543464, + "grad_norm": 0.41014060378074646, + "learning_rate": 7.981853731977005e-06, + "loss": 1.0986, + "num_input_tokens_seen": 605250684, + "step": 15400 + }, + { + "epoch": 0.7466808799302258, + "grad_norm": 0.44504401087760925, + "learning_rate": 7.953133857514999e-06, + "loss": 1.1245, + "num_input_tokens_seen": 605657060, + "step": 15410 + }, + { + "epoch": 0.7471654230061052, + "grad_norm": 0.3899877071380615, + "learning_rate": 7.92445596889213e-06, + "loss": 1.096, + "num_input_tokens_seen": 606046780, + "step": 15420 + }, + { + "epoch": 0.7476499660819846, + "grad_norm": 0.40705353021621704, + "learning_rate": 7.89582013674101e-06, + "loss": 1.1188, + "num_input_tokens_seen": 606446820, + "step": 15430 + }, + { + "epoch": 0.7481345091578642, + "grad_norm": 0.3865094482898712, + "learning_rate": 7.867226431590735e-06, + "loss": 1.1454, + "num_input_tokens_seen": 606828080, + "step": 15440 + }, + { + "epoch": 0.7486190522337436, + "grad_norm": 0.42546987533569336, + "learning_rate": 7.838674923866585e-06, + "loss": 1.0962, + "num_input_tokens_seen": 607213936, + "step": 15450 + }, + { + "epoch": 0.749103595309623, + "grad_norm": 0.39779505133628845, + "learning_rate": 7.81016568388993e-06, + "loss": 1.1534, + "num_input_tokens_seen": 607602128, + "step": 15460 + }, + { + "epoch": 0.7495881383855024, + "grad_norm": 0.43804842233657837, + "learning_rate": 7.78169878187805e-06, + "loss": 1.1426, + "num_input_tokens_seen": 607980840, + "step": 15470 + }, + { + "epoch": 0.750072681461382, + "grad_norm": 0.42345863580703735, + "learning_rate": 7.753274287943927e-06, + "loss": 1.1451, + "num_input_tokens_seen": 608389256, + "step": 15480 + }, + { + "epoch": 0.7505572245372614, + "grad_norm": 0.4104250371456146, + "learning_rate": 7.724892272096115e-06, + "loss": 1.1078, + "num_input_tokens_seen": 608763772, + "step": 15490 + }, + { + "epoch": 0.7510417676131408, + "grad_norm": 0.37295570969581604, + "learning_rate": 7.696552804238514e-06, + "loss": 1.1495, + "num_input_tokens_seen": 609154616, + "step": 15500 + }, + { + "epoch": 0.7515263106890202, + "grad_norm": 0.42971867322921753, + "learning_rate": 7.668255954170258e-06, + "loss": 1.1204, + "num_input_tokens_seen": 609563980, + "step": 15510 + }, + { + "epoch": 0.7520108537648997, + "grad_norm": 0.3849759101867676, + "learning_rate": 7.640001791585507e-06, + "loss": 1.1601, + "num_input_tokens_seen": 609984756, + "step": 15520 + }, + { + "epoch": 0.7524953968407792, + "grad_norm": 0.41971349716186523, + "learning_rate": 7.611790386073281e-06, + "loss": 1.1537, + "num_input_tokens_seen": 610384836, + "step": 15530 + }, + { + "epoch": 0.7529799399166586, + "grad_norm": 0.3679080903530121, + "learning_rate": 7.583621807117288e-06, + "loss": 1.1229, + "num_input_tokens_seen": 610787156, + "step": 15540 + }, + { + "epoch": 0.753464482992538, + "grad_norm": 0.3937299847602844, + "learning_rate": 7.555496124095776e-06, + "loss": 1.1227, + "num_input_tokens_seen": 611161580, + "step": 15550 + }, + { + "epoch": 0.7539490260684175, + "grad_norm": 0.4039948284626007, + "learning_rate": 7.527413406281294e-06, + "loss": 1.163, + "num_input_tokens_seen": 611556308, + "step": 15560 + }, + { + "epoch": 0.7544335691442969, + "grad_norm": 0.38725948333740234, + "learning_rate": 7.499373722840636e-06, + "loss": 1.1422, + "num_input_tokens_seen": 611967848, + "step": 15570 + }, + { + "epoch": 0.7549181122201764, + "grad_norm": 0.3972112536430359, + "learning_rate": 7.471377142834532e-06, + "loss": 1.102, + "num_input_tokens_seen": 612364408, + "step": 15580 + }, + { + "epoch": 0.7554026552960558, + "grad_norm": 0.39917463064193726, + "learning_rate": 7.443423735217622e-06, + "loss": 1.136, + "num_input_tokens_seen": 612775300, + "step": 15590 + }, + { + "epoch": 0.7558871983719353, + "grad_norm": 0.39816567301750183, + "learning_rate": 7.415513568838153e-06, + "loss": 1.1112, + "num_input_tokens_seen": 613151088, + "step": 15600 + }, + { + "epoch": 0.7563717414478147, + "grad_norm": 0.4336942136287689, + "learning_rate": 7.387646712437904e-06, + "loss": 1.1323, + "num_input_tokens_seen": 613546524, + "step": 15610 + }, + { + "epoch": 0.7568562845236941, + "grad_norm": 0.4093795716762543, + "learning_rate": 7.35982323465198e-06, + "loss": 1.081, + "num_input_tokens_seen": 613944468, + "step": 15620 + }, + { + "epoch": 0.7573408275995736, + "grad_norm": 0.3967530131340027, + "learning_rate": 7.332043204008621e-06, + "loss": 1.2024, + "num_input_tokens_seen": 614316784, + "step": 15630 + }, + { + "epoch": 0.7578253706754531, + "grad_norm": 0.3960801362991333, + "learning_rate": 7.304306688929102e-06, + "loss": 1.1074, + "num_input_tokens_seen": 614744716, + "step": 15640 + }, + { + "epoch": 0.7583099137513325, + "grad_norm": 0.40353167057037354, + "learning_rate": 7.2766137577274765e-06, + "loss": 1.1534, + "num_input_tokens_seen": 615162168, + "step": 15650 + }, + { + "epoch": 0.7587944568272119, + "grad_norm": 0.4028729200363159, + "learning_rate": 7.248964478610482e-06, + "loss": 1.1388, + "num_input_tokens_seen": 615552592, + "step": 15660 + }, + { + "epoch": 0.7592789999030913, + "grad_norm": 0.4119277596473694, + "learning_rate": 7.221358919677329e-06, + "loss": 1.1401, + "num_input_tokens_seen": 615985000, + "step": 15670 + }, + { + "epoch": 0.7597635429789709, + "grad_norm": 0.4111025035381317, + "learning_rate": 7.193797148919557e-06, + "loss": 1.1402, + "num_input_tokens_seen": 616372156, + "step": 15680 + }, + { + "epoch": 0.7602480860548503, + "grad_norm": 0.3870091438293457, + "learning_rate": 7.166279234220829e-06, + "loss": 1.1222, + "num_input_tokens_seen": 616794564, + "step": 15690 + }, + { + "epoch": 0.7607326291307297, + "grad_norm": 0.4028170704841614, + "learning_rate": 7.138805243356847e-06, + "loss": 1.1556, + "num_input_tokens_seen": 617166456, + "step": 15700 + }, + { + "epoch": 0.7612171722066091, + "grad_norm": 0.4021094739437103, + "learning_rate": 7.111375243995058e-06, + "loss": 1.1548, + "num_input_tokens_seen": 617553256, + "step": 15710 + }, + { + "epoch": 0.7617017152824886, + "grad_norm": 0.4055640399456024, + "learning_rate": 7.083989303694635e-06, + "loss": 1.1371, + "num_input_tokens_seen": 617951232, + "step": 15720 + }, + { + "epoch": 0.7621862583583681, + "grad_norm": 0.3847959339618683, + "learning_rate": 7.05664748990617e-06, + "loss": 1.1114, + "num_input_tokens_seen": 618343460, + "step": 15730 + }, + { + "epoch": 0.7626708014342475, + "grad_norm": 0.38287290930747986, + "learning_rate": 7.0293498699716105e-06, + "loss": 1.1387, + "num_input_tokens_seen": 618773472, + "step": 15740 + }, + { + "epoch": 0.7631553445101269, + "grad_norm": 0.41064536571502686, + "learning_rate": 7.0020965111240454e-06, + "loss": 1.1396, + "num_input_tokens_seen": 619175808, + "step": 15750 + }, + { + "epoch": 0.7636398875860064, + "grad_norm": 0.42379313707351685, + "learning_rate": 6.9748874804875516e-06, + "loss": 1.1246, + "num_input_tokens_seen": 619583744, + "step": 15760 + }, + { + "epoch": 0.7641244306618858, + "grad_norm": 0.4114890396595001, + "learning_rate": 6.947722845077032e-06, + "loss": 1.1252, + "num_input_tokens_seen": 619983760, + "step": 15770 + }, + { + "epoch": 0.7646089737377653, + "grad_norm": 0.4021173119544983, + "learning_rate": 6.9206026717980265e-06, + "loss": 1.1981, + "num_input_tokens_seen": 620365344, + "step": 15780 + }, + { + "epoch": 0.7650935168136447, + "grad_norm": 0.42168235778808594, + "learning_rate": 6.893527027446589e-06, + "loss": 1.0703, + "num_input_tokens_seen": 620746756, + "step": 15790 + }, + { + "epoch": 0.7655780598895242, + "grad_norm": 0.38521257042884827, + "learning_rate": 6.866495978709087e-06, + "loss": 1.1277, + "num_input_tokens_seen": 621139404, + "step": 15800 + }, + { + "epoch": 0.7660626029654036, + "grad_norm": 0.4103443920612335, + "learning_rate": 6.839509592162055e-06, + "loss": 1.1429, + "num_input_tokens_seen": 621564096, + "step": 15810 + }, + { + "epoch": 0.766547146041283, + "grad_norm": 0.36137986183166504, + "learning_rate": 6.8125679342720294e-06, + "loss": 1.1534, + "num_input_tokens_seen": 621947492, + "step": 15820 + }, + { + "epoch": 0.7670316891171625, + "grad_norm": 0.4212779104709625, + "learning_rate": 6.7856710713953805e-06, + "loss": 1.1169, + "num_input_tokens_seen": 622366700, + "step": 15830 + }, + { + "epoch": 0.767516232193042, + "grad_norm": 0.4411621689796448, + "learning_rate": 6.75881906977813e-06, + "loss": 1.1259, + "num_input_tokens_seen": 622768796, + "step": 15840 + }, + { + "epoch": 0.7680007752689214, + "grad_norm": 0.39934778213500977, + "learning_rate": 6.732011995555851e-06, + "loss": 1.1114, + "num_input_tokens_seen": 623147136, + "step": 15850 + }, + { + "epoch": 0.7684853183448008, + "grad_norm": 0.43150418996810913, + "learning_rate": 6.705249914753414e-06, + "loss": 1.1407, + "num_input_tokens_seen": 623483104, + "step": 15860 + }, + { + "epoch": 0.7689698614206802, + "grad_norm": 0.3736652731895447, + "learning_rate": 6.6785328932849e-06, + "loss": 1.1541, + "num_input_tokens_seen": 623845764, + "step": 15870 + }, + { + "epoch": 0.7694544044965598, + "grad_norm": 0.436292439699173, + "learning_rate": 6.65186099695341e-06, + "loss": 1.1381, + "num_input_tokens_seen": 624231876, + "step": 15880 + }, + { + "epoch": 0.7699389475724392, + "grad_norm": 0.44637537002563477, + "learning_rate": 6.6252342914508965e-06, + "loss": 1.1727, + "num_input_tokens_seen": 624626492, + "step": 15890 + }, + { + "epoch": 0.7704234906483186, + "grad_norm": 0.4382629990577698, + "learning_rate": 6.598652842358008e-06, + "loss": 1.0927, + "num_input_tokens_seen": 625027572, + "step": 15900 + }, + { + "epoch": 0.770908033724198, + "grad_norm": 0.37075766921043396, + "learning_rate": 6.572116715143939e-06, + "loss": 1.1196, + "num_input_tokens_seen": 625436036, + "step": 15910 + }, + { + "epoch": 0.7713925768000776, + "grad_norm": 0.3835522532463074, + "learning_rate": 6.545625975166231e-06, + "loss": 1.1169, + "num_input_tokens_seen": 625817956, + "step": 15920 + }, + { + "epoch": 0.771877119875957, + "grad_norm": 0.4104726016521454, + "learning_rate": 6.51918068767067e-06, + "loss": 1.1782, + "num_input_tokens_seen": 626184948, + "step": 15930 + }, + { + "epoch": 0.7723616629518364, + "grad_norm": 0.4460158944129944, + "learning_rate": 6.492780917791075e-06, + "loss": 1.1137, + "num_input_tokens_seen": 626596420, + "step": 15940 + }, + { + "epoch": 0.7728462060277158, + "grad_norm": 0.4014306664466858, + "learning_rate": 6.466426730549166e-06, + "loss": 1.154, + "num_input_tokens_seen": 627007456, + "step": 15950 + }, + { + "epoch": 0.7733307491035953, + "grad_norm": 0.3667517304420471, + "learning_rate": 6.440118190854394e-06, + "loss": 1.1304, + "num_input_tokens_seen": 627390576, + "step": 15960 + }, + { + "epoch": 0.7738152921794748, + "grad_norm": 0.39722877740859985, + "learning_rate": 6.41385536350376e-06, + "loss": 1.1548, + "num_input_tokens_seen": 627755000, + "step": 15970 + }, + { + "epoch": 0.7742998352553542, + "grad_norm": 0.41055047512054443, + "learning_rate": 6.387638313181721e-06, + "loss": 1.1473, + "num_input_tokens_seen": 628145832, + "step": 15980 + }, + { + "epoch": 0.7747843783312336, + "grad_norm": 0.4256828725337982, + "learning_rate": 6.3614671044599364e-06, + "loss": 1.1001, + "num_input_tokens_seen": 628544212, + "step": 15990 + }, + { + "epoch": 0.7752689214071131, + "grad_norm": 0.4094095230102539, + "learning_rate": 6.335341801797209e-06, + "loss": 1.1743, + "num_input_tokens_seen": 628937800, + "step": 16000 + }, + { + "epoch": 0.7752689214071131, + "eval_loss": 1.1244345903396606, + "eval_runtime": 5.2554, + "eval_samples_per_second": 28.542, + "eval_steps_per_second": 3.615, + "num_input_tokens_seen": 628937800, + "step": 16000 + }, + { + "epoch": 0.7757534644829925, + "grad_norm": 0.44040119647979736, + "learning_rate": 6.309262469539235e-06, + "loss": 1.0965, + "num_input_tokens_seen": 629321000, + "step": 16010 + }, + { + "epoch": 0.776238007558872, + "grad_norm": 0.3980225920677185, + "learning_rate": 6.283229171918506e-06, + "loss": 1.1363, + "num_input_tokens_seen": 629740836, + "step": 16020 + }, + { + "epoch": 0.7767225506347514, + "grad_norm": 0.4174353778362274, + "learning_rate": 6.257241973054132e-06, + "loss": 1.1164, + "num_input_tokens_seen": 630136212, + "step": 16030 + }, + { + "epoch": 0.7772070937106309, + "grad_norm": 0.44722503423690796, + "learning_rate": 6.231300936951686e-06, + "loss": 1.1585, + "num_input_tokens_seen": 630539300, + "step": 16040 + }, + { + "epoch": 0.7776916367865103, + "grad_norm": 0.41615843772888184, + "learning_rate": 6.205406127503021e-06, + "loss": 1.1508, + "num_input_tokens_seen": 630936060, + "step": 16050 + }, + { + "epoch": 0.7781761798623897, + "grad_norm": 0.39735278487205505, + "learning_rate": 6.1795576084861804e-06, + "loss": 1.0761, + "num_input_tokens_seen": 631334484, + "step": 16060 + }, + { + "epoch": 0.7786607229382693, + "grad_norm": 0.4064568281173706, + "learning_rate": 6.153755443565146e-06, + "loss": 1.1212, + "num_input_tokens_seen": 631740948, + "step": 16070 + }, + { + "epoch": 0.7791452660141487, + "grad_norm": 0.4038830101490021, + "learning_rate": 6.12799969628976e-06, + "loss": 1.1316, + "num_input_tokens_seen": 632141196, + "step": 16080 + }, + { + "epoch": 0.7796298090900281, + "grad_norm": 0.41059982776641846, + "learning_rate": 6.102290430095536e-06, + "loss": 1.1087, + "num_input_tokens_seen": 632532608, + "step": 16090 + }, + { + "epoch": 0.7801143521659075, + "grad_norm": 0.40716275572776794, + "learning_rate": 6.0766277083035035e-06, + "loss": 1.1119, + "num_input_tokens_seen": 632916136, + "step": 16100 + }, + { + "epoch": 0.780598895241787, + "grad_norm": 0.39503198862075806, + "learning_rate": 6.05101159412006e-06, + "loss": 1.1406, + "num_input_tokens_seen": 633327552, + "step": 16110 + }, + { + "epoch": 0.7810834383176665, + "grad_norm": 0.3686826229095459, + "learning_rate": 6.025442150636781e-06, + "loss": 1.0964, + "num_input_tokens_seen": 633698396, + "step": 16120 + }, + { + "epoch": 0.7815679813935459, + "grad_norm": 0.4384057819843292, + "learning_rate": 5.999919440830354e-06, + "loss": 1.1353, + "num_input_tokens_seen": 634084992, + "step": 16130 + }, + { + "epoch": 0.7820525244694253, + "grad_norm": 0.4173773527145386, + "learning_rate": 5.974443527562296e-06, + "loss": 1.0915, + "num_input_tokens_seen": 634487672, + "step": 16140 + }, + { + "epoch": 0.7825370675453048, + "grad_norm": 0.4255557358264923, + "learning_rate": 5.949014473578909e-06, + "loss": 1.133, + "num_input_tokens_seen": 634851792, + "step": 16150 + }, + { + "epoch": 0.7830216106211842, + "grad_norm": 0.40748724341392517, + "learning_rate": 5.923632341511071e-06, + "loss": 1.156, + "num_input_tokens_seen": 635237128, + "step": 16160 + }, + { + "epoch": 0.7835061536970637, + "grad_norm": 0.38821229338645935, + "learning_rate": 5.898297193874086e-06, + "loss": 1.1218, + "num_input_tokens_seen": 635617944, + "step": 16170 + }, + { + "epoch": 0.7839906967729431, + "grad_norm": 0.39625513553619385, + "learning_rate": 5.873009093067547e-06, + "loss": 1.1036, + "num_input_tokens_seen": 636028208, + "step": 16180 + }, + { + "epoch": 0.7844752398488226, + "grad_norm": 0.4132024943828583, + "learning_rate": 5.8477681013751724e-06, + "loss": 1.1117, + "num_input_tokens_seen": 636400024, + "step": 16190 + }, + { + "epoch": 0.784959782924702, + "grad_norm": 0.4017508029937744, + "learning_rate": 5.822574280964637e-06, + "loss": 1.1399, + "num_input_tokens_seen": 636803444, + "step": 16200 + }, + { + "epoch": 0.7854443260005814, + "grad_norm": 0.3757273852825165, + "learning_rate": 5.79742769388745e-06, + "loss": 1.1278, + "num_input_tokens_seen": 637208612, + "step": 16210 + }, + { + "epoch": 0.7859288690764609, + "grad_norm": 0.4408009648323059, + "learning_rate": 5.77232840207878e-06, + "loss": 1.1813, + "num_input_tokens_seen": 637569988, + "step": 16220 + }, + { + "epoch": 0.7864134121523404, + "grad_norm": 0.4416961371898651, + "learning_rate": 5.747276467357313e-06, + "loss": 1.0945, + "num_input_tokens_seen": 637969128, + "step": 16230 + }, + { + "epoch": 0.7868979552282198, + "grad_norm": 0.3773948848247528, + "learning_rate": 5.722271951425101e-06, + "loss": 1.097, + "num_input_tokens_seen": 638377296, + "step": 16240 + }, + { + "epoch": 0.7873824983040992, + "grad_norm": 0.4033990502357483, + "learning_rate": 5.6973149158673775e-06, + "loss": 1.1328, + "num_input_tokens_seen": 638763704, + "step": 16250 + }, + { + "epoch": 0.7878670413799786, + "grad_norm": 0.40168893337249756, + "learning_rate": 5.6724054221524825e-06, + "loss": 1.1358, + "num_input_tokens_seen": 639184012, + "step": 16260 + }, + { + "epoch": 0.7883515844558582, + "grad_norm": 0.4118176996707916, + "learning_rate": 5.647543531631611e-06, + "loss": 1.152, + "num_input_tokens_seen": 639586888, + "step": 16270 + }, + { + "epoch": 0.7888361275317376, + "grad_norm": 0.4122507572174072, + "learning_rate": 5.62272930553874e-06, + "loss": 1.1098, + "num_input_tokens_seen": 640002384, + "step": 16280 + }, + { + "epoch": 0.789320670607617, + "grad_norm": 0.42295822501182556, + "learning_rate": 5.597962804990453e-06, + "loss": 1.1196, + "num_input_tokens_seen": 640390420, + "step": 16290 + }, + { + "epoch": 0.7898052136834964, + "grad_norm": 0.37593498826026917, + "learning_rate": 5.573244090985777e-06, + "loss": 1.0817, + "num_input_tokens_seen": 640735112, + "step": 16300 + }, + { + "epoch": 0.790289756759376, + "grad_norm": 0.40528619289398193, + "learning_rate": 5.548573224406045e-06, + "loss": 1.163, + "num_input_tokens_seen": 641116036, + "step": 16310 + }, + { + "epoch": 0.7907742998352554, + "grad_norm": 0.4023245573043823, + "learning_rate": 5.523950266014754e-06, + "loss": 1.1551, + "num_input_tokens_seen": 641501348, + "step": 16320 + }, + { + "epoch": 0.7912588429111348, + "grad_norm": 0.3965412676334381, + "learning_rate": 5.499375276457372e-06, + "loss": 1.0989, + "num_input_tokens_seen": 641919160, + "step": 16330 + }, + { + "epoch": 0.7917433859870142, + "grad_norm": 0.4392828643321991, + "learning_rate": 5.4748483162612716e-06, + "loss": 1.1252, + "num_input_tokens_seen": 642318860, + "step": 16340 + }, + { + "epoch": 0.7922279290628937, + "grad_norm": 0.38923805952072144, + "learning_rate": 5.450369445835485e-06, + "loss": 1.1219, + "num_input_tokens_seen": 642734704, + "step": 16350 + }, + { + "epoch": 0.7927124721387732, + "grad_norm": 0.42891883850097656, + "learning_rate": 5.425938725470628e-06, + "loss": 1.1488, + "num_input_tokens_seen": 643108016, + "step": 16360 + }, + { + "epoch": 0.7931970152146526, + "grad_norm": 0.42537081241607666, + "learning_rate": 5.4015562153387126e-06, + "loss": 1.1634, + "num_input_tokens_seen": 643481944, + "step": 16370 + }, + { + "epoch": 0.793681558290532, + "grad_norm": 0.39477813243865967, + "learning_rate": 5.377221975493016e-06, + "loss": 1.091, + "num_input_tokens_seen": 643855924, + "step": 16380 + }, + { + "epoch": 0.7941661013664115, + "grad_norm": 0.4207918345928192, + "learning_rate": 5.352936065867931e-06, + "loss": 1.0861, + "num_input_tokens_seen": 644257816, + "step": 16390 + }, + { + "epoch": 0.7946506444422909, + "grad_norm": 0.44012096524238586, + "learning_rate": 5.328698546278798e-06, + "loss": 1.1237, + "num_input_tokens_seen": 644645828, + "step": 16400 + }, + { + "epoch": 0.7951351875181704, + "grad_norm": 0.4076230525970459, + "learning_rate": 5.30450947642179e-06, + "loss": 1.1243, + "num_input_tokens_seen": 645029716, + "step": 16410 + }, + { + "epoch": 0.7956197305940498, + "grad_norm": 0.3962908983230591, + "learning_rate": 5.280368915873749e-06, + "loss": 1.149, + "num_input_tokens_seen": 645399720, + "step": 16420 + }, + { + "epoch": 0.7961042736699293, + "grad_norm": 0.4037371575832367, + "learning_rate": 5.256276924092035e-06, + "loss": 1.1561, + "num_input_tokens_seen": 645804128, + "step": 16430 + }, + { + "epoch": 0.7965888167458087, + "grad_norm": 0.4168999493122101, + "learning_rate": 5.232233560414387e-06, + "loss": 1.1377, + "num_input_tokens_seen": 646212012, + "step": 16440 + }, + { + "epoch": 0.7970733598216881, + "grad_norm": 0.40669918060302734, + "learning_rate": 5.208238884058783e-06, + "loss": 1.1135, + "num_input_tokens_seen": 646620356, + "step": 16450 + }, + { + "epoch": 0.7975579028975676, + "grad_norm": 0.3902994692325592, + "learning_rate": 5.1842929541232544e-06, + "loss": 1.1916, + "num_input_tokens_seen": 646994340, + "step": 16460 + }, + { + "epoch": 0.7980424459734471, + "grad_norm": 0.451654314994812, + "learning_rate": 5.1603958295858205e-06, + "loss": 1.1252, + "num_input_tokens_seen": 647370752, + "step": 16470 + }, + { + "epoch": 0.7985269890493265, + "grad_norm": 0.39699918031692505, + "learning_rate": 5.136547569304246e-06, + "loss": 1.1475, + "num_input_tokens_seen": 647750888, + "step": 16480 + }, + { + "epoch": 0.7990115321252059, + "grad_norm": 0.37806710600852966, + "learning_rate": 5.112748232015993e-06, + "loss": 1.1349, + "num_input_tokens_seen": 648103956, + "step": 16490 + }, + { + "epoch": 0.7994960752010853, + "grad_norm": 0.38627612590789795, + "learning_rate": 5.088997876337981e-06, + "loss": 1.1531, + "num_input_tokens_seen": 648508460, + "step": 16500 + }, + { + "epoch": 0.7999806182769649, + "grad_norm": 0.414298951625824, + "learning_rate": 5.065296560766522e-06, + "loss": 1.1369, + "num_input_tokens_seen": 648868944, + "step": 16510 + }, + { + "epoch": 0.8004651613528443, + "grad_norm": 0.38814184069633484, + "learning_rate": 5.041644343677126e-06, + "loss": 1.1623, + "num_input_tokens_seen": 649301060, + "step": 16520 + }, + { + "epoch": 0.8009497044287237, + "grad_norm": 0.380260169506073, + "learning_rate": 5.018041283324387e-06, + "loss": 1.142, + "num_input_tokens_seen": 649700576, + "step": 16530 + }, + { + "epoch": 0.8014342475046031, + "grad_norm": 0.406630277633667, + "learning_rate": 4.994487437841827e-06, + "loss": 1.1287, + "num_input_tokens_seen": 650094908, + "step": 16540 + }, + { + "epoch": 0.8019187905804827, + "grad_norm": 0.4198836088180542, + "learning_rate": 4.9709828652417385e-06, + "loss": 1.1155, + "num_input_tokens_seen": 650509396, + "step": 16550 + }, + { + "epoch": 0.8024033336563621, + "grad_norm": 0.3846551775932312, + "learning_rate": 4.947527623415071e-06, + "loss": 1.084, + "num_input_tokens_seen": 650893528, + "step": 16560 + }, + { + "epoch": 0.8028878767322415, + "grad_norm": 0.36685115098953247, + "learning_rate": 4.924121770131274e-06, + "loss": 1.1465, + "num_input_tokens_seen": 651297116, + "step": 16570 + }, + { + "epoch": 0.8033724198081209, + "grad_norm": 0.4003463387489319, + "learning_rate": 4.900765363038151e-06, + "loss": 1.1613, + "num_input_tokens_seen": 651677048, + "step": 16580 + }, + { + "epoch": 0.8038569628840004, + "grad_norm": 0.4008733928203583, + "learning_rate": 4.87745845966171e-06, + "loss": 1.1023, + "num_input_tokens_seen": 652077724, + "step": 16590 + }, + { + "epoch": 0.8043415059598799, + "grad_norm": 0.40616297721862793, + "learning_rate": 4.854201117406066e-06, + "loss": 1.1272, + "num_input_tokens_seen": 652466264, + "step": 16600 + }, + { + "epoch": 0.8048260490357593, + "grad_norm": 0.41137826442718506, + "learning_rate": 4.830993393553215e-06, + "loss": 1.1, + "num_input_tokens_seen": 652835720, + "step": 16610 + }, + { + "epoch": 0.8053105921116387, + "grad_norm": 0.44767943024635315, + "learning_rate": 4.807835345263009e-06, + "loss": 1.1096, + "num_input_tokens_seen": 653218356, + "step": 16620 + }, + { + "epoch": 0.8057951351875182, + "grad_norm": 0.38211873173713684, + "learning_rate": 4.784727029572894e-06, + "loss": 1.1586, + "num_input_tokens_seen": 653630732, + "step": 16630 + }, + { + "epoch": 0.8062796782633976, + "grad_norm": 0.4150942862033844, + "learning_rate": 4.761668503397851e-06, + "loss": 1.1344, + "num_input_tokens_seen": 654032908, + "step": 16640 + }, + { + "epoch": 0.806764221339277, + "grad_norm": 0.3996165096759796, + "learning_rate": 4.738659823530236e-06, + "loss": 1.1124, + "num_input_tokens_seen": 654438804, + "step": 16650 + }, + { + "epoch": 0.8072487644151565, + "grad_norm": 0.3866037428379059, + "learning_rate": 4.715701046639623e-06, + "loss": 1.1109, + "num_input_tokens_seen": 654823000, + "step": 16660 + }, + { + "epoch": 0.807733307491036, + "grad_norm": 0.4089403450489044, + "learning_rate": 4.6927922292726886e-06, + "loss": 1.1196, + "num_input_tokens_seen": 655212888, + "step": 16670 + }, + { + "epoch": 0.8082178505669154, + "grad_norm": 0.43001115322113037, + "learning_rate": 4.669933427853043e-06, + "loss": 1.0934, + "num_input_tokens_seen": 655622528, + "step": 16680 + }, + { + "epoch": 0.8087023936427948, + "grad_norm": 0.40845033526420593, + "learning_rate": 4.647124698681127e-06, + "loss": 1.1133, + "num_input_tokens_seen": 656007064, + "step": 16690 + }, + { + "epoch": 0.8091869367186743, + "grad_norm": 0.38767552375793457, + "learning_rate": 4.624366097934046e-06, + "loss": 1.1088, + "num_input_tokens_seen": 656399380, + "step": 16700 + }, + { + "epoch": 0.8096714797945538, + "grad_norm": 0.39583420753479004, + "learning_rate": 4.60165768166545e-06, + "loss": 1.1914, + "num_input_tokens_seen": 656798972, + "step": 16710 + }, + { + "epoch": 0.8101560228704332, + "grad_norm": 0.4240066707134247, + "learning_rate": 4.578999505805362e-06, + "loss": 1.1416, + "num_input_tokens_seen": 657198652, + "step": 16720 + }, + { + "epoch": 0.8106405659463126, + "grad_norm": 0.3843323290348053, + "learning_rate": 4.556391626160109e-06, + "loss": 1.1275, + "num_input_tokens_seen": 657637540, + "step": 16730 + }, + { + "epoch": 0.811125109022192, + "grad_norm": 0.41490888595581055, + "learning_rate": 4.533834098412082e-06, + "loss": 1.154, + "num_input_tokens_seen": 658012976, + "step": 16740 + }, + { + "epoch": 0.8116096520980716, + "grad_norm": 0.4508754014968872, + "learning_rate": 4.511326978119718e-06, + "loss": 1.1442, + "num_input_tokens_seen": 658403516, + "step": 16750 + }, + { + "epoch": 0.812094195173951, + "grad_norm": 0.3963550925254822, + "learning_rate": 4.488870320717251e-06, + "loss": 1.1307, + "num_input_tokens_seen": 658799828, + "step": 16760 + }, + { + "epoch": 0.8125787382498304, + "grad_norm": 0.4240289032459259, + "learning_rate": 4.466464181514657e-06, + "loss": 1.1333, + "num_input_tokens_seen": 659188772, + "step": 16770 + }, + { + "epoch": 0.8130632813257098, + "grad_norm": 0.4394795596599579, + "learning_rate": 4.444108615697476e-06, + "loss": 1.1301, + "num_input_tokens_seen": 659605584, + "step": 16780 + }, + { + "epoch": 0.8135478244015893, + "grad_norm": 0.391284704208374, + "learning_rate": 4.421803678326691e-06, + "loss": 1.0913, + "num_input_tokens_seen": 660023780, + "step": 16790 + }, + { + "epoch": 0.8140323674774688, + "grad_norm": 0.37516680359840393, + "learning_rate": 4.399549424338589e-06, + "loss": 1.0831, + "num_input_tokens_seen": 660418884, + "step": 16800 + }, + { + "epoch": 0.8145169105533482, + "grad_norm": 0.415872186422348, + "learning_rate": 4.3773459085446275e-06, + "loss": 1.1123, + "num_input_tokens_seen": 660814748, + "step": 16810 + }, + { + "epoch": 0.8150014536292276, + "grad_norm": 0.4023378789424896, + "learning_rate": 4.355193185631287e-06, + "loss": 1.087, + "num_input_tokens_seen": 661192768, + "step": 16820 + }, + { + "epoch": 0.8154859967051071, + "grad_norm": 0.3767937421798706, + "learning_rate": 4.333091310159956e-06, + "loss": 1.1545, + "num_input_tokens_seen": 661606668, + "step": 16830 + }, + { + "epoch": 0.8159705397809865, + "grad_norm": 0.5043548941612244, + "learning_rate": 4.311040336566791e-06, + "loss": 1.1445, + "num_input_tokens_seen": 661982448, + "step": 16840 + }, + { + "epoch": 0.816455082856866, + "grad_norm": 0.3812048137187958, + "learning_rate": 4.289040319162569e-06, + "loss": 1.136, + "num_input_tokens_seen": 662373168, + "step": 16850 + }, + { + "epoch": 0.8169396259327454, + "grad_norm": 0.40610408782958984, + "learning_rate": 4.267091312132576e-06, + "loss": 1.1151, + "num_input_tokens_seen": 662758704, + "step": 16860 + }, + { + "epoch": 0.8174241690086249, + "grad_norm": 0.4081581234931946, + "learning_rate": 4.245193369536437e-06, + "loss": 1.1532, + "num_input_tokens_seen": 663150068, + "step": 16870 + }, + { + "epoch": 0.8179087120845043, + "grad_norm": 0.41858088970184326, + "learning_rate": 4.2233465453080486e-06, + "loss": 1.1054, + "num_input_tokens_seen": 663562480, + "step": 16880 + }, + { + "epoch": 0.8183932551603837, + "grad_norm": 0.45081600546836853, + "learning_rate": 4.20155089325536e-06, + "loss": 1.128, + "num_input_tokens_seen": 663941916, + "step": 16890 + }, + { + "epoch": 0.8188777982362632, + "grad_norm": 0.41167008876800537, + "learning_rate": 4.17980646706031e-06, + "loss": 1.1354, + "num_input_tokens_seen": 664345336, + "step": 16900 + }, + { + "epoch": 0.8193623413121427, + "grad_norm": 0.39505717158317566, + "learning_rate": 4.158113320278667e-06, + "loss": 1.084, + "num_input_tokens_seen": 664762920, + "step": 16910 + }, + { + "epoch": 0.8198468843880221, + "grad_norm": 0.3788187503814697, + "learning_rate": 4.136471506339901e-06, + "loss": 1.1064, + "num_input_tokens_seen": 665156684, + "step": 16920 + }, + { + "epoch": 0.8203314274639015, + "grad_norm": 0.43902385234832764, + "learning_rate": 4.114881078547042e-06, + "loss": 1.1317, + "num_input_tokens_seen": 665557436, + "step": 16930 + }, + { + "epoch": 0.820815970539781, + "grad_norm": 0.3830069303512573, + "learning_rate": 4.093342090076571e-06, + "loss": 1.127, + "num_input_tokens_seen": 665945288, + "step": 16940 + }, + { + "epoch": 0.8213005136156605, + "grad_norm": 0.4254535734653473, + "learning_rate": 4.071854593978253e-06, + "loss": 1.1208, + "num_input_tokens_seen": 666344140, + "step": 16950 + }, + { + "epoch": 0.8217850566915399, + "grad_norm": 0.38880154490470886, + "learning_rate": 4.050418643175067e-06, + "loss": 1.1566, + "num_input_tokens_seen": 666728028, + "step": 16960 + }, + { + "epoch": 0.8222695997674193, + "grad_norm": 0.4109187126159668, + "learning_rate": 4.029034290462996e-06, + "loss": 1.1369, + "num_input_tokens_seen": 667095588, + "step": 16970 + }, + { + "epoch": 0.8227541428432987, + "grad_norm": 0.36832395195961, + "learning_rate": 4.0077015885109676e-06, + "loss": 1.1458, + "num_input_tokens_seen": 667490212, + "step": 16980 + }, + { + "epoch": 0.8232386859191783, + "grad_norm": 0.38334590196609497, + "learning_rate": 3.986420589860682e-06, + "loss": 1.1087, + "num_input_tokens_seen": 667892264, + "step": 16990 + }, + { + "epoch": 0.8237232289950577, + "grad_norm": 0.41069987416267395, + "learning_rate": 3.9651913469265e-06, + "loss": 1.0849, + "num_input_tokens_seen": 668289636, + "step": 17000 + }, + { + "epoch": 0.8242077720709371, + "grad_norm": 0.3793525695800781, + "learning_rate": 3.944013911995317e-06, + "loss": 1.1524, + "num_input_tokens_seen": 668676344, + "step": 17010 + }, + { + "epoch": 0.8246923151468165, + "grad_norm": 0.41535207629203796, + "learning_rate": 3.922888337226399e-06, + "loss": 1.1035, + "num_input_tokens_seen": 669042236, + "step": 17020 + }, + { + "epoch": 0.825176858222696, + "grad_norm": 0.3818122148513794, + "learning_rate": 3.90181467465133e-06, + "loss": 1.103, + "num_input_tokens_seen": 669438068, + "step": 17030 + }, + { + "epoch": 0.8256614012985755, + "grad_norm": 0.4072776436805725, + "learning_rate": 3.880792976173788e-06, + "loss": 1.1103, + "num_input_tokens_seen": 669833020, + "step": 17040 + }, + { + "epoch": 0.8261459443744549, + "grad_norm": 0.3956544101238251, + "learning_rate": 3.859823293569495e-06, + "loss": 1.1063, + "num_input_tokens_seen": 670234476, + "step": 17050 + }, + { + "epoch": 0.8266304874503343, + "grad_norm": 0.4025129973888397, + "learning_rate": 3.838905678486049e-06, + "loss": 1.0968, + "num_input_tokens_seen": 670625508, + "step": 17060 + }, + { + "epoch": 0.8271150305262138, + "grad_norm": 0.48538658022880554, + "learning_rate": 3.818040182442814e-06, + "loss": 1.1978, + "num_input_tokens_seen": 671013016, + "step": 17070 + }, + { + "epoch": 0.8275995736020932, + "grad_norm": 0.39329686760902405, + "learning_rate": 3.7972268568307685e-06, + "loss": 1.1016, + "num_input_tokens_seen": 671418600, + "step": 17080 + }, + { + "epoch": 0.8280841166779727, + "grad_norm": 0.39455854892730713, + "learning_rate": 3.776465752912428e-06, + "loss": 1.16, + "num_input_tokens_seen": 671802384, + "step": 17090 + }, + { + "epoch": 0.8285686597538521, + "grad_norm": 0.5405839681625366, + "learning_rate": 3.7557569218216516e-06, + "loss": 1.164, + "num_input_tokens_seen": 672200552, + "step": 17100 + }, + { + "epoch": 0.8290532028297316, + "grad_norm": 0.369508296251297, + "learning_rate": 3.735100414563594e-06, + "loss": 1.1282, + "num_input_tokens_seen": 672618664, + "step": 17110 + }, + { + "epoch": 0.829537745905611, + "grad_norm": 0.41533350944519043, + "learning_rate": 3.7144962820144928e-06, + "loss": 1.1445, + "num_input_tokens_seen": 673004068, + "step": 17120 + }, + { + "epoch": 0.8300222889814904, + "grad_norm": 0.38036444783210754, + "learning_rate": 3.6939445749216235e-06, + "loss": 1.1596, + "num_input_tokens_seen": 673389044, + "step": 17130 + }, + { + "epoch": 0.8305068320573699, + "grad_norm": 0.37696197628974915, + "learning_rate": 3.6734453439031257e-06, + "loss": 1.1471, + "num_input_tokens_seen": 673724844, + "step": 17140 + }, + { + "epoch": 0.8309913751332494, + "grad_norm": 0.40430694818496704, + "learning_rate": 3.65299863944788e-06, + "loss": 1.095, + "num_input_tokens_seen": 674106168, + "step": 17150 + }, + { + "epoch": 0.8314759182091288, + "grad_norm": 0.468234658241272, + "learning_rate": 3.6326045119154327e-06, + "loss": 1.108, + "num_input_tokens_seen": 674469340, + "step": 17160 + }, + { + "epoch": 0.8319604612850082, + "grad_norm": 0.40100011229515076, + "learning_rate": 3.612263011535791e-06, + "loss": 1.1609, + "num_input_tokens_seen": 674828188, + "step": 17170 + }, + { + "epoch": 0.8324450043608876, + "grad_norm": 0.42651060223579407, + "learning_rate": 3.5919741884093723e-06, + "loss": 1.1289, + "num_input_tokens_seen": 675221792, + "step": 17180 + }, + { + "epoch": 0.8329295474367672, + "grad_norm": 0.3936256468296051, + "learning_rate": 3.5717380925068405e-06, + "loss": 1.1308, + "num_input_tokens_seen": 675644508, + "step": 17190 + }, + { + "epoch": 0.8334140905126466, + "grad_norm": 0.41212528944015503, + "learning_rate": 3.5515547736689995e-06, + "loss": 1.0939, + "num_input_tokens_seen": 676038672, + "step": 17200 + }, + { + "epoch": 0.833898633588526, + "grad_norm": 0.4407350420951843, + "learning_rate": 3.531424281606663e-06, + "loss": 1.1637, + "num_input_tokens_seen": 676431380, + "step": 17210 + }, + { + "epoch": 0.8343831766644054, + "grad_norm": 0.37873637676239014, + "learning_rate": 3.511346665900536e-06, + "loss": 1.1442, + "num_input_tokens_seen": 676821844, + "step": 17220 + }, + { + "epoch": 0.834867719740285, + "grad_norm": 0.39286068081855774, + "learning_rate": 3.491321976001072e-06, + "loss": 1.1215, + "num_input_tokens_seen": 677199996, + "step": 17230 + }, + { + "epoch": 0.8353522628161644, + "grad_norm": 0.42986762523651123, + "learning_rate": 3.471350261228412e-06, + "loss": 1.0996, + "num_input_tokens_seen": 677592120, + "step": 17240 + }, + { + "epoch": 0.8358368058920438, + "grad_norm": 0.4484102725982666, + "learning_rate": 3.451431570772179e-06, + "loss": 1.1268, + "num_input_tokens_seen": 677999524, + "step": 17250 + }, + { + "epoch": 0.8363213489679232, + "grad_norm": 0.407656729221344, + "learning_rate": 3.431565953691418e-06, + "loss": 1.1505, + "num_input_tokens_seen": 678368908, + "step": 17260 + }, + { + "epoch": 0.8368058920438027, + "grad_norm": 0.398433119058609, + "learning_rate": 3.4117534589144547e-06, + "loss": 1.1351, + "num_input_tokens_seen": 678739932, + "step": 17270 + }, + { + "epoch": 0.8372904351196822, + "grad_norm": 0.47509506344795227, + "learning_rate": 3.3919941352387768e-06, + "loss": 1.0815, + "num_input_tokens_seen": 679131488, + "step": 17280 + }, + { + "epoch": 0.8377749781955616, + "grad_norm": 0.3671634793281555, + "learning_rate": 3.372288031330917e-06, + "loss": 1.1059, + "num_input_tokens_seen": 679542760, + "step": 17290 + }, + { + "epoch": 0.838259521271441, + "grad_norm": 0.4009820818901062, + "learning_rate": 3.3526351957263115e-06, + "loss": 1.1132, + "num_input_tokens_seen": 679961904, + "step": 17300 + }, + { + "epoch": 0.8387440643473205, + "grad_norm": 0.3820185363292694, + "learning_rate": 3.3330356768292215e-06, + "loss": 1.0903, + "num_input_tokens_seen": 680347620, + "step": 17310 + }, + { + "epoch": 0.8392286074231999, + "grad_norm": 0.45867863297462463, + "learning_rate": 3.3134895229125772e-06, + "loss": 1.1581, + "num_input_tokens_seen": 680716452, + "step": 17320 + }, + { + "epoch": 0.8397131504990794, + "grad_norm": 0.4050085246562958, + "learning_rate": 3.293996782117881e-06, + "loss": 1.0828, + "num_input_tokens_seen": 681101364, + "step": 17330 + }, + { + "epoch": 0.8401976935749588, + "grad_norm": 0.39128220081329346, + "learning_rate": 3.2745575024550695e-06, + "loss": 1.0908, + "num_input_tokens_seen": 681520008, + "step": 17340 + }, + { + "epoch": 0.8406822366508383, + "grad_norm": 0.47006210684776306, + "learning_rate": 3.2551717318024255e-06, + "loss": 1.1409, + "num_input_tokens_seen": 681899268, + "step": 17350 + }, + { + "epoch": 0.8411667797267177, + "grad_norm": 0.41448378562927246, + "learning_rate": 3.235839517906411e-06, + "loss": 1.1815, + "num_input_tokens_seen": 682286376, + "step": 17360 + }, + { + "epoch": 0.8416513228025971, + "grad_norm": 0.3959016501903534, + "learning_rate": 3.216560908381616e-06, + "loss": 1.1594, + "num_input_tokens_seen": 682681272, + "step": 17370 + }, + { + "epoch": 0.8421358658784766, + "grad_norm": 0.39144328236579895, + "learning_rate": 3.1973359507105645e-06, + "loss": 1.0814, + "num_input_tokens_seen": 683085540, + "step": 17380 + }, + { + "epoch": 0.8426204089543561, + "grad_norm": 0.5121538043022156, + "learning_rate": 3.1781646922436848e-06, + "loss": 1.1284, + "num_input_tokens_seen": 683457100, + "step": 17390 + }, + { + "epoch": 0.8431049520302355, + "grad_norm": 0.4063737094402313, + "learning_rate": 3.1590471801991012e-06, + "loss": 1.1375, + "num_input_tokens_seen": 683831048, + "step": 17400 + }, + { + "epoch": 0.8435894951061149, + "grad_norm": 0.3836385905742645, + "learning_rate": 3.1399834616625907e-06, + "loss": 1.1183, + "num_input_tokens_seen": 684224324, + "step": 17410 + }, + { + "epoch": 0.8440740381819943, + "grad_norm": 0.39683645963668823, + "learning_rate": 3.120973583587425e-06, + "loss": 1.1197, + "num_input_tokens_seen": 684624492, + "step": 17420 + }, + { + "epoch": 0.8445585812578739, + "grad_norm": 0.3896249234676361, + "learning_rate": 3.102017592794279e-06, + "loss": 1.0956, + "num_input_tokens_seen": 685029460, + "step": 17430 + }, + { + "epoch": 0.8450431243337533, + "grad_norm": 0.39281994104385376, + "learning_rate": 3.0831155359710927e-06, + "loss": 1.158, + "num_input_tokens_seen": 685406276, + "step": 17440 + }, + { + "epoch": 0.8455276674096327, + "grad_norm": 0.4067038893699646, + "learning_rate": 3.0642674596729785e-06, + "loss": 1.1395, + "num_input_tokens_seen": 685780380, + "step": 17450 + }, + { + "epoch": 0.8460122104855121, + "grad_norm": 0.3911641836166382, + "learning_rate": 3.0454734103220942e-06, + "loss": 1.1158, + "num_input_tokens_seen": 686162356, + "step": 17460 + }, + { + "epoch": 0.8464967535613916, + "grad_norm": 0.3925408720970154, + "learning_rate": 3.026733434207532e-06, + "loss": 1.1467, + "num_input_tokens_seen": 686553108, + "step": 17470 + }, + { + "epoch": 0.8469812966372711, + "grad_norm": 0.42923226952552795, + "learning_rate": 3.008047577485204e-06, + "loss": 1.1584, + "num_input_tokens_seen": 686914652, + "step": 17480 + }, + { + "epoch": 0.8474658397131505, + "grad_norm": 0.4102732837200165, + "learning_rate": 2.98941588617771e-06, + "loss": 1.1051, + "num_input_tokens_seen": 687275992, + "step": 17490 + }, + { + "epoch": 0.8479503827890299, + "grad_norm": 0.41153576970100403, + "learning_rate": 2.970838406174284e-06, + "loss": 1.1446, + "num_input_tokens_seen": 687689920, + "step": 17500 + }, + { + "epoch": 0.8484349258649094, + "grad_norm": 0.4032898545265198, + "learning_rate": 2.952315183230589e-06, + "loss": 1.1283, + "num_input_tokens_seen": 688080568, + "step": 17510 + }, + { + "epoch": 0.8489194689407888, + "grad_norm": 0.4179829955101013, + "learning_rate": 2.9338462629687034e-06, + "loss": 1.1031, + "num_input_tokens_seen": 688453284, + "step": 17520 + }, + { + "epoch": 0.8494040120166683, + "grad_norm": 0.4297243058681488, + "learning_rate": 2.915431690876916e-06, + "loss": 1.1442, + "num_input_tokens_seen": 688838776, + "step": 17530 + }, + { + "epoch": 0.8498885550925477, + "grad_norm": 0.4191533029079437, + "learning_rate": 2.8970715123096877e-06, + "loss": 1.1041, + "num_input_tokens_seen": 689225084, + "step": 17540 + }, + { + "epoch": 0.8503730981684272, + "grad_norm": 0.4235047996044159, + "learning_rate": 2.8787657724875016e-06, + "loss": 1.0696, + "num_input_tokens_seen": 689607636, + "step": 17550 + }, + { + "epoch": 0.8508576412443066, + "grad_norm": 0.409962922334671, + "learning_rate": 2.860514516496754e-06, + "loss": 1.1092, + "num_input_tokens_seen": 689951392, + "step": 17560 + }, + { + "epoch": 0.851342184320186, + "grad_norm": 0.4196973443031311, + "learning_rate": 2.8423177892896585e-06, + "loss": 1.0901, + "num_input_tokens_seen": 690348064, + "step": 17570 + }, + { + "epoch": 0.8518267273960655, + "grad_norm": 0.3807368278503418, + "learning_rate": 2.8241756356841233e-06, + "loss": 1.1281, + "num_input_tokens_seen": 690749348, + "step": 17580 + }, + { + "epoch": 0.852311270471945, + "grad_norm": 0.40411800146102905, + "learning_rate": 2.806088100363635e-06, + "loss": 1.0834, + "num_input_tokens_seen": 691138376, + "step": 17590 + }, + { + "epoch": 0.8527958135478244, + "grad_norm": 0.39498665928840637, + "learning_rate": 2.7880552278771703e-06, + "loss": 1.0952, + "num_input_tokens_seen": 691567648, + "step": 17600 + }, + { + "epoch": 0.8532803566237038, + "grad_norm": 0.434725284576416, + "learning_rate": 2.7700770626390677e-06, + "loss": 1.1152, + "num_input_tokens_seen": 691920228, + "step": 17610 + }, + { + "epoch": 0.8537648996995832, + "grad_norm": 0.3900477886199951, + "learning_rate": 2.7521536489289233e-06, + "loss": 1.1517, + "num_input_tokens_seen": 692294800, + "step": 17620 + }, + { + "epoch": 0.8542494427754628, + "grad_norm": 0.42599180340766907, + "learning_rate": 2.7342850308914843e-06, + "loss": 1.1066, + "num_input_tokens_seen": 692670100, + "step": 17630 + }, + { + "epoch": 0.8547339858513422, + "grad_norm": 0.416204571723938, + "learning_rate": 2.716471252536526e-06, + "loss": 1.0739, + "num_input_tokens_seen": 693061084, + "step": 17640 + }, + { + "epoch": 0.8552185289272216, + "grad_norm": 0.40145227313041687, + "learning_rate": 2.6987123577387833e-06, + "loss": 1.1394, + "num_input_tokens_seen": 693458880, + "step": 17650 + }, + { + "epoch": 0.855703072003101, + "grad_norm": 0.38669687509536743, + "learning_rate": 2.6810083902377825e-06, + "loss": 1.1175, + "num_input_tokens_seen": 693867900, + "step": 17660 + }, + { + "epoch": 0.8561876150789806, + "grad_norm": 0.42774924635887146, + "learning_rate": 2.663359393637785e-06, + "loss": 1.1, + "num_input_tokens_seen": 694285092, + "step": 17670 + }, + { + "epoch": 0.85667215815486, + "grad_norm": 0.39861494302749634, + "learning_rate": 2.645765411407655e-06, + "loss": 1.1243, + "num_input_tokens_seen": 694666360, + "step": 17680 + }, + { + "epoch": 0.8571567012307394, + "grad_norm": 0.3979116678237915, + "learning_rate": 2.6282264868807637e-06, + "loss": 1.1226, + "num_input_tokens_seen": 695052444, + "step": 17690 + }, + { + "epoch": 0.8576412443066188, + "grad_norm": 0.4196152985095978, + "learning_rate": 2.61074266325487e-06, + "loss": 1.1283, + "num_input_tokens_seen": 695441168, + "step": 17700 + }, + { + "epoch": 0.8581257873824983, + "grad_norm": 0.41752830147743225, + "learning_rate": 2.59331398359203e-06, + "loss": 1.0917, + "num_input_tokens_seen": 695864664, + "step": 17710 + }, + { + "epoch": 0.8586103304583778, + "grad_norm": 0.41962143778800964, + "learning_rate": 2.5759404908184654e-06, + "loss": 1.1223, + "num_input_tokens_seen": 696267288, + "step": 17720 + }, + { + "epoch": 0.8590948735342572, + "grad_norm": 0.4000556170940399, + "learning_rate": 2.5586222277244887e-06, + "loss": 1.1531, + "num_input_tokens_seen": 696650996, + "step": 17730 + }, + { + "epoch": 0.8595794166101366, + "grad_norm": 0.4215715229511261, + "learning_rate": 2.541359236964386e-06, + "loss": 1.0838, + "num_input_tokens_seen": 697023960, + "step": 17740 + }, + { + "epoch": 0.8600639596860161, + "grad_norm": 0.3923567533493042, + "learning_rate": 2.5241515610562983e-06, + "loss": 1.1086, + "num_input_tokens_seen": 697411632, + "step": 17750 + }, + { + "epoch": 0.8605485027618955, + "grad_norm": 0.39392712712287903, + "learning_rate": 2.506999242382141e-06, + "loss": 1.1011, + "num_input_tokens_seen": 697800768, + "step": 17760 + }, + { + "epoch": 0.861033045837775, + "grad_norm": 0.39785829186439514, + "learning_rate": 2.489902323187465e-06, + "loss": 1.0928, + "num_input_tokens_seen": 698174508, + "step": 17770 + }, + { + "epoch": 0.8615175889136544, + "grad_norm": 0.4119921028614044, + "learning_rate": 2.472860845581404e-06, + "loss": 1.1081, + "num_input_tokens_seen": 698560436, + "step": 17780 + }, + { + "epoch": 0.8620021319895339, + "grad_norm": 0.4029790461063385, + "learning_rate": 2.455874851536516e-06, + "loss": 1.1645, + "num_input_tokens_seen": 698924028, + "step": 17790 + }, + { + "epoch": 0.8624866750654133, + "grad_norm": 0.4168151915073395, + "learning_rate": 2.4389443828887166e-06, + "loss": 1.1385, + "num_input_tokens_seen": 699315512, + "step": 17800 + }, + { + "epoch": 0.8629712181412927, + "grad_norm": 0.44244882464408875, + "learning_rate": 2.422069481337161e-06, + "loss": 1.1355, + "num_input_tokens_seen": 699695032, + "step": 17810 + }, + { + "epoch": 0.8634557612171722, + "grad_norm": 0.3886070251464844, + "learning_rate": 2.405250188444147e-06, + "loss": 1.1019, + "num_input_tokens_seen": 700102396, + "step": 17820 + }, + { + "epoch": 0.8639403042930517, + "grad_norm": 0.4062063992023468, + "learning_rate": 2.3884865456350103e-06, + "loss": 1.1314, + "num_input_tokens_seen": 700467740, + "step": 17830 + }, + { + "epoch": 0.8644248473689311, + "grad_norm": 0.4632991552352905, + "learning_rate": 2.371778594198021e-06, + "loss": 1.1259, + "num_input_tokens_seen": 700860544, + "step": 17840 + }, + { + "epoch": 0.8649093904448105, + "grad_norm": 0.3729296624660492, + "learning_rate": 2.355126375284272e-06, + "loss": 1.1139, + "num_input_tokens_seen": 701260456, + "step": 17850 + }, + { + "epoch": 0.8653939335206899, + "grad_norm": 0.3984206020832062, + "learning_rate": 2.338529929907618e-06, + "loss": 1.1629, + "num_input_tokens_seen": 701665356, + "step": 17860 + }, + { + "epoch": 0.8658784765965695, + "grad_norm": 0.39754825830459595, + "learning_rate": 2.321989298944513e-06, + "loss": 1.1423, + "num_input_tokens_seen": 702049100, + "step": 17870 + }, + { + "epoch": 0.8663630196724489, + "grad_norm": 0.4021519720554352, + "learning_rate": 2.305504523133964e-06, + "loss": 1.1645, + "num_input_tokens_seen": 702418804, + "step": 17880 + }, + { + "epoch": 0.8668475627483283, + "grad_norm": 0.41587236523628235, + "learning_rate": 2.2890756430773956e-06, + "loss": 1.1121, + "num_input_tokens_seen": 702806248, + "step": 17890 + }, + { + "epoch": 0.8673321058242077, + "grad_norm": 0.38616082072257996, + "learning_rate": 2.272702699238574e-06, + "loss": 1.1448, + "num_input_tokens_seen": 703198880, + "step": 17900 + }, + { + "epoch": 0.8678166489000873, + "grad_norm": 0.4032549560070038, + "learning_rate": 2.2563857319434945e-06, + "loss": 1.1484, + "num_input_tokens_seen": 703603228, + "step": 17910 + }, + { + "epoch": 0.8683011919759667, + "grad_norm": 0.4154454171657562, + "learning_rate": 2.2401247813802652e-06, + "loss": 1.0953, + "num_input_tokens_seen": 703977824, + "step": 17920 + }, + { + "epoch": 0.8687857350518461, + "grad_norm": 0.3659140169620514, + "learning_rate": 2.223919887599063e-06, + "loss": 1.136, + "num_input_tokens_seen": 704375196, + "step": 17930 + }, + { + "epoch": 0.8692702781277255, + "grad_norm": 0.4203336834907532, + "learning_rate": 2.2077710905119618e-06, + "loss": 1.1147, + "num_input_tokens_seen": 704771996, + "step": 17940 + }, + { + "epoch": 0.869754821203605, + "grad_norm": 0.4060530364513397, + "learning_rate": 2.191678429892893e-06, + "loss": 1.1429, + "num_input_tokens_seen": 705149948, + "step": 17950 + }, + { + "epoch": 0.8702393642794845, + "grad_norm": 0.4052405059337616, + "learning_rate": 2.175641945377524e-06, + "loss": 1.1261, + "num_input_tokens_seen": 705545528, + "step": 17960 + }, + { + "epoch": 0.8707239073553639, + "grad_norm": 0.39759910106658936, + "learning_rate": 2.1596616764631595e-06, + "loss": 1.1454, + "num_input_tokens_seen": 705922212, + "step": 17970 + }, + { + "epoch": 0.8712084504312433, + "grad_norm": 0.3922360837459564, + "learning_rate": 2.143737662508635e-06, + "loss": 1.0771, + "num_input_tokens_seen": 706339472, + "step": 17980 + }, + { + "epoch": 0.8716929935071228, + "grad_norm": 0.39637112617492676, + "learning_rate": 2.127869942734262e-06, + "loss": 1.0886, + "num_input_tokens_seen": 706709976, + "step": 17990 + }, + { + "epoch": 0.8721775365830022, + "grad_norm": 0.36679452657699585, + "learning_rate": 2.112058556221663e-06, + "loss": 1.137, + "num_input_tokens_seen": 707117360, + "step": 18000 + }, + { + "epoch": 0.8721775365830022, + "eval_loss": 1.1178553104400635, + "eval_runtime": 5.0781, + "eval_samples_per_second": 29.539, + "eval_steps_per_second": 3.742, + "num_input_tokens_seen": 707117360, + "step": 18000 + }, + { + "epoch": 0.8726620796588817, + "grad_norm": 0.3815447986125946, + "learning_rate": 2.0963035419137577e-06, + "loss": 1.1119, + "num_input_tokens_seen": 707514656, + "step": 18010 + }, + { + "epoch": 0.8731466227347611, + "grad_norm": 0.39872461557388306, + "learning_rate": 2.0806049386145774e-06, + "loss": 1.1278, + "num_input_tokens_seen": 707917760, + "step": 18020 + }, + { + "epoch": 0.8736311658106406, + "grad_norm": 0.3805377781391144, + "learning_rate": 2.0649627849892466e-06, + "loss": 1.1064, + "num_input_tokens_seen": 708298628, + "step": 18030 + }, + { + "epoch": 0.87411570888652, + "grad_norm": 0.3966962695121765, + "learning_rate": 2.0493771195638443e-06, + "loss": 1.1155, + "num_input_tokens_seen": 708713480, + "step": 18040 + }, + { + "epoch": 0.8746002519623994, + "grad_norm": 0.42251577973365784, + "learning_rate": 2.033847980725323e-06, + "loss": 1.1489, + "num_input_tokens_seen": 709099180, + "step": 18050 + }, + { + "epoch": 0.8750847950382789, + "grad_norm": 0.4105037450790405, + "learning_rate": 2.018375406721415e-06, + "loss": 1.1305, + "num_input_tokens_seen": 709499996, + "step": 18060 + }, + { + "epoch": 0.8755693381141584, + "grad_norm": 0.4050074517726898, + "learning_rate": 2.0029594356605286e-06, + "loss": 1.1278, + "num_input_tokens_seen": 709902500, + "step": 18070 + }, + { + "epoch": 0.8760538811900378, + "grad_norm": 0.3788783550262451, + "learning_rate": 1.9876001055116664e-06, + "loss": 1.0709, + "num_input_tokens_seen": 710294012, + "step": 18080 + }, + { + "epoch": 0.8765384242659172, + "grad_norm": 0.4039725959300995, + "learning_rate": 1.9722974541043244e-06, + "loss": 1.131, + "num_input_tokens_seen": 710669864, + "step": 18090 + }, + { + "epoch": 0.8770229673417966, + "grad_norm": 0.3935058116912842, + "learning_rate": 1.957051519128403e-06, + "loss": 1.092, + "num_input_tokens_seen": 711071604, + "step": 18100 + }, + { + "epoch": 0.8775075104176762, + "grad_norm": 0.42256423830986023, + "learning_rate": 1.9418623381341094e-06, + "loss": 1.1371, + "num_input_tokens_seen": 711476612, + "step": 18110 + }, + { + "epoch": 0.8779920534935556, + "grad_norm": 0.3739723265171051, + "learning_rate": 1.926729948531872e-06, + "loss": 1.14, + "num_input_tokens_seen": 711872752, + "step": 18120 + }, + { + "epoch": 0.878476596569435, + "grad_norm": 0.3799099624156952, + "learning_rate": 1.9116543875922294e-06, + "loss": 1.1055, + "num_input_tokens_seen": 712244912, + "step": 18130 + }, + { + "epoch": 0.8789611396453144, + "grad_norm": 0.3699980080127716, + "learning_rate": 1.8966356924457833e-06, + "loss": 1.1191, + "num_input_tokens_seen": 712647364, + "step": 18140 + }, + { + "epoch": 0.8794456827211939, + "grad_norm": 0.3966718316078186, + "learning_rate": 1.881673900083042e-06, + "loss": 1.1611, + "num_input_tokens_seen": 713028800, + "step": 18150 + }, + { + "epoch": 0.8799302257970734, + "grad_norm": 0.3989785313606262, + "learning_rate": 1.8667690473543858e-06, + "loss": 1.1063, + "num_input_tokens_seen": 713412044, + "step": 18160 + }, + { + "epoch": 0.8804147688729528, + "grad_norm": 0.3920852541923523, + "learning_rate": 1.8519211709699475e-06, + "loss": 1.1163, + "num_input_tokens_seen": 713796584, + "step": 18170 + }, + { + "epoch": 0.8808993119488322, + "grad_norm": 0.4311419129371643, + "learning_rate": 1.837130307499535e-06, + "loss": 1.0843, + "num_input_tokens_seen": 714199172, + "step": 18180 + }, + { + "epoch": 0.8813838550247117, + "grad_norm": 0.3938891887664795, + "learning_rate": 1.822396493372533e-06, + "loss": 1.0989, + "num_input_tokens_seen": 714590532, + "step": 18190 + }, + { + "epoch": 0.8818683981005911, + "grad_norm": 0.42306891083717346, + "learning_rate": 1.807719764877805e-06, + "loss": 1.0779, + "num_input_tokens_seen": 715001368, + "step": 18200 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.39101406931877136, + "learning_rate": 1.793100158163627e-06, + "loss": 1.1353, + "num_input_tokens_seen": 715383004, + "step": 18210 + }, + { + "epoch": 0.88283748425235, + "grad_norm": 0.3895913362503052, + "learning_rate": 1.7785377092375848e-06, + "loss": 1.1042, + "num_input_tokens_seen": 715769884, + "step": 18220 + }, + { + "epoch": 0.8833220273282295, + "grad_norm": 0.40741434693336487, + "learning_rate": 1.7640324539664827e-06, + "loss": 1.1079, + "num_input_tokens_seen": 716157804, + "step": 18230 + }, + { + "epoch": 0.8838065704041089, + "grad_norm": 0.40758630633354187, + "learning_rate": 1.749584428076262e-06, + "loss": 1.1415, + "num_input_tokens_seen": 716535668, + "step": 18240 + }, + { + "epoch": 0.8842911134799883, + "grad_norm": 0.39152994751930237, + "learning_rate": 1.7351936671519104e-06, + "loss": 1.1472, + "num_input_tokens_seen": 716917736, + "step": 18250 + }, + { + "epoch": 0.8847756565558678, + "grad_norm": 0.3884604275226593, + "learning_rate": 1.7208602066373592e-06, + "loss": 1.1329, + "num_input_tokens_seen": 717307888, + "step": 18260 + }, + { + "epoch": 0.8852601996317473, + "grad_norm": 0.4423676133155823, + "learning_rate": 1.706584081835444e-06, + "loss": 1.154, + "num_input_tokens_seen": 717697032, + "step": 18270 + }, + { + "epoch": 0.8857447427076267, + "grad_norm": 0.39519426226615906, + "learning_rate": 1.6923653279077468e-06, + "loss": 1.106, + "num_input_tokens_seen": 718093096, + "step": 18280 + }, + { + "epoch": 0.8862292857835061, + "grad_norm": 0.4418719708919525, + "learning_rate": 1.6782039798745791e-06, + "loss": 1.1005, + "num_input_tokens_seen": 718458596, + "step": 18290 + }, + { + "epoch": 0.8867138288593855, + "grad_norm": 0.39423951506614685, + "learning_rate": 1.6641000726148353e-06, + "loss": 1.109, + "num_input_tokens_seen": 718874968, + "step": 18300 + }, + { + "epoch": 0.8871983719352651, + "grad_norm": 0.40183213353157043, + "learning_rate": 1.650053640865959e-06, + "loss": 1.1351, + "num_input_tokens_seen": 719282808, + "step": 18310 + }, + { + "epoch": 0.8876829150111445, + "grad_norm": 0.3977072238922119, + "learning_rate": 1.6360647192238176e-06, + "loss": 1.1096, + "num_input_tokens_seen": 719670048, + "step": 18320 + }, + { + "epoch": 0.8881674580870239, + "grad_norm": 0.4405307173728943, + "learning_rate": 1.6221333421426483e-06, + "loss": 1.1682, + "num_input_tokens_seen": 720046844, + "step": 18330 + }, + { + "epoch": 0.8886520011629033, + "grad_norm": 0.38534072041511536, + "learning_rate": 1.6082595439349368e-06, + "loss": 1.1598, + "num_input_tokens_seen": 720440272, + "step": 18340 + }, + { + "epoch": 0.8891365442387829, + "grad_norm": 0.3926476836204529, + "learning_rate": 1.5944433587713693e-06, + "loss": 1.1201, + "num_input_tokens_seen": 720832848, + "step": 18350 + }, + { + "epoch": 0.8896210873146623, + "grad_norm": 0.38580575585365295, + "learning_rate": 1.5806848206807362e-06, + "loss": 1.1723, + "num_input_tokens_seen": 721201392, + "step": 18360 + }, + { + "epoch": 0.8901056303905417, + "grad_norm": 0.41746583580970764, + "learning_rate": 1.566983963549834e-06, + "loss": 1.0699, + "num_input_tokens_seen": 721622692, + "step": 18370 + }, + { + "epoch": 0.8905901734664212, + "grad_norm": 0.3924805223941803, + "learning_rate": 1.5533408211234002e-06, + "loss": 1.1053, + "num_input_tokens_seen": 722027036, + "step": 18380 + }, + { + "epoch": 0.8910747165423006, + "grad_norm": 0.37664535641670227, + "learning_rate": 1.5397554270040137e-06, + "loss": 1.1089, + "num_input_tokens_seen": 722405744, + "step": 18390 + }, + { + "epoch": 0.8915592596181801, + "grad_norm": 0.4099685847759247, + "learning_rate": 1.5262278146520426e-06, + "loss": 1.1115, + "num_input_tokens_seen": 722805360, + "step": 18400 + }, + { + "epoch": 0.8920438026940595, + "grad_norm": 0.3878704607486725, + "learning_rate": 1.5127580173855071e-06, + "loss": 1.1438, + "num_input_tokens_seen": 723211584, + "step": 18410 + }, + { + "epoch": 0.892528345769939, + "grad_norm": 0.438963919878006, + "learning_rate": 1.4993460683800698e-06, + "loss": 1.1211, + "num_input_tokens_seen": 723640776, + "step": 18420 + }, + { + "epoch": 0.8930128888458184, + "grad_norm": 0.3852384090423584, + "learning_rate": 1.4859920006688789e-06, + "loss": 1.1244, + "num_input_tokens_seen": 724035952, + "step": 18430 + }, + { + "epoch": 0.8934974319216978, + "grad_norm": 0.36654970049858093, + "learning_rate": 1.4726958471425495e-06, + "loss": 1.125, + "num_input_tokens_seen": 724439364, + "step": 18440 + }, + { + "epoch": 0.8939819749975773, + "grad_norm": 0.39647069573402405, + "learning_rate": 1.4594576405490417e-06, + "loss": 1.083, + "num_input_tokens_seen": 724842880, + "step": 18450 + }, + { + "epoch": 0.8944665180734568, + "grad_norm": 0.35190171003341675, + "learning_rate": 1.4462774134935963e-06, + "loss": 1.1442, + "num_input_tokens_seen": 725280940, + "step": 18460 + }, + { + "epoch": 0.8949510611493362, + "grad_norm": 0.40093109011650085, + "learning_rate": 1.43315519843866e-06, + "loss": 1.1372, + "num_input_tokens_seen": 725666144, + "step": 18470 + }, + { + "epoch": 0.8954356042252156, + "grad_norm": 0.4389098286628723, + "learning_rate": 1.4200910277037932e-06, + "loss": 1.1656, + "num_input_tokens_seen": 726077688, + "step": 18480 + }, + { + "epoch": 0.895920147301095, + "grad_norm": 0.401603102684021, + "learning_rate": 1.4070849334655883e-06, + "loss": 1.1504, + "num_input_tokens_seen": 726457876, + "step": 18490 + }, + { + "epoch": 0.8964046903769746, + "grad_norm": 0.4019956588745117, + "learning_rate": 1.3941369477576072e-06, + "loss": 1.1493, + "num_input_tokens_seen": 726837760, + "step": 18500 + }, + { + "epoch": 0.896889233452854, + "grad_norm": 0.3990192413330078, + "learning_rate": 1.3812471024702873e-06, + "loss": 1.0712, + "num_input_tokens_seen": 727203844, + "step": 18510 + }, + { + "epoch": 0.8973737765287334, + "grad_norm": 0.396411269903183, + "learning_rate": 1.3684154293508722e-06, + "loss": 1.0972, + "num_input_tokens_seen": 727595240, + "step": 18520 + }, + { + "epoch": 0.8978583196046128, + "grad_norm": 0.3998691737651825, + "learning_rate": 1.3556419600033288e-06, + "loss": 1.0947, + "num_input_tokens_seen": 727979392, + "step": 18530 + }, + { + "epoch": 0.8983428626804923, + "grad_norm": 0.4026288390159607, + "learning_rate": 1.3429267258882578e-06, + "loss": 1.1136, + "num_input_tokens_seen": 728342232, + "step": 18540 + }, + { + "epoch": 0.8988274057563718, + "grad_norm": 0.3861730396747589, + "learning_rate": 1.3302697583228523e-06, + "loss": 1.0925, + "num_input_tokens_seen": 728749172, + "step": 18550 + }, + { + "epoch": 0.8993119488322512, + "grad_norm": 0.38084274530410767, + "learning_rate": 1.3176710884807791e-06, + "loss": 1.115, + "num_input_tokens_seen": 729125920, + "step": 18560 + }, + { + "epoch": 0.8997964919081306, + "grad_norm": 0.43145671486854553, + "learning_rate": 1.3051307473921193e-06, + "loss": 1.1619, + "num_input_tokens_seen": 729532004, + "step": 18570 + }, + { + "epoch": 0.9002810349840101, + "grad_norm": 0.41663506627082825, + "learning_rate": 1.2926487659433024e-06, + "loss": 1.1116, + "num_input_tokens_seen": 729903788, + "step": 18580 + }, + { + "epoch": 0.9007655780598895, + "grad_norm": 0.4152432680130005, + "learning_rate": 1.2802251748770144e-06, + "loss": 1.1174, + "num_input_tokens_seen": 730309520, + "step": 18590 + }, + { + "epoch": 0.901250121135769, + "grad_norm": 0.3903380334377289, + "learning_rate": 1.267860004792129e-06, + "loss": 1.0801, + "num_input_tokens_seen": 730705760, + "step": 18600 + }, + { + "epoch": 0.9017346642116484, + "grad_norm": 0.4450056552886963, + "learning_rate": 1.255553286143632e-06, + "loss": 1.1158, + "num_input_tokens_seen": 731091508, + "step": 18610 + }, + { + "epoch": 0.9022192072875279, + "grad_norm": 0.39127764105796814, + "learning_rate": 1.2433050492425352e-06, + "loss": 1.1237, + "num_input_tokens_seen": 731486328, + "step": 18620 + }, + { + "epoch": 0.9027037503634073, + "grad_norm": 0.3897622227668762, + "learning_rate": 1.2311153242558354e-06, + "loss": 1.0822, + "num_input_tokens_seen": 731875972, + "step": 18630 + }, + { + "epoch": 0.9031882934392867, + "grad_norm": 0.4100290536880493, + "learning_rate": 1.218984141206389e-06, + "loss": 1.0725, + "num_input_tokens_seen": 732293268, + "step": 18640 + }, + { + "epoch": 0.9036728365151662, + "grad_norm": 0.3897705674171448, + "learning_rate": 1.2069115299728845e-06, + "loss": 1.1306, + "num_input_tokens_seen": 732690944, + "step": 18650 + }, + { + "epoch": 0.9041573795910457, + "grad_norm": 0.4191875755786896, + "learning_rate": 1.1948975202897423e-06, + "loss": 1.1358, + "num_input_tokens_seen": 733075324, + "step": 18660 + }, + { + "epoch": 0.9046419226669251, + "grad_norm": 0.3774699568748474, + "learning_rate": 1.1829421417470481e-06, + "loss": 1.1105, + "num_input_tokens_seen": 733466324, + "step": 18670 + }, + { + "epoch": 0.9051264657428045, + "grad_norm": 0.39872831106185913, + "learning_rate": 1.17104542379049e-06, + "loss": 1.1047, + "num_input_tokens_seen": 733853980, + "step": 18680 + }, + { + "epoch": 0.905611008818684, + "grad_norm": 0.38337400555610657, + "learning_rate": 1.159207395721268e-06, + "loss": 1.0915, + "num_input_tokens_seen": 734284336, + "step": 18690 + }, + { + "epoch": 0.9060955518945635, + "grad_norm": 0.4249227046966553, + "learning_rate": 1.1474280866960314e-06, + "loss": 1.1328, + "num_input_tokens_seen": 734691688, + "step": 18700 + }, + { + "epoch": 0.9065800949704429, + "grad_norm": 0.40089496970176697, + "learning_rate": 1.1357075257268147e-06, + "loss": 1.1384, + "num_input_tokens_seen": 735099992, + "step": 18710 + }, + { + "epoch": 0.9070646380463223, + "grad_norm": 0.42720267176628113, + "learning_rate": 1.1240457416809458e-06, + "loss": 1.1603, + "num_input_tokens_seen": 735517812, + "step": 18720 + }, + { + "epoch": 0.9075491811222017, + "grad_norm": 0.41853755712509155, + "learning_rate": 1.112442763280999e-06, + "loss": 1.0719, + "num_input_tokens_seen": 735875148, + "step": 18730 + }, + { + "epoch": 0.9080337241980813, + "grad_norm": 0.4119112491607666, + "learning_rate": 1.1008986191047095e-06, + "loss": 1.12, + "num_input_tokens_seen": 736279692, + "step": 18740 + }, + { + "epoch": 0.9085182672739607, + "grad_norm": 0.40979915857315063, + "learning_rate": 1.0894133375848907e-06, + "loss": 1.0894, + "num_input_tokens_seen": 736678652, + "step": 18750 + }, + { + "epoch": 0.9090028103498401, + "grad_norm": 0.3658255338668823, + "learning_rate": 1.0779869470094072e-06, + "loss": 1.1409, + "num_input_tokens_seen": 737076332, + "step": 18760 + }, + { + "epoch": 0.9094873534257195, + "grad_norm": 0.3819989264011383, + "learning_rate": 1.0666194755210524e-06, + "loss": 1.0877, + "num_input_tokens_seen": 737463700, + "step": 18770 + }, + { + "epoch": 0.909971896501599, + "grad_norm": 0.3775273859500885, + "learning_rate": 1.0553109511175192e-06, + "loss": 1.0883, + "num_input_tokens_seen": 737833988, + "step": 18780 + }, + { + "epoch": 0.9104564395774785, + "grad_norm": 0.4142516553401947, + "learning_rate": 1.0440614016513056e-06, + "loss": 1.0953, + "num_input_tokens_seen": 738251768, + "step": 18790 + }, + { + "epoch": 0.9109409826533579, + "grad_norm": 0.4076782166957855, + "learning_rate": 1.0328708548296622e-06, + "loss": 1.0818, + "num_input_tokens_seen": 738646584, + "step": 18800 + }, + { + "epoch": 0.9114255257292373, + "grad_norm": 0.3837725520133972, + "learning_rate": 1.0217393382145224e-06, + "loss": 1.1542, + "num_input_tokens_seen": 739027612, + "step": 18810 + }, + { + "epoch": 0.9119100688051168, + "grad_norm": 0.4405462145805359, + "learning_rate": 1.0106668792224134e-06, + "loss": 1.1116, + "num_input_tokens_seen": 739421692, + "step": 18820 + }, + { + "epoch": 0.9123946118809962, + "grad_norm": 0.4041633605957031, + "learning_rate": 9.996535051244316e-07, + "loss": 1.062, + "num_input_tokens_seen": 739822296, + "step": 18830 + }, + { + "epoch": 0.9128791549568757, + "grad_norm": 0.4356255829334259, + "learning_rate": 9.88699243046126e-07, + "loss": 1.0675, + "num_input_tokens_seen": 740216416, + "step": 18840 + }, + { + "epoch": 0.9133636980327551, + "grad_norm": 0.365655779838562, + "learning_rate": 9.778041199674626e-07, + "loss": 1.13, + "num_input_tokens_seen": 740584032, + "step": 18850 + }, + { + "epoch": 0.9138482411086346, + "grad_norm": 0.38094544410705566, + "learning_rate": 9.669681627227562e-07, + "loss": 1.1065, + "num_input_tokens_seen": 740973480, + "step": 18860 + }, + { + "epoch": 0.914332784184514, + "grad_norm": 0.41554176807403564, + "learning_rate": 9.561913980005916e-07, + "loss": 1.148, + "num_input_tokens_seen": 741350984, + "step": 18870 + }, + { + "epoch": 0.9148173272603934, + "grad_norm": 0.40062519907951355, + "learning_rate": 9.45473852343759e-07, + "loss": 1.1589, + "num_input_tokens_seen": 741724776, + "step": 18880 + }, + { + "epoch": 0.9153018703362729, + "grad_norm": 0.3932797610759735, + "learning_rate": 9.348155521492125e-07, + "loss": 1.1375, + "num_input_tokens_seen": 742129168, + "step": 18890 + }, + { + "epoch": 0.9157864134121524, + "grad_norm": 0.4039793610572815, + "learning_rate": 9.242165236679646e-07, + "loss": 1.1235, + "num_input_tokens_seen": 742522860, + "step": 18900 + }, + { + "epoch": 0.9162709564880318, + "grad_norm": 0.4156935513019562, + "learning_rate": 9.136767930050666e-07, + "loss": 1.1931, + "num_input_tokens_seen": 742904860, + "step": 18910 + }, + { + "epoch": 0.9167554995639112, + "grad_norm": 0.419506698846817, + "learning_rate": 9.031963861194953e-07, + "loss": 1.0917, + "num_input_tokens_seen": 743285040, + "step": 18920 + }, + { + "epoch": 0.9172400426397906, + "grad_norm": 0.417361855506897, + "learning_rate": 8.927753288241386e-07, + "loss": 1.1108, + "num_input_tokens_seen": 743675480, + "step": 18930 + }, + { + "epoch": 0.9177245857156702, + "grad_norm": 0.38126012682914734, + "learning_rate": 8.82413646785693e-07, + "loss": 1.1074, + "num_input_tokens_seen": 744082016, + "step": 18940 + }, + { + "epoch": 0.9182091287915496, + "grad_norm": 0.401082843542099, + "learning_rate": 8.721113655246222e-07, + "loss": 1.092, + "num_input_tokens_seen": 744477944, + "step": 18950 + }, + { + "epoch": 0.918693671867429, + "grad_norm": 0.41044408082962036, + "learning_rate": 8.618685104150925e-07, + "loss": 1.1382, + "num_input_tokens_seen": 744870004, + "step": 18960 + }, + { + "epoch": 0.9191782149433084, + "grad_norm": 0.40474820137023926, + "learning_rate": 8.516851066848936e-07, + "loss": 1.0912, + "num_input_tokens_seen": 745260580, + "step": 18970 + }, + { + "epoch": 0.919662758019188, + "grad_norm": 0.405483603477478, + "learning_rate": 8.415611794153982e-07, + "loss": 1.1106, + "num_input_tokens_seen": 745671092, + "step": 18980 + }, + { + "epoch": 0.9201473010950674, + "grad_norm": 0.43090713024139404, + "learning_rate": 8.314967535414858e-07, + "loss": 1.076, + "num_input_tokens_seen": 746080284, + "step": 18990 + }, + { + "epoch": 0.9206318441709468, + "grad_norm": 0.4164203107357025, + "learning_rate": 8.214918538514915e-07, + "loss": 1.1442, + "num_input_tokens_seen": 746446148, + "step": 19000 + }, + { + "epoch": 0.9211163872468262, + "grad_norm": 0.4038824737071991, + "learning_rate": 8.11546504987129e-07, + "loss": 1.1099, + "num_input_tokens_seen": 746836508, + "step": 19010 + }, + { + "epoch": 0.9216009303227057, + "grad_norm": 0.39919909834861755, + "learning_rate": 8.016607314434571e-07, + "loss": 1.0966, + "num_input_tokens_seen": 747221668, + "step": 19020 + }, + { + "epoch": 0.9220854733985852, + "grad_norm": 0.4156663417816162, + "learning_rate": 7.918345575687796e-07, + "loss": 1.1268, + "num_input_tokens_seen": 747595608, + "step": 19030 + }, + { + "epoch": 0.9225700164744646, + "grad_norm": 0.40613898634910583, + "learning_rate": 7.820680075646319e-07, + "loss": 1.101, + "num_input_tokens_seen": 747986884, + "step": 19040 + }, + { + "epoch": 0.923054559550344, + "grad_norm": 0.3905068039894104, + "learning_rate": 7.723611054856833e-07, + "loss": 1.1111, + "num_input_tokens_seen": 748374856, + "step": 19050 + }, + { + "epoch": 0.9235391026262235, + "grad_norm": 0.3992663621902466, + "learning_rate": 7.627138752396984e-07, + "loss": 1.0937, + "num_input_tokens_seen": 748752876, + "step": 19060 + }, + { + "epoch": 0.9240236457021029, + "grad_norm": 0.4008621871471405, + "learning_rate": 7.531263405874678e-07, + "loss": 1.0564, + "num_input_tokens_seen": 749125480, + "step": 19070 + }, + { + "epoch": 0.9245081887779824, + "grad_norm": 0.4372722804546356, + "learning_rate": 7.435985251427552e-07, + "loss": 1.1177, + "num_input_tokens_seen": 749506916, + "step": 19080 + }, + { + "epoch": 0.9249927318538618, + "grad_norm": 0.41504257917404175, + "learning_rate": 7.34130452372242e-07, + "loss": 1.0958, + "num_input_tokens_seen": 749895376, + "step": 19090 + }, + { + "epoch": 0.9254772749297413, + "grad_norm": 0.41004976630210876, + "learning_rate": 7.247221455954662e-07, + "loss": 1.109, + "num_input_tokens_seen": 750282460, + "step": 19100 + }, + { + "epoch": 0.9259618180056207, + "grad_norm": 0.40631359815597534, + "learning_rate": 7.1537362798475e-07, + "loss": 1.1214, + "num_input_tokens_seen": 750694156, + "step": 19110 + }, + { + "epoch": 0.9264463610815001, + "grad_norm": 0.3669877350330353, + "learning_rate": 7.060849225651756e-07, + "loss": 1.0916, + "num_input_tokens_seen": 751090800, + "step": 19120 + }, + { + "epoch": 0.9269309041573796, + "grad_norm": 0.39109060168266296, + "learning_rate": 6.968560522145007e-07, + "loss": 1.1302, + "num_input_tokens_seen": 751504408, + "step": 19130 + }, + { + "epoch": 0.9274154472332591, + "grad_norm": 0.4094466269016266, + "learning_rate": 6.876870396631097e-07, + "loss": 1.1174, + "num_input_tokens_seen": 751899136, + "step": 19140 + }, + { + "epoch": 0.9278999903091385, + "grad_norm": 0.39219051599502563, + "learning_rate": 6.785779074939657e-07, + "loss": 1.1205, + "num_input_tokens_seen": 752306356, + "step": 19150 + }, + { + "epoch": 0.9283845333850179, + "grad_norm": 0.39333903789520264, + "learning_rate": 6.695286781425392e-07, + "loss": 1.0771, + "num_input_tokens_seen": 752706400, + "step": 19160 + }, + { + "epoch": 0.9288690764608973, + "grad_norm": 0.4072161614894867, + "learning_rate": 6.605393738967763e-07, + "loss": 1.1592, + "num_input_tokens_seen": 753110792, + "step": 19170 + }, + { + "epoch": 0.9293536195367769, + "grad_norm": 0.39852002263069153, + "learning_rate": 6.516100168970113e-07, + "loss": 1.1284, + "num_input_tokens_seen": 753499472, + "step": 19180 + }, + { + "epoch": 0.9298381626126563, + "grad_norm": 0.4045473039150238, + "learning_rate": 6.427406291359489e-07, + "loss": 1.1153, + "num_input_tokens_seen": 753919864, + "step": 19190 + }, + { + "epoch": 0.9303227056885357, + "grad_norm": 0.37104934453964233, + "learning_rate": 6.339312324585761e-07, + "loss": 1.1329, + "num_input_tokens_seen": 754331304, + "step": 19200 + }, + { + "epoch": 0.9308072487644151, + "grad_norm": 0.40764960646629333, + "learning_rate": 6.251818485621341e-07, + "loss": 1.0779, + "num_input_tokens_seen": 754724460, + "step": 19210 + }, + { + "epoch": 0.9312917918402946, + "grad_norm": 0.3814258277416229, + "learning_rate": 6.164924989960519e-07, + "loss": 1.0927, + "num_input_tokens_seen": 755092192, + "step": 19220 + }, + { + "epoch": 0.9317763349161741, + "grad_norm": 0.4385196566581726, + "learning_rate": 6.078632051618988e-07, + "loss": 1.086, + "num_input_tokens_seen": 755461096, + "step": 19230 + }, + { + "epoch": 0.9322608779920535, + "grad_norm": 0.42675715684890747, + "learning_rate": 5.99293988313318e-07, + "loss": 1.1314, + "num_input_tokens_seen": 755853476, + "step": 19240 + }, + { + "epoch": 0.9327454210679329, + "grad_norm": 0.4081270396709442, + "learning_rate": 5.907848695559964e-07, + "loss": 1.0827, + "num_input_tokens_seen": 756266064, + "step": 19250 + }, + { + "epoch": 0.9332299641438124, + "grad_norm": 0.40428853034973145, + "learning_rate": 5.823358698476e-07, + "loss": 1.1371, + "num_input_tokens_seen": 756658988, + "step": 19260 + }, + { + "epoch": 0.9337145072196918, + "grad_norm": 0.3700270652770996, + "learning_rate": 5.73947009997719e-07, + "loss": 1.1292, + "num_input_tokens_seen": 757041432, + "step": 19270 + }, + { + "epoch": 0.9341990502955713, + "grad_norm": 0.4112299978733063, + "learning_rate": 5.656183106678287e-07, + "loss": 1.0989, + "num_input_tokens_seen": 757439484, + "step": 19280 + }, + { + "epoch": 0.9346835933714507, + "grad_norm": 0.3882913291454315, + "learning_rate": 5.573497923712173e-07, + "loss": 1.1112, + "num_input_tokens_seen": 757858200, + "step": 19290 + }, + { + "epoch": 0.9351681364473302, + "grad_norm": 0.3786594271659851, + "learning_rate": 5.491414754729667e-07, + "loss": 1.0912, + "num_input_tokens_seen": 758250852, + "step": 19300 + }, + { + "epoch": 0.9356526795232096, + "grad_norm": 0.4327850043773651, + "learning_rate": 5.409933801898692e-07, + "loss": 1.1164, + "num_input_tokens_seen": 758642612, + "step": 19310 + }, + { + "epoch": 0.936137222599089, + "grad_norm": 0.38797247409820557, + "learning_rate": 5.329055265904076e-07, + "loss": 1.0734, + "num_input_tokens_seen": 759069940, + "step": 19320 + }, + { + "epoch": 0.9366217656749685, + "grad_norm": 0.40884557366371155, + "learning_rate": 5.248779345946808e-07, + "loss": 1.0806, + "num_input_tokens_seen": 759446920, + "step": 19330 + }, + { + "epoch": 0.937106308750848, + "grad_norm": 0.4048992097377777, + "learning_rate": 5.169106239743648e-07, + "loss": 1.1333, + "num_input_tokens_seen": 759815092, + "step": 19340 + }, + { + "epoch": 0.9375908518267274, + "grad_norm": 0.39969974756240845, + "learning_rate": 5.090036143526767e-07, + "loss": 1.1397, + "num_input_tokens_seen": 760232476, + "step": 19350 + }, + { + "epoch": 0.9380753949026068, + "grad_norm": 0.41090163588523865, + "learning_rate": 5.011569252043019e-07, + "loss": 1.0991, + "num_input_tokens_seen": 760610356, + "step": 19360 + }, + { + "epoch": 0.9385599379784862, + "grad_norm": 0.42378801107406616, + "learning_rate": 4.933705758553619e-07, + "loss": 1.1243, + "num_input_tokens_seen": 761002188, + "step": 19370 + }, + { + "epoch": 0.9390444810543658, + "grad_norm": 0.40979644656181335, + "learning_rate": 4.856445854833719e-07, + "loss": 1.1623, + "num_input_tokens_seen": 761397232, + "step": 19380 + }, + { + "epoch": 0.9395290241302452, + "grad_norm": 0.3961712718009949, + "learning_rate": 4.779789731171713e-07, + "loss": 1.1282, + "num_input_tokens_seen": 761789268, + "step": 19390 + }, + { + "epoch": 0.9400135672061246, + "grad_norm": 0.3944045603275299, + "learning_rate": 4.7037375763689664e-07, + "loss": 1.1045, + "num_input_tokens_seen": 762178272, + "step": 19400 + }, + { + "epoch": 0.940498110282004, + "grad_norm": 0.40869519114494324, + "learning_rate": 4.628289577739309e-07, + "loss": 1.1048, + "num_input_tokens_seen": 762586012, + "step": 19410 + }, + { + "epoch": 0.9409826533578836, + "grad_norm": 0.46123749017715454, + "learning_rate": 4.55344592110854e-07, + "loss": 1.1706, + "num_input_tokens_seen": 763010200, + "step": 19420 + }, + { + "epoch": 0.941467196433763, + "grad_norm": 0.40070682764053345, + "learning_rate": 4.4792067908140387e-07, + "loss": 1.1299, + "num_input_tokens_seen": 763393780, + "step": 19430 + }, + { + "epoch": 0.9419517395096424, + "grad_norm": 0.381905734539032, + "learning_rate": 4.4055723697040976e-07, + "loss": 1.1155, + "num_input_tokens_seen": 763809116, + "step": 19440 + }, + { + "epoch": 0.9424362825855218, + "grad_norm": 0.41140425205230713, + "learning_rate": 4.3325428391378377e-07, + "loss": 1.1394, + "num_input_tokens_seen": 764205148, + "step": 19450 + }, + { + "epoch": 0.9429208256614013, + "grad_norm": 0.43021172285079956, + "learning_rate": 4.260118378984407e-07, + "loss": 1.1463, + "num_input_tokens_seen": 764595480, + "step": 19460 + }, + { + "epoch": 0.9434053687372808, + "grad_norm": 0.43823298811912537, + "learning_rate": 4.188299167622728e-07, + "loss": 1.1693, + "num_input_tokens_seen": 764964992, + "step": 19470 + }, + { + "epoch": 0.9438899118131602, + "grad_norm": 0.3995303213596344, + "learning_rate": 4.117085381941055e-07, + "loss": 1.1092, + "num_input_tokens_seen": 765363980, + "step": 19480 + }, + { + "epoch": 0.9443744548890396, + "grad_norm": 0.36656254529953003, + "learning_rate": 4.0464771973364456e-07, + "loss": 1.128, + "num_input_tokens_seen": 765753044, + "step": 19490 + }, + { + "epoch": 0.9448589979649191, + "grad_norm": 0.34931907057762146, + "learning_rate": 3.9764747877144015e-07, + "loss": 1.1266, + "num_input_tokens_seen": 766153976, + "step": 19500 + }, + { + "epoch": 0.9453435410407985, + "grad_norm": 0.38153091073036194, + "learning_rate": 3.90707832548845e-07, + "loss": 1.0923, + "num_input_tokens_seen": 766544700, + "step": 19510 + }, + { + "epoch": 0.945828084116678, + "grad_norm": 0.3994804620742798, + "learning_rate": 3.838287981579619e-07, + "loss": 1.1353, + "num_input_tokens_seen": 766951124, + "step": 19520 + }, + { + "epoch": 0.9463126271925574, + "grad_norm": 0.4302452504634857, + "learning_rate": 3.7701039254162405e-07, + "loss": 1.1205, + "num_input_tokens_seen": 767361300, + "step": 19530 + }, + { + "epoch": 0.9467971702684369, + "grad_norm": 0.4123125970363617, + "learning_rate": 3.702526324933148e-07, + "loss": 1.1401, + "num_input_tokens_seen": 767727640, + "step": 19540 + }, + { + "epoch": 0.9472817133443163, + "grad_norm": 0.3907496929168701, + "learning_rate": 3.635555346571701e-07, + "loss": 1.1068, + "num_input_tokens_seen": 768096588, + "step": 19550 + }, + { + "epoch": 0.9477662564201957, + "grad_norm": 0.47248342633247375, + "learning_rate": 3.569191155279067e-07, + "loss": 1.0979, + "num_input_tokens_seen": 768467260, + "step": 19560 + }, + { + "epoch": 0.9482507994960752, + "grad_norm": 0.38330796360969543, + "learning_rate": 3.503433914507942e-07, + "loss": 1.0569, + "num_input_tokens_seen": 768874516, + "step": 19570 + }, + { + "epoch": 0.9487353425719547, + "grad_norm": 0.40380969643592834, + "learning_rate": 3.438283786216134e-07, + "loss": 1.1573, + "num_input_tokens_seen": 769280172, + "step": 19580 + }, + { + "epoch": 0.9492198856478341, + "grad_norm": 0.37804293632507324, + "learning_rate": 3.373740930866176e-07, + "loss": 1.1426, + "num_input_tokens_seen": 769668856, + "step": 19590 + }, + { + "epoch": 0.9497044287237135, + "grad_norm": 0.39080822467803955, + "learning_rate": 3.309805507424796e-07, + "loss": 1.0912, + "num_input_tokens_seen": 770088040, + "step": 19600 + }, + { + "epoch": 0.9501889717995929, + "grad_norm": 0.4255511462688446, + "learning_rate": 3.2464776733628075e-07, + "loss": 1.1104, + "num_input_tokens_seen": 770463620, + "step": 19610 + }, + { + "epoch": 0.9506735148754725, + "grad_norm": 0.3901178240776062, + "learning_rate": 3.183757584654418e-07, + "loss": 1.1788, + "num_input_tokens_seen": 770856772, + "step": 19620 + }, + { + "epoch": 0.9511580579513519, + "grad_norm": 0.4149034321308136, + "learning_rate": 3.1216453957770565e-07, + "loss": 1.118, + "num_input_tokens_seen": 771274300, + "step": 19630 + }, + { + "epoch": 0.9516426010272313, + "grad_norm": 0.43007585406303406, + "learning_rate": 3.0601412597108527e-07, + "loss": 1.0905, + "num_input_tokens_seen": 771659776, + "step": 19640 + }, + { + "epoch": 0.9521271441031107, + "grad_norm": 0.41749218106269836, + "learning_rate": 2.9992453279383825e-07, + "loss": 1.1274, + "num_input_tokens_seen": 772049360, + "step": 19650 + }, + { + "epoch": 0.9526116871789903, + "grad_norm": 0.39733201265335083, + "learning_rate": 2.938957750444199e-07, + "loss": 1.0949, + "num_input_tokens_seen": 772424652, + "step": 19660 + }, + { + "epoch": 0.9530962302548697, + "grad_norm": 0.3866783380508423, + "learning_rate": 2.879278675714497e-07, + "loss": 1.1138, + "num_input_tokens_seen": 772839880, + "step": 19670 + }, + { + "epoch": 0.9535807733307491, + "grad_norm": 0.40627822279930115, + "learning_rate": 2.820208250736839e-07, + "loss": 1.1247, + "num_input_tokens_seen": 773238608, + "step": 19680 + }, + { + "epoch": 0.9540653164066285, + "grad_norm": 0.3711201548576355, + "learning_rate": 2.7617466209995115e-07, + "loss": 1.0661, + "num_input_tokens_seen": 773603544, + "step": 19690 + }, + { + "epoch": 0.954549859482508, + "grad_norm": 0.4101531505584717, + "learning_rate": 2.703893930491558e-07, + "loss": 1.1215, + "num_input_tokens_seen": 773975992, + "step": 19700 + }, + { + "epoch": 0.9550344025583875, + "grad_norm": 0.39880767464637756, + "learning_rate": 2.6466503217021654e-07, + "loss": 1.1503, + "num_input_tokens_seen": 774381780, + "step": 19710 + }, + { + "epoch": 0.9555189456342669, + "grad_norm": 0.3977833688259125, + "learning_rate": 2.5900159356202493e-07, + "loss": 1.1096, + "num_input_tokens_seen": 774779660, + "step": 19720 + }, + { + "epoch": 0.9560034887101463, + "grad_norm": 0.3905205726623535, + "learning_rate": 2.5339909117344515e-07, + "loss": 1.1316, + "num_input_tokens_seen": 775185260, + "step": 19730 + }, + { + "epoch": 0.9564880317860258, + "grad_norm": 0.3854650557041168, + "learning_rate": 2.4785753880323945e-07, + "loss": 1.0807, + "num_input_tokens_seen": 775588504, + "step": 19740 + }, + { + "epoch": 0.9569725748619052, + "grad_norm": 0.3917495608329773, + "learning_rate": 2.4237695010005945e-07, + "loss": 1.1197, + "num_input_tokens_seen": 775995628, + "step": 19750 + }, + { + "epoch": 0.9574571179377847, + "grad_norm": 0.401319295167923, + "learning_rate": 2.3695733856240466e-07, + "loss": 1.1239, + "num_input_tokens_seen": 776399704, + "step": 19760 + }, + { + "epoch": 0.9579416610136641, + "grad_norm": 0.3838230073451996, + "learning_rate": 2.3159871753859475e-07, + "loss": 1.1737, + "num_input_tokens_seen": 776806144, + "step": 19770 + }, + { + "epoch": 0.9584262040895436, + "grad_norm": 0.39468008279800415, + "learning_rate": 2.263011002267168e-07, + "loss": 1.1331, + "num_input_tokens_seen": 777150100, + "step": 19780 + }, + { + "epoch": 0.958910747165423, + "grad_norm": 0.390552282333374, + "learning_rate": 2.2106449967463084e-07, + "loss": 1.0929, + "num_input_tokens_seen": 777529280, + "step": 19790 + }, + { + "epoch": 0.9593952902413024, + "grad_norm": 0.38140344619750977, + "learning_rate": 2.158889287798921e-07, + "loss": 1.102, + "num_input_tokens_seen": 777934560, + "step": 19800 + }, + { + "epoch": 0.9598798333171819, + "grad_norm": 0.3968132734298706, + "learning_rate": 2.1077440028975936e-07, + "loss": 1.096, + "num_input_tokens_seen": 778325752, + "step": 19810 + }, + { + "epoch": 0.9603643763930614, + "grad_norm": 0.39363691210746765, + "learning_rate": 2.057209268011312e-07, + "loss": 1.1293, + "num_input_tokens_seen": 778719844, + "step": 19820 + }, + { + "epoch": 0.9608489194689408, + "grad_norm": 0.4126024544239044, + "learning_rate": 2.0072852076054305e-07, + "loss": 1.0956, + "num_input_tokens_seen": 779089772, + "step": 19830 + }, + { + "epoch": 0.9613334625448202, + "grad_norm": 0.4260122776031494, + "learning_rate": 1.957971944641146e-07, + "loss": 1.1148, + "num_input_tokens_seen": 779469980, + "step": 19840 + }, + { + "epoch": 0.9618180056206996, + "grad_norm": 0.43452244997024536, + "learning_rate": 1.9092696005753309e-07, + "loss": 1.1507, + "num_input_tokens_seen": 779871528, + "step": 19850 + }, + { + "epoch": 0.9623025486965792, + "grad_norm": 0.3910650312900543, + "learning_rate": 1.861178295360172e-07, + "loss": 1.1368, + "num_input_tokens_seen": 780275872, + "step": 19860 + }, + { + "epoch": 0.9627870917724586, + "grad_norm": 0.38760048151016235, + "learning_rate": 1.813698147442866e-07, + "loss": 1.1478, + "num_input_tokens_seen": 780644748, + "step": 19870 + }, + { + "epoch": 0.963271634848338, + "grad_norm": 0.4468711316585541, + "learning_rate": 1.7668292737653692e-07, + "loss": 1.0858, + "num_input_tokens_seen": 781036480, + "step": 19880 + }, + { + "epoch": 0.9637561779242174, + "grad_norm": 0.4267124533653259, + "learning_rate": 1.7205717897640638e-07, + "loss": 1.1375, + "num_input_tokens_seen": 781423288, + "step": 19890 + }, + { + "epoch": 0.964240721000097, + "grad_norm": 0.3964008092880249, + "learning_rate": 1.674925809369593e-07, + "loss": 1.1471, + "num_input_tokens_seen": 781830760, + "step": 19900 + }, + { + "epoch": 0.9647252640759764, + "grad_norm": 0.39716637134552, + "learning_rate": 1.6298914450063596e-07, + "loss": 1.1797, + "num_input_tokens_seen": 782212080, + "step": 19910 + }, + { + "epoch": 0.9652098071518558, + "grad_norm": 0.372944712638855, + "learning_rate": 1.5854688075924718e-07, + "loss": 1.1394, + "num_input_tokens_seen": 782640040, + "step": 19920 + }, + { + "epoch": 0.9656943502277352, + "grad_norm": 0.3637344539165497, + "learning_rate": 1.5416580065392984e-07, + "loss": 1.1311, + "num_input_tokens_seen": 783024516, + "step": 19930 + }, + { + "epoch": 0.9661788933036147, + "grad_norm": 0.42081308364868164, + "learning_rate": 1.4984591497513856e-07, + "loss": 1.1429, + "num_input_tokens_seen": 783413016, + "step": 19940 + }, + { + "epoch": 0.9666634363794941, + "grad_norm": 0.4066031873226166, + "learning_rate": 1.4558723436259857e-07, + "loss": 1.0993, + "num_input_tokens_seen": 783799984, + "step": 19950 + }, + { + "epoch": 0.9671479794553736, + "grad_norm": 0.39138638973236084, + "learning_rate": 1.413897693052918e-07, + "loss": 1.1108, + "num_input_tokens_seen": 784171868, + "step": 19960 + }, + { + "epoch": 0.967632522531253, + "grad_norm": 0.39998748898506165, + "learning_rate": 1.3725353014142627e-07, + "loss": 1.1042, + "num_input_tokens_seen": 784567228, + "step": 19970 + }, + { + "epoch": 0.9681170656071325, + "grad_norm": 0.42037269473075867, + "learning_rate": 1.3317852705842239e-07, + "loss": 1.1562, + "num_input_tokens_seen": 784983108, + "step": 19980 + }, + { + "epoch": 0.9686016086830119, + "grad_norm": 0.4196394979953766, + "learning_rate": 1.2916477009286553e-07, + "loss": 1.1435, + "num_input_tokens_seen": 785363296, + "step": 19990 + }, + { + "epoch": 0.9690861517588913, + "grad_norm": 0.405160129070282, + "learning_rate": 1.2521226913050077e-07, + "loss": 1.0713, + "num_input_tokens_seen": 785755388, + "step": 20000 + }, + { + "epoch": 0.9690861517588913, + "eval_loss": 1.1160393953323364, + "eval_runtime": 4.9905, + "eval_samples_per_second": 30.057, + "eval_steps_per_second": 3.807, + "num_input_tokens_seen": 785755388, + "step": 20000 + }, + { + "epoch": 0.9695706948347708, + "grad_norm": 0.38724902272224426, + "learning_rate": 1.2132103390620208e-07, + "loss": 1.1259, + "num_input_tokens_seen": 786168264, + "step": 20010 + }, + { + "epoch": 0.9700552379106503, + "grad_norm": 0.4315202236175537, + "learning_rate": 1.1749107400394477e-07, + "loss": 1.1549, + "num_input_tokens_seen": 786578560, + "step": 20020 + }, + { + "epoch": 0.9705397809865297, + "grad_norm": 0.4061446785926819, + "learning_rate": 1.1372239885678871e-07, + "loss": 1.1396, + "num_input_tokens_seen": 786961092, + "step": 20030 + }, + { + "epoch": 0.9710243240624091, + "grad_norm": 0.38450926542282104, + "learning_rate": 1.1001501774684785e-07, + "loss": 1.119, + "num_input_tokens_seen": 787377820, + "step": 20040 + }, + { + "epoch": 0.9715088671382885, + "grad_norm": 0.3797142803668976, + "learning_rate": 1.063689398052764e-07, + "loss": 1.1298, + "num_input_tokens_seen": 787789388, + "step": 20050 + }, + { + "epoch": 0.9719934102141681, + "grad_norm": 0.38401252031326294, + "learning_rate": 1.0278417401223539e-07, + "loss": 1.1608, + "num_input_tokens_seen": 788175444, + "step": 20060 + }, + { + "epoch": 0.9724779532900475, + "grad_norm": 0.39400792121887207, + "learning_rate": 9.92607291968789e-08, + "loss": 1.1329, + "num_input_tokens_seen": 788565228, + "step": 20070 + }, + { + "epoch": 0.9729624963659269, + "grad_norm": 0.36722561717033386, + "learning_rate": 9.579861403732627e-08, + "loss": 1.1117, + "num_input_tokens_seen": 788948208, + "step": 20080 + }, + { + "epoch": 0.9734470394418063, + "grad_norm": 0.39072954654693604, + "learning_rate": 9.239783706065375e-08, + "loss": 1.1896, + "num_input_tokens_seen": 789332972, + "step": 20090 + }, + { + "epoch": 0.9739315825176859, + "grad_norm": 0.4184485077857971, + "learning_rate": 8.905840664284736e-08, + "loss": 1.114, + "num_input_tokens_seen": 789721976, + "step": 20100 + }, + { + "epoch": 0.9744161255935653, + "grad_norm": 0.3963470160961151, + "learning_rate": 8.578033100881677e-08, + "loss": 1.1298, + "num_input_tokens_seen": 790100792, + "step": 20110 + }, + { + "epoch": 0.9749006686694447, + "grad_norm": 0.39767026901245117, + "learning_rate": 8.256361823234527e-08, + "loss": 1.1244, + "num_input_tokens_seen": 790492440, + "step": 20120 + }, + { + "epoch": 0.9753852117453241, + "grad_norm": 0.4353410005569458, + "learning_rate": 7.940827623608427e-08, + "loss": 1.1209, + "num_input_tokens_seen": 790900392, + "step": 20130 + }, + { + "epoch": 0.9758697548212036, + "grad_norm": 0.4203384518623352, + "learning_rate": 7.631431279153111e-08, + "loss": 1.1472, + "num_input_tokens_seen": 791285628, + "step": 20140 + }, + { + "epoch": 0.9763542978970831, + "grad_norm": 0.3865092098712921, + "learning_rate": 7.328173551901241e-08, + "loss": 1.1548, + "num_input_tokens_seen": 791701900, + "step": 20150 + }, + { + "epoch": 0.9768388409729625, + "grad_norm": 0.4074331521987915, + "learning_rate": 7.031055188765622e-08, + "loss": 1.0969, + "num_input_tokens_seen": 792103252, + "step": 20160 + }, + { + "epoch": 0.9773233840488419, + "grad_norm": 0.4258996546268463, + "learning_rate": 6.74007692153894e-08, + "loss": 1.1043, + "num_input_tokens_seen": 792516300, + "step": 20170 + }, + { + "epoch": 0.9778079271247214, + "grad_norm": 0.42485523223876953, + "learning_rate": 6.455239466890418e-08, + "loss": 1.1026, + "num_input_tokens_seen": 792886424, + "step": 20180 + }, + { + "epoch": 0.9782924702006008, + "grad_norm": 0.4095394015312195, + "learning_rate": 6.176543526364709e-08, + "loss": 1.1432, + "num_input_tokens_seen": 793294236, + "step": 20190 + }, + { + "epoch": 0.9787770132764803, + "grad_norm": 0.38347864151000977, + "learning_rate": 5.903989786380515e-08, + "loss": 1.126, + "num_input_tokens_seen": 793704988, + "step": 20200 + }, + { + "epoch": 0.9792615563523597, + "grad_norm": 0.39956969022750854, + "learning_rate": 5.637578918227526e-08, + "loss": 1.1295, + "num_input_tokens_seen": 794085224, + "step": 20210 + }, + { + "epoch": 0.9797460994282392, + "grad_norm": 0.4015878736972809, + "learning_rate": 5.377311578067257e-08, + "loss": 1.1144, + "num_input_tokens_seen": 794453220, + "step": 20220 + }, + { + "epoch": 0.9802306425041186, + "grad_norm": 0.4096335172653198, + "learning_rate": 5.1231884069288805e-08, + "loss": 1.0987, + "num_input_tokens_seen": 794843816, + "step": 20230 + }, + { + "epoch": 0.980715185579998, + "grad_norm": 0.3710688650608063, + "learning_rate": 4.875210030708677e-08, + "loss": 1.1324, + "num_input_tokens_seen": 795210116, + "step": 20240 + }, + { + "epoch": 0.9811997286558775, + "grad_norm": 0.39916378259658813, + "learning_rate": 4.6333770601689195e-08, + "loss": 1.1467, + "num_input_tokens_seen": 795613888, + "step": 20250 + }, + { + "epoch": 0.981684271731757, + "grad_norm": 0.4291175603866577, + "learning_rate": 4.397690090935935e-08, + "loss": 1.1484, + "num_input_tokens_seen": 796019220, + "step": 20260 + }, + { + "epoch": 0.9821688148076364, + "grad_norm": 0.37853673100471497, + "learning_rate": 4.1681497034984364e-08, + "loss": 1.0957, + "num_input_tokens_seen": 796422340, + "step": 20270 + }, + { + "epoch": 0.9826533578835158, + "grad_norm": 0.3903977572917938, + "learning_rate": 3.9447564632066894e-08, + "loss": 1.1658, + "num_input_tokens_seen": 796813048, + "step": 20280 + }, + { + "epoch": 0.9831379009593952, + "grad_norm": 0.39838188886642456, + "learning_rate": 3.7275109202700165e-08, + "loss": 1.0973, + "num_input_tokens_seen": 797185304, + "step": 20290 + }, + { + "epoch": 0.9836224440352748, + "grad_norm": 0.3897262513637543, + "learning_rate": 3.5164136097567965e-08, + "loss": 1.1272, + "num_input_tokens_seen": 797583336, + "step": 20300 + }, + { + "epoch": 0.9841069871111542, + "grad_norm": 0.400979220867157, + "learning_rate": 3.311465051592522e-08, + "loss": 1.144, + "num_input_tokens_seen": 797981308, + "step": 20310 + }, + { + "epoch": 0.9845915301870336, + "grad_norm": 0.4422793686389923, + "learning_rate": 3.112665750558408e-08, + "loss": 1.07, + "num_input_tokens_seen": 798370388, + "step": 20320 + }, + { + "epoch": 0.985076073262913, + "grad_norm": 0.37973761558532715, + "learning_rate": 2.9200161962900098e-08, + "loss": 1.0582, + "num_input_tokens_seen": 798765140, + "step": 20330 + }, + { + "epoch": 0.9855606163387926, + "grad_norm": 0.42133259773254395, + "learning_rate": 2.7335168632763863e-08, + "loss": 1.1353, + "num_input_tokens_seen": 799159564, + "step": 20340 + }, + { + "epoch": 0.986045159414672, + "grad_norm": 0.5169322490692139, + "learning_rate": 2.5531682108595468e-08, + "loss": 1.1157, + "num_input_tokens_seen": 799545036, + "step": 20350 + }, + { + "epoch": 0.9865297024905514, + "grad_norm": 0.4151138365268707, + "learning_rate": 2.3789706832311186e-08, + "loss": 1.0991, + "num_input_tokens_seen": 799911908, + "step": 20360 + }, + { + "epoch": 0.9870142455664308, + "grad_norm": 0.3700861930847168, + "learning_rate": 2.2109247094342922e-08, + "loss": 1.0973, + "num_input_tokens_seen": 800292720, + "step": 20370 + }, + { + "epoch": 0.9874987886423103, + "grad_norm": 0.3943372070789337, + "learning_rate": 2.049030703359933e-08, + "loss": 1.0944, + "num_input_tokens_seen": 800674764, + "step": 20380 + }, + { + "epoch": 0.9879833317181898, + "grad_norm": 0.40062424540519714, + "learning_rate": 1.893289063747694e-08, + "loss": 1.1316, + "num_input_tokens_seen": 801063864, + "step": 20390 + }, + { + "epoch": 0.9884678747940692, + "grad_norm": 0.40976980328559875, + "learning_rate": 1.7437001741835157e-08, + "loss": 1.1555, + "num_input_tokens_seen": 801463696, + "step": 20400 + }, + { + "epoch": 0.9889524178699486, + "grad_norm": 0.36616411805152893, + "learning_rate": 1.6002644031001823e-08, + "loss": 1.1345, + "num_input_tokens_seen": 801848428, + "step": 20410 + }, + { + "epoch": 0.9894369609458281, + "grad_norm": 0.42311692237854004, + "learning_rate": 1.4629821037742686e-08, + "loss": 1.1274, + "num_input_tokens_seen": 802239180, + "step": 20420 + }, + { + "epoch": 0.9899215040217075, + "grad_norm": 0.39517006278038025, + "learning_rate": 1.331853614327805e-08, + "loss": 1.1543, + "num_input_tokens_seen": 802651824, + "step": 20430 + }, + { + "epoch": 0.990406047097587, + "grad_norm": 0.4204373359680176, + "learning_rate": 1.2068792577255017e-08, + "loss": 1.1141, + "num_input_tokens_seen": 803056060, + "step": 20440 + }, + { + "epoch": 0.9908905901734664, + "grad_norm": 0.42741021513938904, + "learning_rate": 1.0880593417753049e-08, + "loss": 1.107, + "num_input_tokens_seen": 803449480, + "step": 20450 + }, + { + "epoch": 0.9913751332493459, + "grad_norm": 0.3925841450691223, + "learning_rate": 9.753941591258974e-09, + "loss": 1.0623, + "num_input_tokens_seen": 803845032, + "step": 20460 + }, + { + "epoch": 0.9918596763252253, + "grad_norm": 0.38680553436279297, + "learning_rate": 8.6888398726781e-09, + "loss": 1.0997, + "num_input_tokens_seen": 804253704, + "step": 20470 + }, + { + "epoch": 0.9923442194011047, + "grad_norm": 0.3853330612182617, + "learning_rate": 7.685290885323104e-09, + "loss": 1.1338, + "num_input_tokens_seen": 804638340, + "step": 20480 + }, + { + "epoch": 0.9928287624769842, + "grad_norm": 0.378024160861969, + "learning_rate": 6.743297100897383e-09, + "loss": 1.1218, + "num_input_tokens_seen": 805045548, + "step": 20490 + }, + { + "epoch": 0.9933133055528637, + "grad_norm": 0.4238669276237488, + "learning_rate": 5.862860839500606e-09, + "loss": 1.1136, + "num_input_tokens_seen": 805444148, + "step": 20500 + }, + { + "epoch": 0.9937978486287431, + "grad_norm": 0.40654098987579346, + "learning_rate": 5.0439842696148324e-09, + "loss": 1.1313, + "num_input_tokens_seen": 805854612, + "step": 20510 + }, + { + "epoch": 0.9942823917046225, + "grad_norm": 0.38661834597587585, + "learning_rate": 4.286669408104516e-09, + "loss": 1.0887, + "num_input_tokens_seen": 806230656, + "step": 20520 + }, + { + "epoch": 0.9947669347805019, + "grad_norm": 0.42557960748672485, + "learning_rate": 3.5909181202137267e-09, + "loss": 1.1222, + "num_input_tokens_seen": 806626356, + "step": 20530 + }, + { + "epoch": 0.9952514778563815, + "grad_norm": 0.40263572335243225, + "learning_rate": 2.9567321195467236e-09, + "loss": 1.0986, + "num_input_tokens_seen": 807005260, + "step": 20540 + }, + { + "epoch": 0.9957360209322609, + "grad_norm": 0.39904552698135376, + "learning_rate": 2.384112968087382e-09, + "loss": 1.0754, + "num_input_tokens_seen": 807374212, + "step": 20550 + }, + { + "epoch": 0.9962205640081403, + "grad_norm": 0.41701799631118774, + "learning_rate": 1.8730620761742147e-09, + "loss": 1.1052, + "num_input_tokens_seen": 807812204, + "step": 20560 + }, + { + "epoch": 0.9967051070840197, + "grad_norm": 0.3973786532878876, + "learning_rate": 1.4235807025114733e-09, + "loss": 1.1094, + "num_input_tokens_seen": 808181824, + "step": 20570 + }, + { + "epoch": 0.9971896501598992, + "grad_norm": 0.3856273889541626, + "learning_rate": 1.0356699541497205e-09, + "loss": 1.1272, + "num_input_tokens_seen": 808554948, + "step": 20580 + }, + { + "epoch": 0.9976741932357787, + "grad_norm": 0.4207695424556732, + "learning_rate": 7.093307865052578e-10, + "loss": 1.1468, + "num_input_tokens_seen": 808980516, + "step": 20590 + }, + { + "epoch": 0.9981587363116581, + "grad_norm": 0.4193539619445801, + "learning_rate": 4.4456400333514616e-10, + "loss": 1.1278, + "num_input_tokens_seen": 809358268, + "step": 20600 + }, + { + "epoch": 0.9986432793875375, + "grad_norm": 0.4102073609828949, + "learning_rate": 2.4137025675663447e-10, + "loss": 1.0943, + "num_input_tokens_seen": 809733876, + "step": 20610 + }, + { + "epoch": 0.999127822463417, + "grad_norm": 0.41482511162757874, + "learning_rate": 9.975004722495573e-11, + "loss": 1.1314, + "num_input_tokens_seen": 810140020, + "step": 20620 + }, + { + "epoch": 0.9996123655392964, + "grad_norm": 0.3799668252468109, + "learning_rate": 1.9703723547204178e-11, + "loss": 1.1301, + "num_input_tokens_seen": 810535920, + "step": 20630 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 810839096, + "step": 20638, + "total_flos": 4.0772707408255386e+18, + "train_loss": 1.171214375919653, + "train_runtime": 248208.7734, + "train_samples_per_second": 10.643, + "train_steps_per_second": 0.083 + } + ], + "logging_steps": 10, + "max_steps": 20638, + "num_input_tokens_seen": 810839096, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.0772707408255386e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}