{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 42248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023669759515243323, "grad_norm": 0.36582449078559875, "learning_rate": 5.858508151611091e-08, "loss": 2.8441, "step": 100 }, { "epoch": 0.004733951903048665, "grad_norm": 0.36473265290260315, "learning_rate": 1.1657839453205907e-07, "loss": 2.8263, "step": 200 }, { "epoch": 0.007100927854572997, "grad_norm": 0.3368377685546875, "learning_rate": 1.7516347604817e-07, "loss": 2.8367, "step": 300 }, { "epoch": 0.00946790380609733, "grad_norm": 0.3552389442920685, "learning_rate": 2.3434032606444363e-07, "loss": 2.8383, "step": 400 }, { "epoch": 0.011834879757621663, "grad_norm": 0.4023584723472595, "learning_rate": 2.9351717608071723e-07, "loss": 2.8136, "step": 500 }, { "epoch": 0.014201855709145995, "grad_norm": 0.27697062492370605, "learning_rate": 3.526940260969909e-07, "loss": 2.7922, "step": 600 }, { "epoch": 0.016568831660670327, "grad_norm": 0.35471972823143005, "learning_rate": 4.1187087611326455e-07, "loss": 2.7994, "step": 700 }, { "epoch": 0.01893580761219466, "grad_norm": 0.31292667984962463, "learning_rate": 4.710477261295382e-07, "loss": 2.7759, "step": 800 }, { "epoch": 0.021302783563718994, "grad_norm": 0.453121542930603, "learning_rate": 5.302245761458118e-07, "loss": 2.7526, "step": 900 }, { "epoch": 0.023669759515243326, "grad_norm": 0.33600056171417236, "learning_rate": 5.894014261620854e-07, "loss": 2.7212, "step": 1000 }, { "epoch": 0.026036735466767658, "grad_norm": 0.3094422221183777, "learning_rate": 6.485782761783591e-07, "loss": 2.6944, "step": 1100 }, { "epoch": 0.02840371141829199, "grad_norm": 0.3267682194709778, "learning_rate": 7.077551261946328e-07, "loss": 2.6648, "step": 1200 }, { "epoch": 0.03077068736981632, "grad_norm": 0.5763485431671143, "learning_rate": 7.669319762109063e-07, "loss": 2.6594, "step": 1300 }, { "epoch": 0.03313766332134065, "grad_norm": 0.2788572609424591, "learning_rate": 8.255170577270173e-07, "loss": 2.6552, "step": 1400 }, { "epoch": 0.035504639272864985, "grad_norm": 0.38050368428230286, "learning_rate": 8.846939077432909e-07, "loss": 2.6319, "step": 1500 }, { "epoch": 0.03787161522438932, "grad_norm": 0.29289504885673523, "learning_rate": 9.438707577595646e-07, "loss": 2.6371, "step": 1600 }, { "epoch": 0.040238591175913656, "grad_norm": 0.20580381155014038, "learning_rate": 1.0030476077758381e-06, "loss": 2.6054, "step": 1700 }, { "epoch": 0.04260556712743799, "grad_norm": 0.2935289442539215, "learning_rate": 1.0622244577921118e-06, "loss": 2.5963, "step": 1800 }, { "epoch": 0.04497254307896232, "grad_norm": 0.2953510582447052, "learning_rate": 1.1214013078083855e-06, "loss": 2.6056, "step": 1900 }, { "epoch": 0.04733951903048665, "grad_norm": 0.3077057898044586, "learning_rate": 1.180578157824659e-06, "loss": 2.5864, "step": 2000 }, { "epoch": 0.04970649498201098, "grad_norm": 0.25115400552749634, "learning_rate": 1.2397550078409327e-06, "loss": 2.5911, "step": 2100 }, { "epoch": 0.052073470933535315, "grad_norm": 0.2623751759529114, "learning_rate": 1.2989318578572062e-06, "loss": 2.5591, "step": 2200 }, { "epoch": 0.05444044688505965, "grad_norm": 0.30447134375572205, "learning_rate": 1.35810870787348e-06, "loss": 2.5668, "step": 2300 }, { "epoch": 0.05680742283658398, "grad_norm": 0.2120353877544403, "learning_rate": 1.4172855578897537e-06, "loss": 2.5521, "step": 2400 }, { "epoch": 0.05917439878810831, "grad_norm": 0.23940175771713257, "learning_rate": 1.4764624079060272e-06, "loss": 2.5594, "step": 2500 }, { "epoch": 0.06154137473963264, "grad_norm": 0.2214510440826416, "learning_rate": 1.5356392579223009e-06, "loss": 2.5458, "step": 2600 }, { "epoch": 0.06390835069115698, "grad_norm": 0.22601068019866943, "learning_rate": 1.5948161079385746e-06, "loss": 2.538, "step": 2700 }, { "epoch": 0.0662753266426813, "grad_norm": 0.23850201070308685, "learning_rate": 1.6539929579548483e-06, "loss": 2.532, "step": 2800 }, { "epoch": 0.06864230259420565, "grad_norm": 0.20834830403327942, "learning_rate": 1.7131698079711218e-06, "loss": 2.5285, "step": 2900 }, { "epoch": 0.07100927854572997, "grad_norm": 0.21344949305057526, "learning_rate": 1.7723466579873955e-06, "loss": 2.5185, "step": 3000 }, { "epoch": 0.07337625449725431, "grad_norm": 0.21799206733703613, "learning_rate": 1.8315235080036692e-06, "loss": 2.5192, "step": 3100 }, { "epoch": 0.07574323044877863, "grad_norm": 0.21602454781532288, "learning_rate": 1.8907003580199425e-06, "loss": 2.5086, "step": 3200 }, { "epoch": 0.07811020640030297, "grad_norm": 0.2055075764656067, "learning_rate": 1.9498772080362162e-06, "loss": 2.5068, "step": 3300 }, { "epoch": 0.08047718235182731, "grad_norm": 0.21900290250778198, "learning_rate": 2.00905405805249e-06, "loss": 2.5144, "step": 3400 }, { "epoch": 0.08284415830335164, "grad_norm": 0.2083442062139511, "learning_rate": 2.0682309080687637e-06, "loss": 2.5137, "step": 3500 }, { "epoch": 0.08521113425487598, "grad_norm": 0.21810264885425568, "learning_rate": 2.127407758085037e-06, "loss": 2.5075, "step": 3600 }, { "epoch": 0.0875781102064003, "grad_norm": 0.2033359259366989, "learning_rate": 2.186584608101311e-06, "loss": 2.4976, "step": 3700 }, { "epoch": 0.08994508615792464, "grad_norm": 0.20291608572006226, "learning_rate": 2.2457614581175846e-06, "loss": 2.5054, "step": 3800 }, { "epoch": 0.09231206210944896, "grad_norm": 0.21681128442287445, "learning_rate": 2.3049383081338585e-06, "loss": 2.5165, "step": 3900 }, { "epoch": 0.0946790380609733, "grad_norm": 0.23095227777957916, "learning_rate": 2.3641151581501316e-06, "loss": 2.4983, "step": 4000 }, { "epoch": 0.09704601401249763, "grad_norm": 0.23442834615707397, "learning_rate": 2.4232920081664055e-06, "loss": 2.4923, "step": 4100 }, { "epoch": 0.09941298996402197, "grad_norm": 0.22967080771923065, "learning_rate": 2.482468858182679e-06, "loss": 2.4899, "step": 4200 }, { "epoch": 0.10177996591554629, "grad_norm": 0.22393766045570374, "learning_rate": 2.5416457081989525e-06, "loss": 2.4939, "step": 4300 }, { "epoch": 0.10414694186707063, "grad_norm": 0.23877893388271332, "learning_rate": 2.6008225582152264e-06, "loss": 2.4974, "step": 4400 }, { "epoch": 0.10651391781859497, "grad_norm": 0.26197168231010437, "learning_rate": 2.6599994082315e-06, "loss": 2.4773, "step": 4500 }, { "epoch": 0.1088808937701193, "grad_norm": 0.2509444057941437, "learning_rate": 2.719176258247774e-06, "loss": 2.4774, "step": 4600 }, { "epoch": 0.11124786972164363, "grad_norm": 0.22168482840061188, "learning_rate": 2.7783531082640474e-06, "loss": 2.49, "step": 4700 }, { "epoch": 0.11361484567316796, "grad_norm": 0.24707303941249847, "learning_rate": 2.8375299582803213e-06, "loss": 2.468, "step": 4800 }, { "epoch": 0.1159818216246923, "grad_norm": 0.2593018412590027, "learning_rate": 2.8967068082965944e-06, "loss": 2.4749, "step": 4900 }, { "epoch": 0.11834879757621662, "grad_norm": 0.22931291162967682, "learning_rate": 2.955883658312868e-06, "loss": 2.4787, "step": 5000 }, { "epoch": 0.12071577352774096, "grad_norm": 0.2900484502315521, "learning_rate": 3.015060508329142e-06, "loss": 2.4705, "step": 5100 }, { "epoch": 0.12308274947926529, "grad_norm": 0.2222159057855606, "learning_rate": 3.0742373583454153e-06, "loss": 2.4629, "step": 5200 }, { "epoch": 0.1254497254307896, "grad_norm": 0.2193671613931656, "learning_rate": 3.1334142083616892e-06, "loss": 2.4742, "step": 5300 }, { "epoch": 0.12781670138231396, "grad_norm": 0.22836729884147644, "learning_rate": 3.1925910583779627e-06, "loss": 2.4879, "step": 5400 }, { "epoch": 0.1301836773338383, "grad_norm": 0.2218533158302307, "learning_rate": 3.2517679083942367e-06, "loss": 2.4636, "step": 5500 }, { "epoch": 0.1325506532853626, "grad_norm": 0.252085417509079, "learning_rate": 3.31094475841051e-06, "loss": 2.4727, "step": 5600 }, { "epoch": 0.13491762923688697, "grad_norm": 0.26298022270202637, "learning_rate": 3.3701216084267837e-06, "loss": 2.4638, "step": 5700 }, { "epoch": 0.1372846051884113, "grad_norm": 0.23198895156383514, "learning_rate": 3.429298458443057e-06, "loss": 2.4597, "step": 5800 }, { "epoch": 0.13965158113993562, "grad_norm": 0.2724401354789734, "learning_rate": 3.488475308459331e-06, "loss": 2.4671, "step": 5900 }, { "epoch": 0.14201855709145994, "grad_norm": 0.22617186605930328, "learning_rate": 3.5476521584756046e-06, "loss": 2.4665, "step": 6000 }, { "epoch": 0.1443855330429843, "grad_norm": 0.24222290515899658, "learning_rate": 3.6068290084918785e-06, "loss": 2.4729, "step": 6100 }, { "epoch": 0.14675250899450862, "grad_norm": 0.23433572053909302, "learning_rate": 3.666005858508152e-06, "loss": 2.4512, "step": 6200 }, { "epoch": 0.14911948494603294, "grad_norm": 0.23977671563625336, "learning_rate": 3.725182708524426e-06, "loss": 2.464, "step": 6300 }, { "epoch": 0.15148646089755727, "grad_norm": 0.23321278393268585, "learning_rate": 3.784359558540699e-06, "loss": 2.4798, "step": 6400 }, { "epoch": 0.15385343684908162, "grad_norm": 0.27208179235458374, "learning_rate": 3.843536408556973e-06, "loss": 2.4705, "step": 6500 }, { "epoch": 0.15622041280060595, "grad_norm": 0.23790614306926727, "learning_rate": 3.902713258573246e-06, "loss": 2.4555, "step": 6600 }, { "epoch": 0.15858738875213027, "grad_norm": 0.2843892276287079, "learning_rate": 3.96189010858952e-06, "loss": 2.4725, "step": 6700 }, { "epoch": 0.16095436470365462, "grad_norm": 0.2643658220767975, "learning_rate": 4.021066958605794e-06, "loss": 2.4687, "step": 6800 }, { "epoch": 0.16332134065517895, "grad_norm": 0.29611462354660034, "learning_rate": 4.080243808622068e-06, "loss": 2.4594, "step": 6900 }, { "epoch": 0.16568831660670327, "grad_norm": 0.2879164218902588, "learning_rate": 4.139420658638341e-06, "loss": 2.4642, "step": 7000 }, { "epoch": 0.1680552925582276, "grad_norm": 0.27046066522598267, "learning_rate": 4.198597508654615e-06, "loss": 2.4689, "step": 7100 }, { "epoch": 0.17042226850975195, "grad_norm": 0.24744383990764618, "learning_rate": 4.257774358670888e-06, "loss": 2.4526, "step": 7200 }, { "epoch": 0.17278924446127628, "grad_norm": 0.2348434180021286, "learning_rate": 4.316951208687162e-06, "loss": 2.4539, "step": 7300 }, { "epoch": 0.1751562204128006, "grad_norm": 0.295792818069458, "learning_rate": 4.376128058703436e-06, "loss": 2.4491, "step": 7400 }, { "epoch": 0.17752319636432493, "grad_norm": 0.2649165093898773, "learning_rate": 4.435304908719709e-06, "loss": 2.449, "step": 7500 }, { "epoch": 0.17989017231584928, "grad_norm": 0.23758557438850403, "learning_rate": 4.494481758735983e-06, "loss": 2.455, "step": 7600 }, { "epoch": 0.1822571482673736, "grad_norm": 0.27746689319610596, "learning_rate": 4.553658608752257e-06, "loss": 2.4493, "step": 7700 }, { "epoch": 0.18462412421889793, "grad_norm": 0.2592689096927643, "learning_rate": 4.6128354587685306e-06, "loss": 2.4571, "step": 7800 }, { "epoch": 0.18699110017042228, "grad_norm": 0.2735172510147095, "learning_rate": 4.672012308784804e-06, "loss": 2.4539, "step": 7900 }, { "epoch": 0.1893580761219466, "grad_norm": 0.2739349603652954, "learning_rate": 4.7311891588010776e-06, "loss": 2.4604, "step": 8000 }, { "epoch": 0.19172505207347093, "grad_norm": 0.271176815032959, "learning_rate": 4.790366008817351e-06, "loss": 2.448, "step": 8100 }, { "epoch": 0.19409202802499526, "grad_norm": 0.2696959674358368, "learning_rate": 4.8495428588336246e-06, "loss": 2.4563, "step": 8200 }, { "epoch": 0.1964590039765196, "grad_norm": 0.30911239981651306, "learning_rate": 4.9087197088498985e-06, "loss": 2.4614, "step": 8300 }, { "epoch": 0.19882597992804393, "grad_norm": 0.2745211720466614, "learning_rate": 4.967896558866172e-06, "loss": 2.4462, "step": 8400 }, { "epoch": 0.20119295587956826, "grad_norm": 0.29566124081611633, "learning_rate": 5.0270734088824455e-06, "loss": 2.451, "step": 8500 }, { "epoch": 0.20355993183109258, "grad_norm": 0.28213486075401306, "learning_rate": 5.086250258898719e-06, "loss": 2.4407, "step": 8600 }, { "epoch": 0.20592690778261694, "grad_norm": 0.2758745849132538, "learning_rate": 5.145427108914993e-06, "loss": 2.4438, "step": 8700 }, { "epoch": 0.20829388373414126, "grad_norm": 0.2921348810195923, "learning_rate": 5.204603958931267e-06, "loss": 2.449, "step": 8800 }, { "epoch": 0.21066085968566559, "grad_norm": 0.26501932740211487, "learning_rate": 5.26378080894754e-06, "loss": 2.4486, "step": 8900 }, { "epoch": 0.21302783563718994, "grad_norm": 0.2748875617980957, "learning_rate": 5.322957658963814e-06, "loss": 2.4424, "step": 9000 }, { "epoch": 0.21539481158871426, "grad_norm": 0.28109443187713623, "learning_rate": 5.382134508980087e-06, "loss": 2.4513, "step": 9100 }, { "epoch": 0.2177617875402386, "grad_norm": 0.27431926131248474, "learning_rate": 5.44131135899636e-06, "loss": 2.4437, "step": 9200 }, { "epoch": 0.2201287634917629, "grad_norm": 0.2729012668132782, "learning_rate": 5.500488209012634e-06, "loss": 2.4538, "step": 9300 }, { "epoch": 0.22249573944328727, "grad_norm": 0.2898072600364685, "learning_rate": 5.559665059028908e-06, "loss": 2.4546, "step": 9400 }, { "epoch": 0.2248627153948116, "grad_norm": 0.3519386649131775, "learning_rate": 5.618841909045182e-06, "loss": 2.4462, "step": 9500 }, { "epoch": 0.22722969134633592, "grad_norm": 0.2779889404773712, "learning_rate": 5.678018759061455e-06, "loss": 2.443, "step": 9600 }, { "epoch": 0.22959666729786024, "grad_norm": 0.2758658826351166, "learning_rate": 5.737195609077729e-06, "loss": 2.4392, "step": 9700 }, { "epoch": 0.2319636432493846, "grad_norm": 0.3754834532737732, "learning_rate": 5.796372459094003e-06, "loss": 2.4352, "step": 9800 }, { "epoch": 0.23433061920090892, "grad_norm": 0.27345120906829834, "learning_rate": 5.855549309110277e-06, "loss": 2.4523, "step": 9900 }, { "epoch": 0.23669759515243324, "grad_norm": 0.32833969593048096, "learning_rate": 5.91472615912655e-06, "loss": 2.4497, "step": 10000 }, { "epoch": 0.2390645711039576, "grad_norm": 0.2878655791282654, "learning_rate": 5.973903009142824e-06, "loss": 2.451, "step": 10100 }, { "epoch": 0.24143154705548192, "grad_norm": 0.31419286131858826, "learning_rate": 6.033079859159098e-06, "loss": 2.4429, "step": 10200 }, { "epoch": 0.24379852300700625, "grad_norm": 0.2996383607387543, "learning_rate": 6.092256709175372e-06, "loss": 2.4349, "step": 10300 }, { "epoch": 0.24616549895853057, "grad_norm": 0.308442085981369, "learning_rate": 6.151433559191645e-06, "loss": 2.4495, "step": 10400 }, { "epoch": 0.24853247491005492, "grad_norm": 0.2972429394721985, "learning_rate": 6.210610409207919e-06, "loss": 2.433, "step": 10500 }, { "epoch": 0.2508994508615792, "grad_norm": 0.30551430583000183, "learning_rate": 6.269787259224191e-06, "loss": 2.447, "step": 10600 }, { "epoch": 0.2532664268131036, "grad_norm": 0.3082588016986847, "learning_rate": 6.328964109240465e-06, "loss": 2.4458, "step": 10700 }, { "epoch": 0.2556334027646279, "grad_norm": 0.29121455550193787, "learning_rate": 6.388140959256739e-06, "loss": 2.4208, "step": 10800 }, { "epoch": 0.2580003787161522, "grad_norm": 0.32775169610977173, "learning_rate": 6.447317809273013e-06, "loss": 2.4263, "step": 10900 }, { "epoch": 0.2603673546676766, "grad_norm": 0.32109200954437256, "learning_rate": 6.506494659289286e-06, "loss": 2.4385, "step": 11000 }, { "epoch": 0.26273433061920093, "grad_norm": 0.4912450313568115, "learning_rate": 6.56567150930556e-06, "loss": 2.4331, "step": 11100 }, { "epoch": 0.2651013065707252, "grad_norm": 0.30363771319389343, "learning_rate": 6.624848359321834e-06, "loss": 2.4339, "step": 11200 }, { "epoch": 0.2674682825222496, "grad_norm": 0.30812105536460876, "learning_rate": 6.684025209338108e-06, "loss": 2.4373, "step": 11300 }, { "epoch": 0.26983525847377393, "grad_norm": 0.3601232171058655, "learning_rate": 6.743202059354381e-06, "loss": 2.4292, "step": 11400 }, { "epoch": 0.27220223442529823, "grad_norm": 0.3195793926715851, "learning_rate": 6.802378909370655e-06, "loss": 2.438, "step": 11500 }, { "epoch": 0.2745692103768226, "grad_norm": 0.31187400221824646, "learning_rate": 6.861555759386929e-06, "loss": 2.4413, "step": 11600 }, { "epoch": 0.2769361863283469, "grad_norm": 0.3234810531139374, "learning_rate": 6.920732609403203e-06, "loss": 2.4502, "step": 11700 }, { "epoch": 0.27930316227987123, "grad_norm": 0.3229145109653473, "learning_rate": 6.979909459419476e-06, "loss": 2.4369, "step": 11800 }, { "epoch": 0.2816701382313956, "grad_norm": 0.30176299810409546, "learning_rate": 7.03908630943575e-06, "loss": 2.439, "step": 11900 }, { "epoch": 0.2840371141829199, "grad_norm": 0.3238876461982727, "learning_rate": 7.0982631594520235e-06, "loss": 2.4441, "step": 12000 }, { "epoch": 0.28640409013444423, "grad_norm": 0.3230147063732147, "learning_rate": 7.157440009468296e-06, "loss": 2.4395, "step": 12100 }, { "epoch": 0.2887710660859686, "grad_norm": 0.33063408732414246, "learning_rate": 7.21661685948457e-06, "loss": 2.4332, "step": 12200 }, { "epoch": 0.2911380420374929, "grad_norm": 0.32114726305007935, "learning_rate": 7.275793709500844e-06, "loss": 2.4301, "step": 12300 }, { "epoch": 0.29350501798901724, "grad_norm": 0.4075353741645813, "learning_rate": 7.3349705595171175e-06, "loss": 2.4333, "step": 12400 }, { "epoch": 0.2958719939405416, "grad_norm": 0.3239745497703552, "learning_rate": 7.394147409533391e-06, "loss": 2.4323, "step": 12500 }, { "epoch": 0.2982389698920659, "grad_norm": 0.4447726011276245, "learning_rate": 7.4533242595496645e-06, "loss": 2.4321, "step": 12600 }, { "epoch": 0.30060594584359024, "grad_norm": 0.3478521406650543, "learning_rate": 7.5125011095659385e-06, "loss": 2.4246, "step": 12700 }, { "epoch": 0.30297292179511454, "grad_norm": 0.35203248262405396, "learning_rate": 7.571677959582212e-06, "loss": 2.425, "step": 12800 }, { "epoch": 0.3053398977466389, "grad_norm": 0.328659325838089, "learning_rate": 7.630854809598486e-06, "loss": 2.4367, "step": 12900 }, { "epoch": 0.30770687369816324, "grad_norm": 0.3298031985759735, "learning_rate": 7.69003165961476e-06, "loss": 2.4273, "step": 13000 }, { "epoch": 0.31007384964968754, "grad_norm": 0.3143956661224365, "learning_rate": 7.749208509631032e-06, "loss": 2.4292, "step": 13100 }, { "epoch": 0.3124408256012119, "grad_norm": 0.33441880345344543, "learning_rate": 7.808385359647306e-06, "loss": 2.437, "step": 13200 }, { "epoch": 0.31480780155273624, "grad_norm": 0.335602730512619, "learning_rate": 7.86756220966358e-06, "loss": 2.438, "step": 13300 }, { "epoch": 0.31717477750426054, "grad_norm": 0.3256273865699768, "learning_rate": 7.926739059679854e-06, "loss": 2.4324, "step": 13400 }, { "epoch": 0.3195417534557849, "grad_norm": 0.3533662259578705, "learning_rate": 7.985915909696128e-06, "loss": 2.4312, "step": 13500 }, { "epoch": 0.32190872940730925, "grad_norm": 0.34541791677474976, "learning_rate": 8.0450927597124e-06, "loss": 2.4294, "step": 13600 }, { "epoch": 0.32427570535883354, "grad_norm": 0.33559226989746094, "learning_rate": 8.104269609728674e-06, "loss": 2.4206, "step": 13700 }, { "epoch": 0.3266426813103579, "grad_norm": 0.34667766094207764, "learning_rate": 8.163446459744948e-06, "loss": 2.4289, "step": 13800 }, { "epoch": 0.3290096572618822, "grad_norm": 0.3094275891780853, "learning_rate": 8.222623309761222e-06, "loss": 2.4335, "step": 13900 }, { "epoch": 0.33137663321340655, "grad_norm": 0.32228076457977295, "learning_rate": 8.281800159777496e-06, "loss": 2.4348, "step": 14000 }, { "epoch": 0.3337436091649309, "grad_norm": 0.3154647946357727, "learning_rate": 8.34097700979377e-06, "loss": 2.4195, "step": 14100 }, { "epoch": 0.3361105851164552, "grad_norm": 0.380206823348999, "learning_rate": 8.400153859810042e-06, "loss": 2.4257, "step": 14200 }, { "epoch": 0.33847756106797955, "grad_norm": 0.32707059383392334, "learning_rate": 8.459330709826316e-06, "loss": 2.4279, "step": 14300 }, { "epoch": 0.3408445370195039, "grad_norm": 0.3562242090702057, "learning_rate": 8.51850755984259e-06, "loss": 2.4433, "step": 14400 }, { "epoch": 0.3432115129710282, "grad_norm": 0.3338697552680969, "learning_rate": 8.577684409858864e-06, "loss": 2.4378, "step": 14500 }, { "epoch": 0.34557848892255255, "grad_norm": 0.3395216166973114, "learning_rate": 8.636861259875138e-06, "loss": 2.4274, "step": 14600 }, { "epoch": 0.3479454648740769, "grad_norm": 0.32426705956459045, "learning_rate": 8.696038109891412e-06, "loss": 2.4268, "step": 14700 }, { "epoch": 0.3503124408256012, "grad_norm": 0.3478586673736572, "learning_rate": 8.755214959907686e-06, "loss": 2.4247, "step": 14800 }, { "epoch": 0.35267941677712555, "grad_norm": 0.3790106475353241, "learning_rate": 8.81439180992396e-06, "loss": 2.4372, "step": 14900 }, { "epoch": 0.35504639272864985, "grad_norm": 0.3437531888484955, "learning_rate": 8.873568659940232e-06, "loss": 2.4193, "step": 15000 }, { "epoch": 0.3574133686801742, "grad_norm": 0.3627135753631592, "learning_rate": 8.932745509956506e-06, "loss": 2.4343, "step": 15100 }, { "epoch": 0.35978034463169856, "grad_norm": 0.3435176610946655, "learning_rate": 8.99192235997278e-06, "loss": 2.4231, "step": 15200 }, { "epoch": 0.36214732058322285, "grad_norm": 0.3540484309196472, "learning_rate": 9.051099209989052e-06, "loss": 2.426, "step": 15300 }, { "epoch": 0.3645142965347472, "grad_norm": 0.3281879723072052, "learning_rate": 9.110276060005326e-06, "loss": 2.4262, "step": 15400 }, { "epoch": 0.36688127248627156, "grad_norm": 0.419574499130249, "learning_rate": 9.1694529100216e-06, "loss": 2.4103, "step": 15500 }, { "epoch": 0.36924824843779586, "grad_norm": 0.38810306787490845, "learning_rate": 9.228629760037874e-06, "loss": 2.4288, "step": 15600 }, { "epoch": 0.3716152243893202, "grad_norm": 0.3265315592288971, "learning_rate": 9.287214841553986e-06, "loss": 2.431, "step": 15700 }, { "epoch": 0.37398220034084456, "grad_norm": 0.3964623510837555, "learning_rate": 9.346391691570258e-06, "loss": 2.4272, "step": 15800 }, { "epoch": 0.37634917629236886, "grad_norm": 0.3374871611595154, "learning_rate": 9.405568541586532e-06, "loss": 2.4326, "step": 15900 }, { "epoch": 0.3787161522438932, "grad_norm": 0.34002941846847534, "learning_rate": 9.464745391602806e-06, "loss": 2.4256, "step": 16000 }, { "epoch": 0.3810831281954175, "grad_norm": 0.3714279234409332, "learning_rate": 9.52392224161908e-06, "loss": 2.4202, "step": 16100 }, { "epoch": 0.38345010414694186, "grad_norm": 0.343189537525177, "learning_rate": 9.583099091635353e-06, "loss": 2.4168, "step": 16200 }, { "epoch": 0.3858170800984662, "grad_norm": 0.33741703629493713, "learning_rate": 9.642275941651626e-06, "loss": 2.4185, "step": 16300 }, { "epoch": 0.3881840560499905, "grad_norm": 0.3652304708957672, "learning_rate": 9.7014527916679e-06, "loss": 2.4272, "step": 16400 }, { "epoch": 0.39055103200151486, "grad_norm": 0.3449861407279968, "learning_rate": 9.760629641684174e-06, "loss": 2.4048, "step": 16500 }, { "epoch": 0.3929180079530392, "grad_norm": 0.344180703163147, "learning_rate": 9.819806491700447e-06, "loss": 2.4201, "step": 16600 }, { "epoch": 0.3952849839045635, "grad_norm": 0.328961044549942, "learning_rate": 9.878983341716721e-06, "loss": 2.4252, "step": 16700 }, { "epoch": 0.39765195985608787, "grad_norm": 0.3466714918613434, "learning_rate": 9.938160191732995e-06, "loss": 2.4082, "step": 16800 }, { "epoch": 0.4000189358076122, "grad_norm": 0.3624398112297058, "learning_rate": 9.99733704174927e-06, "loss": 2.4275, "step": 16900 }, { "epoch": 0.4023859117591365, "grad_norm": 0.35927194356918335, "learning_rate": 1.0056513891765543e-05, "loss": 2.4183, "step": 17000 }, { "epoch": 0.40475288771066087, "grad_norm": 0.3643719255924225, "learning_rate": 1.0115690741781815e-05, "loss": 2.4299, "step": 17100 }, { "epoch": 0.40711986366218517, "grad_norm": 0.3489636182785034, "learning_rate": 1.017486759179809e-05, "loss": 2.4105, "step": 17200 }, { "epoch": 0.4094868396137095, "grad_norm": 0.3617055118083954, "learning_rate": 1.0234044441814363e-05, "loss": 2.4262, "step": 17300 }, { "epoch": 0.41185381556523387, "grad_norm": 0.3670959174633026, "learning_rate": 1.0293221291830637e-05, "loss": 2.4253, "step": 17400 }, { "epoch": 0.41422079151675817, "grad_norm": 0.4054628610610962, "learning_rate": 1.0352398141846911e-05, "loss": 2.4165, "step": 17500 }, { "epoch": 0.4165877674682825, "grad_norm": 0.32820406556129456, "learning_rate": 1.0411574991863185e-05, "loss": 2.4156, "step": 17600 }, { "epoch": 0.4189547434198069, "grad_norm": 0.3387589752674103, "learning_rate": 1.0470751841879459e-05, "loss": 2.4273, "step": 17700 }, { "epoch": 0.42132171937133117, "grad_norm": 0.3759928047657013, "learning_rate": 1.0529928691895733e-05, "loss": 2.4311, "step": 17800 }, { "epoch": 0.4236886953228555, "grad_norm": 0.38023602962493896, "learning_rate": 1.0589105541912005e-05, "loss": 2.4243, "step": 17900 }, { "epoch": 0.4260556712743799, "grad_norm": 0.34721675515174866, "learning_rate": 1.0648282391928279e-05, "loss": 2.4188, "step": 18000 }, { "epoch": 0.4284226472259042, "grad_norm": 0.34966644644737244, "learning_rate": 1.0707459241944551e-05, "loss": 2.4086, "step": 18100 }, { "epoch": 0.4307896231774285, "grad_norm": 0.38616931438446045, "learning_rate": 1.0766636091960825e-05, "loss": 2.412, "step": 18200 }, { "epoch": 0.4331565991289528, "grad_norm": 0.3381541967391968, "learning_rate": 1.0825812941977099e-05, "loss": 2.414, "step": 18300 }, { "epoch": 0.4355235750804772, "grad_norm": 0.4827527105808258, "learning_rate": 1.0884989791993373e-05, "loss": 2.4125, "step": 18400 }, { "epoch": 0.43789055103200153, "grad_norm": 0.3514668941497803, "learning_rate": 1.0944166642009645e-05, "loss": 2.4137, "step": 18500 }, { "epoch": 0.4402575269835258, "grad_norm": 0.3542225956916809, "learning_rate": 1.100334349202592e-05, "loss": 2.4087, "step": 18600 }, { "epoch": 0.4426245029350502, "grad_norm": 0.40214431285858154, "learning_rate": 1.1062520342042193e-05, "loss": 2.4242, "step": 18700 }, { "epoch": 0.44499147888657453, "grad_norm": 0.34530532360076904, "learning_rate": 1.1121697192058467e-05, "loss": 2.4115, "step": 18800 }, { "epoch": 0.44735845483809883, "grad_norm": 0.3892427384853363, "learning_rate": 1.1180874042074741e-05, "loss": 2.4158, "step": 18900 }, { "epoch": 0.4497254307896232, "grad_norm": 0.3698406219482422, "learning_rate": 1.1240050892091015e-05, "loss": 2.4136, "step": 19000 }, { "epoch": 0.45209240674114753, "grad_norm": 0.3435867726802826, "learning_rate": 1.1299227742107289e-05, "loss": 2.4181, "step": 19100 }, { "epoch": 0.45445938269267183, "grad_norm": 0.3343878388404846, "learning_rate": 1.1358404592123563e-05, "loss": 2.4123, "step": 19200 }, { "epoch": 0.4568263586441962, "grad_norm": 0.3319224417209625, "learning_rate": 1.1417581442139835e-05, "loss": 2.4179, "step": 19300 }, { "epoch": 0.4591933345957205, "grad_norm": 0.36949145793914795, "learning_rate": 1.1476758292156109e-05, "loss": 2.4288, "step": 19400 }, { "epoch": 0.46156031054724483, "grad_norm": 0.33672720193862915, "learning_rate": 1.1535935142172383e-05, "loss": 2.4283, "step": 19500 }, { "epoch": 0.4639272864987692, "grad_norm": 0.36359962821006775, "learning_rate": 1.1595111992188657e-05, "loss": 2.4104, "step": 19600 }, { "epoch": 0.4662942624502935, "grad_norm": 0.357768714427948, "learning_rate": 1.165428884220493e-05, "loss": 2.4005, "step": 19700 }, { "epoch": 0.46866123840181784, "grad_norm": 0.35632389783859253, "learning_rate": 1.1713465692221205e-05, "loss": 2.4156, "step": 19800 }, { "epoch": 0.4710282143533422, "grad_norm": 0.35454291105270386, "learning_rate": 1.1772642542237479e-05, "loss": 2.4075, "step": 19900 }, { "epoch": 0.4733951903048665, "grad_norm": 0.337933212518692, "learning_rate": 1.1831819392253752e-05, "loss": 2.4119, "step": 20000 }, { "epoch": 0.47576216625639084, "grad_norm": 0.36804336309432983, "learning_rate": 1.1890996242270025e-05, "loss": 2.4112, "step": 20100 }, { "epoch": 0.4781291422079152, "grad_norm": 0.3589170575141907, "learning_rate": 1.1950173092286299e-05, "loss": 2.4111, "step": 20200 }, { "epoch": 0.4804961181594395, "grad_norm": 0.4138932228088379, "learning_rate": 1.2009349942302573e-05, "loss": 2.4147, "step": 20300 }, { "epoch": 0.48286309411096384, "grad_norm": 0.37294042110443115, "learning_rate": 1.2068526792318846e-05, "loss": 2.4199, "step": 20400 }, { "epoch": 0.48523007006248814, "grad_norm": 0.34787285327911377, "learning_rate": 1.212770364233512e-05, "loss": 2.4125, "step": 20500 }, { "epoch": 0.4875970460140125, "grad_norm": 0.33219948410987854, "learning_rate": 1.2186880492351394e-05, "loss": 2.4046, "step": 20600 }, { "epoch": 0.48996402196553684, "grad_norm": 0.3547484278678894, "learning_rate": 1.2246057342367668e-05, "loss": 2.4178, "step": 20700 }, { "epoch": 0.49233099791706114, "grad_norm": 0.33837926387786865, "learning_rate": 1.2305234192383942e-05, "loss": 2.403, "step": 20800 }, { "epoch": 0.4946979738685855, "grad_norm": 0.35077232122421265, "learning_rate": 1.2364411042400214e-05, "loss": 2.4139, "step": 20900 }, { "epoch": 0.49706494982010985, "grad_norm": 0.3571261167526245, "learning_rate": 1.2422996123916324e-05, "loss": 2.4001, "step": 21000 }, { "epoch": 0.49943192577163414, "grad_norm": 0.36656296253204346, "learning_rate": 1.2482172973932598e-05, "loss": 2.406, "step": 21100 }, { "epoch": 0.5017989017231584, "grad_norm": 0.3557038903236389, "learning_rate": 1.2541349823948872e-05, "loss": 2.41, "step": 21200 }, { "epoch": 0.5041658776746828, "grad_norm": 0.361907035112381, "learning_rate": 1.2600526673965146e-05, "loss": 2.4106, "step": 21300 }, { "epoch": 0.5065328536262071, "grad_norm": 0.34070518612861633, "learning_rate": 1.2659703523981418e-05, "loss": 2.4121, "step": 21400 }, { "epoch": 0.5088998295777315, "grad_norm": 0.35266879200935364, "learning_rate": 1.2718880373997692e-05, "loss": 2.4051, "step": 21500 }, { "epoch": 0.5112668055292559, "grad_norm": 0.39729219675064087, "learning_rate": 1.2778057224013966e-05, "loss": 2.4004, "step": 21600 }, { "epoch": 0.5136337814807802, "grad_norm": 0.34886813163757324, "learning_rate": 1.283723407403024e-05, "loss": 2.4171, "step": 21700 }, { "epoch": 0.5160007574323044, "grad_norm": 0.33244648575782776, "learning_rate": 1.2896410924046514e-05, "loss": 2.3979, "step": 21800 }, { "epoch": 0.5183677333838288, "grad_norm": 0.3533230423927307, "learning_rate": 1.2955587774062788e-05, "loss": 2.4039, "step": 21900 }, { "epoch": 0.5207347093353532, "grad_norm": 0.3643980920314789, "learning_rate": 1.3014764624079062e-05, "loss": 2.417, "step": 22000 }, { "epoch": 0.5231016852868775, "grad_norm": 0.3681216835975647, "learning_rate": 1.3073941474095336e-05, "loss": 2.4028, "step": 22100 }, { "epoch": 0.5254686612384019, "grad_norm": 0.3376631438732147, "learning_rate": 1.3133118324111608e-05, "loss": 2.4044, "step": 22200 }, { "epoch": 0.5278356371899261, "grad_norm": 0.3588080108165741, "learning_rate": 1.3192295174127882e-05, "loss": 2.4152, "step": 22300 }, { "epoch": 0.5302026131414505, "grad_norm": 0.35474061965942383, "learning_rate": 1.3251472024144156e-05, "loss": 2.3962, "step": 22400 }, { "epoch": 0.5325695890929748, "grad_norm": 0.36065080761909485, "learning_rate": 1.331064887416043e-05, "loss": 2.4035, "step": 22500 }, { "epoch": 0.5349365650444992, "grad_norm": 0.34817591309547424, "learning_rate": 1.3369825724176704e-05, "loss": 2.4108, "step": 22600 }, { "epoch": 0.5373035409960235, "grad_norm": 0.33565661311149597, "learning_rate": 1.3429002574192978e-05, "loss": 2.403, "step": 22700 }, { "epoch": 0.5396705169475479, "grad_norm": 0.34676095843315125, "learning_rate": 1.3488179424209252e-05, "loss": 2.4056, "step": 22800 }, { "epoch": 0.5420374928990721, "grad_norm": 0.3674164116382599, "learning_rate": 1.3547356274225526e-05, "loss": 2.4061, "step": 22900 }, { "epoch": 0.5444044688505965, "grad_norm": 0.3376142978668213, "learning_rate": 1.3605941355741634e-05, "loss": 2.4158, "step": 23000 }, { "epoch": 0.5467714448021208, "grad_norm": 0.3908544182777405, "learning_rate": 1.3665118205757908e-05, "loss": 2.4022, "step": 23100 }, { "epoch": 0.5491384207536452, "grad_norm": 0.38587990403175354, "learning_rate": 1.3724295055774182e-05, "loss": 2.4171, "step": 23200 }, { "epoch": 0.5515053967051695, "grad_norm": 0.3695133924484253, "learning_rate": 1.3783471905790456e-05, "loss": 2.3997, "step": 23300 }, { "epoch": 0.5538723726566938, "grad_norm": 0.3392127454280853, "learning_rate": 1.384264875580673e-05, "loss": 2.4157, "step": 23400 }, { "epoch": 0.5562393486082181, "grad_norm": 0.3664696216583252, "learning_rate": 1.3901825605823004e-05, "loss": 2.4123, "step": 23500 }, { "epoch": 0.5586063245597425, "grad_norm": 0.3691762387752533, "learning_rate": 1.3961002455839276e-05, "loss": 2.3994, "step": 23600 }, { "epoch": 0.5609733005112668, "grad_norm": 0.3565746247768402, "learning_rate": 1.402017930585555e-05, "loss": 2.4027, "step": 23700 }, { "epoch": 0.5633402764627912, "grad_norm": 0.3518475890159607, "learning_rate": 1.4079356155871824e-05, "loss": 2.3937, "step": 23800 }, { "epoch": 0.5657072524143155, "grad_norm": 0.34867557883262634, "learning_rate": 1.4138533005888098e-05, "loss": 2.4, "step": 23900 }, { "epoch": 0.5680742283658398, "grad_norm": 0.35145652294158936, "learning_rate": 1.4197709855904371e-05, "loss": 2.4044, "step": 24000 }, { "epoch": 0.5704412043173641, "grad_norm": 0.3380683958530426, "learning_rate": 1.4256886705920645e-05, "loss": 2.4139, "step": 24100 }, { "epoch": 0.5728081802688885, "grad_norm": 0.3554782569408417, "learning_rate": 1.431606355593692e-05, "loss": 2.395, "step": 24200 }, { "epoch": 0.5751751562204128, "grad_norm": 0.39881500601768494, "learning_rate": 1.4375240405953193e-05, "loss": 2.3942, "step": 24300 }, { "epoch": 0.5775421321719372, "grad_norm": 0.37088507413864136, "learning_rate": 1.4434417255969465e-05, "loss": 2.4092, "step": 24400 }, { "epoch": 0.5799091081234614, "grad_norm": 0.3711656630039215, "learning_rate": 1.449359410598574e-05, "loss": 2.4184, "step": 24500 }, { "epoch": 0.5822760840749858, "grad_norm": 0.33910948038101196, "learning_rate": 1.4552770956002013e-05, "loss": 2.3916, "step": 24600 }, { "epoch": 0.5846430600265101, "grad_norm": 0.35600873827934265, "learning_rate": 1.4611947806018287e-05, "loss": 2.4008, "step": 24700 }, { "epoch": 0.5870100359780345, "grad_norm": 0.35309475660324097, "learning_rate": 1.4671124656034561e-05, "loss": 2.3979, "step": 24800 }, { "epoch": 0.5893770119295588, "grad_norm": 0.3425716459751129, "learning_rate": 1.4730301506050835e-05, "loss": 2.4015, "step": 24900 }, { "epoch": 0.5917439878810832, "grad_norm": 0.3652407228946686, "learning_rate": 1.4789478356067109e-05, "loss": 2.3957, "step": 25000 }, { "epoch": 0.5941109638326074, "grad_norm": 0.3365596830844879, "learning_rate": 1.4848655206083383e-05, "loss": 2.3913, "step": 25100 }, { "epoch": 0.5964779397841318, "grad_norm": 0.35885608196258545, "learning_rate": 1.4907832056099655e-05, "loss": 2.3903, "step": 25200 }, { "epoch": 0.5988449157356561, "grad_norm": 0.38684821128845215, "learning_rate": 1.4966417137615765e-05, "loss": 2.3876, "step": 25300 }, { "epoch": 0.6012118916871805, "grad_norm": 0.3497035503387451, "learning_rate": 1.5025593987632039e-05, "loss": 2.3874, "step": 25400 }, { "epoch": 0.6035788676387048, "grad_norm": 0.3431876599788666, "learning_rate": 1.5084770837648313e-05, "loss": 2.39, "step": 25500 }, { "epoch": 0.6059458435902291, "grad_norm": 0.35600966215133667, "learning_rate": 1.5143947687664587e-05, "loss": 2.4009, "step": 25600 }, { "epoch": 0.6083128195417534, "grad_norm": 0.33623310923576355, "learning_rate": 1.5203124537680861e-05, "loss": 2.3981, "step": 25700 }, { "epoch": 0.6106797954932778, "grad_norm": 0.33237648010253906, "learning_rate": 1.5262301387697135e-05, "loss": 2.4036, "step": 25800 }, { "epoch": 0.6130467714448021, "grad_norm": 0.35398033261299133, "learning_rate": 1.532147823771341e-05, "loss": 2.3988, "step": 25900 }, { "epoch": 0.6154137473963265, "grad_norm": 0.47366973757743835, "learning_rate": 1.5380655087729683e-05, "loss": 2.4013, "step": 26000 }, { "epoch": 0.6177807233478508, "grad_norm": 0.339417427778244, "learning_rate": 1.5439831937745957e-05, "loss": 2.4069, "step": 26100 }, { "epoch": 0.6201476992993751, "grad_norm": 0.3327637016773224, "learning_rate": 1.5499008787762227e-05, "loss": 2.3921, "step": 26200 }, { "epoch": 0.6225146752508994, "grad_norm": 0.3412494659423828, "learning_rate": 1.55581856377785e-05, "loss": 2.379, "step": 26300 }, { "epoch": 0.6248816512024238, "grad_norm": 0.3637641668319702, "learning_rate": 1.5617362487794775e-05, "loss": 2.3911, "step": 26400 }, { "epoch": 0.6272486271539481, "grad_norm": 0.4117577373981476, "learning_rate": 1.567653933781105e-05, "loss": 2.391, "step": 26500 }, { "epoch": 0.6296156031054725, "grad_norm": 0.3605392575263977, "learning_rate": 1.5735716187827323e-05, "loss": 2.3961, "step": 26600 }, { "epoch": 0.6319825790569967, "grad_norm": 0.35646742582321167, "learning_rate": 1.5794893037843597e-05, "loss": 2.3969, "step": 26700 }, { "epoch": 0.6343495550085211, "grad_norm": 0.3432878851890564, "learning_rate": 1.585406988785987e-05, "loss": 2.3939, "step": 26800 }, { "epoch": 0.6367165309600454, "grad_norm": 0.3541545569896698, "learning_rate": 1.5913246737876145e-05, "loss": 2.4079, "step": 26900 }, { "epoch": 0.6390835069115698, "grad_norm": 0.3709736168384552, "learning_rate": 1.597242358789242e-05, "loss": 2.4119, "step": 27000 }, { "epoch": 0.6414504828630941, "grad_norm": 0.32629159092903137, "learning_rate": 1.6031600437908692e-05, "loss": 2.3905, "step": 27100 }, { "epoch": 0.6438174588146185, "grad_norm": 0.4810309410095215, "learning_rate": 1.6090777287924966e-05, "loss": 2.3926, "step": 27200 }, { "epoch": 0.6461844347661427, "grad_norm": 0.37358030676841736, "learning_rate": 1.614995413794124e-05, "loss": 2.3836, "step": 27300 }, { "epoch": 0.6485514107176671, "grad_norm": 0.36473044753074646, "learning_rate": 1.6209130987957514e-05, "loss": 2.39, "step": 27400 }, { "epoch": 0.6509183866691914, "grad_norm": 0.32987740635871887, "learning_rate": 1.6268307837973788e-05, "loss": 2.3925, "step": 27500 }, { "epoch": 0.6532853626207158, "grad_norm": 0.34442269802093506, "learning_rate": 1.6327484687990062e-05, "loss": 2.4023, "step": 27600 }, { "epoch": 0.6556523385722401, "grad_norm": 0.3745739161968231, "learning_rate": 1.6386661538006333e-05, "loss": 2.4047, "step": 27700 }, { "epoch": 0.6580193145237644, "grad_norm": 0.3746493458747864, "learning_rate": 1.6445838388022607e-05, "loss": 2.4005, "step": 27800 }, { "epoch": 0.6603862904752887, "grad_norm": 0.32949355244636536, "learning_rate": 1.650501523803888e-05, "loss": 2.3875, "step": 27900 }, { "epoch": 0.6627532664268131, "grad_norm": 0.331719309091568, "learning_rate": 1.6564192088055154e-05, "loss": 2.3876, "step": 28000 }, { "epoch": 0.6651202423783374, "grad_norm": 0.34970593452453613, "learning_rate": 1.662336893807143e-05, "loss": 2.3995, "step": 28100 }, { "epoch": 0.6674872183298618, "grad_norm": 0.3494050204753876, "learning_rate": 1.6682545788087702e-05, "loss": 2.3852, "step": 28200 }, { "epoch": 0.6698541942813862, "grad_norm": 0.31740233302116394, "learning_rate": 1.6741722638103973e-05, "loss": 2.3953, "step": 28300 }, { "epoch": 0.6722211702329104, "grad_norm": 0.3360515236854553, "learning_rate": 1.6800899488120247e-05, "loss": 2.3911, "step": 28400 }, { "epoch": 0.6745881461844347, "grad_norm": 0.3421274721622467, "learning_rate": 1.685948456963636e-05, "loss": 2.404, "step": 28500 }, { "epoch": 0.6769551221359591, "grad_norm": 0.33647575974464417, "learning_rate": 1.6918661419652632e-05, "loss": 2.3986, "step": 28600 }, { "epoch": 0.6793220980874835, "grad_norm": 0.33582180738449097, "learning_rate": 1.6977838269668906e-05, "loss": 2.3948, "step": 28700 }, { "epoch": 0.6816890740390078, "grad_norm": 0.34744688868522644, "learning_rate": 1.703701511968518e-05, "loss": 2.3921, "step": 28800 }, { "epoch": 0.684056049990532, "grad_norm": 0.3513332009315491, "learning_rate": 1.7096191969701454e-05, "loss": 2.397, "step": 28900 }, { "epoch": 0.6864230259420564, "grad_norm": 0.35616153478622437, "learning_rate": 1.7155368819717728e-05, "loss": 2.3922, "step": 29000 }, { "epoch": 0.6887900018935808, "grad_norm": 0.3601691424846649, "learning_rate": 1.7214545669734002e-05, "loss": 2.3886, "step": 29100 }, { "epoch": 0.6911569778451051, "grad_norm": 0.3415214419364929, "learning_rate": 1.7273722519750276e-05, "loss": 2.3836, "step": 29200 }, { "epoch": 0.6935239537966295, "grad_norm": 0.3496253788471222, "learning_rate": 1.733289936976655e-05, "loss": 2.3832, "step": 29300 }, { "epoch": 0.6958909297481538, "grad_norm": 0.32848358154296875, "learning_rate": 1.7392076219782824e-05, "loss": 2.3839, "step": 29400 }, { "epoch": 0.698257905699678, "grad_norm": 0.3362344801425934, "learning_rate": 1.7451253069799098e-05, "loss": 2.3878, "step": 29500 }, { "epoch": 0.7006248816512024, "grad_norm": 0.34034013748168945, "learning_rate": 1.751042991981537e-05, "loss": 2.3841, "step": 29600 }, { "epoch": 0.7029918576027268, "grad_norm": 0.34850838780403137, "learning_rate": 1.7569606769831646e-05, "loss": 2.3893, "step": 29700 }, { "epoch": 0.7053588335542511, "grad_norm": 0.34481024742126465, "learning_rate": 1.762878361984792e-05, "loss": 2.3746, "step": 29800 }, { "epoch": 0.7077258095057755, "grad_norm": 0.319324254989624, "learning_rate": 1.768796046986419e-05, "loss": 2.3909, "step": 29900 }, { "epoch": 0.7100927854572997, "grad_norm": 0.3310067057609558, "learning_rate": 1.7747137319880464e-05, "loss": 2.3859, "step": 30000 }, { "epoch": 0.712459761408824, "grad_norm": 0.34449535608291626, "learning_rate": 1.7806314169896738e-05, "loss": 2.4031, "step": 30100 }, { "epoch": 0.7148267373603484, "grad_norm": 0.36738091707229614, "learning_rate": 1.7865491019913012e-05, "loss": 2.3877, "step": 30200 }, { "epoch": 0.7171937133118728, "grad_norm": 0.3570147752761841, "learning_rate": 1.7924667869929286e-05, "loss": 2.3921, "step": 30300 }, { "epoch": 0.7195606892633971, "grad_norm": 0.32705631852149963, "learning_rate": 1.798384471994556e-05, "loss": 2.3844, "step": 30400 }, { "epoch": 0.7219276652149215, "grad_norm": 0.3508467972278595, "learning_rate": 1.804302156996183e-05, "loss": 2.375, "step": 30500 }, { "epoch": 0.7242946411664457, "grad_norm": 0.3959505558013916, "learning_rate": 1.8102198419978104e-05, "loss": 2.3893, "step": 30600 }, { "epoch": 0.7266616171179701, "grad_norm": 0.3338560163974762, "learning_rate": 1.8161375269994378e-05, "loss": 2.3803, "step": 30700 }, { "epoch": 0.7290285930694944, "grad_norm": 0.3438529968261719, "learning_rate": 1.8220552120010652e-05, "loss": 2.383, "step": 30800 }, { "epoch": 0.7313955690210188, "grad_norm": 0.34159713983535767, "learning_rate": 1.8279728970026926e-05, "loss": 2.381, "step": 30900 }, { "epoch": 0.7337625449725431, "grad_norm": 0.38974571228027344, "learning_rate": 1.83389058200432e-05, "loss": 2.3779, "step": 31000 }, { "epoch": 0.7361295209240674, "grad_norm": 0.3364710211753845, "learning_rate": 1.8398082670059474e-05, "loss": 2.3846, "step": 31100 }, { "epoch": 0.7384964968755917, "grad_norm": 0.39294859766960144, "learning_rate": 1.8457259520075748e-05, "loss": 2.3857, "step": 31200 }, { "epoch": 0.7408634728271161, "grad_norm": 0.35359159111976624, "learning_rate": 1.851643637009202e-05, "loss": 2.3821, "step": 31300 }, { "epoch": 0.7432304487786404, "grad_norm": 0.37089574337005615, "learning_rate": 1.8575613220108295e-05, "loss": 2.394, "step": 31400 }, { "epoch": 0.7455974247301648, "grad_norm": 0.32074281573295593, "learning_rate": 1.863479007012457e-05, "loss": 2.3854, "step": 31500 }, { "epoch": 0.7479644006816891, "grad_norm": 0.3406684696674347, "learning_rate": 1.8693966920140843e-05, "loss": 2.3822, "step": 31600 }, { "epoch": 0.7503313766332134, "grad_norm": 0.3442894220352173, "learning_rate": 1.8753143770157117e-05, "loss": 2.3782, "step": 31700 }, { "epoch": 0.7526983525847377, "grad_norm": 0.3537774682044983, "learning_rate": 1.881172885167323e-05, "loss": 2.3846, "step": 31800 }, { "epoch": 0.7550653285362621, "grad_norm": 0.31586501002311707, "learning_rate": 1.8870905701689503e-05, "loss": 2.3876, "step": 31900 }, { "epoch": 0.7574323044877864, "grad_norm": 0.35079076886177063, "learning_rate": 1.8930082551705777e-05, "loss": 2.3891, "step": 32000 }, { "epoch": 0.7597992804393108, "grad_norm": 0.3363019824028015, "learning_rate": 1.8989259401722047e-05, "loss": 2.3933, "step": 32100 }, { "epoch": 0.762166256390835, "grad_norm": 0.32039549946784973, "learning_rate": 1.904843625173832e-05, "loss": 2.3585, "step": 32200 }, { "epoch": 0.7645332323423594, "grad_norm": 0.33742275834083557, "learning_rate": 1.9107613101754595e-05, "loss": 2.3809, "step": 32300 }, { "epoch": 0.7669002082938837, "grad_norm": 0.3437131941318512, "learning_rate": 1.916678995177087e-05, "loss": 2.3811, "step": 32400 }, { "epoch": 0.7692671842454081, "grad_norm": 0.3589881658554077, "learning_rate": 1.9225966801787143e-05, "loss": 2.3763, "step": 32500 }, { "epoch": 0.7716341601969324, "grad_norm": 0.36550530791282654, "learning_rate": 1.9285143651803414e-05, "loss": 2.3804, "step": 32600 }, { "epoch": 0.7740011361484568, "grad_norm": 0.3241026699542999, "learning_rate": 1.9344320501819688e-05, "loss": 2.3908, "step": 32700 }, { "epoch": 0.776368112099981, "grad_norm": 0.33091387152671814, "learning_rate": 1.940349735183596e-05, "loss": 2.378, "step": 32800 }, { "epoch": 0.7787350880515054, "grad_norm": 0.31871795654296875, "learning_rate": 1.9462674201852235e-05, "loss": 2.3837, "step": 32900 }, { "epoch": 0.7811020640030297, "grad_norm": 0.331828773021698, "learning_rate": 1.952185105186851e-05, "loss": 2.3774, "step": 33000 }, { "epoch": 0.7834690399545541, "grad_norm": 0.33192068338394165, "learning_rate": 1.9581027901884783e-05, "loss": 2.3812, "step": 33100 }, { "epoch": 0.7858360159060784, "grad_norm": 0.3415600657463074, "learning_rate": 1.9640204751901057e-05, "loss": 2.3754, "step": 33200 }, { "epoch": 0.7882029918576027, "grad_norm": 0.30927810072898865, "learning_rate": 1.969938160191733e-05, "loss": 2.3844, "step": 33300 }, { "epoch": 0.790569967809127, "grad_norm": 0.3214524984359741, "learning_rate": 1.9758558451933605e-05, "loss": 2.3678, "step": 33400 }, { "epoch": 0.7929369437606514, "grad_norm": 0.3286936581134796, "learning_rate": 1.981773530194988e-05, "loss": 2.3848, "step": 33500 }, { "epoch": 0.7953039197121757, "grad_norm": 0.33375072479248047, "learning_rate": 1.9876912151966153e-05, "loss": 2.3737, "step": 33600 }, { "epoch": 0.7976708956637001, "grad_norm": 0.3241300582885742, "learning_rate": 1.9936089001982427e-05, "loss": 2.3662, "step": 33700 }, { "epoch": 0.8000378716152244, "grad_norm": 0.34323224425315857, "learning_rate": 1.99952658519987e-05, "loss": 2.3809, "step": 33800 }, { "epoch": 0.8024048475667487, "grad_norm": 0.3225324749946594, "learning_rate": 1.9994152275965527e-05, "loss": 2.3724, "step": 33900 }, { "epoch": 0.804771823518273, "grad_norm": 0.3365699350833893, "learning_rate": 1.997453922456623e-05, "loss": 2.3759, "step": 34000 }, { "epoch": 0.8071387994697974, "grad_norm": 0.32580050826072693, "learning_rate": 1.994114372491635e-05, "loss": 2.3733, "step": 34100 }, { "epoch": 0.8095057754213217, "grad_norm": 0.3402758836746216, "learning_rate": 1.989455103627163e-05, "loss": 2.3742, "step": 34200 }, { "epoch": 0.8118727513728461, "grad_norm": 0.3205104470252991, "learning_rate": 1.983388438172617e-05, "loss": 2.3704, "step": 34300 }, { "epoch": 0.8142397273243703, "grad_norm": 0.3125210404396057, "learning_rate": 1.975962963057375e-05, "loss": 2.3652, "step": 34400 }, { "epoch": 0.8166067032758947, "grad_norm": 0.3083760142326355, "learning_rate": 1.9671889385274698e-05, "loss": 2.3782, "step": 34500 }, { "epoch": 0.818973679227419, "grad_norm": 0.3169231116771698, "learning_rate": 1.9570784882044856e-05, "loss": 2.3826, "step": 34600 }, { "epoch": 0.8213406551789434, "grad_norm": 0.30974331498146057, "learning_rate": 1.945645582333587e-05, "loss": 2.3741, "step": 34700 }, { "epoch": 0.8237076311304677, "grad_norm": 0.34712207317352295, "learning_rate": 1.93290601847995e-05, "loss": 2.3839, "step": 34800 }, { "epoch": 0.8260746070819921, "grad_norm": 0.3297557234764099, "learning_rate": 1.918877399700279e-05, "loss": 2.3762, "step": 34900 }, { "epoch": 0.8284415830335163, "grad_norm": 0.3331148326396942, "learning_rate": 1.9035791102195484e-05, "loss": 2.3759, "step": 35000 }, { "epoch": 0.8308085589850407, "grad_norm": 0.3134233057498932, "learning_rate": 1.8870322886466053e-05, "loss": 2.3715, "step": 35100 }, { "epoch": 0.833175534936565, "grad_norm": 0.3077858090400696, "learning_rate": 1.8692597987656205e-05, "loss": 2.3652, "step": 35200 }, { "epoch": 0.8355425108880894, "grad_norm": 0.3141195476055145, "learning_rate": 1.8502861979437626e-05, "loss": 2.3677, "step": 35300 }, { "epoch": 0.8379094868396137, "grad_norm": 0.3238203525543213, "learning_rate": 1.8301377031987363e-05, "loss": 2.368, "step": 35400 }, { "epoch": 0.840276462791138, "grad_norm": 0.32180941104888916, "learning_rate": 1.8088421549730826e-05, "loss": 2.3654, "step": 35500 }, { "epoch": 0.8426434387426623, "grad_norm": 0.3173375427722931, "learning_rate": 1.7864289786652865e-05, "loss": 2.3708, "step": 35600 }, { "epoch": 0.8450104146941867, "grad_norm": 0.3098245859146118, "learning_rate": 1.762929143970854e-05, "loss": 2.3847, "step": 35700 }, { "epoch": 0.847377390645711, "grad_norm": 0.3169116675853729, "learning_rate": 1.7383751220895348e-05, "loss": 2.3849, "step": 35800 }, { "epoch": 0.8497443665972354, "grad_norm": 0.2940201461315155, "learning_rate": 1.7128008408578232e-05, "loss": 2.3777, "step": 35900 }, { "epoch": 0.8521113425487598, "grad_norm": 0.3399713635444641, "learning_rate": 1.686241637868734e-05, "loss": 2.3686, "step": 36000 }, { "epoch": 0.854478318500284, "grad_norm": 0.319431871175766, "learning_rate": 1.658734211643625e-05, "loss": 2.3656, "step": 36100 }, { "epoch": 0.8568452944518083, "grad_norm": 0.3360809087753296, "learning_rate": 1.6303165709235443e-05, "loss": 2.3782, "step": 36200 }, { "epoch": 0.8592122704033327, "grad_norm": 0.3362599015235901, "learning_rate": 1.6010279821501603e-05, "loss": 2.3838, "step": 36300 }, { "epoch": 0.861579246354857, "grad_norm": 0.32247358560562134, "learning_rate": 1.5709089152088488e-05, "loss": 2.3708, "step": 36400 }, { "epoch": 0.8639462223063814, "grad_norm": 0.3041239380836487, "learning_rate": 1.5400009875089087e-05, "loss": 2.3754, "step": 36500 }, { "epoch": 0.8663131982579056, "grad_norm": 0.3329671323299408, "learning_rate": 1.5083469064781687e-05, "loss": 2.3611, "step": 36600 }, { "epoch": 0.86868017420943, "grad_norm": 0.3081216812133789, "learning_rate": 1.475990410551448e-05, "loss": 2.3697, "step": 36700 }, { "epoch": 0.8710471501609544, "grad_norm": 0.3056845963001251, "learning_rate": 1.4429762087344101e-05, "loss": 2.3602, "step": 36800 }, { "epoch": 0.8734141261124787, "grad_norm": 0.3348017632961273, "learning_rate": 1.4093499188263166e-05, "loss": 2.3688, "step": 36900 }, { "epoch": 0.8757811020640031, "grad_norm": 0.3079584240913391, "learning_rate": 1.3751580043870465e-05, "loss": 2.3741, "step": 37000 }, { "epoch": 0.8781480780155274, "grad_norm": 0.33923518657684326, "learning_rate": 1.3407972225319847e-05, "loss": 2.3628, "step": 37100 }, { "epoch": 0.8805150539670517, "grad_norm": 0.325127512216568, "learning_rate": 1.3056209752459611e-05, "loss": 2.3621, "step": 37200 }, { "epoch": 0.882882029918576, "grad_norm": 0.32378092408180237, "learning_rate": 1.270022432234713e-05, "loss": 2.3662, "step": 37300 }, { "epoch": 0.8852490058701004, "grad_norm": 0.3274565637111664, "learning_rate": 1.2340507822442868e-05, "loss": 2.3665, "step": 37400 }, { "epoch": 0.8876159818216247, "grad_norm": 0.33395031094551086, "learning_rate": 1.1977557295661108e-05, "loss": 2.3616, "step": 37500 }, { "epoch": 0.8899829577731491, "grad_norm": 0.3172805607318878, "learning_rate": 1.1611874253574492e-05, "loss": 2.3676, "step": 37600 }, { "epoch": 0.8923499337246733, "grad_norm": 0.34134411811828613, "learning_rate": 1.1243963983443936e-05, "loss": 2.361, "step": 37700 }, { "epoch": 0.8947169096761977, "grad_norm": 0.3161686658859253, "learning_rate": 1.0874334850031435e-05, "loss": 2.3653, "step": 37800 }, { "epoch": 0.897083885627722, "grad_norm": 0.31952333450317383, "learning_rate": 1.0503497593160507e-05, "loss": 2.3689, "step": 37900 }, { "epoch": 0.8994508615792464, "grad_norm": 0.3199727237224579, "learning_rate": 1.0131964621994832e-05, "loss": 2.3679, "step": 38000 }, { "epoch": 0.9018178375307707, "grad_norm": 0.3248251676559448, "learning_rate": 9.760249307010301e-06, "loss": 2.3718, "step": 38100 }, { "epoch": 0.9041848134822951, "grad_norm": 0.32015731930732727, "learning_rate": 9.388865270638724e-06, "loss": 2.3594, "step": 38200 }, { "epoch": 0.9065517894338193, "grad_norm": 0.336444228887558, "learning_rate": 9.018325677563413e-06, "loss": 2.3677, "step": 38300 }, { "epoch": 0.9089187653853437, "grad_norm": 0.3233816623687744, "learning_rate": 8.649142525647271e-06, "loss": 2.3651, "step": 38400 }, { "epoch": 0.911285741336868, "grad_norm": 0.3261754512786865, "learning_rate": 8.281825938473116e-06, "loss": 2.3586, "step": 38500 }, { "epoch": 0.9136527172883924, "grad_norm": 0.30703264474868774, "learning_rate": 7.916883460473865e-06, "loss": 2.3668, "step": 38600 }, { "epoch": 0.9160196932399167, "grad_norm": 0.3308265507221222, "learning_rate": 7.554819355626455e-06, "loss": 2.3536, "step": 38700 }, { "epoch": 0.918386669191441, "grad_norm": 0.35222479701042175, "learning_rate": 7.196133910678582e-06, "loss": 2.3635, "step": 38800 }, { "epoch": 0.9207536451429653, "grad_norm": 0.3244943618774414, "learning_rate": 6.841322743871041e-06, "loss": 2.3705, "step": 38900 }, { "epoch": 0.9231206210944897, "grad_norm": 0.3116552233695984, "learning_rate": 6.490876120110827e-06, "loss": 2.3611, "step": 39000 }, { "epoch": 0.925487597046014, "grad_norm": 0.33249032497406006, "learning_rate": 6.145278273541281e-06, "loss": 2.3585, "step": 39100 }, { "epoch": 0.9278545729975384, "grad_norm": 0.3132554888725281, "learning_rate": 5.805006738445294e-06, "loss": 2.368, "step": 39200 }, { "epoch": 0.9302215489490627, "grad_norm": 0.3264056444168091, "learning_rate": 5.4705316894061765e-06, "loss": 2.3635, "step": 39300 }, { "epoch": 0.932588524900587, "grad_norm": 0.3539658188819885, "learning_rate": 5.142315291637857e-06, "loss": 2.3624, "step": 39400 }, { "epoch": 0.9349555008521113, "grad_norm": 0.32513052225112915, "learning_rate": 4.823991412773918e-06, "loss": 2.3714, "step": 39500 }, { "epoch": 0.9373224768036357, "grad_norm": 0.3148360252380371, "learning_rate": 4.509569863501355e-06, "loss": 2.3587, "step": 39600 }, { "epoch": 0.93968945275516, "grad_norm": 0.33431729674339294, "learning_rate": 4.202734786899464e-06, "loss": 2.3719, "step": 39700 }, { "epoch": 0.9420564287066844, "grad_norm": 0.3179948031902313, "learning_rate": 3.903910156293686e-06, "loss": 2.3668, "step": 39800 }, { "epoch": 0.9444234046582086, "grad_norm": 0.3322046101093292, "learning_rate": 3.613508876472357e-06, "loss": 2.3645, "step": 39900 }, { "epoch": 0.946790380609733, "grad_norm": 0.328708291053772, "learning_rate": 3.331932213150203e-06, "loss": 2.3592, "step": 40000 }, { "epoch": 0.9491573565612573, "grad_norm": 0.31054648756980896, "learning_rate": 3.0595692385142717e-06, "loss": 2.373, "step": 40100 }, { "epoch": 0.9515243325127817, "grad_norm": 0.3367001414299011, "learning_rate": 2.79679629361839e-06, "loss": 2.3614, "step": 40200 }, { "epoch": 0.953891308464306, "grad_norm": 0.32708635926246643, "learning_rate": 2.543976468369088e-06, "loss": 2.3541, "step": 40300 }, { "epoch": 0.9562582844158304, "grad_norm": 0.32008349895477295, "learning_rate": 2.301459099821417e-06, "loss": 2.3742, "step": 40400 }, { "epoch": 0.9586252603673546, "grad_norm": 0.3486650288105011, "learning_rate": 2.0695792894779788e-06, "loss": 2.3553, "step": 40500 }, { "epoch": 0.960992236318879, "grad_norm": 0.32483014464378357, "learning_rate": 1.8486574402580858e-06, "loss": 2.3573, "step": 40600 }, { "epoch": 0.9633592122704033, "grad_norm": 0.3283023536205292, "learning_rate": 1.6389988137769153e-06, "loss": 2.3715, "step": 40700 }, { "epoch": 0.9657261882219277, "grad_norm": 0.3350400924682617, "learning_rate": 1.4408931085463206e-06, "loss": 2.3757, "step": 40800 }, { "epoch": 0.968093164173452, "grad_norm": 0.32112061977386475, "learning_rate": 1.2564174493396274e-06, "loss": 2.3816, "step": 40900 }, { "epoch": 0.9704601401249763, "grad_norm": 0.3097105324268341, "learning_rate": 1.0821003902626947e-06, "loss": 2.365, "step": 41000 }, { "epoch": 0.9728271160765006, "grad_norm": 0.3082149028778076, "learning_rate": 9.201057540173219e-07, "loss": 2.3691, "step": 41100 }, { "epoch": 0.975194092028025, "grad_norm": 0.31467440724372864, "learning_rate": 7.706573787819616e-07, "loss": 2.3787, "step": 41200 }, { "epoch": 0.9775610679795493, "grad_norm": 0.3216908872127533, "learning_rate": 6.339617667770615e-07, "loss": 2.3821, "step": 41300 }, { "epoch": 0.9799280439310737, "grad_norm": 0.32618415355682373, "learning_rate": 5.102077989279552e-07, "loss": 2.3609, "step": 41400 }, { "epoch": 0.982295019882598, "grad_norm": 0.32504287362098694, "learning_rate": 3.9956647387621507e-07, "loss": 2.3646, "step": 41500 }, { "epoch": 0.9846619958341223, "grad_norm": 0.31874212622642517, "learning_rate": 3.0219067170006445e-07, "loss": 2.3579, "step": 41600 }, { "epoch": 0.9870289717856466, "grad_norm": 0.34173473715782166, "learning_rate": 2.182149426703606e-07, "loss": 2.3719, "step": 41700 }, { "epoch": 0.989395947737171, "grad_norm": 0.32773253321647644, "learning_rate": 1.4775532133402547e-07, "loss": 2.3625, "step": 41800 }, { "epoch": 0.9917629236886953, "grad_norm": 0.34616851806640625, "learning_rate": 9.090916618180623e-08, "loss": 2.3645, "step": 41900 }, { "epoch": 0.9941298996402197, "grad_norm": 0.3184524476528168, "learning_rate": 4.775502512193164e-08, "loss": 2.3658, "step": 42000 }, { "epoch": 0.9964968755917439, "grad_norm": 0.349345862865448, "learning_rate": 1.835252694552425e-08, "loss": 2.3604, "step": 42100 }, { "epoch": 0.9988638515432683, "grad_norm": 0.33537670969963074, "learning_rate": 2.742298933747778e-09, "loss": 2.3624, "step": 42200 }, { "epoch": 1.0, "step": 42248, "total_flos": 6.236990962447417e+18, "train_loss": 2.428118334464832, "train_runtime": 22646.6509, "train_samples_per_second": 29.848, "train_steps_per_second": 1.866 } ], "logging_steps": 100, "max_steps": 42248, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.236990962447417e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }