{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 3728, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000536480686695279, "grad_norm": 2.1193658839109535, "learning_rate": 2.6809651474530834e-08, "loss": 0.2801, "step": 1 }, { "epoch": 0.002682403433476395, "grad_norm": 2.0015902310793314, "learning_rate": 1.3404825737265417e-07, "loss": 0.2807, "step": 5 }, { "epoch": 0.00536480686695279, "grad_norm": 2.073451625086016, "learning_rate": 2.6809651474530835e-07, "loss": 0.2747, "step": 10 }, { "epoch": 0.008047210300429184, "grad_norm": 1.8760840710412692, "learning_rate": 4.021447721179625e-07, "loss": 0.2718, "step": 15 }, { "epoch": 0.01072961373390558, "grad_norm": 1.8514498316277004, "learning_rate": 5.361930294906167e-07, "loss": 0.2744, "step": 20 }, { "epoch": 0.013412017167381975, "grad_norm": 2.3107741406506377, "learning_rate": 6.702412868632709e-07, "loss": 0.2717, "step": 25 }, { "epoch": 0.016094420600858368, "grad_norm": 2.9666341479845557, "learning_rate": 8.04289544235925e-07, "loss": 0.2782, "step": 30 }, { "epoch": 0.018776824034334765, "grad_norm": 1.8534767524547542, "learning_rate": 9.383378016085791e-07, "loss": 0.2734, "step": 35 }, { "epoch": 0.02145922746781116, "grad_norm": 1.823032428950144, "learning_rate": 1.0723860589812334e-06, "loss": 0.2617, "step": 40 }, { "epoch": 0.024141630901287552, "grad_norm": 1.775612988969821, "learning_rate": 1.2064343163538874e-06, "loss": 0.276, "step": 45 }, { "epoch": 0.02682403433476395, "grad_norm": 1.929975715410071, "learning_rate": 1.3404825737265418e-06, "loss": 0.2662, "step": 50 }, { "epoch": 0.029506437768240343, "grad_norm": 1.8087913414979522, "learning_rate": 1.4745308310991958e-06, "loss": 0.2669, "step": 55 }, { "epoch": 0.032188841201716736, "grad_norm": 2.022204524546991, "learning_rate": 1.60857908847185e-06, "loss": 0.2627, "step": 60 }, { "epoch": 0.03487124463519313, "grad_norm": 2.1463881824702753, "learning_rate": 1.7426273458445042e-06, "loss": 0.2696, "step": 65 }, { "epoch": 0.03755364806866953, "grad_norm": 2.0857951929046483, "learning_rate": 1.8766756032171582e-06, "loss": 0.2687, "step": 70 }, { "epoch": 0.040236051502145924, "grad_norm": 1.9633020298954502, "learning_rate": 2.0107238605898126e-06, "loss": 0.2624, "step": 75 }, { "epoch": 0.04291845493562232, "grad_norm": 1.8157453645459625, "learning_rate": 2.1447721179624668e-06, "loss": 0.2656, "step": 80 }, { "epoch": 0.04560085836909871, "grad_norm": 2.0537449419682523, "learning_rate": 2.278820375335121e-06, "loss": 0.2654, "step": 85 }, { "epoch": 0.048283261802575105, "grad_norm": 1.9165430368004193, "learning_rate": 2.4128686327077747e-06, "loss": 0.2702, "step": 90 }, { "epoch": 0.050965665236051505, "grad_norm": 2.099005086864125, "learning_rate": 2.5469168900804294e-06, "loss": 0.2708, "step": 95 }, { "epoch": 0.0536480686695279, "grad_norm": 2.204477969609587, "learning_rate": 2.6809651474530836e-06, "loss": 0.2793, "step": 100 }, { "epoch": 0.05633047210300429, "grad_norm": 1.8884219618666596, "learning_rate": 2.8150134048257378e-06, "loss": 0.2705, "step": 105 }, { "epoch": 0.059012875536480686, "grad_norm": 1.956758686772309, "learning_rate": 2.9490616621983915e-06, "loss": 0.273, "step": 110 }, { "epoch": 0.06169527896995708, "grad_norm": 2.1563070425827973, "learning_rate": 3.0831099195710457e-06, "loss": 0.2731, "step": 115 }, { "epoch": 0.06437768240343347, "grad_norm": 1.858725074682674, "learning_rate": 3.2171581769437e-06, "loss": 0.2759, "step": 120 }, { "epoch": 0.06706008583690987, "grad_norm": 2.005262259307075, "learning_rate": 3.351206434316354e-06, "loss": 0.2724, "step": 125 }, { "epoch": 0.06974248927038626, "grad_norm": 1.9910932655550024, "learning_rate": 3.4852546916890083e-06, "loss": 0.2733, "step": 130 }, { "epoch": 0.07242489270386267, "grad_norm": 2.1163561816144165, "learning_rate": 3.6193029490616625e-06, "loss": 0.2701, "step": 135 }, { "epoch": 0.07510729613733906, "grad_norm": 2.0289917692414727, "learning_rate": 3.7533512064343163e-06, "loss": 0.2809, "step": 140 }, { "epoch": 0.07778969957081545, "grad_norm": 2.0542532816858246, "learning_rate": 3.8873994638069705e-06, "loss": 0.2845, "step": 145 }, { "epoch": 0.08047210300429185, "grad_norm": 2.2160380804123703, "learning_rate": 4.021447721179625e-06, "loss": 0.2718, "step": 150 }, { "epoch": 0.08315450643776824, "grad_norm": 2.010718604868917, "learning_rate": 4.155495978552279e-06, "loss": 0.2743, "step": 155 }, { "epoch": 0.08583690987124463, "grad_norm": 2.095619735005895, "learning_rate": 4.2895442359249335e-06, "loss": 0.2746, "step": 160 }, { "epoch": 0.08851931330472103, "grad_norm": 2.094029007639187, "learning_rate": 4.423592493297587e-06, "loss": 0.2711, "step": 165 }, { "epoch": 0.09120171673819742, "grad_norm": 2.351687226421416, "learning_rate": 4.557640750670242e-06, "loss": 0.2827, "step": 170 }, { "epoch": 0.09388412017167382, "grad_norm": 2.1865063248573633, "learning_rate": 4.691689008042896e-06, "loss": 0.2805, "step": 175 }, { "epoch": 0.09656652360515021, "grad_norm": 2.29052738745751, "learning_rate": 4.8257372654155495e-06, "loss": 0.2823, "step": 180 }, { "epoch": 0.0992489270386266, "grad_norm": 2.070148947789941, "learning_rate": 4.959785522788204e-06, "loss": 0.2744, "step": 185 }, { "epoch": 0.10193133047210301, "grad_norm": 2.077129309115267, "learning_rate": 5.093833780160859e-06, "loss": 0.2895, "step": 190 }, { "epoch": 0.1046137339055794, "grad_norm": 2.148889376498599, "learning_rate": 5.2278820375335125e-06, "loss": 0.2839, "step": 195 }, { "epoch": 0.1072961373390558, "grad_norm": 2.2661600118200362, "learning_rate": 5.361930294906167e-06, "loss": 0.2849, "step": 200 }, { "epoch": 0.10997854077253219, "grad_norm": 2.4516992204705845, "learning_rate": 5.495978552278821e-06, "loss": 0.2905, "step": 205 }, { "epoch": 0.11266094420600858, "grad_norm": 2.278752706949854, "learning_rate": 5.6300268096514755e-06, "loss": 0.2976, "step": 210 }, { "epoch": 0.11534334763948498, "grad_norm": 2.228118304205806, "learning_rate": 5.764075067024129e-06, "loss": 0.2872, "step": 215 }, { "epoch": 0.11802575107296137, "grad_norm": 2.3264366180297955, "learning_rate": 5.898123324396783e-06, "loss": 0.2827, "step": 220 }, { "epoch": 0.12070815450643776, "grad_norm": 2.307669959933898, "learning_rate": 6.032171581769437e-06, "loss": 0.2792, "step": 225 }, { "epoch": 0.12339055793991416, "grad_norm": 2.2266960480281237, "learning_rate": 6.1662198391420915e-06, "loss": 0.2874, "step": 230 }, { "epoch": 0.12607296137339055, "grad_norm": 2.2629479071659127, "learning_rate": 6.300268096514745e-06, "loss": 0.2909, "step": 235 }, { "epoch": 0.12875536480686695, "grad_norm": 2.365422069984688, "learning_rate": 6.4343163538874e-06, "loss": 0.2927, "step": 240 }, { "epoch": 0.13143776824034334, "grad_norm": 2.2527300701494837, "learning_rate": 6.5683646112600545e-06, "loss": 0.2883, "step": 245 }, { "epoch": 0.13412017167381973, "grad_norm": 2.4274972440039293, "learning_rate": 6.702412868632708e-06, "loss": 0.2907, "step": 250 }, { "epoch": 0.13680257510729613, "grad_norm": 2.1457321190107206, "learning_rate": 6.836461126005363e-06, "loss": 0.2843, "step": 255 }, { "epoch": 0.13948497854077252, "grad_norm": 2.387953890047213, "learning_rate": 6.970509383378017e-06, "loss": 0.293, "step": 260 }, { "epoch": 0.14216738197424894, "grad_norm": 2.2814653060043906, "learning_rate": 7.104557640750671e-06, "loss": 0.2946, "step": 265 }, { "epoch": 0.14484978540772533, "grad_norm": 2.370046047924481, "learning_rate": 7.238605898123325e-06, "loss": 0.3013, "step": 270 }, { "epoch": 0.14753218884120173, "grad_norm": 2.2771297604435943, "learning_rate": 7.37265415549598e-06, "loss": 0.2972, "step": 275 }, { "epoch": 0.15021459227467812, "grad_norm": 2.4040269748067122, "learning_rate": 7.506702412868633e-06, "loss": 0.2981, "step": 280 }, { "epoch": 0.15289699570815452, "grad_norm": 2.2181354741056025, "learning_rate": 7.640750670241287e-06, "loss": 0.2874, "step": 285 }, { "epoch": 0.1555793991416309, "grad_norm": 2.3224226868793547, "learning_rate": 7.774798927613941e-06, "loss": 0.2942, "step": 290 }, { "epoch": 0.1582618025751073, "grad_norm": 2.306896775766709, "learning_rate": 7.908847184986595e-06, "loss": 0.2965, "step": 295 }, { "epoch": 0.1609442060085837, "grad_norm": 2.4727900310215065, "learning_rate": 8.04289544235925e-06, "loss": 0.298, "step": 300 }, { "epoch": 0.1636266094420601, "grad_norm": 2.5327361432436133, "learning_rate": 8.176943699731904e-06, "loss": 0.3062, "step": 305 }, { "epoch": 0.16630901287553648, "grad_norm": 2.382656988199917, "learning_rate": 8.310991957104558e-06, "loss": 0.3098, "step": 310 }, { "epoch": 0.16899141630901288, "grad_norm": 2.5408461164910023, "learning_rate": 8.445040214477213e-06, "loss": 0.3095, "step": 315 }, { "epoch": 0.17167381974248927, "grad_norm": 2.509775729676357, "learning_rate": 8.579088471849867e-06, "loss": 0.3071, "step": 320 }, { "epoch": 0.17435622317596566, "grad_norm": 2.3746080279147135, "learning_rate": 8.71313672922252e-06, "loss": 0.3036, "step": 325 }, { "epoch": 0.17703862660944206, "grad_norm": 2.676758001381311, "learning_rate": 8.847184986595175e-06, "loss": 0.3112, "step": 330 }, { "epoch": 0.17972103004291845, "grad_norm": 2.3311511336790844, "learning_rate": 8.98123324396783e-06, "loss": 0.2943, "step": 335 }, { "epoch": 0.18240343347639484, "grad_norm": 2.5612823504248268, "learning_rate": 9.115281501340484e-06, "loss": 0.3007, "step": 340 }, { "epoch": 0.18508583690987124, "grad_norm": 2.489998305969664, "learning_rate": 9.249329758713138e-06, "loss": 0.3221, "step": 345 }, { "epoch": 0.18776824034334763, "grad_norm": 2.565835844383068, "learning_rate": 9.383378016085791e-06, "loss": 0.3142, "step": 350 }, { "epoch": 0.19045064377682402, "grad_norm": 2.5575470827530786, "learning_rate": 9.517426273458445e-06, "loss": 0.3195, "step": 355 }, { "epoch": 0.19313304721030042, "grad_norm": 2.5709996724338238, "learning_rate": 9.651474530831099e-06, "loss": 0.3197, "step": 360 }, { "epoch": 0.1958154506437768, "grad_norm": 2.61230510657174, "learning_rate": 9.785522788203754e-06, "loss": 0.3178, "step": 365 }, { "epoch": 0.1984978540772532, "grad_norm": 2.4243619066809763, "learning_rate": 9.919571045576408e-06, "loss": 0.3235, "step": 370 }, { "epoch": 0.20118025751072963, "grad_norm": 2.5656565045851134, "learning_rate": 9.999991231716779e-06, "loss": 0.3262, "step": 375 }, { "epoch": 0.20386266094420602, "grad_norm": 2.504711268754622, "learning_rate": 9.999892588883699e-06, "loss": 0.3129, "step": 380 }, { "epoch": 0.2065450643776824, "grad_norm": 2.6368432321178394, "learning_rate": 9.99968434503304e-06, "loss": 0.3187, "step": 385 }, { "epoch": 0.2092274678111588, "grad_norm": 2.6401120959685764, "learning_rate": 9.999366504729645e-06, "loss": 0.3307, "step": 390 }, { "epoch": 0.2119098712446352, "grad_norm": 2.446108433760747, "learning_rate": 9.998939074940788e-06, "loss": 0.312, "step": 395 }, { "epoch": 0.2145922746781116, "grad_norm": 2.7244856222722857, "learning_rate": 9.998402065036018e-06, "loss": 0.3253, "step": 400 }, { "epoch": 0.217274678111588, "grad_norm": 2.7127769547322145, "learning_rate": 9.997755486786954e-06, "loss": 0.3091, "step": 405 }, { "epoch": 0.21995708154506438, "grad_norm": 2.5371567830259636, "learning_rate": 9.996999354367028e-06, "loss": 0.3199, "step": 410 }, { "epoch": 0.22263948497854077, "grad_norm": 2.6088498753645353, "learning_rate": 9.996133684351172e-06, "loss": 0.3232, "step": 415 }, { "epoch": 0.22532188841201717, "grad_norm": 7.217139015065728, "learning_rate": 9.995158495715459e-06, "loss": 0.327, "step": 420 }, { "epoch": 0.22800429184549356, "grad_norm": 3.554479318340767, "learning_rate": 9.994073809836677e-06, "loss": 0.3256, "step": 425 }, { "epoch": 0.23068669527896996, "grad_norm": 2.559539610962939, "learning_rate": 9.992879650491877e-06, "loss": 0.3213, "step": 430 }, { "epoch": 0.23336909871244635, "grad_norm": 2.557571984377656, "learning_rate": 9.991576043857833e-06, "loss": 0.3237, "step": 435 }, { "epoch": 0.23605150214592274, "grad_norm": 2.6391378035627375, "learning_rate": 9.990163018510484e-06, "loss": 0.3202, "step": 440 }, { "epoch": 0.23873390557939914, "grad_norm": 2.6671621213459678, "learning_rate": 9.988640605424298e-06, "loss": 0.3284, "step": 445 }, { "epoch": 0.24141630901287553, "grad_norm": 2.4904914439261883, "learning_rate": 9.987008837971595e-06, "loss": 0.3193, "step": 450 }, { "epoch": 0.24409871244635192, "grad_norm": 3.1179943187719363, "learning_rate": 9.98526775192182e-06, "loss": 0.328, "step": 455 }, { "epoch": 0.24678111587982832, "grad_norm": 2.595197759246731, "learning_rate": 9.983417385440755e-06, "loss": 0.3264, "step": 460 }, { "epoch": 0.2494635193133047, "grad_norm": 2.746547849364462, "learning_rate": 9.981457779089678e-06, "loss": 0.3396, "step": 465 }, { "epoch": 0.2521459227467811, "grad_norm": 2.722019747471792, "learning_rate": 9.979388975824485e-06, "loss": 0.3272, "step": 470 }, { "epoch": 0.2548283261802575, "grad_norm": 2.486350126619608, "learning_rate": 9.977211020994735e-06, "loss": 0.324, "step": 475 }, { "epoch": 0.2575107296137339, "grad_norm": 2.5526327242594555, "learning_rate": 9.97492396234267e-06, "loss": 0.3223, "step": 480 }, { "epoch": 0.2601931330472103, "grad_norm": 2.5930764574443512, "learning_rate": 9.972527850002154e-06, "loss": 0.3247, "step": 485 }, { "epoch": 0.2628755364806867, "grad_norm": 2.5928820204873575, "learning_rate": 9.970022736497588e-06, "loss": 0.3301, "step": 490 }, { "epoch": 0.2655579399141631, "grad_norm": 2.5784218179882386, "learning_rate": 9.96740867674275e-06, "loss": 0.3194, "step": 495 }, { "epoch": 0.26824034334763946, "grad_norm": 2.658880048400765, "learning_rate": 9.964685728039596e-06, "loss": 0.3272, "step": 500 }, { "epoch": 0.2709227467811159, "grad_norm": 2.463519790065412, "learning_rate": 9.961853950076992e-06, "loss": 0.3289, "step": 505 }, { "epoch": 0.27360515021459225, "grad_norm": 2.6146614884524255, "learning_rate": 9.958913404929423e-06, "loss": 0.3299, "step": 510 }, { "epoch": 0.2762875536480687, "grad_norm": 2.5798821970457086, "learning_rate": 9.955864157055623e-06, "loss": 0.3323, "step": 515 }, { "epoch": 0.27896995708154504, "grad_norm": 2.590993175319231, "learning_rate": 9.95270627329716e-06, "loss": 0.3225, "step": 520 }, { "epoch": 0.28165236051502146, "grad_norm": 2.7224033196200557, "learning_rate": 9.949439822876975e-06, "loss": 0.335, "step": 525 }, { "epoch": 0.2843347639484979, "grad_norm": 2.5981695298735756, "learning_rate": 9.94606487739787e-06, "loss": 0.3415, "step": 530 }, { "epoch": 0.28701716738197425, "grad_norm": 2.473905568804743, "learning_rate": 9.942581510840919e-06, "loss": 0.3361, "step": 535 }, { "epoch": 0.28969957081545067, "grad_norm": 2.507624964074619, "learning_rate": 9.93898979956387e-06, "loss": 0.3243, "step": 540 }, { "epoch": 0.29238197424892703, "grad_norm": 2.5170360727284247, "learning_rate": 9.935289822299456e-06, "loss": 0.3332, "step": 545 }, { "epoch": 0.29506437768240346, "grad_norm": 2.6442710714593303, "learning_rate": 9.931481660153672e-06, "loss": 0.3282, "step": 550 }, { "epoch": 0.2977467811158798, "grad_norm": 2.5155871741574893, "learning_rate": 9.927565396604001e-06, "loss": 0.3304, "step": 555 }, { "epoch": 0.30042918454935624, "grad_norm": 2.4727988504009244, "learning_rate": 9.923541117497586e-06, "loss": 0.3383, "step": 560 }, { "epoch": 0.3031115879828326, "grad_norm": 2.5793736113029264, "learning_rate": 9.919408911049333e-06, "loss": 0.3348, "step": 565 }, { "epoch": 0.30579399141630903, "grad_norm": 2.611112917618138, "learning_rate": 9.915168867839997e-06, "loss": 0.3335, "step": 570 }, { "epoch": 0.3084763948497854, "grad_norm": 2.4707000489730464, "learning_rate": 9.910821080814184e-06, "loss": 0.3329, "step": 575 }, { "epoch": 0.3111587982832618, "grad_norm": 2.580213753748582, "learning_rate": 9.90636564527832e-06, "loss": 0.3357, "step": 580 }, { "epoch": 0.3138412017167382, "grad_norm": 2.458103213642892, "learning_rate": 9.901802658898552e-06, "loss": 0.3283, "step": 585 }, { "epoch": 0.3165236051502146, "grad_norm": 2.6446412175211442, "learning_rate": 9.897132221698624e-06, "loss": 0.3317, "step": 590 }, { "epoch": 0.31920600858369097, "grad_norm": 2.600054325837593, "learning_rate": 9.892354436057665e-06, "loss": 0.3349, "step": 595 }, { "epoch": 0.3218884120171674, "grad_norm": 2.762247660251928, "learning_rate": 9.887469406707962e-06, "loss": 0.3389, "step": 600 }, { "epoch": 0.32457081545064376, "grad_norm": 2.6896055716205622, "learning_rate": 9.882477240732652e-06, "loss": 0.3366, "step": 605 }, { "epoch": 0.3272532188841202, "grad_norm": 2.512025914738532, "learning_rate": 9.877378047563378e-06, "loss": 0.3375, "step": 610 }, { "epoch": 0.32993562231759654, "grad_norm": 2.5061656043715392, "learning_rate": 9.872171938977895e-06, "loss": 0.3443, "step": 615 }, { "epoch": 0.33261802575107297, "grad_norm": 2.437928130863239, "learning_rate": 9.866859029097613e-06, "loss": 0.3416, "step": 620 }, { "epoch": 0.33530042918454933, "grad_norm": 2.3508365072923127, "learning_rate": 9.8614394343851e-06, "loss": 0.34, "step": 625 }, { "epoch": 0.33798283261802575, "grad_norm": 2.6530719820749846, "learning_rate": 9.855913273641531e-06, "loss": 0.3434, "step": 630 }, { "epoch": 0.3406652360515021, "grad_norm": 2.5500422975767467, "learning_rate": 9.850280668004072e-06, "loss": 0.338, "step": 635 }, { "epoch": 0.34334763948497854, "grad_norm": 2.5804793225270277, "learning_rate": 9.844541740943239e-06, "loss": 0.3174, "step": 640 }, { "epoch": 0.34603004291845496, "grad_norm": 2.5633885980424207, "learning_rate": 9.838696618260182e-06, "loss": 0.3316, "step": 645 }, { "epoch": 0.3487124463519313, "grad_norm": 2.446470211512751, "learning_rate": 9.832745428083934e-06, "loss": 0.3332, "step": 650 }, { "epoch": 0.35139484978540775, "grad_norm": 2.448412365397892, "learning_rate": 9.826688300868597e-06, "loss": 0.3389, "step": 655 }, { "epoch": 0.3540772532188841, "grad_norm": 2.6322710209414466, "learning_rate": 9.820525369390486e-06, "loss": 0.3369, "step": 660 }, { "epoch": 0.35675965665236054, "grad_norm": 2.511193215977049, "learning_rate": 9.814256768745212e-06, "loss": 0.3261, "step": 665 }, { "epoch": 0.3594420600858369, "grad_norm": 2.5352827679444716, "learning_rate": 9.80788263634473e-06, "loss": 0.3341, "step": 670 }, { "epoch": 0.3621244635193133, "grad_norm": 2.61220305673336, "learning_rate": 9.801403111914324e-06, "loss": 0.3386, "step": 675 }, { "epoch": 0.3648068669527897, "grad_norm": 2.4921902626593186, "learning_rate": 9.794818337489535e-06, "loss": 0.3351, "step": 680 }, { "epoch": 0.3674892703862661, "grad_norm": 2.5869281548130827, "learning_rate": 9.788128457413064e-06, "loss": 0.3343, "step": 685 }, { "epoch": 0.3701716738197425, "grad_norm": 2.8402615755811413, "learning_rate": 9.78133361833159e-06, "loss": 0.3368, "step": 690 }, { "epoch": 0.3728540772532189, "grad_norm": 2.3935150249105126, "learning_rate": 9.774433969192569e-06, "loss": 0.3416, "step": 695 }, { "epoch": 0.37553648068669526, "grad_norm": 2.602269232425324, "learning_rate": 9.767429661240966e-06, "loss": 0.3333, "step": 700 }, { "epoch": 0.3782188841201717, "grad_norm": 2.5611730414611076, "learning_rate": 9.760320848015932e-06, "loss": 0.3366, "step": 705 }, { "epoch": 0.38090128755364805, "grad_norm": 2.58256056801785, "learning_rate": 9.75310768534745e-06, "loss": 0.326, "step": 710 }, { "epoch": 0.38358369098712447, "grad_norm": 2.610706387259647, "learning_rate": 9.745790331352907e-06, "loss": 0.3316, "step": 715 }, { "epoch": 0.38626609442060084, "grad_norm": 2.5317785996531894, "learning_rate": 9.73836894643364e-06, "loss": 0.3441, "step": 720 }, { "epoch": 0.38894849785407726, "grad_norm": 2.53159279905397, "learning_rate": 9.730843693271413e-06, "loss": 0.3421, "step": 725 }, { "epoch": 0.3916309012875536, "grad_norm": 2.685183324993938, "learning_rate": 9.723214736824847e-06, "loss": 0.3296, "step": 730 }, { "epoch": 0.39431330472103004, "grad_norm": 2.565999747275768, "learning_rate": 9.715482244325816e-06, "loss": 0.3438, "step": 735 }, { "epoch": 0.3969957081545064, "grad_norm": 2.484493208541512, "learning_rate": 9.707646385275766e-06, "loss": 0.336, "step": 740 }, { "epoch": 0.39967811158798283, "grad_norm": 2.3771195907818123, "learning_rate": 9.699707331442016e-06, "loss": 0.3337, "step": 745 }, { "epoch": 0.40236051502145925, "grad_norm": 2.465742112959504, "learning_rate": 9.691665256853978e-06, "loss": 0.332, "step": 750 }, { "epoch": 0.4050429184549356, "grad_norm": 2.4010231701635, "learning_rate": 9.683520337799353e-06, "loss": 0.3345, "step": 755 }, { "epoch": 0.40772532188841204, "grad_norm": 2.6592548561925793, "learning_rate": 9.675272752820258e-06, "loss": 0.3347, "step": 760 }, { "epoch": 0.4104077253218884, "grad_norm": 2.4815416560056125, "learning_rate": 9.666922682709317e-06, "loss": 0.3439, "step": 765 }, { "epoch": 0.4130901287553648, "grad_norm": 2.558230830272978, "learning_rate": 9.6584703105057e-06, "loss": 0.3324, "step": 770 }, { "epoch": 0.4157725321888412, "grad_norm": 2.4774260975478155, "learning_rate": 9.649915821491107e-06, "loss": 0.3326, "step": 775 }, { "epoch": 0.4184549356223176, "grad_norm": 2.371202097931529, "learning_rate": 9.641259403185706e-06, "loss": 0.3369, "step": 780 }, { "epoch": 0.421137339055794, "grad_norm": 2.405819539304012, "learning_rate": 9.632501245344024e-06, "loss": 0.3408, "step": 785 }, { "epoch": 0.4238197424892704, "grad_norm": 2.431257636296087, "learning_rate": 9.623641539950787e-06, "loss": 0.3501, "step": 790 }, { "epoch": 0.42650214592274677, "grad_norm": 2.4851593087836013, "learning_rate": 9.614680481216712e-06, "loss": 0.338, "step": 795 }, { "epoch": 0.4291845493562232, "grad_norm": 2.490245676643186, "learning_rate": 9.60561826557425e-06, "loss": 0.3388, "step": 800 }, { "epoch": 0.43186695278969955, "grad_norm": 2.653622799913057, "learning_rate": 9.596455091673282e-06, "loss": 0.3304, "step": 805 }, { "epoch": 0.434549356223176, "grad_norm": 2.5823090892360407, "learning_rate": 9.587191160376758e-06, "loss": 0.3354, "step": 810 }, { "epoch": 0.43723175965665234, "grad_norm": 2.3819860431289, "learning_rate": 9.577826674756301e-06, "loss": 0.3352, "step": 815 }, { "epoch": 0.43991416309012876, "grad_norm": 2.3080073471823632, "learning_rate": 9.56836184008775e-06, "loss": 0.3361, "step": 820 }, { "epoch": 0.44259656652360513, "grad_norm": 2.43418727053637, "learning_rate": 9.558796863846663e-06, "loss": 0.3343, "step": 825 }, { "epoch": 0.44527896995708155, "grad_norm": 2.543683903747536, "learning_rate": 9.549131955703772e-06, "loss": 0.3226, "step": 830 }, { "epoch": 0.4479613733905579, "grad_norm": 2.4761354386803154, "learning_rate": 9.539367327520382e-06, "loss": 0.3421, "step": 835 }, { "epoch": 0.45064377682403434, "grad_norm": 2.570039809535249, "learning_rate": 9.529503193343726e-06, "loss": 0.342, "step": 840 }, { "epoch": 0.4533261802575107, "grad_norm": 2.4065111294753048, "learning_rate": 9.519539769402282e-06, "loss": 0.3363, "step": 845 }, { "epoch": 0.4560085836909871, "grad_norm": 2.522022790991193, "learning_rate": 9.509477274101019e-06, "loss": 0.3387, "step": 850 }, { "epoch": 0.45869098712446355, "grad_norm": 2.3858997562388318, "learning_rate": 9.499315928016619e-06, "loss": 0.3295, "step": 855 }, { "epoch": 0.4613733905579399, "grad_norm": 2.505858567708606, "learning_rate": 9.489055953892644e-06, "loss": 0.3341, "step": 860 }, { "epoch": 0.46405579399141633, "grad_norm": 2.4452476068678908, "learning_rate": 9.478697576634646e-06, "loss": 0.3316, "step": 865 }, { "epoch": 0.4667381974248927, "grad_norm": 2.5452395525308744, "learning_rate": 9.46824102330524e-06, "loss": 0.3364, "step": 870 }, { "epoch": 0.4694206008583691, "grad_norm": 2.4420944612073123, "learning_rate": 9.457686523119128e-06, "loss": 0.3315, "step": 875 }, { "epoch": 0.4721030042918455, "grad_norm": 2.483173454795082, "learning_rate": 9.447034307438068e-06, "loss": 0.3299, "step": 880 }, { "epoch": 0.4747854077253219, "grad_norm": 2.493452761177253, "learning_rate": 9.436284609765818e-06, "loss": 0.3353, "step": 885 }, { "epoch": 0.47746781115879827, "grad_norm": 2.453176864303856, "learning_rate": 9.425437665742998e-06, "loss": 0.3357, "step": 890 }, { "epoch": 0.4801502145922747, "grad_norm": 2.4727667972924734, "learning_rate": 9.414493713141936e-06, "loss": 0.3336, "step": 895 }, { "epoch": 0.48283261802575106, "grad_norm": 2.458171758440651, "learning_rate": 9.403452991861452e-06, "loss": 0.3339, "step": 900 }, { "epoch": 0.4855150214592275, "grad_norm": 2.3736391910133157, "learning_rate": 9.392315743921606e-06, "loss": 0.3395, "step": 905 }, { "epoch": 0.48819742489270385, "grad_norm": 2.305554364897578, "learning_rate": 9.381082213458384e-06, "loss": 0.3368, "step": 910 }, { "epoch": 0.49087982832618027, "grad_norm": 2.5405659444544137, "learning_rate": 9.36975264671835e-06, "loss": 0.3367, "step": 915 }, { "epoch": 0.49356223175965663, "grad_norm": 2.486366217248514, "learning_rate": 9.358327292053244e-06, "loss": 0.3352, "step": 920 }, { "epoch": 0.49624463519313305, "grad_norm": 2.4553374965820014, "learning_rate": 9.346806399914547e-06, "loss": 0.3307, "step": 925 }, { "epoch": 0.4989270386266094, "grad_norm": 2.475216219975318, "learning_rate": 9.335190222847988e-06, "loss": 0.3174, "step": 930 }, { "epoch": 0.5016094420600858, "grad_norm": 2.3789571302193973, "learning_rate": 9.323479015488e-06, "loss": 0.3237, "step": 935 }, { "epoch": 0.5042918454935622, "grad_norm": 2.3230391837350086, "learning_rate": 9.311673034552146e-06, "loss": 0.3319, "step": 940 }, { "epoch": 0.5069742489270386, "grad_norm": 2.4118551244456317, "learning_rate": 9.299772538835492e-06, "loss": 0.3321, "step": 945 }, { "epoch": 0.509656652360515, "grad_norm": 2.292868894219552, "learning_rate": 9.28777778920493e-06, "loss": 0.3249, "step": 950 }, { "epoch": 0.5123390557939914, "grad_norm": 2.4306806304945563, "learning_rate": 9.27568904859346e-06, "loss": 0.338, "step": 955 }, { "epoch": 0.5150214592274678, "grad_norm": 2.357372710792746, "learning_rate": 9.26350658199443e-06, "loss": 0.3333, "step": 960 }, { "epoch": 0.5177038626609443, "grad_norm": 2.421036965572546, "learning_rate": 9.251230656455722e-06, "loss": 0.3402, "step": 965 }, { "epoch": 0.5203862660944206, "grad_norm": 2.5045313756852554, "learning_rate": 9.238861541073909e-06, "loss": 0.3392, "step": 970 }, { "epoch": 0.523068669527897, "grad_norm": 2.4355093742610916, "learning_rate": 9.226399506988336e-06, "loss": 0.335, "step": 975 }, { "epoch": 0.5257510729613734, "grad_norm": 2.330543332243749, "learning_rate": 9.213844827375196e-06, "loss": 0.3335, "step": 980 }, { "epoch": 0.5284334763948498, "grad_norm": 2.536018676820137, "learning_rate": 9.201197777441533e-06, "loss": 0.3238, "step": 985 }, { "epoch": 0.5311158798283262, "grad_norm": 2.399579098411104, "learning_rate": 9.188458634419213e-06, "loss": 0.3323, "step": 990 }, { "epoch": 0.5337982832618026, "grad_norm": 2.43498389222907, "learning_rate": 9.175627677558842e-06, "loss": 0.321, "step": 995 }, { "epoch": 0.5364806866952789, "grad_norm": 2.26672581616932, "learning_rate": 9.162705188123647e-06, "loss": 0.323, "step": 1000 }, { "epoch": 0.5391630901287554, "grad_norm": 2.325071753856254, "learning_rate": 9.149691449383313e-06, "loss": 0.3336, "step": 1005 }, { "epoch": 0.5418454935622318, "grad_norm": 2.2946404475202735, "learning_rate": 9.136586746607767e-06, "loss": 0.3233, "step": 1010 }, { "epoch": 0.5445278969957081, "grad_norm": 2.442316505275983, "learning_rate": 9.123391367060937e-06, "loss": 0.3393, "step": 1015 }, { "epoch": 0.5472103004291845, "grad_norm": 2.5521490255701864, "learning_rate": 9.110105599994436e-06, "loss": 0.343, "step": 1020 }, { "epoch": 0.549892703862661, "grad_norm": 2.3305497472508288, "learning_rate": 9.096729736641242e-06, "loss": 0.3367, "step": 1025 }, { "epoch": 0.5525751072961373, "grad_norm": 2.25720935413374, "learning_rate": 9.0832640702093e-06, "loss": 0.3422, "step": 1030 }, { "epoch": 0.5552575107296137, "grad_norm": 2.415906092571449, "learning_rate": 9.0697088958751e-06, "loss": 0.335, "step": 1035 }, { "epoch": 0.5579399141630901, "grad_norm": 2.2375929147259783, "learning_rate": 9.056064510777204e-06, "loss": 0.3291, "step": 1040 }, { "epoch": 0.5606223175965666, "grad_norm": 2.677950195880169, "learning_rate": 9.042331214009736e-06, "loss": 0.3335, "step": 1045 }, { "epoch": 0.5633047210300429, "grad_norm": 2.311823280122463, "learning_rate": 9.028509306615825e-06, "loss": 0.3308, "step": 1050 }, { "epoch": 0.5659871244635193, "grad_norm": 2.396970916065724, "learning_rate": 9.014599091581e-06, "loss": 0.3287, "step": 1055 }, { "epoch": 0.5686695278969958, "grad_norm": 2.2847365866080467, "learning_rate": 9.000600873826558e-06, "loss": 0.337, "step": 1060 }, { "epoch": 0.5713519313304721, "grad_norm": 2.292999633924368, "learning_rate": 8.98651496020287e-06, "loss": 0.3301, "step": 1065 }, { "epoch": 0.5740343347639485, "grad_norm": 2.2933175284284015, "learning_rate": 8.972341659482666e-06, "loss": 0.3359, "step": 1070 }, { "epoch": 0.5767167381974249, "grad_norm": 2.2283344742754903, "learning_rate": 8.958081282354253e-06, "loss": 0.3336, "step": 1075 }, { "epoch": 0.5793991416309013, "grad_norm": 2.5070943845944855, "learning_rate": 8.943734141414719e-06, "loss": 0.3324, "step": 1080 }, { "epoch": 0.5820815450643777, "grad_norm": 2.267683842345449, "learning_rate": 8.929300551163068e-06, "loss": 0.3314, "step": 1085 }, { "epoch": 0.5847639484978541, "grad_norm": 2.4677926709787474, "learning_rate": 8.914780827993332e-06, "loss": 0.3271, "step": 1090 }, { "epoch": 0.5874463519313304, "grad_norm": 2.43703846838283, "learning_rate": 8.900175290187636e-06, "loss": 0.3379, "step": 1095 }, { "epoch": 0.5901287553648069, "grad_norm": 2.482475990091307, "learning_rate": 8.885484257909218e-06, "loss": 0.3368, "step": 1100 }, { "epoch": 0.5928111587982833, "grad_norm": 2.3823185834033382, "learning_rate": 8.870708053195414e-06, "loss": 0.3289, "step": 1105 }, { "epoch": 0.5954935622317596, "grad_norm": 2.334521807088892, "learning_rate": 8.855846999950595e-06, "loss": 0.3307, "step": 1110 }, { "epoch": 0.598175965665236, "grad_norm": 2.340735623017881, "learning_rate": 8.840901423939075e-06, "loss": 0.3263, "step": 1115 }, { "epoch": 0.6008583690987125, "grad_norm": 2.33972855755703, "learning_rate": 8.825871652777955e-06, "loss": 0.3304, "step": 1120 }, { "epoch": 0.6035407725321889, "grad_norm": 2.4211001060502912, "learning_rate": 8.81075801592996e-06, "loss": 0.3311, "step": 1125 }, { "epoch": 0.6062231759656652, "grad_norm": 2.389080751970636, "learning_rate": 8.795560844696198e-06, "loss": 0.3297, "step": 1130 }, { "epoch": 0.6089055793991416, "grad_norm": 2.305861078874869, "learning_rate": 8.780280472208915e-06, "loss": 0.3234, "step": 1135 }, { "epoch": 0.6115879828326181, "grad_norm": 2.269973580460903, "learning_rate": 8.764917233424179e-06, "loss": 0.3221, "step": 1140 }, { "epoch": 0.6142703862660944, "grad_norm": 2.244789255783976, "learning_rate": 8.749471465114548e-06, "loss": 0.332, "step": 1145 }, { "epoch": 0.6169527896995708, "grad_norm": 2.5737698599599583, "learning_rate": 8.73394350586168e-06, "loss": 0.3287, "step": 1150 }, { "epoch": 0.6196351931330472, "grad_norm": 2.2480057718644018, "learning_rate": 8.71833369604891e-06, "loss": 0.333, "step": 1155 }, { "epoch": 0.6223175965665236, "grad_norm": 2.3155246440956336, "learning_rate": 8.702642377853803e-06, "loss": 0.3262, "step": 1160 }, { "epoch": 0.625, "grad_norm": 2.233547973333564, "learning_rate": 8.686869895240631e-06, "loss": 0.3244, "step": 1165 }, { "epoch": 0.6276824034334764, "grad_norm": 2.3464269986000272, "learning_rate": 8.671016593952853e-06, "loss": 0.3322, "step": 1170 }, { "epoch": 0.6303648068669528, "grad_norm": 2.2816655658038547, "learning_rate": 8.655082821505524e-06, "loss": 0.3207, "step": 1175 }, { "epoch": 0.6330472103004292, "grad_norm": 2.334006714492497, "learning_rate": 8.639068927177684e-06, "loss": 0.3189, "step": 1180 }, { "epoch": 0.6357296137339056, "grad_norm": 2.282379047008958, "learning_rate": 8.622975262004694e-06, "loss": 0.3243, "step": 1185 }, { "epoch": 0.6384120171673819, "grad_norm": 2.3407091854966167, "learning_rate": 8.606802178770551e-06, "loss": 0.3142, "step": 1190 }, { "epoch": 0.6410944206008584, "grad_norm": 2.2572106758010655, "learning_rate": 8.590550032000146e-06, "loss": 0.3214, "step": 1195 }, { "epoch": 0.6437768240343348, "grad_norm": 2.2748714163816977, "learning_rate": 8.574219177951495e-06, "loss": 0.3208, "step": 1200 }, { "epoch": 0.6464592274678111, "grad_norm": 2.3835860967064266, "learning_rate": 8.557809974607936e-06, "loss": 0.3156, "step": 1205 }, { "epoch": 0.6491416309012875, "grad_norm": 2.219631549314143, "learning_rate": 8.541322781670272e-06, "loss": 0.3224, "step": 1210 }, { "epoch": 0.651824034334764, "grad_norm": 2.195766054412958, "learning_rate": 8.524757960548888e-06, "loss": 0.3266, "step": 1215 }, { "epoch": 0.6545064377682404, "grad_norm": 2.289035430876054, "learning_rate": 8.50811587435584e-06, "loss": 0.3319, "step": 1220 }, { "epoch": 0.6571888412017167, "grad_norm": 2.2247473050344833, "learning_rate": 8.491396887896878e-06, "loss": 0.3219, "step": 1225 }, { "epoch": 0.6598712446351931, "grad_norm": 2.1315097672742174, "learning_rate": 8.474601367663463e-06, "loss": 0.3366, "step": 1230 }, { "epoch": 0.6625536480686696, "grad_norm": 2.3388502053664624, "learning_rate": 8.457729681824722e-06, "loss": 0.3338, "step": 1235 }, { "epoch": 0.6652360515021459, "grad_norm": 2.4148894464728916, "learning_rate": 8.440782200219391e-06, "loss": 0.3327, "step": 1240 }, { "epoch": 0.6679184549356223, "grad_norm": 3.8302928644643344, "learning_rate": 8.423759294347693e-06, "loss": 0.3331, "step": 1245 }, { "epoch": 0.6706008583690987, "grad_norm": 2.429237188345722, "learning_rate": 8.40666133736321e-06, "loss": 0.3324, "step": 1250 }, { "epoch": 0.6732832618025751, "grad_norm": 2.1025491872375794, "learning_rate": 8.389488704064686e-06, "loss": 0.318, "step": 1255 }, { "epoch": 0.6759656652360515, "grad_norm": 2.262233034421088, "learning_rate": 8.372241770887826e-06, "loss": 0.3296, "step": 1260 }, { "epoch": 0.6786480686695279, "grad_norm": 2.264871201049536, "learning_rate": 8.354920915897038e-06, "loss": 0.3203, "step": 1265 }, { "epoch": 0.6813304721030042, "grad_norm": 2.076909015535434, "learning_rate": 8.337526518777143e-06, "loss": 0.3188, "step": 1270 }, { "epoch": 0.6840128755364807, "grad_norm": 2.1428586849753026, "learning_rate": 8.32005896082506e-06, "loss": 0.327, "step": 1275 }, { "epoch": 0.6866952789699571, "grad_norm": 2.127948506611697, "learning_rate": 8.302518624941435e-06, "loss": 0.3294, "step": 1280 }, { "epoch": 0.6893776824034334, "grad_norm": 2.3201539807593483, "learning_rate": 8.284905895622265e-06, "loss": 0.3273, "step": 1285 }, { "epoch": 0.6920600858369099, "grad_norm": 2.088162375885553, "learning_rate": 8.26722115895045e-06, "loss": 0.3114, "step": 1290 }, { "epoch": 0.6947424892703863, "grad_norm": 2.264838313054302, "learning_rate": 8.249464802587353e-06, "loss": 0.3354, "step": 1295 }, { "epoch": 0.6974248927038627, "grad_norm": 2.123718589579232, "learning_rate": 8.231637215764273e-06, "loss": 0.3259, "step": 1300 }, { "epoch": 0.700107296137339, "grad_norm": 2.0627388841699332, "learning_rate": 8.21373878927394e-06, "loss": 0.318, "step": 1305 }, { "epoch": 0.7027896995708155, "grad_norm": 2.1640157174478984, "learning_rate": 8.195769915461931e-06, "loss": 0.3177, "step": 1310 }, { "epoch": 0.7054721030042919, "grad_norm": 2.269514848951079, "learning_rate": 8.177730988218083e-06, "loss": 0.3209, "step": 1315 }, { "epoch": 0.7081545064377682, "grad_norm": 2.122207553121707, "learning_rate": 8.159622402967841e-06, "loss": 0.3231, "step": 1320 }, { "epoch": 0.7108369098712446, "grad_norm": 2.179205043969236, "learning_rate": 8.141444556663612e-06, "loss": 0.3256, "step": 1325 }, { "epoch": 0.7135193133047211, "grad_norm": 2.3230582464480096, "learning_rate": 8.123197847776043e-06, "loss": 0.3145, "step": 1330 }, { "epoch": 0.7162017167381974, "grad_norm": 2.1333711500312686, "learning_rate": 8.104882676285301e-06, "loss": 0.3217, "step": 1335 }, { "epoch": 0.7188841201716738, "grad_norm": 2.245766068620901, "learning_rate": 8.086499443672297e-06, "loss": 0.3309, "step": 1340 }, { "epoch": 0.7215665236051502, "grad_norm": 2.3431306789729227, "learning_rate": 8.068048552909887e-06, "loss": 0.312, "step": 1345 }, { "epoch": 0.7242489270386266, "grad_norm": 2.2346832053248646, "learning_rate": 8.049530408454041e-06, "loss": 0.317, "step": 1350 }, { "epoch": 0.726931330472103, "grad_norm": 2.0292338590090147, "learning_rate": 8.030945416234971e-06, "loss": 0.309, "step": 1355 }, { "epoch": 0.7296137339055794, "grad_norm": 2.158639341677383, "learning_rate": 8.012293983648247e-06, "loss": 0.3137, "step": 1360 }, { "epoch": 0.7322961373390557, "grad_norm": 2.1689628455177794, "learning_rate": 7.993576519545844e-06, "loss": 0.3133, "step": 1365 }, { "epoch": 0.7349785407725322, "grad_norm": 2.3090614011323933, "learning_rate": 7.974793434227203e-06, "loss": 0.3238, "step": 1370 }, { "epoch": 0.7376609442060086, "grad_norm": 2.1217112214975673, "learning_rate": 7.955945139430221e-06, "loss": 0.3167, "step": 1375 }, { "epoch": 0.740343347639485, "grad_norm": 2.1215030056703004, "learning_rate": 7.937032048322231e-06, "loss": 0.3127, "step": 1380 }, { "epoch": 0.7430257510729614, "grad_norm": 2.2142767774411016, "learning_rate": 7.918054575490943e-06, "loss": 0.3265, "step": 1385 }, { "epoch": 0.7457081545064378, "grad_norm": 2.1780400492888354, "learning_rate": 7.899013136935365e-06, "loss": 0.3085, "step": 1390 }, { "epoch": 0.7483905579399142, "grad_norm": 2.0298352979187575, "learning_rate": 7.879908150056668e-06, "loss": 0.3091, "step": 1395 }, { "epoch": 0.7510729613733905, "grad_norm": 2.2026581596650243, "learning_rate": 7.860740033649053e-06, "loss": 0.3212, "step": 1400 }, { "epoch": 0.753755364806867, "grad_norm": 2.025023000154723, "learning_rate": 7.841509207890555e-06, "loss": 0.3094, "step": 1405 }, { "epoch": 0.7564377682403434, "grad_norm": 2.2724658958003805, "learning_rate": 7.822216094333847e-06, "loss": 0.3102, "step": 1410 }, { "epoch": 0.7591201716738197, "grad_norm": 2.0586319723016637, "learning_rate": 7.802861115896988e-06, "loss": 0.3113, "step": 1415 }, { "epoch": 0.7618025751072961, "grad_norm": 2.114190748749228, "learning_rate": 7.783444696854161e-06, "loss": 0.316, "step": 1420 }, { "epoch": 0.7644849785407726, "grad_norm": 2.1915835233585397, "learning_rate": 7.763967262826363e-06, "loss": 0.3068, "step": 1425 }, { "epoch": 0.7671673819742489, "grad_norm": 2.106341062609497, "learning_rate": 7.74442924077209e-06, "loss": 0.3121, "step": 1430 }, { "epoch": 0.7698497854077253, "grad_norm": 2.2367399439878217, "learning_rate": 7.724831058977955e-06, "loss": 0.3161, "step": 1435 }, { "epoch": 0.7725321888412017, "grad_norm": 2.138786537260804, "learning_rate": 7.705173147049326e-06, "loss": 0.3177, "step": 1440 }, { "epoch": 0.7752145922746781, "grad_norm": 1.9953199910560304, "learning_rate": 7.685455935900886e-06, "loss": 0.3079, "step": 1445 }, { "epoch": 0.7778969957081545, "grad_norm": 2.0920590199358533, "learning_rate": 7.665679857747204e-06, "loss": 0.3222, "step": 1450 }, { "epoch": 0.7805793991416309, "grad_norm": 2.1092103888387204, "learning_rate": 7.645845346093246e-06, "loss": 0.316, "step": 1455 }, { "epoch": 0.7832618025751072, "grad_norm": 2.042467621524812, "learning_rate": 7.625952835724892e-06, "loss": 0.3164, "step": 1460 }, { "epoch": 0.7859442060085837, "grad_norm": 2.2854070209855135, "learning_rate": 7.606002762699378e-06, "loss": 0.3209, "step": 1465 }, { "epoch": 0.7886266094420601, "grad_norm": 2.129201075036926, "learning_rate": 7.585995564335764e-06, "loss": 0.3117, "step": 1470 }, { "epoch": 0.7913090128755365, "grad_norm": 2.2563643165040905, "learning_rate": 7.565931679205329e-06, "loss": 0.3191, "step": 1475 }, { "epoch": 0.7939914163090128, "grad_norm": 2.104554696249556, "learning_rate": 7.545811547121969e-06, "loss": 0.3134, "step": 1480 }, { "epoch": 0.7966738197424893, "grad_norm": 2.1306942215262747, "learning_rate": 7.525635609132543e-06, "loss": 0.3258, "step": 1485 }, { "epoch": 0.7993562231759657, "grad_norm": 2.135154550941403, "learning_rate": 7.505404307507227e-06, "loss": 0.3074, "step": 1490 }, { "epoch": 0.802038626609442, "grad_norm": 2.123478070304184, "learning_rate": 7.48511808572979e-06, "loss": 0.3113, "step": 1495 }, { "epoch": 0.8047210300429185, "grad_norm": 2.1029629790480677, "learning_rate": 7.464777388487899e-06, "loss": 0.3119, "step": 1500 }, { "epoch": 0.8074034334763949, "grad_norm": 2.064717882237083, "learning_rate": 7.4443826616633555e-06, "loss": 0.3178, "step": 1505 }, { "epoch": 0.8100858369098712, "grad_norm": 2.2821056948735183, "learning_rate": 7.423934352322324e-06, "loss": 0.3184, "step": 1510 }, { "epoch": 0.8127682403433476, "grad_norm": 1.9432859276082908, "learning_rate": 7.403432908705537e-06, "loss": 0.317, "step": 1515 }, { "epoch": 0.8154506437768241, "grad_norm": 2.1596795779966262, "learning_rate": 7.382878780218466e-06, "loss": 0.312, "step": 1520 }, { "epoch": 0.8181330472103004, "grad_norm": 2.2263428321455723, "learning_rate": 7.362272417421467e-06, "loss": 0.3053, "step": 1525 }, { "epoch": 0.8208154506437768, "grad_norm": 2.052397724990397, "learning_rate": 7.341614272019912e-06, "loss": 0.312, "step": 1530 }, { "epoch": 0.8234978540772532, "grad_norm": 2.0250517824449252, "learning_rate": 7.3209047968542815e-06, "loss": 0.3133, "step": 1535 }, { "epoch": 0.8261802575107297, "grad_norm": 2.1078510422026886, "learning_rate": 7.300144445890236e-06, "loss": 0.3088, "step": 1540 }, { "epoch": 0.828862660944206, "grad_norm": 2.0603861634003873, "learning_rate": 7.279333674208671e-06, "loss": 0.3125, "step": 1545 }, { "epoch": 0.8315450643776824, "grad_norm": 2.0394959875230376, "learning_rate": 7.258472937995736e-06, "loss": 0.3057, "step": 1550 }, { "epoch": 0.8342274678111588, "grad_norm": 2.1154958189964246, "learning_rate": 7.23756269453284e-06, "loss": 0.3229, "step": 1555 }, { "epoch": 0.8369098712446352, "grad_norm": 2.1061740410536736, "learning_rate": 7.216603402186618e-06, "loss": 0.3223, "step": 1560 }, { "epoch": 0.8395922746781116, "grad_norm": 2.1666694294371447, "learning_rate": 7.195595520398898e-06, "loss": 0.3203, "step": 1565 }, { "epoch": 0.842274678111588, "grad_norm": 2.0544672340134684, "learning_rate": 7.174539509676612e-06, "loss": 0.3123, "step": 1570 }, { "epoch": 0.8449570815450643, "grad_norm": 2.060820804792574, "learning_rate": 7.153435831581722e-06, "loss": 0.3176, "step": 1575 }, { "epoch": 0.8476394849785408, "grad_norm": 1.9310023216744814, "learning_rate": 7.132284948721079e-06, "loss": 0.3111, "step": 1580 }, { "epoch": 0.8503218884120172, "grad_norm": 2.1274231776005896, "learning_rate": 7.1110873247363035e-06, "loss": 0.306, "step": 1585 }, { "epoch": 0.8530042918454935, "grad_norm": 2.147749360638321, "learning_rate": 7.089843424293606e-06, "loss": 0.3129, "step": 1590 }, { "epoch": 0.85568669527897, "grad_norm": 1.9192451256579208, "learning_rate": 7.0685537130736145e-06, "loss": 0.3171, "step": 1595 }, { "epoch": 0.8583690987124464, "grad_norm": 2.006562509961495, "learning_rate": 7.047218657761156e-06, "loss": 0.3079, "step": 1600 }, { "epoch": 0.8610515021459227, "grad_norm": 1.9976147103903519, "learning_rate": 7.025838726035032e-06, "loss": 0.3176, "step": 1605 }, { "epoch": 0.8637339055793991, "grad_norm": 1.9212023443333308, "learning_rate": 7.004414386557765e-06, "loss": 0.3104, "step": 1610 }, { "epoch": 0.8664163090128756, "grad_norm": 2.1402465900938115, "learning_rate": 6.982946108965326e-06, "loss": 0.3096, "step": 1615 }, { "epoch": 0.869098712446352, "grad_norm": 2.010681271869721, "learning_rate": 6.961434363856836e-06, "loss": 0.3057, "step": 1620 }, { "epoch": 0.8717811158798283, "grad_norm": 2.0204223918270814, "learning_rate": 6.939879622784259e-06, "loss": 0.314, "step": 1625 }, { "epoch": 0.8744635193133047, "grad_norm": 1.9316355248793622, "learning_rate": 6.918282358242053e-06, "loss": 0.3087, "step": 1630 }, { "epoch": 0.8771459227467812, "grad_norm": 2.0156913222609263, "learning_rate": 6.896643043656826e-06, "loss": 0.2991, "step": 1635 }, { "epoch": 0.8798283261802575, "grad_norm": 1.8786303347180955, "learning_rate": 6.874962153376945e-06, "loss": 0.2988, "step": 1640 }, { "epoch": 0.8825107296137339, "grad_norm": 2.026958173248558, "learning_rate": 6.853240162662149e-06, "loss": 0.304, "step": 1645 }, { "epoch": 0.8851931330472103, "grad_norm": 1.9294190561716569, "learning_rate": 6.831477547673122e-06, "loss": 0.3054, "step": 1650 }, { "epoch": 0.8878755364806867, "grad_norm": 2.138444156957396, "learning_rate": 6.8096747854610634e-06, "loss": 0.3061, "step": 1655 }, { "epoch": 0.8905579399141631, "grad_norm": 1.9329582418293967, "learning_rate": 6.787832353957225e-06, "loss": 0.3158, "step": 1660 }, { "epoch": 0.8932403433476395, "grad_norm": 1.9473598026947105, "learning_rate": 6.7659507319624355e-06, "loss": 0.314, "step": 1665 }, { "epoch": 0.8959227467811158, "grad_norm": 1.9222468117438298, "learning_rate": 6.744030399136606e-06, "loss": 0.3051, "step": 1670 }, { "epoch": 0.8986051502145923, "grad_norm": 2.0241538727369752, "learning_rate": 6.722071835988217e-06, "loss": 0.3161, "step": 1675 }, { "epoch": 0.9012875536480687, "grad_norm": 1.9593352526633656, "learning_rate": 6.700075523863783e-06, "loss": 0.3029, "step": 1680 }, { "epoch": 0.903969957081545, "grad_norm": 1.9447267787452907, "learning_rate": 6.678041944937297e-06, "loss": 0.3057, "step": 1685 }, { "epoch": 0.9066523605150214, "grad_norm": 1.9006711851063618, "learning_rate": 6.655971582199672e-06, "loss": 0.3049, "step": 1690 }, { "epoch": 0.9093347639484979, "grad_norm": 2.0215374329425106, "learning_rate": 6.633864919448143e-06, "loss": 0.3182, "step": 1695 }, { "epoch": 0.9120171673819742, "grad_norm": 2.098630005043553, "learning_rate": 6.611722441275666e-06, "loss": 0.3025, "step": 1700 }, { "epoch": 0.9146995708154506, "grad_norm": 2.1127598840757438, "learning_rate": 6.589544633060298e-06, "loss": 0.304, "step": 1705 }, { "epoch": 0.9173819742489271, "grad_norm": 1.9889146049121293, "learning_rate": 6.5673319809545496e-06, "loss": 0.3095, "step": 1710 }, { "epoch": 0.9200643776824035, "grad_norm": 2.087177436054678, "learning_rate": 6.545084971874738e-06, "loss": 0.3164, "step": 1715 }, { "epoch": 0.9227467811158798, "grad_norm": 1.9241219937062042, "learning_rate": 6.522804093490305e-06, "loss": 0.3065, "step": 1720 }, { "epoch": 0.9254291845493562, "grad_norm": 1.965286294685453, "learning_rate": 6.50048983421313e-06, "loss": 0.3127, "step": 1725 }, { "epoch": 0.9281115879828327, "grad_norm": 2.101887531148685, "learning_rate": 6.478142683186827e-06, "loss": 0.3021, "step": 1730 }, { "epoch": 0.930793991416309, "grad_norm": 2.026014742718446, "learning_rate": 6.455763130276019e-06, "loss": 0.3082, "step": 1735 }, { "epoch": 0.9334763948497854, "grad_norm": 1.9201233096208363, "learning_rate": 6.433351666055598e-06, "loss": 0.2972, "step": 1740 }, { "epoch": 0.9361587982832618, "grad_norm": 1.9217768515294535, "learning_rate": 6.410908781799974e-06, "loss": 0.2995, "step": 1745 }, { "epoch": 0.9388412017167382, "grad_norm": 1.999671813664054, "learning_rate": 6.388434969472307e-06, "loss": 0.3071, "step": 1750 }, { "epoch": 0.9415236051502146, "grad_norm": 1.791293771696864, "learning_rate": 6.365930721713718e-06, "loss": 0.308, "step": 1755 }, { "epoch": 0.944206008583691, "grad_norm": 2.0291022086947517, "learning_rate": 6.343396531832497e-06, "loss": 0.3127, "step": 1760 }, { "epoch": 0.9468884120171673, "grad_norm": 1.941721220274983, "learning_rate": 6.320832893793285e-06, "loss": 0.3123, "step": 1765 }, { "epoch": 0.9495708154506438, "grad_norm": 2.0683501972359206, "learning_rate": 6.298240302206242e-06, "loss": 0.3009, "step": 1770 }, { "epoch": 0.9522532188841202, "grad_norm": 1.8993937810214891, "learning_rate": 6.275619252316213e-06, "loss": 0.3058, "step": 1775 }, { "epoch": 0.9549356223175965, "grad_norm": 1.8984190017829987, "learning_rate": 6.25297023999187e-06, "loss": 0.3118, "step": 1780 }, { "epoch": 0.9576180257510729, "grad_norm": 1.995180760043102, "learning_rate": 6.2302937617148365e-06, "loss": 0.3079, "step": 1785 }, { "epoch": 0.9603004291845494, "grad_norm": 1.8676656033530143, "learning_rate": 6.20759031456881e-06, "loss": 0.3077, "step": 1790 }, { "epoch": 0.9629828326180258, "grad_norm": 1.8278398077002314, "learning_rate": 6.184860396228664e-06, "loss": 0.3063, "step": 1795 }, { "epoch": 0.9656652360515021, "grad_norm": 1.8048374517283314, "learning_rate": 6.1621045049495376e-06, "loss": 0.2857, "step": 1800 }, { "epoch": 0.9683476394849786, "grad_norm": 1.944999842829527, "learning_rate": 6.139323139555914e-06, "loss": 0.3024, "step": 1805 }, { "epoch": 0.971030042918455, "grad_norm": 1.8832607970350839, "learning_rate": 6.116516799430689e-06, "loss": 0.3036, "step": 1810 }, { "epoch": 0.9737124463519313, "grad_norm": 1.9338585968796769, "learning_rate": 6.0936859845042164e-06, "loss": 0.2975, "step": 1815 }, { "epoch": 0.9763948497854077, "grad_norm": 1.882527757996201, "learning_rate": 6.07083119524336e-06, "loss": 0.3057, "step": 1820 }, { "epoch": 0.9790772532188842, "grad_norm": 1.9678086753998791, "learning_rate": 6.047952932640513e-06, "loss": 0.297, "step": 1825 }, { "epoch": 0.9817596566523605, "grad_norm": 1.9149379294568063, "learning_rate": 6.0250516982026205e-06, "loss": 0.3115, "step": 1830 }, { "epoch": 0.9844420600858369, "grad_norm": 1.9981298804152876, "learning_rate": 6.002127993940187e-06, "loss": 0.2894, "step": 1835 }, { "epoch": 0.9871244635193133, "grad_norm": 2.0009718614650978, "learning_rate": 5.979182322356269e-06, "loss": 0.3158, "step": 1840 }, { "epoch": 0.9898068669527897, "grad_norm": 1.9391820355717926, "learning_rate": 5.956215186435464e-06, "loss": 0.3055, "step": 1845 }, { "epoch": 0.9924892703862661, "grad_norm": 1.9437317790715514, "learning_rate": 5.9332270896328815e-06, "loss": 0.2948, "step": 1850 }, { "epoch": 0.9951716738197425, "grad_norm": 1.8651628866078283, "learning_rate": 5.910218535863106e-06, "loss": 0.2929, "step": 1855 }, { "epoch": 0.9978540772532188, "grad_norm": 1.8374878014597336, "learning_rate": 5.8871900294891525e-06, "loss": 0.2971, "step": 1860 }, { "epoch": 1.0, "eval_runtime": 264.8676, "eval_samples_per_second": 3.775, "eval_steps_per_second": 0.944, "step": 1864 }, { "epoch": 1.0005364806866952, "grad_norm": 2.1465231831949194, "learning_rate": 5.864142075311414e-06, "loss": 0.2823, "step": 1865 }, { "epoch": 1.0032188841201717, "grad_norm": 2.835907131571581, "learning_rate": 5.84107517855659e-06, "loss": 0.2054, "step": 1870 }, { "epoch": 1.0059012875536482, "grad_norm": 1.9826942171677255, "learning_rate": 5.817989844866613e-06, "loss": 0.1922, "step": 1875 }, { "epoch": 1.0085836909871244, "grad_norm": 2.1591088746556317, "learning_rate": 5.794886580287565e-06, "loss": 0.1996, "step": 1880 }, { "epoch": 1.011266094420601, "grad_norm": 1.9963566164082294, "learning_rate": 5.77176589125859e-06, "loss": 0.2, "step": 1885 }, { "epoch": 1.0139484978540771, "grad_norm": 2.226264123202166, "learning_rate": 5.7486282846007835e-06, "loss": 0.1908, "step": 1890 }, { "epoch": 1.0166309012875536, "grad_norm": 1.90926028127329, "learning_rate": 5.725474267506088e-06, "loss": 0.2034, "step": 1895 }, { "epoch": 1.01931330472103, "grad_norm": 2.219206268718368, "learning_rate": 5.702304347526172e-06, "loss": 0.1879, "step": 1900 }, { "epoch": 1.0219957081545064, "grad_norm": 2.0360503333559183, "learning_rate": 5.679119032561311e-06, "loss": 0.1852, "step": 1905 }, { "epoch": 1.0246781115879828, "grad_norm": 1.81221408386523, "learning_rate": 5.655918830849243e-06, "loss": 0.1882, "step": 1910 }, { "epoch": 1.0273605150214593, "grad_norm": 1.8036898984239804, "learning_rate": 5.632704250954039e-06, "loss": 0.1956, "step": 1915 }, { "epoch": 1.0300429184549356, "grad_norm": 2.011741953472937, "learning_rate": 5.6094758017549436e-06, "loss": 0.1921, "step": 1920 }, { "epoch": 1.032725321888412, "grad_norm": 1.8763118588406138, "learning_rate": 5.5862339924352306e-06, "loss": 0.1941, "step": 1925 }, { "epoch": 1.0354077253218885, "grad_norm": 1.8972603788347657, "learning_rate": 5.562979332471035e-06, "loss": 0.183, "step": 1930 }, { "epoch": 1.0380901287553648, "grad_norm": 1.8996557084164813, "learning_rate": 5.539712331620186e-06, "loss": 0.1948, "step": 1935 }, { "epoch": 1.0407725321888412, "grad_norm": 1.884964348275225, "learning_rate": 5.516433499911035e-06, "loss": 0.1912, "step": 1940 }, { "epoch": 1.0434549356223175, "grad_norm": 2.0109538940169496, "learning_rate": 5.493143347631272e-06, "loss": 0.1894, "step": 1945 }, { "epoch": 1.046137339055794, "grad_norm": 1.9435415532263924, "learning_rate": 5.4698423853167425e-06, "loss": 0.1913, "step": 1950 }, { "epoch": 1.0488197424892705, "grad_norm": 1.9021619609069815, "learning_rate": 5.446531123740257e-06, "loss": 0.1898, "step": 1955 }, { "epoch": 1.0515021459227467, "grad_norm": 1.8691859260241384, "learning_rate": 5.4232100739003855e-06, "loss": 0.1947, "step": 1960 }, { "epoch": 1.0541845493562232, "grad_norm": 1.9278006073349867, "learning_rate": 5.399879747010275e-06, "loss": 0.2016, "step": 1965 }, { "epoch": 1.0568669527896997, "grad_norm": 1.8962668797019708, "learning_rate": 5.376540654486422e-06, "loss": 0.1879, "step": 1970 }, { "epoch": 1.059549356223176, "grad_norm": 2.140834654747942, "learning_rate": 5.353193307937477e-06, "loss": 0.2021, "step": 1975 }, { "epoch": 1.0622317596566524, "grad_norm": 1.9563740759017338, "learning_rate": 5.32983821915302e-06, "loss": 0.1907, "step": 1980 }, { "epoch": 1.0649141630901287, "grad_norm": 1.9312909115421, "learning_rate": 5.306475900092348e-06, "loss": 0.1889, "step": 1985 }, { "epoch": 1.0675965665236051, "grad_norm": 1.8461568136911246, "learning_rate": 5.283106862873253e-06, "loss": 0.1861, "step": 1990 }, { "epoch": 1.0702789699570816, "grad_norm": 1.9611737417143975, "learning_rate": 5.259731619760792e-06, "loss": 0.1846, "step": 1995 }, { "epoch": 1.0729613733905579, "grad_norm": 1.9222420116013819, "learning_rate": 5.236350683156055e-06, "loss": 0.189, "step": 2000 }, { "epoch": 1.0756437768240343, "grad_norm": 1.8398929156199553, "learning_rate": 5.212964565584944e-06, "loss": 0.1925, "step": 2005 }, { "epoch": 1.0783261802575108, "grad_norm": 1.8756803869158039, "learning_rate": 5.189573779686929e-06, "loss": 0.1952, "step": 2010 }, { "epoch": 1.081008583690987, "grad_norm": 1.9193238743297847, "learning_rate": 5.166178838203808e-06, "loss": 0.196, "step": 2015 }, { "epoch": 1.0836909871244635, "grad_norm": 1.8933730256658954, "learning_rate": 5.142780253968481e-06, "loss": 0.1871, "step": 2020 }, { "epoch": 1.0863733905579398, "grad_norm": 1.971927316697686, "learning_rate": 5.119378539893693e-06, "loss": 0.1913, "step": 2025 }, { "epoch": 1.0890557939914163, "grad_norm": 1.9574781217785189, "learning_rate": 5.095974208960799e-06, "loss": 0.1959, "step": 2030 }, { "epoch": 1.0917381974248928, "grad_norm": 2.168409304530154, "learning_rate": 5.072567774208518e-06, "loss": 0.1904, "step": 2035 }, { "epoch": 1.094420600858369, "grad_norm": 1.8100899333654916, "learning_rate": 5.049159748721685e-06, "loss": 0.1862, "step": 2040 }, { "epoch": 1.0971030042918455, "grad_norm": 2.070676104731243, "learning_rate": 5.025750645620004e-06, "loss": 0.1835, "step": 2045 }, { "epoch": 1.099785407725322, "grad_norm": 1.8392297513288312, "learning_rate": 5.002340978046807e-06, "loss": 0.1853, "step": 2050 }, { "epoch": 1.1024678111587982, "grad_norm": 1.8971251627665042, "learning_rate": 4.978931259157791e-06, "loss": 0.1858, "step": 2055 }, { "epoch": 1.1051502145922747, "grad_norm": 1.923105775536507, "learning_rate": 4.955522002109782e-06, "loss": 0.1913, "step": 2060 }, { "epoch": 1.1078326180257512, "grad_norm": 1.8547858179770311, "learning_rate": 4.932113720049485e-06, "loss": 0.196, "step": 2065 }, { "epoch": 1.1105150214592274, "grad_norm": 1.7715573310321981, "learning_rate": 4.908706926102229e-06, "loss": 0.184, "step": 2070 }, { "epoch": 1.113197424892704, "grad_norm": 1.9483276780491752, "learning_rate": 4.885302133360722e-06, "loss": 0.1928, "step": 2075 }, { "epoch": 1.1158798283261802, "grad_norm": 1.955128881315733, "learning_rate": 4.8618998548738065e-06, "loss": 0.1899, "step": 2080 }, { "epoch": 1.1185622317596566, "grad_norm": 1.8143501780550073, "learning_rate": 4.8385006036352104e-06, "loss": 0.1802, "step": 2085 }, { "epoch": 1.121244635193133, "grad_norm": 1.914782046325854, "learning_rate": 4.8151048925723014e-06, "loss": 0.1969, "step": 2090 }, { "epoch": 1.1239270386266094, "grad_norm": 1.8580160050691163, "learning_rate": 4.791713234534844e-06, "loss": 0.1857, "step": 2095 }, { "epoch": 1.1266094420600858, "grad_norm": 1.7456925046922118, "learning_rate": 4.768326142283757e-06, "loss": 0.1827, "step": 2100 }, { "epoch": 1.1292918454935623, "grad_norm": 1.9059969926415221, "learning_rate": 4.744944128479879e-06, "loss": 0.1885, "step": 2105 }, { "epoch": 1.1319742489270386, "grad_norm": 1.8503800835661866, "learning_rate": 4.7215677056727185e-06, "loss": 0.1909, "step": 2110 }, { "epoch": 1.134656652360515, "grad_norm": 1.925446147659633, "learning_rate": 4.698197386289232e-06, "loss": 0.1882, "step": 2115 }, { "epoch": 1.1373390557939915, "grad_norm": 2.026187494704868, "learning_rate": 4.674833682622577e-06, "loss": 0.1928, "step": 2120 }, { "epoch": 1.1400214592274678, "grad_norm": 2.1197969410621216, "learning_rate": 4.6514771068209e-06, "loss": 0.1866, "step": 2125 }, { "epoch": 1.1427038626609443, "grad_norm": 1.8690779309375414, "learning_rate": 4.628128170876093e-06, "loss": 0.1901, "step": 2130 }, { "epoch": 1.1453862660944205, "grad_norm": 1.884559663910041, "learning_rate": 4.604787386612579e-06, "loss": 0.189, "step": 2135 }, { "epoch": 1.148068669527897, "grad_norm": 1.8046390167534845, "learning_rate": 4.581455265676089e-06, "loss": 0.1945, "step": 2140 }, { "epoch": 1.1507510729613735, "grad_norm": 1.8683227420775612, "learning_rate": 4.558132319522451e-06, "loss": 0.1734, "step": 2145 }, { "epoch": 1.1534334763948497, "grad_norm": 1.7381200436139665, "learning_rate": 4.534819059406374e-06, "loss": 0.1845, "step": 2150 }, { "epoch": 1.1561158798283262, "grad_norm": 2.025453682495107, "learning_rate": 4.511515996370244e-06, "loss": 0.1973, "step": 2155 }, { "epoch": 1.1587982832618025, "grad_norm": 2.069820396761178, "learning_rate": 4.488223641232915e-06, "loss": 0.1935, "step": 2160 }, { "epoch": 1.161480686695279, "grad_norm": 1.972809924154737, "learning_rate": 4.464942504578524e-06, "loss": 0.1903, "step": 2165 }, { "epoch": 1.1641630901287554, "grad_norm": 1.8963041505297642, "learning_rate": 4.441673096745287e-06, "loss": 0.1933, "step": 2170 }, { "epoch": 1.1668454935622319, "grad_norm": 1.949719913762463, "learning_rate": 4.418415927814315e-06, "loss": 0.1854, "step": 2175 }, { "epoch": 1.1695278969957081, "grad_norm": 1.838505943416919, "learning_rate": 4.395171507598441e-06, "loss": 0.1826, "step": 2180 }, { "epoch": 1.1722103004291846, "grad_norm": 2.066384468146897, "learning_rate": 4.371940345631027e-06, "loss": 0.1946, "step": 2185 }, { "epoch": 1.1748927038626609, "grad_norm": 1.8351688532447725, "learning_rate": 4.348722951154816e-06, "loss": 0.1842, "step": 2190 }, { "epoch": 1.1775751072961373, "grad_norm": 1.872111653127208, "learning_rate": 4.3255198331107485e-06, "loss": 0.1883, "step": 2195 }, { "epoch": 1.1802575107296138, "grad_norm": 1.886316421614782, "learning_rate": 4.302331500126824e-06, "loss": 0.1859, "step": 2200 }, { "epoch": 1.18293991416309, "grad_norm": 1.8252623302278148, "learning_rate": 4.279158460506939e-06, "loss": 0.1827, "step": 2205 }, { "epoch": 1.1856223175965666, "grad_norm": 1.8400802753502186, "learning_rate": 4.256001222219751e-06, "loss": 0.1844, "step": 2210 }, { "epoch": 1.1883047210300428, "grad_norm": 1.8818344201338155, "learning_rate": 4.232860292887537e-06, "loss": 0.179, "step": 2215 }, { "epoch": 1.1909871244635193, "grad_norm": 1.9741627465587812, "learning_rate": 4.2097361797750815e-06, "loss": 0.1804, "step": 2220 }, { "epoch": 1.1936695278969958, "grad_norm": 1.8275554011726929, "learning_rate": 4.1866293897785356e-06, "loss": 0.1859, "step": 2225 }, { "epoch": 1.196351931330472, "grad_norm": 1.7451512261577948, "learning_rate": 4.16354042941432e-06, "loss": 0.1827, "step": 2230 }, { "epoch": 1.1990343347639485, "grad_norm": 1.7461056439001637, "learning_rate": 4.1404698048080175e-06, "loss": 0.1924, "step": 2235 }, { "epoch": 1.201716738197425, "grad_norm": 1.9631829460204946, "learning_rate": 4.117418021683278e-06, "loss": 0.1901, "step": 2240 }, { "epoch": 1.2043991416309012, "grad_norm": 1.9959602820602425, "learning_rate": 4.094385585350736e-06, "loss": 0.1878, "step": 2245 }, { "epoch": 1.2070815450643777, "grad_norm": 1.868353082690321, "learning_rate": 4.0713730006969285e-06, "loss": 0.1837, "step": 2250 }, { "epoch": 1.2097639484978542, "grad_norm": 1.8647123845921825, "learning_rate": 4.048380772173231e-06, "loss": 0.1952, "step": 2255 }, { "epoch": 1.2124463519313304, "grad_norm": 1.9424516201873914, "learning_rate": 4.0254094037848005e-06, "loss": 0.1842, "step": 2260 }, { "epoch": 1.215128755364807, "grad_norm": 1.8740890147227733, "learning_rate": 4.002459399079523e-06, "loss": 0.1856, "step": 2265 }, { "epoch": 1.2178111587982832, "grad_norm": 1.8960654007326538, "learning_rate": 3.979531261136981e-06, "loss": 0.1904, "step": 2270 }, { "epoch": 1.2204935622317596, "grad_norm": 1.893645018391927, "learning_rate": 3.956625492557417e-06, "loss": 0.1839, "step": 2275 }, { "epoch": 1.2231759656652361, "grad_norm": 1.9010652191534456, "learning_rate": 3.933742595450733e-06, "loss": 0.1845, "step": 2280 }, { "epoch": 1.2258583690987124, "grad_norm": 2.001816005554033, "learning_rate": 3.910883071425463e-06, "loss": 0.1892, "step": 2285 }, { "epoch": 1.2285407725321889, "grad_norm": 1.8924184483931448, "learning_rate": 3.8880474215777915e-06, "loss": 0.1838, "step": 2290 }, { "epoch": 1.2312231759656653, "grad_norm": 1.7498884870572744, "learning_rate": 3.865236146480562e-06, "loss": 0.1869, "step": 2295 }, { "epoch": 1.2339055793991416, "grad_norm": 1.8794585911654438, "learning_rate": 3.842449746172311e-06, "loss": 0.1821, "step": 2300 }, { "epoch": 1.236587982832618, "grad_norm": 2.0520617557462444, "learning_rate": 3.8196887201463e-06, "loss": 0.1908, "step": 2305 }, { "epoch": 1.2392703862660945, "grad_norm": 1.9391936764467865, "learning_rate": 3.796953567339571e-06, "loss": 0.1939, "step": 2310 }, { "epoch": 1.2419527896995708, "grad_norm": 1.8748693105705567, "learning_rate": 3.7742447861220027e-06, "loss": 0.1833, "step": 2315 }, { "epoch": 1.2446351931330473, "grad_norm": 1.9974397227990535, "learning_rate": 3.7515628742854006e-06, "loss": 0.1836, "step": 2320 }, { "epoch": 1.2473175965665235, "grad_norm": 1.9673663691547625, "learning_rate": 3.7289083290325668e-06, "loss": 0.1879, "step": 2325 }, { "epoch": 1.25, "grad_norm": 1.741852807857151, "learning_rate": 3.706281646966409e-06, "loss": 0.18, "step": 2330 }, { "epoch": 1.2526824034334765, "grad_norm": 1.845317314457415, "learning_rate": 3.6836833240790625e-06, "loss": 0.1844, "step": 2335 }, { "epoch": 1.2553648068669527, "grad_norm": 1.8928195985644114, "learning_rate": 3.6611138557410047e-06, "loss": 0.1861, "step": 2340 }, { "epoch": 1.2580472103004292, "grad_norm": 1.9135160408381562, "learning_rate": 3.638573736690202e-06, "loss": 0.1867, "step": 2345 }, { "epoch": 1.2607296137339055, "grad_norm": 1.8715854861816341, "learning_rate": 3.6160634610212642e-06, "loss": 0.1795, "step": 2350 }, { "epoch": 1.263412017167382, "grad_norm": 1.941735004373864, "learning_rate": 3.5935835221746183e-06, "loss": 0.1792, "step": 2355 }, { "epoch": 1.2660944206008584, "grad_norm": 1.8602874406371905, "learning_rate": 3.5711344129256832e-06, "loss": 0.1834, "step": 2360 }, { "epoch": 1.268776824034335, "grad_norm": 1.8934431804634135, "learning_rate": 3.548716625374074e-06, "loss": 0.1878, "step": 2365 }, { "epoch": 1.2714592274678111, "grad_norm": 1.8674302371875071, "learning_rate": 3.5263306509328103e-06, "loss": 0.1887, "step": 2370 }, { "epoch": 1.2741416309012876, "grad_norm": 1.830454228744595, "learning_rate": 3.5039769803175545e-06, "loss": 0.1832, "step": 2375 }, { "epoch": 1.2768240343347639, "grad_norm": 1.8983910015313787, "learning_rate": 3.481656103535839e-06, "loss": 0.1866, "step": 2380 }, { "epoch": 1.2795064377682404, "grad_norm": 1.8935396043689916, "learning_rate": 3.459368509876338e-06, "loss": 0.1894, "step": 2385 }, { "epoch": 1.2821888412017168, "grad_norm": 1.9182258648131894, "learning_rate": 3.437114687898132e-06, "loss": 0.1854, "step": 2390 }, { "epoch": 1.284871244635193, "grad_norm": 1.7928082461331274, "learning_rate": 3.414895125420013e-06, "loss": 0.1812, "step": 2395 }, { "epoch": 1.2875536480686696, "grad_norm": 1.935408410751667, "learning_rate": 3.3927103095097725e-06, "loss": 0.1814, "step": 2400 }, { "epoch": 1.2902360515021458, "grad_norm": 1.9003287879335953, "learning_rate": 3.370560726473537e-06, "loss": 0.1903, "step": 2405 }, { "epoch": 1.2929184549356223, "grad_norm": 2.0555183420575327, "learning_rate": 3.348446861845106e-06, "loss": 0.1828, "step": 2410 }, { "epoch": 1.2956008583690988, "grad_norm": 1.8617361794963694, "learning_rate": 3.3263692003753056e-06, "loss": 0.1808, "step": 2415 }, { "epoch": 1.298283261802575, "grad_norm": 1.9721539542652242, "learning_rate": 3.304328226021365e-06, "loss": 0.1919, "step": 2420 }, { "epoch": 1.3009656652360515, "grad_norm": 1.7235452784628431, "learning_rate": 3.282324421936307e-06, "loss": 0.183, "step": 2425 }, { "epoch": 1.3036480686695278, "grad_norm": 1.9224755804378313, "learning_rate": 3.2603582704583547e-06, "loss": 0.1794, "step": 2430 }, { "epoch": 1.3063304721030042, "grad_norm": 1.9296894237667428, "learning_rate": 3.2384302531003676e-06, "loss": 0.1819, "step": 2435 }, { "epoch": 1.3090128755364807, "grad_norm": 1.8488160096499893, "learning_rate": 3.216540850539272e-06, "loss": 0.184, "step": 2440 }, { "epoch": 1.3116952789699572, "grad_norm": 1.9650298752372577, "learning_rate": 3.1946905426055353e-06, "loss": 0.1865, "step": 2445 }, { "epoch": 1.3143776824034334, "grad_norm": 1.8416341894181925, "learning_rate": 3.172879808272642e-06, "loss": 0.1867, "step": 2450 }, { "epoch": 1.31706008583691, "grad_norm": 1.8305031917576464, "learning_rate": 3.151109125646601e-06, "loss": 0.1798, "step": 2455 }, { "epoch": 1.3197424892703862, "grad_norm": 1.7570726165570256, "learning_rate": 3.1293789719554562e-06, "loss": 0.1871, "step": 2460 }, { "epoch": 1.3224248927038627, "grad_norm": 1.9141085633525485, "learning_rate": 3.107689823538833e-06, "loss": 0.1843, "step": 2465 }, { "epoch": 1.3251072961373391, "grad_norm": 1.8197849342377277, "learning_rate": 3.086042155837491e-06, "loss": 0.1822, "step": 2470 }, { "epoch": 1.3277896995708154, "grad_norm": 1.7732550666037066, "learning_rate": 3.0644364433829076e-06, "loss": 0.1886, "step": 2475 }, { "epoch": 1.3304721030042919, "grad_norm": 1.736138201516796, "learning_rate": 3.0428731597868706e-06, "loss": 0.1778, "step": 2480 }, { "epoch": 1.3331545064377681, "grad_norm": 1.8890101293385348, "learning_rate": 3.021352777731096e-06, "loss": 0.1898, "step": 2485 }, { "epoch": 1.3358369098712446, "grad_norm": 1.9863756874762624, "learning_rate": 2.9998757689568775e-06, "loss": 0.1827, "step": 2490 }, { "epoch": 1.338519313304721, "grad_norm": 1.8003677020892024, "learning_rate": 2.978442604254729e-06, "loss": 0.1817, "step": 2495 }, { "epoch": 1.3412017167381975, "grad_norm": 1.8826968690903354, "learning_rate": 2.9570537534540765e-06, "loss": 0.1852, "step": 2500 }, { "epoch": 1.3438841201716738, "grad_norm": 1.7536228897017827, "learning_rate": 2.935709685412954e-06, "loss": 0.1841, "step": 2505 }, { "epoch": 1.3465665236051503, "grad_norm": 1.8929318222613714, "learning_rate": 2.9144108680077288e-06, "loss": 0.1822, "step": 2510 }, { "epoch": 1.3492489270386265, "grad_norm": 1.9442758921901246, "learning_rate": 2.8931577681228407e-06, "loss": 0.1804, "step": 2515 }, { "epoch": 1.351931330472103, "grad_norm": 2.0042642224278504, "learning_rate": 2.871950851640577e-06, "loss": 0.1889, "step": 2520 }, { "epoch": 1.3546137339055795, "grad_norm": 1.7418683543232312, "learning_rate": 2.8507905834308417e-06, "loss": 0.1842, "step": 2525 }, { "epoch": 1.3572961373390557, "grad_norm": 1.6799217908123405, "learning_rate": 2.8296774273409944e-06, "loss": 0.1792, "step": 2530 }, { "epoch": 1.3599785407725322, "grad_norm": 1.8835878261854089, "learning_rate": 2.8086118461856494e-06, "loss": 0.1783, "step": 2535 }, { "epoch": 1.3626609442060085, "grad_norm": 1.8803546675146086, "learning_rate": 2.787594301736556e-06, "loss": 0.1904, "step": 2540 }, { "epoch": 1.365343347639485, "grad_norm": 1.8532904145098434, "learning_rate": 2.7666252547124596e-06, "loss": 0.1814, "step": 2545 }, { "epoch": 1.3680257510729614, "grad_norm": 1.7786107649574128, "learning_rate": 2.745705164769015e-06, "loss": 0.1789, "step": 2550 }, { "epoch": 1.370708154506438, "grad_norm": 1.8374049198264844, "learning_rate": 2.724834490488705e-06, "loss": 0.184, "step": 2555 }, { "epoch": 1.3733905579399142, "grad_norm": 1.7865980582369325, "learning_rate": 2.7040136893707813e-06, "loss": 0.1788, "step": 2560 }, { "epoch": 1.3760729613733906, "grad_norm": 1.845948134552065, "learning_rate": 2.683243217821248e-06, "loss": 0.1761, "step": 2565 }, { "epoch": 1.378755364806867, "grad_norm": 1.7851346175445124, "learning_rate": 2.66252353114285e-06, "loss": 0.186, "step": 2570 }, { "epoch": 1.3814377682403434, "grad_norm": 1.8246015252435668, "learning_rate": 2.6418550835250946e-06, "loss": 0.1851, "step": 2575 }, { "epoch": 1.3841201716738198, "grad_norm": 1.8559371914450862, "learning_rate": 2.621238328034289e-06, "loss": 0.1802, "step": 2580 }, { "epoch": 1.386802575107296, "grad_norm": 1.6806542607012862, "learning_rate": 2.60067371660362e-06, "loss": 0.1736, "step": 2585 }, { "epoch": 1.3894849785407726, "grad_norm": 1.8686143641511146, "learning_rate": 2.5801617000232416e-06, "loss": 0.1877, "step": 2590 }, { "epoch": 1.3921673819742488, "grad_norm": 1.8128693858787677, "learning_rate": 2.559702727930386e-06, "loss": 0.1765, "step": 2595 }, { "epoch": 1.3948497854077253, "grad_norm": 1.7767602734245427, "learning_rate": 2.5392972487995247e-06, "loss": 0.1788, "step": 2600 }, { "epoch": 1.3975321888412018, "grad_norm": 1.801535269669841, "learning_rate": 2.5189457099325153e-06, "loss": 0.1758, "step": 2605 }, { "epoch": 1.400214592274678, "grad_norm": 1.8540923660851476, "learning_rate": 2.498648557448824e-06, "loss": 0.176, "step": 2610 }, { "epoch": 1.4028969957081545, "grad_norm": 1.782258369250198, "learning_rate": 2.4784062362757156e-06, "loss": 0.1753, "step": 2615 }, { "epoch": 1.4055793991416308, "grad_norm": 1.9756868732773234, "learning_rate": 2.458219190138526e-06, "loss": 0.1831, "step": 2620 }, { "epoch": 1.4082618025751072, "grad_norm": 1.8615216161202677, "learning_rate": 2.4380878615509156e-06, "loss": 0.1789, "step": 2625 }, { "epoch": 1.4109442060085837, "grad_norm": 1.7784799710308095, "learning_rate": 2.418012691805191e-06, "loss": 0.1798, "step": 2630 }, { "epoch": 1.4136266094420602, "grad_norm": 1.7728377054053102, "learning_rate": 2.3979941209626072e-06, "loss": 0.1771, "step": 2635 }, { "epoch": 1.4163090128755365, "grad_norm": 1.9983557573599398, "learning_rate": 2.3780325878437415e-06, "loss": 0.1765, "step": 2640 }, { "epoch": 1.418991416309013, "grad_norm": 1.8582307461530765, "learning_rate": 2.358128530018858e-06, "loss": 0.1853, "step": 2645 }, { "epoch": 1.4216738197424892, "grad_norm": 2.003406816815176, "learning_rate": 2.3382823837983314e-06, "loss": 0.1774, "step": 2650 }, { "epoch": 1.4243562231759657, "grad_norm": 1.869592497513551, "learning_rate": 2.318494584223072e-06, "loss": 0.1862, "step": 2655 }, { "epoch": 1.4270386266094421, "grad_norm": 1.8173356540308416, "learning_rate": 2.2987655650549862e-06, "loss": 0.1752, "step": 2660 }, { "epoch": 1.4297210300429184, "grad_norm": 1.801602630583928, "learning_rate": 2.2790957587674876e-06, "loss": 0.1776, "step": 2665 }, { "epoch": 1.4324034334763949, "grad_norm": 1.7795073214192665, "learning_rate": 2.2594855965359906e-06, "loss": 0.1797, "step": 2670 }, { "epoch": 1.4350858369098711, "grad_norm": 1.8235939944536936, "learning_rate": 2.2399355082284804e-06, "loss": 0.1828, "step": 2675 }, { "epoch": 1.4377682403433476, "grad_norm": 1.8577120027984566, "learning_rate": 2.2204459223960716e-06, "loss": 0.1755, "step": 2680 }, { "epoch": 1.440450643776824, "grad_norm": 1.8441638113430847, "learning_rate": 2.2010172662636377e-06, "loss": 0.1861, "step": 2685 }, { "epoch": 1.4431330472103006, "grad_norm": 1.8478925551724383, "learning_rate": 2.1816499657204183e-06, "loss": 0.1762, "step": 2690 }, { "epoch": 1.4458154506437768, "grad_norm": 1.7525203705768637, "learning_rate": 2.1623444453107067e-06, "loss": 0.1816, "step": 2695 }, { "epoch": 1.4484978540772533, "grad_norm": 1.8211338807635666, "learning_rate": 2.1431011282245274e-06, "loss": 0.1838, "step": 2700 }, { "epoch": 1.4511802575107295, "grad_norm": 1.7799090336948569, "learning_rate": 2.12392043628837e-06, "loss": 0.1771, "step": 2705 }, { "epoch": 1.453862660944206, "grad_norm": 1.7639414117794536, "learning_rate": 2.10480278995594e-06, "loss": 0.1831, "step": 2710 }, { "epoch": 1.4565450643776825, "grad_norm": 1.7531623587359995, "learning_rate": 2.0857486082989344e-06, "loss": 0.1843, "step": 2715 }, { "epoch": 1.4592274678111588, "grad_norm": 1.855907772291884, "learning_rate": 2.0667583089978673e-06, "loss": 0.1802, "step": 2720 }, { "epoch": 1.4619098712446352, "grad_norm": 1.8684669706778945, "learning_rate": 2.0478323083329072e-06, "loss": 0.1721, "step": 2725 }, { "epoch": 1.4645922746781115, "grad_norm": 1.8147519194803996, "learning_rate": 2.028971021174754e-06, "loss": 0.1853, "step": 2730 }, { "epoch": 1.467274678111588, "grad_norm": 1.8098133512911396, "learning_rate": 2.0101748609755407e-06, "loss": 0.1857, "step": 2735 }, { "epoch": 1.4699570815450644, "grad_norm": 1.820346979346647, "learning_rate": 1.9914442397597756e-06, "loss": 0.1781, "step": 2740 }, { "epoch": 1.4726394849785407, "grad_norm": 1.866870124366536, "learning_rate": 1.9727795681153083e-06, "loss": 0.1699, "step": 2745 }, { "epoch": 1.4753218884120172, "grad_norm": 1.7258506096579789, "learning_rate": 1.954181255184331e-06, "loss": 0.176, "step": 2750 }, { "epoch": 1.4780042918454936, "grad_norm": 1.711526424745769, "learning_rate": 1.935649708654403e-06, "loss": 0.1772, "step": 2755 }, { "epoch": 1.48068669527897, "grad_norm": 1.8609006211079224, "learning_rate": 1.9171853347495234e-06, "loss": 0.1777, "step": 2760 }, { "epoch": 1.4833690987124464, "grad_norm": 1.6755745022336013, "learning_rate": 1.8987885382212235e-06, "loss": 0.1798, "step": 2765 }, { "epoch": 1.4860515021459229, "grad_norm": 1.7601304020764486, "learning_rate": 1.8804597223396865e-06, "loss": 0.1815, "step": 2770 }, { "epoch": 1.488733905579399, "grad_norm": 1.811311124699501, "learning_rate": 1.8621992888849217e-06, "loss": 0.1831, "step": 2775 }, { "epoch": 1.4914163090128756, "grad_norm": 1.6708390106679016, "learning_rate": 1.8440076381379395e-06, "loss": 0.1762, "step": 2780 }, { "epoch": 1.4940987124463518, "grad_norm": 1.7531230138831804, "learning_rate": 1.8258851688720009e-06, "loss": 0.1713, "step": 2785 }, { "epoch": 1.4967811158798283, "grad_norm": 1.9394223869956364, "learning_rate": 1.807832278343849e-06, "loss": 0.183, "step": 2790 }, { "epoch": 1.4994635193133048, "grad_norm": 1.8048082568446313, "learning_rate": 1.7898493622850227e-06, "loss": 0.1768, "step": 2795 }, { "epoch": 1.5021459227467813, "grad_norm": 1.7551888740169181, "learning_rate": 1.771936814893167e-06, "loss": 0.1764, "step": 2800 }, { "epoch": 1.5048283261802575, "grad_norm": 1.8875754401735638, "learning_rate": 1.7540950288234033e-06, "loss": 0.1845, "step": 2805 }, { "epoch": 1.5075107296137338, "grad_norm": 1.8524080040006463, "learning_rate": 1.7363243951797155e-06, "loss": 0.1756, "step": 2810 }, { "epoch": 1.5101931330472103, "grad_norm": 1.7691884608701913, "learning_rate": 1.7186253035063738e-06, "loss": 0.1703, "step": 2815 }, { "epoch": 1.5128755364806867, "grad_norm": 1.709705848421099, "learning_rate": 1.7009981417794114e-06, "loss": 0.173, "step": 2820 }, { "epoch": 1.5155579399141632, "grad_norm": 1.7493695311267172, "learning_rate": 1.6834432963980957e-06, "loss": 0.1761, "step": 2825 }, { "epoch": 1.5182403433476395, "grad_norm": 1.817658314934476, "learning_rate": 1.6659611521764807e-06, "loss": 0.1766, "step": 2830 }, { "epoch": 1.5209227467811157, "grad_norm": 1.8124121386173149, "learning_rate": 1.6485520923349529e-06, "loss": 0.1711, "step": 2835 }, { "epoch": 1.5236051502145922, "grad_norm": 1.8036388934020569, "learning_rate": 1.6312164984918516e-06, "loss": 0.1632, "step": 2840 }, { "epoch": 1.5262875536480687, "grad_norm": 1.7721585365938846, "learning_rate": 1.6139547506550808e-06, "loss": 0.1827, "step": 2845 }, { "epoch": 1.5289699570815452, "grad_norm": 1.7879152977426878, "learning_rate": 1.5967672272137968e-06, "loss": 0.1831, "step": 2850 }, { "epoch": 1.5316523605150214, "grad_norm": 1.758941967973733, "learning_rate": 1.5796543049301033e-06, "loss": 0.173, "step": 2855 }, { "epoch": 1.5343347639484979, "grad_norm": 1.9267006298361062, "learning_rate": 1.5626163589307991e-06, "loss": 0.1814, "step": 2860 }, { "epoch": 1.5370171673819741, "grad_norm": 1.7464407607171644, "learning_rate": 1.5456537626991525e-06, "loss": 0.1704, "step": 2865 }, { "epoch": 1.5396995708154506, "grad_norm": 1.7464375420338283, "learning_rate": 1.5287668880667107e-06, "loss": 0.1708, "step": 2870 }, { "epoch": 1.542381974248927, "grad_norm": 1.8740461070429053, "learning_rate": 1.5119561052051546e-06, "loss": 0.1751, "step": 2875 }, { "epoch": 1.5450643776824036, "grad_norm": 1.8623897260848234, "learning_rate": 1.495221782618183e-06, "loss": 0.1802, "step": 2880 }, { "epoch": 1.5477467811158798, "grad_norm": 1.8806738023084306, "learning_rate": 1.4785642871334349e-06, "loss": 0.1701, "step": 2885 }, { "epoch": 1.550429184549356, "grad_norm": 1.7909625741938127, "learning_rate": 1.4619839838944416e-06, "loss": 0.1784, "step": 2890 }, { "epoch": 1.5531115879828326, "grad_norm": 1.7357651631494229, "learning_rate": 1.4454812363526339e-06, "loss": 0.1732, "step": 2895 }, { "epoch": 1.555793991416309, "grad_norm": 1.7581678834791308, "learning_rate": 1.429056406259368e-06, "loss": 0.1633, "step": 2900 }, { "epoch": 1.5584763948497855, "grad_norm": 1.836631568460111, "learning_rate": 1.4127098536579982e-06, "loss": 0.1815, "step": 2905 }, { "epoch": 1.5611587982832618, "grad_norm": 1.8009902649006466, "learning_rate": 1.3964419368759786e-06, "loss": 0.1727, "step": 2910 }, { "epoch": 1.5638412017167382, "grad_norm": 1.8219251551094937, "learning_rate": 1.380253012517019e-06, "loss": 0.1698, "step": 2915 }, { "epoch": 1.5665236051502145, "grad_norm": 1.7312674449206737, "learning_rate": 1.3641434354532595e-06, "loss": 0.1741, "step": 2920 }, { "epoch": 1.569206008583691, "grad_norm": 1.7950433845906226, "learning_rate": 1.3481135588174926e-06, "loss": 0.173, "step": 2925 }, { "epoch": 1.5718884120171674, "grad_norm": 1.7927705418884003, "learning_rate": 1.332163733995427e-06, "loss": 0.1783, "step": 2930 }, { "epoch": 1.574570815450644, "grad_norm": 1.8716599881468707, "learning_rate": 1.3162943106179748e-06, "loss": 0.1777, "step": 2935 }, { "epoch": 1.5772532188841202, "grad_norm": 1.8801243933764982, "learning_rate": 1.3005056365536067e-06, "loss": 0.1779, "step": 2940 }, { "epoch": 1.5799356223175964, "grad_norm": 1.8020425970562752, "learning_rate": 1.2847980579007003e-06, "loss": 0.1707, "step": 2945 }, { "epoch": 1.582618025751073, "grad_norm": 1.872972664773768, "learning_rate": 1.2691719189799774e-06, "loss": 0.1801, "step": 2950 }, { "epoch": 1.5853004291845494, "grad_norm": 1.6825595039109138, "learning_rate": 1.253627562326936e-06, "loss": 0.1678, "step": 2955 }, { "epoch": 1.5879828326180259, "grad_norm": 1.671241827748479, "learning_rate": 1.2381653286843648e-06, "loss": 0.1719, "step": 2960 }, { "epoch": 1.5906652360515021, "grad_norm": 1.793255581230876, "learning_rate": 1.2227855569948477e-06, "loss": 0.1774, "step": 2965 }, { "epoch": 1.5933476394849786, "grad_norm": 1.6638694140276173, "learning_rate": 1.2074885843933542e-06, "loss": 0.1649, "step": 2970 }, { "epoch": 1.5960300429184548, "grad_norm": 1.8655315847180594, "learning_rate": 1.1922747461998425e-06, "loss": 0.1756, "step": 2975 }, { "epoch": 1.5987124463519313, "grad_norm": 1.6935733709105691, "learning_rate": 1.1771443759119028e-06, "loss": 0.1753, "step": 2980 }, { "epoch": 1.6013948497854078, "grad_norm": 1.7020009025089284, "learning_rate": 1.162097805197459e-06, "loss": 0.17, "step": 2985 }, { "epoch": 1.6040772532188843, "grad_norm": 1.6885484043441028, "learning_rate": 1.147135363887485e-06, "loss": 0.1735, "step": 2990 }, { "epoch": 1.6067596566523605, "grad_norm": 1.7655269813249368, "learning_rate": 1.1322573799687904e-06, "loss": 0.1831, "step": 2995 }, { "epoch": 1.6094420600858368, "grad_norm": 1.7993834138767326, "learning_rate": 1.1174641795768132e-06, "loss": 0.1761, "step": 3000 }, { "epoch": 1.6121244635193133, "grad_norm": 1.715209068714748, "learning_rate": 1.1027560869884845e-06, "loss": 0.1792, "step": 3005 }, { "epoch": 1.6148068669527897, "grad_norm": 1.7858030345864855, "learning_rate": 1.0881334246151114e-06, "loss": 0.1797, "step": 3010 }, { "epoch": 1.6174892703862662, "grad_norm": 1.8696512838527688, "learning_rate": 1.073596512995317e-06, "loss": 0.1769, "step": 3015 }, { "epoch": 1.6201716738197425, "grad_norm": 1.6742157162384825, "learning_rate": 1.0591456707880077e-06, "loss": 0.1711, "step": 3020 }, { "epoch": 1.6228540772532187, "grad_norm": 1.8438739760376024, "learning_rate": 1.0447812147653885e-06, "loss": 0.1692, "step": 3025 }, { "epoch": 1.6255364806866952, "grad_norm": 1.866018619333295, "learning_rate": 1.0305034598060254e-06, "loss": 0.1835, "step": 3030 }, { "epoch": 1.6282188841201717, "grad_norm": 1.7015347298679881, "learning_rate": 1.0163127188879352e-06, "loss": 0.1723, "step": 3035 }, { "epoch": 1.6309012875536482, "grad_norm": 1.7680749094941124, "learning_rate": 1.0022093030817316e-06, "loss": 0.1737, "step": 3040 }, { "epoch": 1.6335836909871244, "grad_norm": 1.7566186598853142, "learning_rate": 9.88193521543797e-07, "loss": 0.1726, "step": 3045 }, { "epoch": 1.636266094420601, "grad_norm": 1.6378738587893038, "learning_rate": 9.742656815095175e-07, "loss": 0.1707, "step": 3050 }, { "epoch": 1.6389484978540771, "grad_norm": 1.7202264347511544, "learning_rate": 9.604260882865395e-07, "loss": 0.1722, "step": 3055 }, { "epoch": 1.6416309012875536, "grad_norm": 1.8200254670288298, "learning_rate": 9.466750452480816e-07, "loss": 0.1693, "step": 3060 }, { "epoch": 1.64431330472103, "grad_norm": 1.7312812411647176, "learning_rate": 9.330128538262784e-07, "loss": 0.1691, "step": 3065 }, { "epoch": 1.6469957081545066, "grad_norm": 1.7315554623411147, "learning_rate": 9.194398135055815e-07, "loss": 0.1751, "step": 3070 }, { "epoch": 1.6496781115879828, "grad_norm": 1.7543811126295066, "learning_rate": 9.059562218161894e-07, "loss": 0.172, "step": 3075 }, { "epoch": 1.652360515021459, "grad_norm": 1.6634656601819895, "learning_rate": 8.925623743275235e-07, "loss": 0.177, "step": 3080 }, { "epoch": 1.6550429184549356, "grad_norm": 1.7357349125240544, "learning_rate": 8.792585646417568e-07, "loss": 0.1774, "step": 3085 }, { "epoch": 1.657725321888412, "grad_norm": 1.769627938345817, "learning_rate": 8.660450843873647e-07, "loss": 0.1781, "step": 3090 }, { "epoch": 1.6604077253218885, "grad_norm": 1.7738444995348275, "learning_rate": 8.529222232127526e-07, "loss": 0.1741, "step": 3095 }, { "epoch": 1.6630901287553648, "grad_norm": 1.8078545330039417, "learning_rate": 8.398902687798832e-07, "loss": 0.172, "step": 3100 }, { "epoch": 1.6657725321888412, "grad_norm": 1.7954176425736126, "learning_rate": 8.269495067579891e-07, "loss": 0.1837, "step": 3105 }, { "epoch": 1.6684549356223175, "grad_norm": 1.8046939747053379, "learning_rate": 8.141002208172977e-07, "loss": 0.1756, "step": 3110 }, { "epoch": 1.671137339055794, "grad_norm": 1.7579566353635228, "learning_rate": 8.013426926228274e-07, "loss": 0.1733, "step": 3115 }, { "epoch": 1.6738197424892705, "grad_norm": 1.9891948383446052, "learning_rate": 7.886772018281969e-07, "loss": 0.1789, "step": 3120 }, { "epoch": 1.676502145922747, "grad_norm": 1.7720251235238305, "learning_rate": 7.761040260695074e-07, "loss": 0.1666, "step": 3125 }, { "epoch": 1.6791845493562232, "grad_norm": 1.804180840883795, "learning_rate": 7.636234409592524e-07, "loss": 0.1682, "step": 3130 }, { "epoch": 1.6818669527896994, "grad_norm": 1.7827982629041568, "learning_rate": 7.512357200802722e-07, "loss": 0.1734, "step": 3135 }, { "epoch": 1.684549356223176, "grad_norm": 1.8435636730729283, "learning_rate": 7.389411349797654e-07, "loss": 0.1729, "step": 3140 }, { "epoch": 1.6872317596566524, "grad_norm": 1.731693467400855, "learning_rate": 7.267399551633253e-07, "loss": 0.1715, "step": 3145 }, { "epoch": 1.6899141630901289, "grad_norm": 1.896921182532337, "learning_rate": 7.146324480890476e-07, "loss": 0.1777, "step": 3150 }, { "epoch": 1.6925965665236051, "grad_norm": 1.7406155992474135, "learning_rate": 7.026188791616484e-07, "loss": 0.1784, "step": 3155 }, { "epoch": 1.6952789699570814, "grad_norm": 1.695450435696037, "learning_rate": 6.906995117266641e-07, "loss": 0.1715, "step": 3160 }, { "epoch": 1.6979613733905579, "grad_norm": 1.8253483868244234, "learning_rate": 6.788746070646646e-07, "loss": 0.178, "step": 3165 }, { "epoch": 1.7006437768240343, "grad_norm": 1.7426331705172873, "learning_rate": 6.671444243855368e-07, "loss": 0.1729, "step": 3170 }, { "epoch": 1.7033261802575108, "grad_norm": 1.7830965054941021, "learning_rate": 6.555092208227953e-07, "loss": 0.1701, "step": 3175 }, { "epoch": 1.7060085836909873, "grad_norm": 1.7709532602427074, "learning_rate": 6.439692514279516e-07, "loss": 0.1655, "step": 3180 }, { "epoch": 1.7086909871244635, "grad_norm": 1.7457888434905895, "learning_rate": 6.325247691649139e-07, "loss": 0.169, "step": 3185 }, { "epoch": 1.7113733905579398, "grad_norm": 1.845129639957685, "learning_rate": 6.211760249044535e-07, "loss": 0.1683, "step": 3190 }, { "epoch": 1.7140557939914163, "grad_norm": 1.7406237571394934, "learning_rate": 6.099232674187e-07, "loss": 0.1693, "step": 3195 }, { "epoch": 1.7167381974248928, "grad_norm": 1.6558560175079584, "learning_rate": 5.987667433756844e-07, "loss": 0.171, "step": 3200 }, { "epoch": 1.7194206008583692, "grad_norm": 1.7619787504212168, "learning_rate": 5.877066973339379e-07, "loss": 0.1762, "step": 3205 }, { "epoch": 1.7221030042918455, "grad_norm": 1.712187106314034, "learning_rate": 5.767433717371301e-07, "loss": 0.1729, "step": 3210 }, { "epoch": 1.7247854077253217, "grad_norm": 1.744673710241909, "learning_rate": 5.658770069087521e-07, "loss": 0.1706, "step": 3215 }, { "epoch": 1.7274678111587982, "grad_norm": 1.6897781826250886, "learning_rate": 5.551078410468486e-07, "loss": 0.1674, "step": 3220 }, { "epoch": 1.7301502145922747, "grad_norm": 1.836757276470344, "learning_rate": 5.444361102187979e-07, "loss": 0.1727, "step": 3225 }, { "epoch": 1.7328326180257512, "grad_norm": 1.7933807550358414, "learning_rate": 5.338620483561386e-07, "loss": 0.1762, "step": 3230 }, { "epoch": 1.7355150214592274, "grad_norm": 1.7339487780920857, "learning_rate": 5.233858872494357e-07, "loss": 0.1746, "step": 3235 }, { "epoch": 1.738197424892704, "grad_norm": 1.824200302131874, "learning_rate": 5.130078565432089e-07, "loss": 0.1706, "step": 3240 }, { "epoch": 1.7408798283261802, "grad_norm": 1.8298886758608333, "learning_rate": 5.027281837308873e-07, "loss": 0.1645, "step": 3245 }, { "epoch": 1.7435622317596566, "grad_norm": 1.8098426926543671, "learning_rate": 4.925470941498345e-07, "loss": 0.1692, "step": 3250 }, { "epoch": 1.746244635193133, "grad_norm": 1.6085152250171215, "learning_rate": 4.824648109763991e-07, "loss": 0.1705, "step": 3255 }, { "epoch": 1.7489270386266096, "grad_norm": 1.7432884236898376, "learning_rate": 4.724815552210288e-07, "loss": 0.1686, "step": 3260 }, { "epoch": 1.7516094420600858, "grad_norm": 1.6699883001696092, "learning_rate": 4.6259754572342e-07, "loss": 0.1731, "step": 3265 }, { "epoch": 1.754291845493562, "grad_norm": 1.7187172541851534, "learning_rate": 4.5281299914773146e-07, "loss": 0.1725, "step": 3270 }, { "epoch": 1.7569742489270386, "grad_norm": 1.6982676525729474, "learning_rate": 4.43128129977819e-07, "loss": 0.1766, "step": 3275 }, { "epoch": 1.759656652360515, "grad_norm": 1.6727940157589543, "learning_rate": 4.3354315051254927e-07, "loss": 0.1784, "step": 3280 }, { "epoch": 1.7623390557939915, "grad_norm": 1.7282228982144165, "learning_rate": 4.2405827086113406e-07, "loss": 0.1741, "step": 3285 }, { "epoch": 1.7650214592274678, "grad_norm": 1.8182216293075124, "learning_rate": 4.146736989385336e-07, "loss": 0.163, "step": 3290 }, { "epoch": 1.7677038626609443, "grad_norm": 1.689273374533355, "learning_rate": 4.0538964046089426e-07, "loss": 0.178, "step": 3295 }, { "epoch": 1.7703862660944205, "grad_norm": 1.8090493850015277, "learning_rate": 3.962062989410359e-07, "loss": 0.1723, "step": 3300 }, { "epoch": 1.773068669527897, "grad_norm": 1.8486291817369827, "learning_rate": 3.871238756840029e-07, "loss": 0.1757, "step": 3305 }, { "epoch": 1.7757510729613735, "grad_norm": 1.7621710071809544, "learning_rate": 3.7814256978263465e-07, "loss": 0.1716, "step": 3310 }, { "epoch": 1.77843347639485, "grad_norm": 1.7003521211490562, "learning_rate": 3.6926257811321585e-07, "loss": 0.1656, "step": 3315 }, { "epoch": 1.7811158798283262, "grad_norm": 1.7719108557141303, "learning_rate": 3.604840953311506e-07, "loss": 0.1697, "step": 3320 }, { "epoch": 1.7837982832618025, "grad_norm": 1.7205613637549066, "learning_rate": 3.518073138667044e-07, "loss": 0.1724, "step": 3325 }, { "epoch": 1.786480686695279, "grad_norm": 1.8084109102411703, "learning_rate": 3.4323242392077737e-07, "loss": 0.1691, "step": 3330 }, { "epoch": 1.7891630901287554, "grad_norm": 1.6325936133590864, "learning_rate": 3.347596134607406e-07, "loss": 0.169, "step": 3335 }, { "epoch": 1.7918454935622319, "grad_norm": 1.7801884532973724, "learning_rate": 3.263890682163129e-07, "loss": 0.1738, "step": 3340 }, { "epoch": 1.7945278969957081, "grad_norm": 1.7508032875718211, "learning_rate": 3.1812097167549127e-07, "loss": 0.1725, "step": 3345 }, { "epoch": 1.7972103004291844, "grad_norm": 1.6635694650855373, "learning_rate": 3.0995550508052976e-07, "loss": 0.1729, "step": 3350 }, { "epoch": 1.7998927038626609, "grad_norm": 1.6797657597761915, "learning_rate": 3.018928474239613e-07, "loss": 0.1665, "step": 3355 }, { "epoch": 1.8025751072961373, "grad_norm": 1.709310741789826, "learning_rate": 2.9393317544468003e-07, "loss": 0.177, "step": 3360 }, { "epoch": 1.8052575107296138, "grad_norm": 1.7381407138927376, "learning_rate": 2.860766636240636e-07, "loss": 0.1746, "step": 3365 }, { "epoch": 1.80793991416309, "grad_norm": 1.6152398510351698, "learning_rate": 2.7832348418215084e-07, "loss": 0.1713, "step": 3370 }, { "epoch": 1.8106223175965666, "grad_norm": 1.7713457299557143, "learning_rate": 2.7067380707386235e-07, "loss": 0.1751, "step": 3375 }, { "epoch": 1.8133047210300428, "grad_norm": 1.6975116402326647, "learning_rate": 2.631277999852799e-07, "loss": 0.1718, "step": 3380 }, { "epoch": 1.8159871244635193, "grad_norm": 1.730923253560568, "learning_rate": 2.556856283299691e-07, "loss": 0.1696, "step": 3385 }, { "epoch": 1.8186695278969958, "grad_norm": 1.6311862215280322, "learning_rate": 2.483474552453513e-07, "loss": 0.1755, "step": 3390 }, { "epoch": 1.8213519313304722, "grad_norm": 1.8268074745707832, "learning_rate": 2.4111344158912863e-07, "loss": 0.174, "step": 3395 }, { "epoch": 1.8240343347639485, "grad_norm": 1.7223098307234075, "learning_rate": 2.3398374593576022e-07, "loss": 0.1636, "step": 3400 }, { "epoch": 1.8267167381974247, "grad_norm": 1.7099494978625052, "learning_rate": 2.2695852457298328e-07, "loss": 0.1725, "step": 3405 }, { "epoch": 1.8293991416309012, "grad_norm": 1.7831248984652448, "learning_rate": 2.2003793149838692e-07, "loss": 0.1731, "step": 3410 }, { "epoch": 1.8320815450643777, "grad_norm": 1.8017259843202504, "learning_rate": 2.1322211841604046e-07, "loss": 0.1699, "step": 3415 }, { "epoch": 1.8347639484978542, "grad_norm": 1.8570689059641385, "learning_rate": 2.0651123473316103e-07, "loss": 0.1779, "step": 3420 }, { "epoch": 1.8374463519313304, "grad_norm": 1.7150879901759284, "learning_rate": 1.9990542755684738e-07, "loss": 0.1642, "step": 3425 }, { "epoch": 1.840128755364807, "grad_norm": 1.7925956709417157, "learning_rate": 1.9340484169084627e-07, "loss": 0.1728, "step": 3430 }, { "epoch": 1.8428111587982832, "grad_norm": 1.7271961500114719, "learning_rate": 1.870096196323856e-07, "loss": 0.174, "step": 3435 }, { "epoch": 1.8454935622317596, "grad_norm": 1.6999393264082368, "learning_rate": 1.8071990156904362e-07, "loss": 0.1712, "step": 3440 }, { "epoch": 1.8481759656652361, "grad_norm": 1.711677924442901, "learning_rate": 1.7453582537568404e-07, "loss": 0.1683, "step": 3445 }, { "epoch": 1.8508583690987126, "grad_norm": 1.7373150398027093, "learning_rate": 1.6845752661142744e-07, "loss": 0.1746, "step": 3450 }, { "epoch": 1.8535407725321889, "grad_norm": 1.6825849720769612, "learning_rate": 1.624851385166809e-07, "loss": 0.1692, "step": 3455 }, { "epoch": 1.856223175965665, "grad_norm": 1.7405361349977913, "learning_rate": 1.5661879201022135e-07, "loss": 0.1635, "step": 3460 }, { "epoch": 1.8589055793991416, "grad_norm": 1.8597174774773906, "learning_rate": 1.5085861568631845e-07, "loss": 0.1749, "step": 3465 }, { "epoch": 1.861587982832618, "grad_norm": 1.691769036794149, "learning_rate": 1.4520473581192407e-07, "loss": 0.1646, "step": 3470 }, { "epoch": 1.8642703862660945, "grad_norm": 1.8330270391633918, "learning_rate": 1.396572763238957e-07, "loss": 0.1696, "step": 3475 }, { "epoch": 1.8669527896995708, "grad_norm": 1.6678130714491481, "learning_rate": 1.3421635882628958e-07, "loss": 0.1697, "step": 3480 }, { "epoch": 1.869635193133047, "grad_norm": 1.874926450346368, "learning_rate": 1.2888210258768464e-07, "loss": 0.1729, "step": 3485 }, { "epoch": 1.8723175965665235, "grad_norm": 1.7207040138451901, "learning_rate": 1.2365462453857612e-07, "loss": 0.1712, "step": 3490 }, { "epoch": 1.875, "grad_norm": 1.7895874415323925, "learning_rate": 1.1853403926880725e-07, "loss": 0.178, "step": 3495 }, { "epoch": 1.8776824034334765, "grad_norm": 1.8069520613275543, "learning_rate": 1.1352045902506158e-07, "loss": 0.1781, "step": 3500 }, { "epoch": 1.880364806866953, "grad_norm": 1.698996774909848, "learning_rate": 1.0861399370839964e-07, "loss": 0.1677, "step": 3505 }, { "epoch": 1.8830472103004292, "grad_norm": 1.7570852714691856, "learning_rate": 1.03814750871849e-07, "loss": 0.1729, "step": 3510 }, { "epoch": 1.8857296137339055, "grad_norm": 1.7836667455428648, "learning_rate": 9.912283571805015e-08, "loss": 0.1706, "step": 3515 }, { "epoch": 1.888412017167382, "grad_norm": 1.8336984549540354, "learning_rate": 9.45383510969472e-08, "loss": 0.1791, "step": 3520 }, { "epoch": 1.8910944206008584, "grad_norm": 1.759998721870761, "learning_rate": 9.006139750353526e-08, "loss": 0.1682, "step": 3525 }, { "epoch": 1.893776824034335, "grad_norm": 1.7345566492947497, "learning_rate": 8.569207307565664e-08, "loss": 0.1758, "step": 3530 }, { "epoch": 1.8964592274678111, "grad_norm": 1.7936369740428832, "learning_rate": 8.143047359184863e-08, "loss": 0.1764, "step": 3535 }, { "epoch": 1.8991416309012874, "grad_norm": 1.784540157506738, "learning_rate": 7.727669246924697e-08, "loss": 0.1731, "step": 3540 }, { "epoch": 1.9018240343347639, "grad_norm": 1.829063622283105, "learning_rate": 7.32308207615351e-08, "loss": 0.1752, "step": 3545 }, { "epoch": 1.9045064377682404, "grad_norm": 1.6871978641908676, "learning_rate": 6.929294715694923e-08, "loss": 0.1683, "step": 3550 }, { "epoch": 1.9071888412017168, "grad_norm": 1.6868622193197653, "learning_rate": 6.54631579763343e-08, "loss": 0.1691, "step": 3555 }, { "epoch": 1.909871244635193, "grad_norm": 1.680779201660084, "learning_rate": 6.174153717125264e-08, "loss": 0.1648, "step": 3560 }, { "epoch": 1.9125536480686696, "grad_norm": 1.6748335213189753, "learning_rate": 5.812816632214169e-08, "loss": 0.1738, "step": 3565 }, { "epoch": 1.9152360515021458, "grad_norm": 1.8292206738323757, "learning_rate": 5.4623124636528635e-08, "loss": 0.1745, "step": 3570 }, { "epoch": 1.9179184549356223, "grad_norm": 1.7189630297488938, "learning_rate": 5.122648894728854e-08, "loss": 0.1597, "step": 3575 }, { "epoch": 1.9206008583690988, "grad_norm": 1.8798559179522052, "learning_rate": 4.7938333710969564e-08, "loss": 0.1666, "step": 3580 }, { "epoch": 1.9232832618025753, "grad_norm": 1.613502521171826, "learning_rate": 4.4758731006149804e-08, "loss": 0.1665, "step": 3585 }, { "epoch": 1.9259656652360515, "grad_norm": 1.7022983189249141, "learning_rate": 4.16877505318658e-08, "loss": 0.1659, "step": 3590 }, { "epoch": 1.9286480686695278, "grad_norm": 1.711513268173503, "learning_rate": 3.872545960608099e-08, "loss": 0.1704, "step": 3595 }, { "epoch": 1.9313304721030042, "grad_norm": 1.8474264290514621, "learning_rate": 3.587192316420962e-08, "loss": 0.1837, "step": 3600 }, { "epoch": 1.9340128755364807, "grad_norm": 1.7781807411963892, "learning_rate": 3.312720375769518e-08, "loss": 0.1707, "step": 3605 }, { "epoch": 1.9366952789699572, "grad_norm": 1.7301580232656828, "learning_rate": 3.04913615526381e-08, "loss": 0.1733, "step": 3610 }, { "epoch": 1.9393776824034334, "grad_norm": 1.7247734229734726, "learning_rate": 2.796445432847794e-08, "loss": 0.1665, "step": 3615 }, { "epoch": 1.94206008583691, "grad_norm": 1.705280886912048, "learning_rate": 2.554653747672442e-08, "loss": 0.1723, "step": 3620 }, { "epoch": 1.9447424892703862, "grad_norm": 1.75822408454372, "learning_rate": 2.323766399974614e-08, "loss": 0.1695, "step": 3625 }, { "epoch": 1.9474248927038627, "grad_norm": 1.7328976784349883, "learning_rate": 2.1037884509605976e-08, "loss": 0.1732, "step": 3630 }, { "epoch": 1.9501072961373391, "grad_norm": 1.7693954674590804, "learning_rate": 1.8947247226954736e-08, "loss": 0.1699, "step": 3635 }, { "epoch": 1.9527896995708156, "grad_norm": 1.7777015681081694, "learning_rate": 1.6965797979971442e-08, "loss": 0.1739, "step": 3640 }, { "epoch": 1.9554721030042919, "grad_norm": 1.6605608018230513, "learning_rate": 1.509358020336027e-08, "loss": 0.1668, "step": 3645 }, { "epoch": 1.9581545064377681, "grad_norm": 1.79866532080048, "learning_rate": 1.3330634937396835e-08, "loss": 0.1634, "step": 3650 }, { "epoch": 1.9608369098712446, "grad_norm": 1.7313998733328226, "learning_rate": 1.1677000827030604e-08, "loss": 0.1713, "step": 3655 }, { "epoch": 1.963519313304721, "grad_norm": 1.7883976851314054, "learning_rate": 1.0132714121037223e-08, "loss": 0.1761, "step": 3660 }, { "epoch": 1.9662017167381975, "grad_norm": 1.611702372634109, "learning_rate": 8.697808671221385e-09, "loss": 0.1734, "step": 3665 }, { "epoch": 1.9688841201716738, "grad_norm": 1.615875588269734, "learning_rate": 7.3723159316796414e-09, "loss": 0.1683, "step": 3670 }, { "epoch": 1.97156652360515, "grad_norm": 1.6850939463410233, "learning_rate": 6.1562649581059505e-09, "loss": 0.1683, "step": 3675 }, { "epoch": 1.9742489270386265, "grad_norm": 1.7194604331458, "learning_rate": 5.049682407157752e-09, "loss": 0.1736, "step": 3680 }, { "epoch": 1.976931330472103, "grad_norm": 1.6946799124574234, "learning_rate": 4.052592535871425e-09, "loss": 0.1653, "step": 3685 }, { "epoch": 1.9796137339055795, "grad_norm": 1.867427898238158, "learning_rate": 3.1650172011293834e-09, "loss": 0.1795, "step": 3690 }, { "epoch": 1.9822961373390557, "grad_norm": 1.677977662561145, "learning_rate": 2.3869758591810177e-09, "loss": 0.1635, "step": 3695 }, { "epoch": 1.9849785407725322, "grad_norm": 1.690688570445271, "learning_rate": 1.718485565218031e-09, "loss": 0.1658, "step": 3700 }, { "epoch": 1.9876609442060085, "grad_norm": 1.7364686218822993, "learning_rate": 1.15956097299752e-09, "loss": 0.1642, "step": 3705 }, { "epoch": 1.990343347639485, "grad_norm": 1.771831402767419, "learning_rate": 7.102143345238955e-10, "loss": 0.1717, "step": 3710 }, { "epoch": 1.9930257510729614, "grad_norm": 1.7240723585657705, "learning_rate": 3.7045549977909877e-10, "loss": 0.1712, "step": 3715 }, { "epoch": 1.995708154506438, "grad_norm": 1.663368730452181, "learning_rate": 1.4029191650555274e-10, "loss": 0.1728, "step": 3720 }, { "epoch": 1.9983905579399142, "grad_norm": 1.8123761305092458, "learning_rate": 1.9728630044069107e-11, "loss": 0.1727, "step": 3725 }, { "epoch": 2.0, "eval_runtime": 264.5222, "eval_samples_per_second": 3.78, "eval_steps_per_second": 0.945, "step": 3728 }, { "epoch": 2.0, "step": 3728, "total_flos": 390283678187520.0, "train_loss": 0.24741445771166695, "train_runtime": 33475.4316, "train_samples_per_second": 1.781, "train_steps_per_second": 0.111 } ], "logging_steps": 5, "max_steps": 3728, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 390283678187520.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }