{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24731352553220048, "eval_steps": 500, "global_step": 355600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013909647105298114, "grad_norm": 3.490684747695923, "learning_rate": 1.9999999832914018e-05, "loss": 1.8403, "step": 200 }, { "epoch": 0.0002781929421059623, "grad_norm": 3.975701332092285, "learning_rate": 1.9999999331656075e-05, "loss": 1.7405, "step": 400 }, { "epoch": 0.00041728941315894347, "grad_norm": 1.8542802333831787, "learning_rate": 1.9999998496226195e-05, "loss": 1.7381, "step": 600 }, { "epoch": 0.0005563858842119246, "grad_norm": 2.3053829669952393, "learning_rate": 1.999999732662442e-05, "loss": 1.7488, "step": 800 }, { "epoch": 0.0006954823552649058, "grad_norm": 3.304736614227295, "learning_rate": 1.99999958228508e-05, "loss": 1.6913, "step": 1000 }, { "epoch": 0.0008345788263178869, "grad_norm": 2.547065019607544, "learning_rate": 1.999999398490542e-05, "loss": 1.7454, "step": 1200 }, { "epoch": 0.0009736752973708681, "grad_norm": 3.4149389266967773, "learning_rate": 1.999999181278835e-05, "loss": 1.7446, "step": 1400 }, { "epoch": 0.001112771768423849, "grad_norm": 7.536021709442139, "learning_rate": 1.999998930649971e-05, "loss": 1.7236, "step": 1600 }, { "epoch": 0.0012518682394768305, "grad_norm": 3.5668132305145264, "learning_rate": 1.9999986466039608e-05, "loss": 1.7183, "step": 1800 }, { "epoch": 0.0013909647105298116, "grad_norm": 2.099731683731079, "learning_rate": 1.999998329140819e-05, "loss": 1.6953, "step": 2000 }, { "epoch": 0.0015300611815827927, "grad_norm": 2.7119176387786865, "learning_rate": 1.99999797826056e-05, "loss": 1.7412, "step": 2200 }, { "epoch": 0.0016691576526357739, "grad_norm": 3.5231118202209473, "learning_rate": 1.9999975939632005e-05, "loss": 1.6678, "step": 2400 }, { "epoch": 0.001808254123688755, "grad_norm": 2.289564371109009, "learning_rate": 1.9999971762487593e-05, "loss": 1.7012, "step": 2600 }, { "epoch": 0.0019473505947417361, "grad_norm": 2.248680591583252, "learning_rate": 1.999996725117256e-05, "loss": 1.69, "step": 2800 }, { "epoch": 0.0020864470657947173, "grad_norm": 1.9502891302108765, "learning_rate": 1.9999962405687122e-05, "loss": 1.7891, "step": 3000 }, { "epoch": 0.002225543536847698, "grad_norm": 2.609624147415161, "learning_rate": 1.9999957226031515e-05, "loss": 1.6898, "step": 3200 }, { "epoch": 0.0023646400079006796, "grad_norm": 3.5334463119506836, "learning_rate": 1.9999951712205977e-05, "loss": 1.664, "step": 3400 }, { "epoch": 0.002503736478953661, "grad_norm": 4.190518856048584, "learning_rate": 1.9999945864210777e-05, "loss": 1.7189, "step": 3600 }, { "epoch": 0.002642832950006642, "grad_norm": 1.869044542312622, "learning_rate": 1.999993968204619e-05, "loss": 1.7454, "step": 3800 }, { "epoch": 0.002781929421059623, "grad_norm": 3.599501609802246, "learning_rate": 1.9999933165712525e-05, "loss": 1.7142, "step": 4000 }, { "epoch": 0.002921025892112604, "grad_norm": 4.190791130065918, "learning_rate": 1.999992631521008e-05, "loss": 1.6922, "step": 4200 }, { "epoch": 0.0030601223631655855, "grad_norm": 1.6306347846984863, "learning_rate": 1.999991913053918e-05, "loss": 1.7296, "step": 4400 }, { "epoch": 0.0031992188342185664, "grad_norm": 2.6920559406280518, "learning_rate": 1.9999911611700172e-05, "loss": 1.6584, "step": 4600 }, { "epoch": 0.0033383153052715477, "grad_norm": 2.2550413608551025, "learning_rate": 1.999990375869342e-05, "loss": 1.7011, "step": 4800 }, { "epoch": 0.0034774117763245287, "grad_norm": 2.0802359580993652, "learning_rate": 1.9999895571519294e-05, "loss": 1.7183, "step": 5000 }, { "epoch": 0.00361650824737751, "grad_norm": 1.8050875663757324, "learning_rate": 1.9999887050178187e-05, "loss": 1.6972, "step": 5200 }, { "epoch": 0.003755604718430491, "grad_norm": 1.8249893188476562, "learning_rate": 1.99998781946705e-05, "loss": 1.7097, "step": 5400 }, { "epoch": 0.0038947011894834723, "grad_norm": 2.246152639389038, "learning_rate": 1.9999869004996664e-05, "loss": 1.687, "step": 5600 }, { "epoch": 0.004033797660536453, "grad_norm": 2.287285089492798, "learning_rate": 1.9999859481157112e-05, "loss": 1.7333, "step": 5800 }, { "epoch": 0.004172894131589435, "grad_norm": 3.4363529682159424, "learning_rate": 1.99998496231523e-05, "loss": 1.8065, "step": 6000 }, { "epoch": 0.004311990602642416, "grad_norm": 1.944610834121704, "learning_rate": 1.9999839430982698e-05, "loss": 1.6932, "step": 6200 }, { "epoch": 0.004451087073695396, "grad_norm": 6.126698017120361, "learning_rate": 1.9999828904648794e-05, "loss": 1.7195, "step": 6400 }, { "epoch": 0.004590183544748378, "grad_norm": 5.03230619430542, "learning_rate": 1.9999818044151088e-05, "loss": 1.6625, "step": 6600 }, { "epoch": 0.004729280015801359, "grad_norm": 1.7065147161483765, "learning_rate": 1.9999806849490103e-05, "loss": 1.7005, "step": 6800 }, { "epoch": 0.0048683764868543405, "grad_norm": 2.811397075653076, "learning_rate": 1.9999795320666373e-05, "loss": 1.7432, "step": 7000 }, { "epoch": 0.005007472957907322, "grad_norm": 2.421945810317993, "learning_rate": 1.9999783457680448e-05, "loss": 1.6493, "step": 7200 }, { "epoch": 0.005146569428960302, "grad_norm": 3.027940273284912, "learning_rate": 1.9999771260532886e-05, "loss": 1.674, "step": 7400 }, { "epoch": 0.005285665900013284, "grad_norm": 2.6853113174438477, "learning_rate": 1.9999758729224277e-05, "loss": 1.7261, "step": 7600 }, { "epoch": 0.005424762371066265, "grad_norm": 1.7555756568908691, "learning_rate": 1.9999745863755225e-05, "loss": 1.6795, "step": 7800 }, { "epoch": 0.005563858842119246, "grad_norm": 2.3382883071899414, "learning_rate": 1.999973266412633e-05, "loss": 1.7256, "step": 8000 }, { "epoch": 0.005702955313172227, "grad_norm": 1.9959968328475952, "learning_rate": 1.9999719130338235e-05, "loss": 1.6529, "step": 8200 }, { "epoch": 0.005842051784225208, "grad_norm": 3.996197462081909, "learning_rate": 1.9999705262391574e-05, "loss": 1.689, "step": 8400 }, { "epoch": 0.00598114825527819, "grad_norm": 2.0444586277008057, "learning_rate": 1.999969106028702e-05, "loss": 1.6629, "step": 8600 }, { "epoch": 0.006120244726331171, "grad_norm": 3.6032233238220215, "learning_rate": 1.999967652402525e-05, "loss": 1.6322, "step": 8800 }, { "epoch": 0.006259341197384151, "grad_norm": 3.2948379516601562, "learning_rate": 1.9999661653606947e-05, "loss": 1.657, "step": 9000 }, { "epoch": 0.006398437668437133, "grad_norm": 2.703350067138672, "learning_rate": 1.9999646449032833e-05, "loss": 1.7157, "step": 9200 }, { "epoch": 0.006537534139490114, "grad_norm": 3.6379008293151855, "learning_rate": 1.9999630910303628e-05, "loss": 1.7283, "step": 9400 }, { "epoch": 0.0066766306105430955, "grad_norm": 3.558755397796631, "learning_rate": 1.9999615037420078e-05, "loss": 1.6915, "step": 9600 }, { "epoch": 0.006815727081596076, "grad_norm": 1.7843396663665771, "learning_rate": 1.9999598830382934e-05, "loss": 1.7236, "step": 9800 }, { "epoch": 0.006954823552649057, "grad_norm": 4.09860372543335, "learning_rate": 1.9999582289192974e-05, "loss": 1.704, "step": 10000 }, { "epoch": 0.007093920023702039, "grad_norm": 1.7402437925338745, "learning_rate": 1.9999565413850993e-05, "loss": 1.722, "step": 10200 }, { "epoch": 0.00723301649475502, "grad_norm": 2.323296546936035, "learning_rate": 1.9999548204357783e-05, "loss": 1.686, "step": 10400 }, { "epoch": 0.007372112965808001, "grad_norm": 2.715653419494629, "learning_rate": 1.9999530660714175e-05, "loss": 1.702, "step": 10600 }, { "epoch": 0.007511209436860982, "grad_norm": 1.7570403814315796, "learning_rate": 1.999951278292101e-05, "loss": 1.7388, "step": 10800 }, { "epoch": 0.007650305907913963, "grad_norm": 3.93326735496521, "learning_rate": 1.999949457097913e-05, "loss": 1.6729, "step": 11000 }, { "epoch": 0.007789402378966945, "grad_norm": 3.1733429431915283, "learning_rate": 1.9999476024889414e-05, "loss": 1.6785, "step": 11200 }, { "epoch": 0.007928498850019925, "grad_norm": 1.2749360799789429, "learning_rate": 1.9999457144652746e-05, "loss": 1.6974, "step": 11400 }, { "epoch": 0.008067595321072906, "grad_norm": 1.4741085767745972, "learning_rate": 1.999943793027002e-05, "loss": 1.7071, "step": 11600 }, { "epoch": 0.008206691792125888, "grad_norm": 1.8536938428878784, "learning_rate": 1.999941838174216e-05, "loss": 1.669, "step": 11800 }, { "epoch": 0.00834578826317887, "grad_norm": 4.066403388977051, "learning_rate": 1.9999398499070103e-05, "loss": 1.6679, "step": 12000 }, { "epoch": 0.00848488473423185, "grad_norm": 2.6502602100372314, "learning_rate": 1.9999378282254787e-05, "loss": 1.6914, "step": 12200 }, { "epoch": 0.008623981205284832, "grad_norm": 1.7778935432434082, "learning_rate": 1.999935773129719e-05, "loss": 1.7033, "step": 12400 }, { "epoch": 0.008763077676337813, "grad_norm": 5.382219314575195, "learning_rate": 1.999933684619828e-05, "loss": 1.6723, "step": 12600 }, { "epoch": 0.008902174147390793, "grad_norm": 2.359602928161621, "learning_rate": 1.9999315626959067e-05, "loss": 1.7246, "step": 12800 }, { "epoch": 0.009041270618443774, "grad_norm": 2.1279995441436768, "learning_rate": 1.999929407358055e-05, "loss": 1.7077, "step": 13000 }, { "epoch": 0.009180367089496756, "grad_norm": 2.6688473224639893, "learning_rate": 1.9999272186063767e-05, "loss": 1.6788, "step": 13200 }, { "epoch": 0.009319463560549737, "grad_norm": 1.7765421867370605, "learning_rate": 1.9999249964409763e-05, "loss": 1.6995, "step": 13400 }, { "epoch": 0.009458560031602718, "grad_norm": 1.976920485496521, "learning_rate": 1.9999227408619597e-05, "loss": 1.6848, "step": 13600 }, { "epoch": 0.0095976565026557, "grad_norm": 1.5781333446502686, "learning_rate": 1.9999204518694348e-05, "loss": 1.6838, "step": 13800 }, { "epoch": 0.009736752973708681, "grad_norm": 3.1562576293945312, "learning_rate": 1.9999181294635103e-05, "loss": 1.697, "step": 14000 }, { "epoch": 0.009875849444761662, "grad_norm": 3.2943713665008545, "learning_rate": 1.9999157736442973e-05, "loss": 1.6949, "step": 14200 }, { "epoch": 0.010014945915814644, "grad_norm": 2.8242108821868896, "learning_rate": 1.999913384411909e-05, "loss": 1.6555, "step": 14400 }, { "epoch": 0.010154042386867623, "grad_norm": 3.441899299621582, "learning_rate": 1.9999109617664585e-05, "loss": 1.717, "step": 14600 }, { "epoch": 0.010293138857920605, "grad_norm": 1.9739881753921509, "learning_rate": 1.9999085057080614e-05, "loss": 1.6838, "step": 14800 }, { "epoch": 0.010432235328973586, "grad_norm": 2.0629451274871826, "learning_rate": 1.999906016236836e-05, "loss": 1.6777, "step": 15000 }, { "epoch": 0.010571331800026567, "grad_norm": 2.920884847640991, "learning_rate": 1.9999034933529003e-05, "loss": 1.7185, "step": 15200 }, { "epoch": 0.010710428271079549, "grad_norm": 3.8650007247924805, "learning_rate": 1.9999009370563746e-05, "loss": 1.717, "step": 15400 }, { "epoch": 0.01084952474213253, "grad_norm": 2.3857533931732178, "learning_rate": 1.9998983473473816e-05, "loss": 1.7108, "step": 15600 }, { "epoch": 0.010988621213185511, "grad_norm": 1.6580150127410889, "learning_rate": 1.9998957242260447e-05, "loss": 1.7263, "step": 15800 }, { "epoch": 0.011127717684238493, "grad_norm": 2.27158784866333, "learning_rate": 1.9998930676924887e-05, "loss": 1.6676, "step": 16000 }, { "epoch": 0.011266814155291472, "grad_norm": 2.073591947555542, "learning_rate": 1.999890377746841e-05, "loss": 1.6653, "step": 16200 }, { "epoch": 0.011405910626344454, "grad_norm": 2.6413662433624268, "learning_rate": 1.9998876543892297e-05, "loss": 1.7, "step": 16400 }, { "epoch": 0.011545007097397435, "grad_norm": 1.7674572467803955, "learning_rate": 1.9998848976197845e-05, "loss": 1.655, "step": 16600 }, { "epoch": 0.011684103568450416, "grad_norm": 3.325094699859619, "learning_rate": 1.9998821074386376e-05, "loss": 1.6966, "step": 16800 }, { "epoch": 0.011823200039503398, "grad_norm": 4.600919246673584, "learning_rate": 1.9998792838459226e-05, "loss": 1.6826, "step": 17000 }, { "epoch": 0.01196229651055638, "grad_norm": 2.438291072845459, "learning_rate": 1.9998764268417728e-05, "loss": 1.6938, "step": 17200 }, { "epoch": 0.01210139298160936, "grad_norm": 3.072421073913574, "learning_rate": 1.9998735364263257e-05, "loss": 1.6398, "step": 17400 }, { "epoch": 0.012240489452662342, "grad_norm": 3.100858688354492, "learning_rate": 1.9998706125997193e-05, "loss": 1.7201, "step": 17600 }, { "epoch": 0.012379585923715323, "grad_norm": 1.6366045475006104, "learning_rate": 1.9998676553620927e-05, "loss": 1.7063, "step": 17800 }, { "epoch": 0.012518682394768303, "grad_norm": 3.026857852935791, "learning_rate": 1.9998646647135877e-05, "loss": 1.7723, "step": 18000 }, { "epoch": 0.012657778865821284, "grad_norm": 2.204956293106079, "learning_rate": 1.9998616406543457e-05, "loss": 1.7373, "step": 18200 }, { "epoch": 0.012796875336874266, "grad_norm": 3.4815049171447754, "learning_rate": 1.999858583184513e-05, "loss": 1.7032, "step": 18400 }, { "epoch": 0.012935971807927247, "grad_norm": 2.680816411972046, "learning_rate": 1.9998554923042343e-05, "loss": 1.7167, "step": 18600 }, { "epoch": 0.013075068278980228, "grad_norm": 2.0747711658477783, "learning_rate": 1.9998523680136575e-05, "loss": 1.7054, "step": 18800 }, { "epoch": 0.01321416475003321, "grad_norm": 2.943026065826416, "learning_rate": 1.9998492103129314e-05, "loss": 1.7296, "step": 19000 }, { "epoch": 0.013353261221086191, "grad_norm": 3.2096681594848633, "learning_rate": 1.9998460192022073e-05, "loss": 1.7288, "step": 19200 }, { "epoch": 0.013492357692139172, "grad_norm": 1.9929059743881226, "learning_rate": 1.999842794681637e-05, "loss": 1.6747, "step": 19400 }, { "epoch": 0.013631454163192152, "grad_norm": 2.7822391986846924, "learning_rate": 1.9998395367513753e-05, "loss": 1.6426, "step": 19600 }, { "epoch": 0.013770550634245133, "grad_norm": 2.433687686920166, "learning_rate": 1.9998362454115767e-05, "loss": 1.6781, "step": 19800 }, { "epoch": 0.013909647105298115, "grad_norm": 2.0973353385925293, "learning_rate": 1.999832920662399e-05, "loss": 1.7132, "step": 20000 }, { "epoch": 0.014048743576351096, "grad_norm": 1.9057769775390625, "learning_rate": 1.9998295625040006e-05, "loss": 1.7141, "step": 20200 }, { "epoch": 0.014187840047404077, "grad_norm": 3.1140308380126953, "learning_rate": 1.9998261709365422e-05, "loss": 1.6266, "step": 20400 }, { "epoch": 0.014326936518457059, "grad_norm": 5.554874420166016, "learning_rate": 1.9998227459601847e-05, "loss": 1.7396, "step": 20600 }, { "epoch": 0.01446603298951004, "grad_norm": 3.160214424133301, "learning_rate": 1.9998192875750928e-05, "loss": 1.6973, "step": 20800 }, { "epoch": 0.014605129460563021, "grad_norm": 5.784990310668945, "learning_rate": 1.999815795781431e-05, "loss": 1.7528, "step": 21000 }, { "epoch": 0.014744225931616003, "grad_norm": 2.038553237915039, "learning_rate": 1.9998122705793667e-05, "loss": 1.6435, "step": 21200 }, { "epoch": 0.014883322402668982, "grad_norm": 2.000013589859009, "learning_rate": 1.9998087119690667e-05, "loss": 1.746, "step": 21400 }, { "epoch": 0.015022418873721964, "grad_norm": 4.159511566162109, "learning_rate": 1.9998051199507023e-05, "loss": 1.6741, "step": 21600 }, { "epoch": 0.015161515344774945, "grad_norm": 4.24055814743042, "learning_rate": 1.9998014945244445e-05, "loss": 1.6825, "step": 21800 }, { "epoch": 0.015300611815827926, "grad_norm": 2.4057905673980713, "learning_rate": 1.9997978356904658e-05, "loss": 1.7085, "step": 22000 }, { "epoch": 0.015439708286880908, "grad_norm": 1.9155327081680298, "learning_rate": 1.999794143448942e-05, "loss": 1.7391, "step": 22200 }, { "epoch": 0.01557880475793389, "grad_norm": 1.7514386177062988, "learning_rate": 1.9997904178000485e-05, "loss": 1.7169, "step": 22400 }, { "epoch": 0.01571790122898687, "grad_norm": 2.172142744064331, "learning_rate": 1.9997866587439633e-05, "loss": 1.6361, "step": 22600 }, { "epoch": 0.01585699770003985, "grad_norm": 2.026682138442993, "learning_rate": 1.9997828662808665e-05, "loss": 1.7017, "step": 22800 }, { "epoch": 0.015996094171092833, "grad_norm": 3.7516732215881348, "learning_rate": 1.999779040410938e-05, "loss": 1.6982, "step": 23000 }, { "epoch": 0.016135190642145813, "grad_norm": 1.6760916709899902, "learning_rate": 1.9997751811343617e-05, "loss": 1.6747, "step": 23200 }, { "epoch": 0.016274287113198796, "grad_norm": 4.183455944061279, "learning_rate": 1.9997712884513206e-05, "loss": 1.6745, "step": 23400 }, { "epoch": 0.016413383584251776, "grad_norm": 4.69508695602417, "learning_rate": 1.9997673623620018e-05, "loss": 1.6605, "step": 23600 }, { "epoch": 0.016552480055304755, "grad_norm": 4.241724014282227, "learning_rate": 1.9997634028665915e-05, "loss": 1.6547, "step": 23800 }, { "epoch": 0.01669157652635774, "grad_norm": 2.174389362335205, "learning_rate": 1.9997594099652796e-05, "loss": 1.6315, "step": 24000 }, { "epoch": 0.016830672997410718, "grad_norm": 1.7669682502746582, "learning_rate": 1.9997553836582564e-05, "loss": 1.6784, "step": 24200 }, { "epoch": 0.0169697694684637, "grad_norm": 1.653138279914856, "learning_rate": 1.9997513239457138e-05, "loss": 1.6759, "step": 24400 }, { "epoch": 0.01710886593951668, "grad_norm": 3.8766541481018066, "learning_rate": 1.999747230827846e-05, "loss": 1.6885, "step": 24600 }, { "epoch": 0.017247962410569664, "grad_norm": 1.8397940397262573, "learning_rate": 1.999743104304849e-05, "loss": 1.6495, "step": 24800 }, { "epoch": 0.017387058881622643, "grad_norm": 2.1748106479644775, "learning_rate": 1.9997389443769183e-05, "loss": 1.7696, "step": 25000 }, { "epoch": 0.017526155352675626, "grad_norm": 2.0195181369781494, "learning_rate": 1.9997347510442536e-05, "loss": 1.681, "step": 25200 }, { "epoch": 0.017665251823728606, "grad_norm": 2.446232557296753, "learning_rate": 1.9997305243070547e-05, "loss": 1.6504, "step": 25400 }, { "epoch": 0.017804348294781586, "grad_norm": 1.763822317123413, "learning_rate": 1.999726264165524e-05, "loss": 1.7779, "step": 25600 }, { "epoch": 0.01794344476583457, "grad_norm": 4.14453125, "learning_rate": 1.9997219706198637e-05, "loss": 1.7237, "step": 25800 }, { "epoch": 0.01808254123688755, "grad_norm": 2.8632140159606934, "learning_rate": 1.9997176436702798e-05, "loss": 1.7078, "step": 26000 }, { "epoch": 0.01822163770794053, "grad_norm": 2.8407750129699707, "learning_rate": 1.9997132833169782e-05, "loss": 1.7034, "step": 26200 }, { "epoch": 0.01836073417899351, "grad_norm": 2.290100574493408, "learning_rate": 1.999708889560168e-05, "loss": 1.6465, "step": 26400 }, { "epoch": 0.018499830650046494, "grad_norm": 2.3949410915374756, "learning_rate": 1.999704462400058e-05, "loss": 1.6693, "step": 26600 }, { "epoch": 0.018638927121099474, "grad_norm": 1.8822963237762451, "learning_rate": 1.9997000018368592e-05, "loss": 1.6824, "step": 26800 }, { "epoch": 0.018778023592152457, "grad_norm": 2.2643373012542725, "learning_rate": 1.9996955078707855e-05, "loss": 1.6599, "step": 27000 }, { "epoch": 0.018917120063205436, "grad_norm": 2.3489792346954346, "learning_rate": 1.9996909805020516e-05, "loss": 1.7212, "step": 27200 }, { "epoch": 0.019056216534258416, "grad_norm": 2.2388651371002197, "learning_rate": 1.999686419730873e-05, "loss": 1.6677, "step": 27400 }, { "epoch": 0.0191953130053114, "grad_norm": 2.217224597930908, "learning_rate": 1.9996818255574672e-05, "loss": 1.6785, "step": 27600 }, { "epoch": 0.01933440947636438, "grad_norm": 4.4212212562561035, "learning_rate": 1.9996771979820543e-05, "loss": 1.6515, "step": 27800 }, { "epoch": 0.019473505947417362, "grad_norm": 2.4725615978240967, "learning_rate": 1.9996725370048545e-05, "loss": 1.66, "step": 28000 }, { "epoch": 0.01961260241847034, "grad_norm": 2.4805498123168945, "learning_rate": 1.9996678426260905e-05, "loss": 1.6954, "step": 28200 }, { "epoch": 0.019751698889523325, "grad_norm": 1.793663501739502, "learning_rate": 1.999663114845987e-05, "loss": 1.6956, "step": 28400 }, { "epoch": 0.019890795360576304, "grad_norm": 2.178234100341797, "learning_rate": 1.999658353664769e-05, "loss": 1.6676, "step": 28600 }, { "epoch": 0.020029891831629287, "grad_norm": 2.034973382949829, "learning_rate": 1.999653559082664e-05, "loss": 1.6977, "step": 28800 }, { "epoch": 0.020168988302682267, "grad_norm": 3.669325828552246, "learning_rate": 1.9996487310999007e-05, "loss": 1.685, "step": 29000 }, { "epoch": 0.020308084773735247, "grad_norm": 2.070687770843506, "learning_rate": 1.99964386971671e-05, "loss": 1.7383, "step": 29200 }, { "epoch": 0.02044718124478823, "grad_norm": 1.6653083562850952, "learning_rate": 1.999638974933324e-05, "loss": 1.7003, "step": 29400 }, { "epoch": 0.02058627771584121, "grad_norm": 2.3347225189208984, "learning_rate": 1.999634046749976e-05, "loss": 1.7457, "step": 29600 }, { "epoch": 0.020725374186894192, "grad_norm": 1.8311301469802856, "learning_rate": 1.9996290851669012e-05, "loss": 1.6243, "step": 29800 }, { "epoch": 0.020864470657947172, "grad_norm": 4.241110324859619, "learning_rate": 1.9996240901843368e-05, "loss": 1.7011, "step": 30000 }, { "epoch": 0.021003567129000155, "grad_norm": 3.895310878753662, "learning_rate": 1.9996190618025216e-05, "loss": 1.713, "step": 30200 }, { "epoch": 0.021142663600053135, "grad_norm": 2.71415638923645, "learning_rate": 1.9996140000216945e-05, "loss": 1.7247, "step": 30400 }, { "epoch": 0.021281760071106114, "grad_norm": 2.7343320846557617, "learning_rate": 1.9996089048420977e-05, "loss": 1.7241, "step": 30600 }, { "epoch": 0.021420856542159097, "grad_norm": 1.6846824884414673, "learning_rate": 1.9996037762639752e-05, "loss": 1.7244, "step": 30800 }, { "epoch": 0.021559953013212077, "grad_norm": 4.142380237579346, "learning_rate": 1.999598614287571e-05, "loss": 1.73, "step": 31000 }, { "epoch": 0.02169904948426506, "grad_norm": 2.5720510482788086, "learning_rate": 1.9995934189131313e-05, "loss": 1.6542, "step": 31200 }, { "epoch": 0.02183814595531804, "grad_norm": 4.803544044494629, "learning_rate": 1.999588190140905e-05, "loss": 1.6693, "step": 31400 }, { "epoch": 0.021977242426371023, "grad_norm": 1.6169124841690063, "learning_rate": 1.9995829279711406e-05, "loss": 1.7223, "step": 31600 }, { "epoch": 0.022116338897424002, "grad_norm": 2.43363618850708, "learning_rate": 1.999577632404091e-05, "loss": 1.7493, "step": 31800 }, { "epoch": 0.022255435368476986, "grad_norm": 1.6128485202789307, "learning_rate": 1.9995723034400073e-05, "loss": 1.6658, "step": 32000 }, { "epoch": 0.022394531839529965, "grad_norm": 2.0989410877227783, "learning_rate": 1.9995669410791448e-05, "loss": 1.6943, "step": 32200 }, { "epoch": 0.022533628310582945, "grad_norm": 1.8325483798980713, "learning_rate": 1.999561545321759e-05, "loss": 1.6779, "step": 32400 }, { "epoch": 0.022672724781635928, "grad_norm": 2.0416502952575684, "learning_rate": 1.999556116168108e-05, "loss": 1.6609, "step": 32600 }, { "epoch": 0.022811821252688907, "grad_norm": 1.8695884943008423, "learning_rate": 1.9995506536184505e-05, "loss": 1.7289, "step": 32800 }, { "epoch": 0.02295091772374189, "grad_norm": 1.682106614112854, "learning_rate": 1.999545157673047e-05, "loss": 1.6637, "step": 33000 }, { "epoch": 0.02309001419479487, "grad_norm": 2.3423938751220703, "learning_rate": 1.9995396283321612e-05, "loss": 1.7022, "step": 33200 }, { "epoch": 0.023229110665847853, "grad_norm": 2.1338844299316406, "learning_rate": 1.999534065596056e-05, "loss": 1.6286, "step": 33400 }, { "epoch": 0.023368207136900833, "grad_norm": 1.8180948495864868, "learning_rate": 1.999528469464997e-05, "loss": 1.6822, "step": 33600 }, { "epoch": 0.023507303607953816, "grad_norm": 4.408015727996826, "learning_rate": 1.999522839939252e-05, "loss": 1.6647, "step": 33800 }, { "epoch": 0.023646400079006796, "grad_norm": 1.913683533668518, "learning_rate": 1.999517177019089e-05, "loss": 1.662, "step": 34000 }, { "epoch": 0.023785496550059775, "grad_norm": 2.642828941345215, "learning_rate": 1.9995114807047784e-05, "loss": 1.6511, "step": 34200 }, { "epoch": 0.02392459302111276, "grad_norm": 2.2986598014831543, "learning_rate": 1.9995057509965927e-05, "loss": 1.627, "step": 34400 }, { "epoch": 0.024063689492165738, "grad_norm": 1.6629712581634521, "learning_rate": 1.999499987894805e-05, "loss": 1.6492, "step": 34600 }, { "epoch": 0.02420278596321872, "grad_norm": 3.302642583847046, "learning_rate": 1.9994941913996905e-05, "loss": 1.7311, "step": 34800 }, { "epoch": 0.0243418824342717, "grad_norm": 2.2457594871520996, "learning_rate": 1.9994883615115256e-05, "loss": 1.7053, "step": 35000 }, { "epoch": 0.024480978905324684, "grad_norm": 3.1490752696990967, "learning_rate": 1.9994824982305894e-05, "loss": 1.7145, "step": 35200 }, { "epoch": 0.024620075376377663, "grad_norm": 2.5022294521331787, "learning_rate": 1.9994766015571617e-05, "loss": 1.6747, "step": 35400 }, { "epoch": 0.024759171847430646, "grad_norm": 2.0420994758605957, "learning_rate": 1.999470671491523e-05, "loss": 1.6354, "step": 35600 }, { "epoch": 0.024898268318483626, "grad_norm": 2.904489278793335, "learning_rate": 1.9994647080339576e-05, "loss": 1.6739, "step": 35800 }, { "epoch": 0.025037364789536606, "grad_norm": 3.5906155109405518, "learning_rate": 1.9994587111847493e-05, "loss": 1.6882, "step": 36000 }, { "epoch": 0.02517646126058959, "grad_norm": 2.0279171466827393, "learning_rate": 1.9994526809441846e-05, "loss": 1.6274, "step": 36200 }, { "epoch": 0.02531555773164257, "grad_norm": 1.3273375034332275, "learning_rate": 1.999446617312551e-05, "loss": 1.6795, "step": 36400 }, { "epoch": 0.02545465420269555, "grad_norm": 3.7897446155548096, "learning_rate": 1.9994405202901396e-05, "loss": 1.6904, "step": 36600 }, { "epoch": 0.02559375067374853, "grad_norm": 2.1297788619995117, "learning_rate": 1.99943438987724e-05, "loss": 1.7057, "step": 36800 }, { "epoch": 0.025732847144801514, "grad_norm": 2.5796139240264893, "learning_rate": 1.9994282260741447e-05, "loss": 1.6695, "step": 37000 }, { "epoch": 0.025871943615854494, "grad_norm": 3.6038098335266113, "learning_rate": 1.9994220288811493e-05, "loss": 1.6608, "step": 37200 }, { "epoch": 0.026011040086907473, "grad_norm": 2.2663841247558594, "learning_rate": 1.9994157982985484e-05, "loss": 1.6834, "step": 37400 }, { "epoch": 0.026150136557960457, "grad_norm": 2.462709426879883, "learning_rate": 1.9994095343266396e-05, "loss": 1.6946, "step": 37600 }, { "epoch": 0.026289233029013436, "grad_norm": 2.082554817199707, "learning_rate": 1.9994032369657223e-05, "loss": 1.6931, "step": 37800 }, { "epoch": 0.02642832950006642, "grad_norm": 3.0539119243621826, "learning_rate": 1.999396906216097e-05, "loss": 1.6796, "step": 38000 }, { "epoch": 0.0265674259711194, "grad_norm": 2.478039026260376, "learning_rate": 1.9993905420780658e-05, "loss": 1.7081, "step": 38200 }, { "epoch": 0.026706522442172382, "grad_norm": 5.309814453125, "learning_rate": 1.9993841445519327e-05, "loss": 1.7076, "step": 38400 }, { "epoch": 0.02684561891322536, "grad_norm": 2.58223032951355, "learning_rate": 1.999377713638003e-05, "loss": 1.7193, "step": 38600 }, { "epoch": 0.026984715384278345, "grad_norm": 4.54454231262207, "learning_rate": 1.999371249336584e-05, "loss": 1.7087, "step": 38800 }, { "epoch": 0.027123811855331324, "grad_norm": 6.4016876220703125, "learning_rate": 1.999364751647984e-05, "loss": 1.6778, "step": 39000 }, { "epoch": 0.027262908326384304, "grad_norm": 2.3655877113342285, "learning_rate": 1.999358220572513e-05, "loss": 1.6927, "step": 39200 }, { "epoch": 0.027402004797437287, "grad_norm": 3.2086055278778076, "learning_rate": 1.9993516561104834e-05, "loss": 1.6915, "step": 39400 }, { "epoch": 0.027541101268490267, "grad_norm": 4.550060749053955, "learning_rate": 1.999345058262208e-05, "loss": 1.6706, "step": 39600 }, { "epoch": 0.02768019773954325, "grad_norm": 5.274898529052734, "learning_rate": 1.999338427028002e-05, "loss": 1.6722, "step": 39800 }, { "epoch": 0.02781929421059623, "grad_norm": 4.817627429962158, "learning_rate": 1.9993317624081818e-05, "loss": 1.6831, "step": 40000 }, { "epoch": 0.027958390681649212, "grad_norm": 3.387507200241089, "learning_rate": 1.999325064403066e-05, "loss": 1.6987, "step": 40200 }, { "epoch": 0.028097487152702192, "grad_norm": 5.216610431671143, "learning_rate": 1.999318333012974e-05, "loss": 1.732, "step": 40400 }, { "epoch": 0.028236583623755175, "grad_norm": 3.5860650539398193, "learning_rate": 1.9993115682382273e-05, "loss": 1.626, "step": 40600 }, { "epoch": 0.028375680094808155, "grad_norm": 3.4120900630950928, "learning_rate": 1.9993047700791484e-05, "loss": 1.6541, "step": 40800 }, { "epoch": 0.028514776565861134, "grad_norm": 2.7464821338653564, "learning_rate": 1.9992979385360627e-05, "loss": 1.6558, "step": 41000 }, { "epoch": 0.028653873036914117, "grad_norm": 2.974565267562866, "learning_rate": 1.9992910736092956e-05, "loss": 1.7827, "step": 41200 }, { "epoch": 0.028792969507967097, "grad_norm": 1.62872314453125, "learning_rate": 1.9992841752991746e-05, "loss": 1.6925, "step": 41400 }, { "epoch": 0.02893206597902008, "grad_norm": 2.0491859912872314, "learning_rate": 1.9992772436060303e-05, "loss": 1.6699, "step": 41600 }, { "epoch": 0.02907116245007306, "grad_norm": 4.282040596008301, "learning_rate": 1.999270278530192e-05, "loss": 1.6866, "step": 41800 }, { "epoch": 0.029210258921126043, "grad_norm": 1.5519753694534302, "learning_rate": 1.9992632800719933e-05, "loss": 1.734, "step": 42000 }, { "epoch": 0.029349355392179022, "grad_norm": 2.028379201889038, "learning_rate": 1.9992562482317682e-05, "loss": 1.6825, "step": 42200 }, { "epoch": 0.029488451863232006, "grad_norm": 2.8444042205810547, "learning_rate": 1.999249183009852e-05, "loss": 1.671, "step": 42400 }, { "epoch": 0.029627548334284985, "grad_norm": 2.0103445053100586, "learning_rate": 1.999242084406582e-05, "loss": 1.707, "step": 42600 }, { "epoch": 0.029766644805337965, "grad_norm": 1.7135134935379028, "learning_rate": 1.9992349524222974e-05, "loss": 1.6703, "step": 42800 }, { "epoch": 0.029905741276390948, "grad_norm": 6.601833343505859, "learning_rate": 1.9992277870573384e-05, "loss": 1.7094, "step": 43000 }, { "epoch": 0.030044837747443928, "grad_norm": 3.613133430480957, "learning_rate": 1.9992205883120476e-05, "loss": 1.66, "step": 43200 }, { "epoch": 0.03018393421849691, "grad_norm": 3.0776429176330566, "learning_rate": 1.999213356186768e-05, "loss": 1.7313, "step": 43400 }, { "epoch": 0.03032303068954989, "grad_norm": 4.370647430419922, "learning_rate": 1.999206090681845e-05, "loss": 1.6906, "step": 43600 }, { "epoch": 0.030462127160602873, "grad_norm": 3.666454315185547, "learning_rate": 1.999198791797625e-05, "loss": 1.6623, "step": 43800 }, { "epoch": 0.030601223631655853, "grad_norm": 5.570009231567383, "learning_rate": 1.999191459534458e-05, "loss": 1.6862, "step": 44000 }, { "epoch": 0.030740320102708833, "grad_norm": 8.811431884765625, "learning_rate": 1.9991840938926925e-05, "loss": 1.6893, "step": 44200 }, { "epoch": 0.030879416573761816, "grad_norm": 4.8162617683410645, "learning_rate": 1.999176694872681e-05, "loss": 1.687, "step": 44400 }, { "epoch": 0.031018513044814795, "grad_norm": 2.9438042640686035, "learning_rate": 1.9991692624747756e-05, "loss": 1.714, "step": 44600 }, { "epoch": 0.03115760951586778, "grad_norm": 5.454644680023193, "learning_rate": 1.9991617966993324e-05, "loss": 1.6707, "step": 44800 }, { "epoch": 0.03129670598692076, "grad_norm": 5.090953826904297, "learning_rate": 1.9991542975467074e-05, "loss": 1.6852, "step": 45000 }, { "epoch": 0.03143580245797374, "grad_norm": 4.892083168029785, "learning_rate": 1.9991467650172576e-05, "loss": 1.636, "step": 45200 }, { "epoch": 0.03157489892902672, "grad_norm": 5.967046737670898, "learning_rate": 1.9991391991113443e-05, "loss": 1.7314, "step": 45400 }, { "epoch": 0.0317139954000797, "grad_norm": 4.124074935913086, "learning_rate": 1.999131599829328e-05, "loss": 1.7064, "step": 45600 }, { "epoch": 0.03185309187113269, "grad_norm": 2.1032886505126953, "learning_rate": 1.999123967171571e-05, "loss": 1.6755, "step": 45800 }, { "epoch": 0.031992188342185666, "grad_norm": 2.694011688232422, "learning_rate": 1.9991163011384382e-05, "loss": 1.6889, "step": 46000 }, { "epoch": 0.032131284813238646, "grad_norm": 3.628530263900757, "learning_rate": 1.9991086017302953e-05, "loss": 1.6233, "step": 46200 }, { "epoch": 0.032270381284291626, "grad_norm": 3.5236215591430664, "learning_rate": 1.99910086894751e-05, "loss": 1.7024, "step": 46400 }, { "epoch": 0.032409477755344605, "grad_norm": 3.225529432296753, "learning_rate": 1.999093102790451e-05, "loss": 1.6857, "step": 46600 }, { "epoch": 0.03254857422639759, "grad_norm": 3.014922618865967, "learning_rate": 1.9990853032594902e-05, "loss": 1.6712, "step": 46800 }, { "epoch": 0.03268767069745057, "grad_norm": 3.654637336730957, "learning_rate": 1.9990774703549988e-05, "loss": 1.7464, "step": 47000 }, { "epoch": 0.03282676716850355, "grad_norm": 3.2240869998931885, "learning_rate": 1.999069604077351e-05, "loss": 1.6179, "step": 47200 }, { "epoch": 0.03296586363955653, "grad_norm": 2.993143081665039, "learning_rate": 1.9990617044269226e-05, "loss": 1.6679, "step": 47400 }, { "epoch": 0.03310496011060951, "grad_norm": 5.778799057006836, "learning_rate": 1.9990537714040906e-05, "loss": 1.6547, "step": 47600 }, { "epoch": 0.0332440565816625, "grad_norm": 2.6785480976104736, "learning_rate": 1.9990458050092335e-05, "loss": 1.6496, "step": 47800 }, { "epoch": 0.03338315305271548, "grad_norm": 6.086597442626953, "learning_rate": 1.999037805242732e-05, "loss": 1.6859, "step": 48000 }, { "epoch": 0.033522249523768456, "grad_norm": 2.1356918811798096, "learning_rate": 1.9990297721049674e-05, "loss": 1.6864, "step": 48200 }, { "epoch": 0.033661345994821436, "grad_norm": 5.611029624938965, "learning_rate": 1.999021705596324e-05, "loss": 1.6508, "step": 48400 }, { "epoch": 0.03380044246587442, "grad_norm": 2.953913927078247, "learning_rate": 1.9990136057171866e-05, "loss": 1.6805, "step": 48600 }, { "epoch": 0.0339395389369274, "grad_norm": 4.642039775848389, "learning_rate": 1.9990054724679412e-05, "loss": 1.6394, "step": 48800 }, { "epoch": 0.03407863540798038, "grad_norm": 4.601811408996582, "learning_rate": 1.9989973058489766e-05, "loss": 1.6854, "step": 49000 }, { "epoch": 0.03421773187903336, "grad_norm": 6.185051441192627, "learning_rate": 1.9989891058606835e-05, "loss": 1.7445, "step": 49200 }, { "epoch": 0.03435682835008634, "grad_norm": 6.0281877517700195, "learning_rate": 1.998980872503452e-05, "loss": 1.6916, "step": 49400 }, { "epoch": 0.03449592482113933, "grad_norm": 2.0409023761749268, "learning_rate": 1.9989726057776755e-05, "loss": 1.6718, "step": 49600 }, { "epoch": 0.03463502129219231, "grad_norm": 6.715494155883789, "learning_rate": 1.998964305683749e-05, "loss": 1.6913, "step": 49800 }, { "epoch": 0.03477411776324529, "grad_norm": 5.204497337341309, "learning_rate": 1.9989559722220688e-05, "loss": 1.7206, "step": 50000 }, { "epoch": 0.034913214234298266, "grad_norm": 4.236083984375, "learning_rate": 1.998947605393032e-05, "loss": 1.6304, "step": 50200 }, { "epoch": 0.03505231070535125, "grad_norm": 4.496490955352783, "learning_rate": 1.998939205197039e-05, "loss": 1.6468, "step": 50400 }, { "epoch": 0.03519140717640423, "grad_norm": 4.949439525604248, "learning_rate": 1.9989307716344902e-05, "loss": 1.6992, "step": 50600 }, { "epoch": 0.03533050364745721, "grad_norm": 4.516083240509033, "learning_rate": 1.9989223047057886e-05, "loss": 1.6655, "step": 50800 }, { "epoch": 0.03546960011851019, "grad_norm": 3.174182176589966, "learning_rate": 1.998913804411338e-05, "loss": 1.7408, "step": 51000 }, { "epoch": 0.03560869658956317, "grad_norm": 3.5125887393951416, "learning_rate": 1.998905270751544e-05, "loss": 1.6859, "step": 51200 }, { "epoch": 0.03574779306061616, "grad_norm": 3.4447836875915527, "learning_rate": 1.998896703726815e-05, "loss": 1.7248, "step": 51400 }, { "epoch": 0.03588688953166914, "grad_norm": 6.76048469543457, "learning_rate": 1.998888103337559e-05, "loss": 1.7037, "step": 51600 }, { "epoch": 0.03602598600272212, "grad_norm": 5.462390422821045, "learning_rate": 1.998879469584187e-05, "loss": 1.6771, "step": 51800 }, { "epoch": 0.0361650824737751, "grad_norm": 4.145532131195068, "learning_rate": 1.9988708024671108e-05, "loss": 1.6732, "step": 52000 }, { "epoch": 0.03630417894482808, "grad_norm": 2.3257813453674316, "learning_rate": 1.9988621019867444e-05, "loss": 1.68, "step": 52200 }, { "epoch": 0.03644327541588106, "grad_norm": 2.8310391902923584, "learning_rate": 1.9988533681435035e-05, "loss": 1.7028, "step": 52400 }, { "epoch": 0.03658237188693404, "grad_norm": 3.2486252784729004, "learning_rate": 1.9988446009378043e-05, "loss": 1.7134, "step": 52600 }, { "epoch": 0.03672146835798702, "grad_norm": 4.35382604598999, "learning_rate": 1.998835800370066e-05, "loss": 1.6279, "step": 52800 }, { "epoch": 0.03686056482904, "grad_norm": 4.6429243087768555, "learning_rate": 1.9988269664407082e-05, "loss": 1.7032, "step": 53000 }, { "epoch": 0.03699966130009299, "grad_norm": 7.826554775238037, "learning_rate": 1.998818099150153e-05, "loss": 1.7485, "step": 53200 }, { "epoch": 0.03713875777114597, "grad_norm": 3.7032411098480225, "learning_rate": 1.998809198498824e-05, "loss": 1.6475, "step": 53400 }, { "epoch": 0.03727785424219895, "grad_norm": 4.902835369110107, "learning_rate": 1.9988002644871453e-05, "loss": 1.6881, "step": 53600 }, { "epoch": 0.03741695071325193, "grad_norm": 4.872868061065674, "learning_rate": 1.9987912971155436e-05, "loss": 1.6455, "step": 53800 }, { "epoch": 0.037556047184304914, "grad_norm": 4.0793962478637695, "learning_rate": 1.998782296384448e-05, "loss": 1.7448, "step": 54000 }, { "epoch": 0.03769514365535789, "grad_norm": 3.5870656967163086, "learning_rate": 1.998773262294287e-05, "loss": 1.6967, "step": 54200 }, { "epoch": 0.03783424012641087, "grad_norm": 3.6290981769561768, "learning_rate": 1.9987641948454918e-05, "loss": 1.6429, "step": 54400 }, { "epoch": 0.03797333659746385, "grad_norm": 2.646336555480957, "learning_rate": 1.9987550940384965e-05, "loss": 1.6551, "step": 54600 }, { "epoch": 0.03811243306851683, "grad_norm": 3.4167566299438477, "learning_rate": 1.9987459598737344e-05, "loss": 1.6922, "step": 54800 }, { "epoch": 0.03825152953956982, "grad_norm": 3.9670467376708984, "learning_rate": 1.9987367923516418e-05, "loss": 1.6568, "step": 55000 }, { "epoch": 0.0383906260106228, "grad_norm": 4.334254741668701, "learning_rate": 1.998727591472657e-05, "loss": 1.7244, "step": 55200 }, { "epoch": 0.03852972248167578, "grad_norm": 4.030015468597412, "learning_rate": 1.998718357237218e-05, "loss": 1.6908, "step": 55400 }, { "epoch": 0.03866881895272876, "grad_norm": 2.921394109725952, "learning_rate": 1.998709089645767e-05, "loss": 1.6995, "step": 55600 }, { "epoch": 0.038807915423781744, "grad_norm": 4.17643928527832, "learning_rate": 1.9986997886987453e-05, "loss": 1.7459, "step": 55800 }, { "epoch": 0.038947011894834724, "grad_norm": 5.148770809173584, "learning_rate": 1.9986904543965983e-05, "loss": 1.6433, "step": 56000 }, { "epoch": 0.0390861083658877, "grad_norm": 3.664881467819214, "learning_rate": 1.9986810867397697e-05, "loss": 1.6391, "step": 56200 }, { "epoch": 0.03922520483694068, "grad_norm": 4.360840320587158, "learning_rate": 1.9986716857287084e-05, "loss": 1.7352, "step": 56400 }, { "epoch": 0.03936430130799366, "grad_norm": 4.004901885986328, "learning_rate": 1.9986622513638624e-05, "loss": 1.6717, "step": 56600 }, { "epoch": 0.03950339777904665, "grad_norm": 2.7370033264160156, "learning_rate": 1.9986527836456816e-05, "loss": 1.613, "step": 56800 }, { "epoch": 0.03964249425009963, "grad_norm": 4.080074787139893, "learning_rate": 1.9986432825746193e-05, "loss": 1.6511, "step": 57000 }, { "epoch": 0.03978159072115261, "grad_norm": 3.5836145877838135, "learning_rate": 1.998633748151128e-05, "loss": 1.6844, "step": 57200 }, { "epoch": 0.03992068719220559, "grad_norm": 8.140799522399902, "learning_rate": 1.9986241803756635e-05, "loss": 1.6506, "step": 57400 }, { "epoch": 0.040059783663258575, "grad_norm": 5.401699542999268, "learning_rate": 1.998614579248682e-05, "loss": 1.6886, "step": 57600 }, { "epoch": 0.040198880134311554, "grad_norm": 3.747342109680176, "learning_rate": 1.9986049447706426e-05, "loss": 1.7271, "step": 57800 }, { "epoch": 0.040337976605364534, "grad_norm": 3.5322844982147217, "learning_rate": 1.998595276942004e-05, "loss": 1.6014, "step": 58000 }, { "epoch": 0.040477073076417514, "grad_norm": 3.0417582988739014, "learning_rate": 1.9985855757632287e-05, "loss": 1.6338, "step": 58200 }, { "epoch": 0.04061616954747049, "grad_norm": 3.487562417984009, "learning_rate": 1.99857584123478e-05, "loss": 1.7305, "step": 58400 }, { "epoch": 0.04075526601852348, "grad_norm": 1.9068052768707275, "learning_rate": 1.9985660733571222e-05, "loss": 1.7155, "step": 58600 }, { "epoch": 0.04089436248957646, "grad_norm": 8.823533058166504, "learning_rate": 1.9985562721307213e-05, "loss": 1.6578, "step": 58800 }, { "epoch": 0.04103345896062944, "grad_norm": 5.038608551025391, "learning_rate": 1.9985464375560455e-05, "loss": 1.6595, "step": 59000 }, { "epoch": 0.04117255543168242, "grad_norm": 4.362532138824463, "learning_rate": 1.9985365696335646e-05, "loss": 1.7077, "step": 59200 }, { "epoch": 0.041311651902735405, "grad_norm": 4.135485649108887, "learning_rate": 1.9985266683637493e-05, "loss": 1.6595, "step": 59400 }, { "epoch": 0.041450748373788385, "grad_norm": 3.2375669479370117, "learning_rate": 1.9985167337470723e-05, "loss": 1.6731, "step": 59600 }, { "epoch": 0.041589844844841364, "grad_norm": 7.285531044006348, "learning_rate": 1.9985067657840077e-05, "loss": 1.6671, "step": 59800 }, { "epoch": 0.041728941315894344, "grad_norm": 5.653309345245361, "learning_rate": 1.998496764475032e-05, "loss": 1.6902, "step": 60000 }, { "epoch": 0.041868037786947324, "grad_norm": 6.675251007080078, "learning_rate": 1.9984867298206225e-05, "loss": 1.6838, "step": 60200 }, { "epoch": 0.04200713425800031, "grad_norm": 4.708273887634277, "learning_rate": 1.9984766618212576e-05, "loss": 1.615, "step": 60400 }, { "epoch": 0.04214623072905329, "grad_norm": 2.906949281692505, "learning_rate": 1.9984665604774185e-05, "loss": 1.6764, "step": 60600 }, { "epoch": 0.04228532720010627, "grad_norm": 3.628300666809082, "learning_rate": 1.9984564257895867e-05, "loss": 1.689, "step": 60800 }, { "epoch": 0.04242442367115925, "grad_norm": 3.8123619556427, "learning_rate": 1.9984462577582474e-05, "loss": 1.6421, "step": 61000 }, { "epoch": 0.04256352014221223, "grad_norm": 3.2274296283721924, "learning_rate": 1.9984360563838847e-05, "loss": 1.649, "step": 61200 }, { "epoch": 0.042702616613265215, "grad_norm": 1.9272345304489136, "learning_rate": 1.9984258216669862e-05, "loss": 1.7022, "step": 61400 }, { "epoch": 0.042841713084318195, "grad_norm": 3.6718461513519287, "learning_rate": 1.9984155536080404e-05, "loss": 1.6884, "step": 61600 }, { "epoch": 0.042980809555371174, "grad_norm": 5.313728332519531, "learning_rate": 1.9984052522075377e-05, "loss": 1.6437, "step": 61800 }, { "epoch": 0.043119906026424154, "grad_norm": 4.368004322052002, "learning_rate": 1.9983949174659692e-05, "loss": 1.7634, "step": 62000 }, { "epoch": 0.04325900249747714, "grad_norm": 2.9768130779266357, "learning_rate": 1.998384549383829e-05, "loss": 1.67, "step": 62200 }, { "epoch": 0.04339809896853012, "grad_norm": 4.6725897789001465, "learning_rate": 1.9983741479616118e-05, "loss": 1.6884, "step": 62400 }, { "epoch": 0.0435371954395831, "grad_norm": 3.639035940170288, "learning_rate": 1.998363713199814e-05, "loss": 1.6921, "step": 62600 }, { "epoch": 0.04367629191063608, "grad_norm": 4.510855197906494, "learning_rate": 1.998353245098934e-05, "loss": 1.6833, "step": 62800 }, { "epoch": 0.04381538838168906, "grad_norm": 4.581980228424072, "learning_rate": 1.9983427436594716e-05, "loss": 1.686, "step": 63000 }, { "epoch": 0.043954484852742046, "grad_norm": 4.461788654327393, "learning_rate": 1.9983322088819274e-05, "loss": 1.6919, "step": 63200 }, { "epoch": 0.044093581323795025, "grad_norm": 3.806112766265869, "learning_rate": 1.9983216407668053e-05, "loss": 1.6954, "step": 63400 }, { "epoch": 0.044232677794848005, "grad_norm": 4.456310749053955, "learning_rate": 1.9983110393146093e-05, "loss": 1.7317, "step": 63600 }, { "epoch": 0.044371774265900985, "grad_norm": 6.300820827484131, "learning_rate": 1.998300404525845e-05, "loss": 1.7097, "step": 63800 }, { "epoch": 0.04451087073695397, "grad_norm": 3.2175614833831787, "learning_rate": 1.9982897364010213e-05, "loss": 1.6765, "step": 64000 }, { "epoch": 0.04464996720800695, "grad_norm": 5.659944534301758, "learning_rate": 1.9982790349406468e-05, "loss": 1.6226, "step": 64200 }, { "epoch": 0.04478906367905993, "grad_norm": 3.648066759109497, "learning_rate": 1.998268300145232e-05, "loss": 1.6363, "step": 64400 }, { "epoch": 0.04492816015011291, "grad_norm": 6.237667083740234, "learning_rate": 1.99825753201529e-05, "loss": 1.656, "step": 64600 }, { "epoch": 0.04506725662116589, "grad_norm": 4.146718502044678, "learning_rate": 1.9982467305513346e-05, "loss": 1.6788, "step": 64800 }, { "epoch": 0.045206353092218876, "grad_norm": 3.9391887187957764, "learning_rate": 1.9982358957538816e-05, "loss": 1.7115, "step": 65000 }, { "epoch": 0.045345449563271856, "grad_norm": 4.433906555175781, "learning_rate": 1.9982250276234483e-05, "loss": 1.6697, "step": 65200 }, { "epoch": 0.045484546034324835, "grad_norm": 4.5281171798706055, "learning_rate": 1.998214126160553e-05, "loss": 1.6247, "step": 65400 }, { "epoch": 0.045623642505377815, "grad_norm": 2.758456230163574, "learning_rate": 1.9982031913657168e-05, "loss": 1.651, "step": 65600 }, { "epoch": 0.0457627389764308, "grad_norm": 4.08587121963501, "learning_rate": 1.9981922232394612e-05, "loss": 1.702, "step": 65800 }, { "epoch": 0.04590183544748378, "grad_norm": 2.943826913833618, "learning_rate": 1.9981812217823103e-05, "loss": 1.7346, "step": 66000 }, { "epoch": 0.04604093191853676, "grad_norm": 5.124936580657959, "learning_rate": 1.9981701869947887e-05, "loss": 1.6869, "step": 66200 }, { "epoch": 0.04618002838958974, "grad_norm": 5.860598087310791, "learning_rate": 1.9981591188774236e-05, "loss": 1.699, "step": 66400 }, { "epoch": 0.04631912486064272, "grad_norm": 5.8782172203063965, "learning_rate": 1.9981480174307434e-05, "loss": 1.7112, "step": 66600 }, { "epoch": 0.04645822133169571, "grad_norm": 3.9649393558502197, "learning_rate": 1.9981368826552775e-05, "loss": 1.6556, "step": 66800 }, { "epoch": 0.046597317802748686, "grad_norm": 4.633118629455566, "learning_rate": 1.998125714551558e-05, "loss": 1.6687, "step": 67000 }, { "epoch": 0.046736414273801666, "grad_norm": 2.720689535140991, "learning_rate": 1.9981145131201183e-05, "loss": 1.6155, "step": 67200 }, { "epoch": 0.046875510744854645, "grad_norm": 2.554689407348633, "learning_rate": 1.9981032783614928e-05, "loss": 1.7186, "step": 67400 }, { "epoch": 0.04701460721590763, "grad_norm": 6.52652645111084, "learning_rate": 1.9980920102762174e-05, "loss": 1.7065, "step": 67600 }, { "epoch": 0.04715370368696061, "grad_norm": 4.655409336090088, "learning_rate": 1.998080708864831e-05, "loss": 1.6714, "step": 67800 }, { "epoch": 0.04729280015801359, "grad_norm": 2.7842917442321777, "learning_rate": 1.998069374127872e-05, "loss": 1.6298, "step": 68000 }, { "epoch": 0.04743189662906657, "grad_norm": 3.6234030723571777, "learning_rate": 1.9980580060658826e-05, "loss": 1.6689, "step": 68200 }, { "epoch": 0.04757099310011955, "grad_norm": 4.62315559387207, "learning_rate": 1.9980466046794044e-05, "loss": 1.7086, "step": 68400 }, { "epoch": 0.04771008957117254, "grad_norm": 5.461521148681641, "learning_rate": 1.998035169968983e-05, "loss": 1.6662, "step": 68600 }, { "epoch": 0.04784918604222552, "grad_norm": 7.280538558959961, "learning_rate": 1.998023701935163e-05, "loss": 1.7044, "step": 68800 }, { "epoch": 0.047988282513278496, "grad_norm": 3.66982102394104, "learning_rate": 1.998012200578493e-05, "loss": 1.6707, "step": 69000 }, { "epoch": 0.048127378984331476, "grad_norm": 3.3503715991973877, "learning_rate": 1.998000665899521e-05, "loss": 1.6544, "step": 69200 }, { "epoch": 0.04826647545538446, "grad_norm": 3.640894889831543, "learning_rate": 1.9979890978987976e-05, "loss": 1.6518, "step": 69400 }, { "epoch": 0.04840557192643744, "grad_norm": 6.372598648071289, "learning_rate": 1.9979774965768762e-05, "loss": 1.6693, "step": 69600 }, { "epoch": 0.04854466839749042, "grad_norm": 4.334291458129883, "learning_rate": 1.99796586193431e-05, "loss": 1.7179, "step": 69800 }, { "epoch": 0.0486837648685434, "grad_norm": 3.7265446186065674, "learning_rate": 1.997954193971655e-05, "loss": 1.6597, "step": 70000 }, { "epoch": 0.04882286133959638, "grad_norm": 5.195233345031738, "learning_rate": 1.997942492689467e-05, "loss": 1.7107, "step": 70200 }, { "epoch": 0.04896195781064937, "grad_norm": 3.7042763233184814, "learning_rate": 1.9979307580883048e-05, "loss": 1.7004, "step": 70400 }, { "epoch": 0.04910105428170235, "grad_norm": 5.705939292907715, "learning_rate": 1.9979189901687298e-05, "loss": 1.6756, "step": 70600 }, { "epoch": 0.04924015075275533, "grad_norm": 8.152667999267578, "learning_rate": 1.9979071889313028e-05, "loss": 1.6684, "step": 70800 }, { "epoch": 0.049379247223808306, "grad_norm": 4.426294326782227, "learning_rate": 1.9978953543765876e-05, "loss": 1.7269, "step": 71000 }, { "epoch": 0.04951834369486129, "grad_norm": 3.9523568153381348, "learning_rate": 1.9978834865051492e-05, "loss": 1.7212, "step": 71200 }, { "epoch": 0.04965744016591427, "grad_norm": 4.159371376037598, "learning_rate": 1.9978715853175532e-05, "loss": 1.6869, "step": 71400 }, { "epoch": 0.04979653663696725, "grad_norm": 5.7590155601501465, "learning_rate": 1.997859650814369e-05, "loss": 1.6258, "step": 71600 }, { "epoch": 0.04993563310802023, "grad_norm": 6.574980735778809, "learning_rate": 1.997847682996165e-05, "loss": 1.7416, "step": 71800 }, { "epoch": 0.05007472957907321, "grad_norm": 2.2273080348968506, "learning_rate": 1.9978356818635146e-05, "loss": 1.6869, "step": 72000 }, { "epoch": 0.0502138260501262, "grad_norm": 2.451094150543213, "learning_rate": 1.997823647416989e-05, "loss": 1.6537, "step": 72200 }, { "epoch": 0.05035292252117918, "grad_norm": 2.1979329586029053, "learning_rate": 1.997811579657163e-05, "loss": 1.6154, "step": 72400 }, { "epoch": 0.05049201899223216, "grad_norm": 5.160410404205322, "learning_rate": 1.997799478584613e-05, "loss": 1.7159, "step": 72600 }, { "epoch": 0.05063111546328514, "grad_norm": 4.327044486999512, "learning_rate": 1.997787344199917e-05, "loss": 1.7153, "step": 72800 }, { "epoch": 0.050770211934338116, "grad_norm": 3.2752387523651123, "learning_rate": 1.997775176503653e-05, "loss": 1.6477, "step": 73000 }, { "epoch": 0.0509093084053911, "grad_norm": 3.984393358230591, "learning_rate": 1.9977629754964036e-05, "loss": 1.6563, "step": 73200 }, { "epoch": 0.05104840487644408, "grad_norm": 5.828344345092773, "learning_rate": 1.99775074117875e-05, "loss": 1.6841, "step": 73400 }, { "epoch": 0.05118750134749706, "grad_norm": 3.39604115486145, "learning_rate": 1.9977384735512765e-05, "loss": 1.6379, "step": 73600 }, { "epoch": 0.05132659781855004, "grad_norm": 4.018509387969971, "learning_rate": 1.9977261726145692e-05, "loss": 1.6195, "step": 73800 }, { "epoch": 0.05146569428960303, "grad_norm": 5.31196403503418, "learning_rate": 1.997713838369215e-05, "loss": 1.6841, "step": 74000 }, { "epoch": 0.05160479076065601, "grad_norm": 3.7980597019195557, "learning_rate": 1.9977014708158027e-05, "loss": 1.727, "step": 74200 }, { "epoch": 0.05174388723170899, "grad_norm": 4.492558479309082, "learning_rate": 1.997689069954923e-05, "loss": 1.6307, "step": 74400 }, { "epoch": 0.05188298370276197, "grad_norm": 5.221099853515625, "learning_rate": 1.997676635787167e-05, "loss": 1.6814, "step": 74600 }, { "epoch": 0.05202208017381495, "grad_norm": 3.387580633163452, "learning_rate": 1.9976641683131293e-05, "loss": 1.6883, "step": 74800 }, { "epoch": 0.05216117664486793, "grad_norm": 2.536865472793579, "learning_rate": 1.9976516675334044e-05, "loss": 1.6756, "step": 75000 }, { "epoch": 0.05230027311592091, "grad_norm": 4.267054557800293, "learning_rate": 1.99763913344859e-05, "loss": 1.659, "step": 75200 }, { "epoch": 0.05243936958697389, "grad_norm": 4.1718034744262695, "learning_rate": 1.997626566059284e-05, "loss": 1.6593, "step": 75400 }, { "epoch": 0.05257846605802687, "grad_norm": 4.38176965713501, "learning_rate": 1.9976139653660858e-05, "loss": 1.7247, "step": 75600 }, { "epoch": 0.05271756252907986, "grad_norm": 5.4682936668396, "learning_rate": 1.997601331369597e-05, "loss": 1.6676, "step": 75800 }, { "epoch": 0.05285665900013284, "grad_norm": 4.038586616516113, "learning_rate": 1.9975886640704223e-05, "loss": 1.6884, "step": 76000 }, { "epoch": 0.05299575547118582, "grad_norm": 5.104429721832275, "learning_rate": 1.9975759634691644e-05, "loss": 1.6482, "step": 76200 }, { "epoch": 0.0531348519422388, "grad_norm": 5.605188369750977, "learning_rate": 1.9975632295664304e-05, "loss": 1.7038, "step": 76400 }, { "epoch": 0.05327394841329178, "grad_norm": 3.7522759437561035, "learning_rate": 1.9975504623628285e-05, "loss": 1.6708, "step": 76600 }, { "epoch": 0.053413044884344764, "grad_norm": 4.757645606994629, "learning_rate": 1.9975376618589682e-05, "loss": 1.658, "step": 76800 }, { "epoch": 0.053552141355397744, "grad_norm": 4.041986465454102, "learning_rate": 1.9975248280554598e-05, "loss": 1.6976, "step": 77000 }, { "epoch": 0.05369123782645072, "grad_norm": 4.330105781555176, "learning_rate": 1.9975119609529164e-05, "loss": 1.6851, "step": 77200 }, { "epoch": 0.0538303342975037, "grad_norm": 5.530795097351074, "learning_rate": 1.9974990605519528e-05, "loss": 1.6883, "step": 77400 }, { "epoch": 0.05396943076855669, "grad_norm": 3.5975773334503174, "learning_rate": 1.997486126853184e-05, "loss": 1.6485, "step": 77600 }, { "epoch": 0.05410852723960967, "grad_norm": 2.303584098815918, "learning_rate": 1.9974731598572283e-05, "loss": 1.6799, "step": 77800 }, { "epoch": 0.05424762371066265, "grad_norm": 3.4903829097747803, "learning_rate": 1.997460159564704e-05, "loss": 1.6218, "step": 78000 }, { "epoch": 0.05438672018171563, "grad_norm": 3.64200758934021, "learning_rate": 1.997447125976232e-05, "loss": 1.6829, "step": 78200 }, { "epoch": 0.05452581665276861, "grad_norm": 3.231588840484619, "learning_rate": 1.9974340590924342e-05, "loss": 1.66, "step": 78400 }, { "epoch": 0.054664913123821594, "grad_norm": 6.966937065124512, "learning_rate": 1.997420958913935e-05, "loss": 1.6452, "step": 78600 }, { "epoch": 0.054804009594874574, "grad_norm": 3.31689190864563, "learning_rate": 1.9974078254413595e-05, "loss": 1.6315, "step": 78800 }, { "epoch": 0.054943106065927554, "grad_norm": 3.8332653045654297, "learning_rate": 1.9973946586753347e-05, "loss": 1.6833, "step": 79000 }, { "epoch": 0.05508220253698053, "grad_norm": 2.8052940368652344, "learning_rate": 1.997381458616489e-05, "loss": 1.7328, "step": 79200 }, { "epoch": 0.05522129900803352, "grad_norm": 3.172034978866577, "learning_rate": 1.9973682252654528e-05, "loss": 1.7078, "step": 79400 }, { "epoch": 0.0553603954790865, "grad_norm": 4.48649263381958, "learning_rate": 1.9973549586228574e-05, "loss": 1.663, "step": 79600 }, { "epoch": 0.05549949195013948, "grad_norm": 5.504746437072754, "learning_rate": 1.9973416586893366e-05, "loss": 1.6735, "step": 79800 }, { "epoch": 0.05563858842119246, "grad_norm": 9.966243743896484, "learning_rate": 1.9973283254655252e-05, "loss": 1.6366, "step": 80000 }, { "epoch": 0.05577768489224544, "grad_norm": 3.5485918521881104, "learning_rate": 1.99731495895206e-05, "loss": 1.6717, "step": 80200 }, { "epoch": 0.055916781363298425, "grad_norm": 3.4408681392669678, "learning_rate": 1.9973015591495787e-05, "loss": 1.6275, "step": 80400 }, { "epoch": 0.056055877834351404, "grad_norm": 4.678125858306885, "learning_rate": 1.9972881260587207e-05, "loss": 1.7055, "step": 80600 }, { "epoch": 0.056194974305404384, "grad_norm": 3.339069366455078, "learning_rate": 1.997274659680128e-05, "loss": 1.6691, "step": 80800 }, { "epoch": 0.056334070776457364, "grad_norm": 5.560351848602295, "learning_rate": 1.997261160014443e-05, "loss": 1.6484, "step": 81000 }, { "epoch": 0.05647316724751035, "grad_norm": 3.7626335620880127, "learning_rate": 1.9972476270623103e-05, "loss": 1.6868, "step": 81200 }, { "epoch": 0.05661226371856333, "grad_norm": 4.2480340003967285, "learning_rate": 1.9972340608243763e-05, "loss": 1.642, "step": 81400 }, { "epoch": 0.05675136018961631, "grad_norm": 3.7375171184539795, "learning_rate": 1.997220461301288e-05, "loss": 1.6068, "step": 81600 }, { "epoch": 0.05689045666066929, "grad_norm": 7.013606548309326, "learning_rate": 1.997206828493695e-05, "loss": 1.6926, "step": 81800 }, { "epoch": 0.05702955313172227, "grad_norm": 5.370455741882324, "learning_rate": 1.9971931624022477e-05, "loss": 1.7026, "step": 82000 }, { "epoch": 0.057168649602775255, "grad_norm": 4.551242828369141, "learning_rate": 1.997179463027599e-05, "loss": 1.7326, "step": 82200 }, { "epoch": 0.057307746073828235, "grad_norm": 4.156348705291748, "learning_rate": 1.997165730370403e-05, "loss": 1.7013, "step": 82400 }, { "epoch": 0.057446842544881214, "grad_norm": 3.648369550704956, "learning_rate": 1.9971519644313147e-05, "loss": 1.6619, "step": 82600 }, { "epoch": 0.057585939015934194, "grad_norm": 3.6276497840881348, "learning_rate": 1.9971381652109915e-05, "loss": 1.6558, "step": 82800 }, { "epoch": 0.05772503548698718, "grad_norm": 3.5667388439178467, "learning_rate": 1.9971243327100923e-05, "loss": 1.6633, "step": 83000 }, { "epoch": 0.05786413195804016, "grad_norm": 4.294811725616455, "learning_rate": 1.9971104669292777e-05, "loss": 1.6865, "step": 83200 }, { "epoch": 0.05800322842909314, "grad_norm": 3.2815868854522705, "learning_rate": 1.997096567869209e-05, "loss": 1.6464, "step": 83400 }, { "epoch": 0.05814232490014612, "grad_norm": 3.6213953495025635, "learning_rate": 1.99708263553055e-05, "loss": 1.6875, "step": 83600 }, { "epoch": 0.0582814213711991, "grad_norm": 13.829986572265625, "learning_rate": 1.997068669913966e-05, "loss": 1.669, "step": 83800 }, { "epoch": 0.058420517842252086, "grad_norm": 4.447632312774658, "learning_rate": 1.9970546710201236e-05, "loss": 1.7151, "step": 84000 }, { "epoch": 0.058559614313305065, "grad_norm": 3.175236940383911, "learning_rate": 1.9970406388496907e-05, "loss": 1.5914, "step": 84200 }, { "epoch": 0.058698710784358045, "grad_norm": 6.262105464935303, "learning_rate": 1.9970265734033377e-05, "loss": 1.6959, "step": 84400 }, { "epoch": 0.058837807255411025, "grad_norm": 7.8032917976379395, "learning_rate": 1.997012474681736e-05, "loss": 1.6684, "step": 84600 }, { "epoch": 0.05897690372646401, "grad_norm": 3.615449905395508, "learning_rate": 1.9969983426855583e-05, "loss": 1.657, "step": 84800 }, { "epoch": 0.05911600019751699, "grad_norm": 3.9366722106933594, "learning_rate": 1.9969841774154797e-05, "loss": 1.6866, "step": 85000 }, { "epoch": 0.05925509666856997, "grad_norm": 4.699201583862305, "learning_rate": 1.9969699788721763e-05, "loss": 1.6711, "step": 85200 }, { "epoch": 0.05939419313962295, "grad_norm": 3.251643657684326, "learning_rate": 1.9969557470563257e-05, "loss": 1.6349, "step": 85400 }, { "epoch": 0.05953328961067593, "grad_norm": 5.28709602355957, "learning_rate": 1.9969414819686076e-05, "loss": 1.6651, "step": 85600 }, { "epoch": 0.059672386081728916, "grad_norm": 3.1515891551971436, "learning_rate": 1.996927183609703e-05, "loss": 1.7007, "step": 85800 }, { "epoch": 0.059811482552781896, "grad_norm": 5.949617385864258, "learning_rate": 1.9969128519802942e-05, "loss": 1.6807, "step": 86000 }, { "epoch": 0.059950579023834875, "grad_norm": 4.615626335144043, "learning_rate": 1.9968984870810654e-05, "loss": 1.6803, "step": 86200 }, { "epoch": 0.060089675494887855, "grad_norm": 2.7743518352508545, "learning_rate": 1.9968840889127022e-05, "loss": 1.6932, "step": 86400 }, { "epoch": 0.060228771965940835, "grad_norm": 4.365586757659912, "learning_rate": 1.996869657475893e-05, "loss": 1.6411, "step": 86600 }, { "epoch": 0.06036786843699382, "grad_norm": 3.0346338748931885, "learning_rate": 1.9968551927713252e-05, "loss": 1.6896, "step": 86800 }, { "epoch": 0.0605069649080468, "grad_norm": 8.06223201751709, "learning_rate": 1.9968406947996906e-05, "loss": 1.6605, "step": 87000 }, { "epoch": 0.06064606137909978, "grad_norm": 8.650927543640137, "learning_rate": 1.9968261635616807e-05, "loss": 1.6567, "step": 87200 }, { "epoch": 0.06078515785015276, "grad_norm": 3.195859432220459, "learning_rate": 1.9968115990579892e-05, "loss": 1.6775, "step": 87400 }, { "epoch": 0.06092425432120575, "grad_norm": 5.3510565757751465, "learning_rate": 1.996797001289312e-05, "loss": 1.646, "step": 87600 }, { "epoch": 0.061063350792258726, "grad_norm": 7.507141590118408, "learning_rate": 1.996782370256345e-05, "loss": 1.6355, "step": 87800 }, { "epoch": 0.061202447263311706, "grad_norm": 3.331545114517212, "learning_rate": 1.996767705959787e-05, "loss": 1.7598, "step": 88000 }, { "epoch": 0.061341543734364685, "grad_norm": 4.220621109008789, "learning_rate": 1.9967530084003388e-05, "loss": 1.7096, "step": 88200 }, { "epoch": 0.061480640205417665, "grad_norm": 9.295842170715332, "learning_rate": 1.9967382775787013e-05, "loss": 1.6514, "step": 88400 }, { "epoch": 0.06161973667647065, "grad_norm": 4.063211441040039, "learning_rate": 1.9967235134955777e-05, "loss": 1.658, "step": 88600 }, { "epoch": 0.06175883314752363, "grad_norm": 6.070775508880615, "learning_rate": 1.996708716151673e-05, "loss": 1.7261, "step": 88800 }, { "epoch": 0.06189792961857661, "grad_norm": 3.954354763031006, "learning_rate": 1.996693885547694e-05, "loss": 1.705, "step": 89000 }, { "epoch": 0.06203702608962959, "grad_norm": 2.7232494354248047, "learning_rate": 1.9966790216843476e-05, "loss": 1.6655, "step": 89200 }, { "epoch": 0.06217612256068258, "grad_norm": 5.40230131149292, "learning_rate": 1.996664124562345e-05, "loss": 1.6839, "step": 89400 }, { "epoch": 0.06231521903173556, "grad_norm": 4.149807453155518, "learning_rate": 1.9966491941823962e-05, "loss": 1.6702, "step": 89600 }, { "epoch": 0.062454315502788536, "grad_norm": 3.431563138961792, "learning_rate": 1.996634230545214e-05, "loss": 1.6595, "step": 89800 }, { "epoch": 0.06259341197384152, "grad_norm": 5.329677104949951, "learning_rate": 1.9966192336515128e-05, "loss": 1.7017, "step": 90000 }, { "epoch": 0.0627325084448945, "grad_norm": 4.849306583404541, "learning_rate": 1.9966042035020093e-05, "loss": 1.6811, "step": 90200 }, { "epoch": 0.06287160491594748, "grad_norm": 3.2907044887542725, "learning_rate": 1.9965891400974203e-05, "loss": 1.6413, "step": 90400 }, { "epoch": 0.06301070138700046, "grad_norm": 4.458028793334961, "learning_rate": 1.996574043438465e-05, "loss": 1.5886, "step": 90600 }, { "epoch": 0.06314979785805344, "grad_norm": 4.1903486251831055, "learning_rate": 1.996558913525864e-05, "loss": 1.6198, "step": 90800 }, { "epoch": 0.06328889432910642, "grad_norm": 5.75989294052124, "learning_rate": 1.9965437503603396e-05, "loss": 1.679, "step": 91000 }, { "epoch": 0.0634279908001594, "grad_norm": 5.585878849029541, "learning_rate": 1.996528553942616e-05, "loss": 1.6739, "step": 91200 }, { "epoch": 0.06356708727121238, "grad_norm": 4.55380916595459, "learning_rate": 1.9965133242734188e-05, "loss": 1.6697, "step": 91400 }, { "epoch": 0.06370618374226537, "grad_norm": 4.409550189971924, "learning_rate": 1.9964980613534744e-05, "loss": 1.6748, "step": 91600 }, { "epoch": 0.06384528021331835, "grad_norm": 3.8981220722198486, "learning_rate": 1.9964827651835115e-05, "loss": 1.6714, "step": 91800 }, { "epoch": 0.06398437668437133, "grad_norm": 4.1529436111450195, "learning_rate": 1.9964674357642614e-05, "loss": 1.6678, "step": 92000 }, { "epoch": 0.06412347315542431, "grad_norm": 4.378027439117432, "learning_rate": 1.9964520730964544e-05, "loss": 1.6718, "step": 92200 }, { "epoch": 0.06426256962647729, "grad_norm": 3.563565969467163, "learning_rate": 1.9964366771808244e-05, "loss": 1.6212, "step": 92400 }, { "epoch": 0.06440166609753027, "grad_norm": 4.523779392242432, "learning_rate": 1.9964212480181067e-05, "loss": 1.7021, "step": 92600 }, { "epoch": 0.06454076256858325, "grad_norm": 4.6933979988098145, "learning_rate": 1.9964057856090382e-05, "loss": 1.6333, "step": 92800 }, { "epoch": 0.06467985903963623, "grad_norm": 4.0696001052856445, "learning_rate": 1.9963902899543565e-05, "loss": 1.627, "step": 93000 }, { "epoch": 0.06481895551068921, "grad_norm": 4.228518486022949, "learning_rate": 1.996374761054801e-05, "loss": 1.6936, "step": 93200 }, { "epoch": 0.06495805198174219, "grad_norm": 5.702962398529053, "learning_rate": 1.996359198911114e-05, "loss": 1.6592, "step": 93400 }, { "epoch": 0.06509714845279518, "grad_norm": 4.182117462158203, "learning_rate": 1.9963436035240377e-05, "loss": 1.6758, "step": 93600 }, { "epoch": 0.06523624492384816, "grad_norm": 3.498380661010742, "learning_rate": 1.9963279748943166e-05, "loss": 1.7096, "step": 93800 }, { "epoch": 0.06537534139490114, "grad_norm": 3.446587324142456, "learning_rate": 1.996312313022697e-05, "loss": 1.6608, "step": 94000 }, { "epoch": 0.06551443786595412, "grad_norm": 4.699007511138916, "learning_rate": 1.9962966179099264e-05, "loss": 1.6602, "step": 94200 }, { "epoch": 0.0656535343370071, "grad_norm": 5.520283222198486, "learning_rate": 1.9962808895567545e-05, "loss": 1.6837, "step": 94400 }, { "epoch": 0.06579263080806008, "grad_norm": 3.68941593170166, "learning_rate": 1.996265127963932e-05, "loss": 1.7134, "step": 94600 }, { "epoch": 0.06593172727911306, "grad_norm": 4.174427032470703, "learning_rate": 1.996249333132211e-05, "loss": 1.6899, "step": 94800 }, { "epoch": 0.06607082375016604, "grad_norm": 4.726343631744385, "learning_rate": 1.996233505062346e-05, "loss": 1.6605, "step": 95000 }, { "epoch": 0.06620992022121902, "grad_norm": 2.8353991508483887, "learning_rate": 1.9962176437550923e-05, "loss": 1.7237, "step": 95200 }, { "epoch": 0.06634901669227201, "grad_norm": 4.429863452911377, "learning_rate": 1.9962017492112068e-05, "loss": 1.6411, "step": 95400 }, { "epoch": 0.066488113163325, "grad_norm": 5.652978897094727, "learning_rate": 1.996185821431449e-05, "loss": 1.5914, "step": 95600 }, { "epoch": 0.06662720963437797, "grad_norm": 5.26047945022583, "learning_rate": 1.9961698604165786e-05, "loss": 1.6873, "step": 95800 }, { "epoch": 0.06676630610543095, "grad_norm": 3.1413629055023193, "learning_rate": 1.996153866167358e-05, "loss": 1.6972, "step": 96000 }, { "epoch": 0.06690540257648393, "grad_norm": 4.40495491027832, "learning_rate": 1.996137838684551e-05, "loss": 1.6656, "step": 96200 }, { "epoch": 0.06704449904753691, "grad_norm": 2.3306381702423096, "learning_rate": 1.9961217779689218e-05, "loss": 1.656, "step": 96400 }, { "epoch": 0.06718359551858989, "grad_norm": 3.7499096393585205, "learning_rate": 1.9961056840212385e-05, "loss": 1.643, "step": 96600 }, { "epoch": 0.06732269198964287, "grad_norm": 3.8473072052001953, "learning_rate": 1.9960895568422676e-05, "loss": 1.6766, "step": 96800 }, { "epoch": 0.06746178846069585, "grad_norm": 4.882070064544678, "learning_rate": 1.9960733964327807e-05, "loss": 1.6488, "step": 97000 }, { "epoch": 0.06760088493174884, "grad_norm": 3.859436511993408, "learning_rate": 1.9960572027935482e-05, "loss": 1.6849, "step": 97200 }, { "epoch": 0.06773998140280182, "grad_norm": 4.725877285003662, "learning_rate": 1.9960409759253436e-05, "loss": 1.719, "step": 97400 }, { "epoch": 0.0678790778738548, "grad_norm": 5.153317451477051, "learning_rate": 1.996024715828942e-05, "loss": 1.6654, "step": 97600 }, { "epoch": 0.06801817434490778, "grad_norm": 2.4250733852386475, "learning_rate": 1.9960084225051184e-05, "loss": 1.6848, "step": 97800 }, { "epoch": 0.06815727081596076, "grad_norm": 2.9795289039611816, "learning_rate": 1.9959920959546515e-05, "loss": 1.6365, "step": 98000 }, { "epoch": 0.06829636728701374, "grad_norm": 5.835683345794678, "learning_rate": 1.9959757361783212e-05, "loss": 1.6644, "step": 98200 }, { "epoch": 0.06843546375806672, "grad_norm": 3.1525168418884277, "learning_rate": 1.995959343176907e-05, "loss": 1.6032, "step": 98400 }, { "epoch": 0.0685745602291197, "grad_norm": 5.7228474617004395, "learning_rate": 1.9959429169511926e-05, "loss": 1.6478, "step": 98600 }, { "epoch": 0.06871365670017268, "grad_norm": 6.300377368927002, "learning_rate": 1.995926457501962e-05, "loss": 1.6064, "step": 98800 }, { "epoch": 0.06885275317122568, "grad_norm": 3.665008544921875, "learning_rate": 1.995909964830001e-05, "loss": 1.6926, "step": 99000 }, { "epoch": 0.06899184964227865, "grad_norm": 4.594838619232178, "learning_rate": 1.9958934389360967e-05, "loss": 1.6776, "step": 99200 }, { "epoch": 0.06913094611333163, "grad_norm": 7.23288631439209, "learning_rate": 1.995876879821038e-05, "loss": 1.6946, "step": 99400 }, { "epoch": 0.06927004258438461, "grad_norm": 5.666861534118652, "learning_rate": 1.9958602874856158e-05, "loss": 1.71, "step": 99600 }, { "epoch": 0.0694091390554376, "grad_norm": 4.7979559898376465, "learning_rate": 1.995843661930622e-05, "loss": 1.659, "step": 99800 }, { "epoch": 0.06954823552649057, "grad_norm": 2.9751641750335693, "learning_rate": 1.9958270031568497e-05, "loss": 1.679, "step": 100000 }, { "epoch": 0.06968733199754355, "grad_norm": 5.1251654624938965, "learning_rate": 1.995810311165095e-05, "loss": 1.6381, "step": 100200 }, { "epoch": 0.06982642846859653, "grad_norm": 3.9431636333465576, "learning_rate": 1.9957935859561544e-05, "loss": 1.6567, "step": 100400 }, { "epoch": 0.06996552493964951, "grad_norm": 5.359188556671143, "learning_rate": 1.9957768275308266e-05, "loss": 1.6625, "step": 100600 }, { "epoch": 0.0701046214107025, "grad_norm": 3.7686452865600586, "learning_rate": 1.9957600358899115e-05, "loss": 1.6645, "step": 100800 }, { "epoch": 0.07024371788175549, "grad_norm": 2.9700372219085693, "learning_rate": 1.9957432110342102e-05, "loss": 1.6857, "step": 101000 }, { "epoch": 0.07038281435280846, "grad_norm": 3.2254884243011475, "learning_rate": 1.9957263529645265e-05, "loss": 1.671, "step": 101200 }, { "epoch": 0.07052191082386144, "grad_norm": 5.474912643432617, "learning_rate": 1.9957094616816652e-05, "loss": 1.7433, "step": 101400 }, { "epoch": 0.07066100729491442, "grad_norm": 2.335407018661499, "learning_rate": 1.9956925371864324e-05, "loss": 1.6785, "step": 101600 }, { "epoch": 0.0708001037659674, "grad_norm": 8.107941627502441, "learning_rate": 1.9956755794796363e-05, "loss": 1.6447, "step": 101800 }, { "epoch": 0.07093920023702038, "grad_norm": 4.152973175048828, "learning_rate": 1.995658588562086e-05, "loss": 1.6381, "step": 102000 }, { "epoch": 0.07107829670807336, "grad_norm": 3.2355446815490723, "learning_rate": 1.995641564434593e-05, "loss": 1.6433, "step": 102200 }, { "epoch": 0.07121739317912634, "grad_norm": 6.418330192565918, "learning_rate": 1.9956245070979698e-05, "loss": 1.7079, "step": 102400 }, { "epoch": 0.07135648965017934, "grad_norm": 4.378702640533447, "learning_rate": 1.995607416553031e-05, "loss": 1.7091, "step": 102600 }, { "epoch": 0.07149558612123232, "grad_norm": 5.313634872436523, "learning_rate": 1.9955902928005924e-05, "loss": 1.7109, "step": 102800 }, { "epoch": 0.0716346825922853, "grad_norm": 4.048514366149902, "learning_rate": 1.9955731358414715e-05, "loss": 1.6557, "step": 103000 }, { "epoch": 0.07177377906333827, "grad_norm": 3.6237123012542725, "learning_rate": 1.9955559456764872e-05, "loss": 1.6738, "step": 103200 }, { "epoch": 0.07191287553439125, "grad_norm": 4.541780948638916, "learning_rate": 1.99553872230646e-05, "loss": 1.7196, "step": 103400 }, { "epoch": 0.07205197200544423, "grad_norm": 4.659724235534668, "learning_rate": 1.995521465732212e-05, "loss": 1.6151, "step": 103600 }, { "epoch": 0.07219106847649721, "grad_norm": 5.4621100425720215, "learning_rate": 1.9955041759545675e-05, "loss": 1.7023, "step": 103800 }, { "epoch": 0.0723301649475502, "grad_norm": 4.281054973602295, "learning_rate": 1.995486852974352e-05, "loss": 1.6322, "step": 104000 }, { "epoch": 0.07246926141860317, "grad_norm": 5.630712985992432, "learning_rate": 1.9954694967923923e-05, "loss": 1.7311, "step": 104200 }, { "epoch": 0.07260835788965617, "grad_norm": 4.253119468688965, "learning_rate": 1.9954521074095163e-05, "loss": 1.6776, "step": 104400 }, { "epoch": 0.07274745436070915, "grad_norm": 5.049556255340576, "learning_rate": 1.9954346848265545e-05, "loss": 1.6611, "step": 104600 }, { "epoch": 0.07288655083176213, "grad_norm": 5.372382640838623, "learning_rate": 1.99541722904434e-05, "loss": 1.7059, "step": 104800 }, { "epoch": 0.0730256473028151, "grad_norm": 4.119580268859863, "learning_rate": 1.9953997400637037e-05, "loss": 1.6605, "step": 105000 }, { "epoch": 0.07316474377386809, "grad_norm": 5.1215291023254395, "learning_rate": 1.9953822178854824e-05, "loss": 1.6663, "step": 105200 }, { "epoch": 0.07330384024492106, "grad_norm": 3.7207326889038086, "learning_rate": 1.995364662510512e-05, "loss": 1.6526, "step": 105400 }, { "epoch": 0.07344293671597404, "grad_norm": 11.24229907989502, "learning_rate": 1.9953470739396302e-05, "loss": 1.6699, "step": 105600 }, { "epoch": 0.07358203318702702, "grad_norm": 5.274993419647217, "learning_rate": 1.995329452173677e-05, "loss": 1.67, "step": 105800 }, { "epoch": 0.07372112965808, "grad_norm": 5.523107528686523, "learning_rate": 1.9953117972134937e-05, "loss": 1.6571, "step": 106000 }, { "epoch": 0.073860226129133, "grad_norm": 4.517809867858887, "learning_rate": 1.995294109059923e-05, "loss": 1.6567, "step": 106200 }, { "epoch": 0.07399932260018598, "grad_norm": 4.909901142120361, "learning_rate": 1.9952763877138093e-05, "loss": 1.6905, "step": 106400 }, { "epoch": 0.07413841907123896, "grad_norm": 3.8286750316619873, "learning_rate": 1.9952586331759987e-05, "loss": 1.7379, "step": 106600 }, { "epoch": 0.07427751554229194, "grad_norm": 2.9494268894195557, "learning_rate": 1.9952408454473387e-05, "loss": 1.6404, "step": 106800 }, { "epoch": 0.07441661201334492, "grad_norm": 4.137836933135986, "learning_rate": 1.9952230245286786e-05, "loss": 1.7068, "step": 107000 }, { "epoch": 0.0745557084843979, "grad_norm": 4.852845191955566, "learning_rate": 1.995205170420869e-05, "loss": 1.6395, "step": 107200 }, { "epoch": 0.07469480495545087, "grad_norm": 4.205634593963623, "learning_rate": 1.9951872831247626e-05, "loss": 1.6057, "step": 107400 }, { "epoch": 0.07483390142650385, "grad_norm": 65.30146789550781, "learning_rate": 1.9951693626412125e-05, "loss": 1.6569, "step": 107600 }, { "epoch": 0.07497299789755683, "grad_norm": 3.216360092163086, "learning_rate": 1.995151408971075e-05, "loss": 1.7085, "step": 107800 }, { "epoch": 0.07511209436860983, "grad_norm": 5.793154716491699, "learning_rate": 1.9951334221152066e-05, "loss": 1.664, "step": 108000 }, { "epoch": 0.07525119083966281, "grad_norm": 5.157447814941406, "learning_rate": 1.995115402074466e-05, "loss": 1.6386, "step": 108200 }, { "epoch": 0.07539028731071579, "grad_norm": 3.836587429046631, "learning_rate": 1.9950973488497144e-05, "loss": 1.6608, "step": 108400 }, { "epoch": 0.07552938378176877, "grad_norm": 5.336925983428955, "learning_rate": 1.995079262441813e-05, "loss": 1.7036, "step": 108600 }, { "epoch": 0.07566848025282175, "grad_norm": 3.538271188735962, "learning_rate": 1.9950611428516247e-05, "loss": 1.5952, "step": 108800 }, { "epoch": 0.07580757672387473, "grad_norm": 5.387715816497803, "learning_rate": 1.9950429900800153e-05, "loss": 1.6764, "step": 109000 }, { "epoch": 0.0759466731949277, "grad_norm": 5.670628070831299, "learning_rate": 1.9950248041278506e-05, "loss": 1.6749, "step": 109200 }, { "epoch": 0.07608576966598068, "grad_norm": 4.447868824005127, "learning_rate": 1.995006584996e-05, "loss": 1.6717, "step": 109400 }, { "epoch": 0.07622486613703366, "grad_norm": 4.092779159545898, "learning_rate": 1.9949883326853318e-05, "loss": 1.6879, "step": 109600 }, { "epoch": 0.07636396260808666, "grad_norm": 3.9257493019104004, "learning_rate": 1.9949700471967185e-05, "loss": 1.7153, "step": 109800 }, { "epoch": 0.07650305907913964, "grad_norm": 3.0709376335144043, "learning_rate": 1.9949517285310325e-05, "loss": 1.636, "step": 110000 }, { "epoch": 0.07664215555019262, "grad_norm": 7.4014177322387695, "learning_rate": 1.9949333766891487e-05, "loss": 1.6939, "step": 110200 }, { "epoch": 0.0767812520212456, "grad_norm": 4.5453572273254395, "learning_rate": 1.9949149916719424e-05, "loss": 1.6761, "step": 110400 }, { "epoch": 0.07692034849229858, "grad_norm": 7.345770835876465, "learning_rate": 1.994896573480292e-05, "loss": 1.6474, "step": 110600 }, { "epoch": 0.07705944496335156, "grad_norm": 4.953390598297119, "learning_rate": 1.9948781221150764e-05, "loss": 1.6564, "step": 110800 }, { "epoch": 0.07719854143440454, "grad_norm": 4.344512462615967, "learning_rate": 1.994859637577177e-05, "loss": 1.7007, "step": 111000 }, { "epoch": 0.07733763790545752, "grad_norm": 6.857558250427246, "learning_rate": 1.9948411198674753e-05, "loss": 1.6557, "step": 111200 }, { "epoch": 0.0774767343765105, "grad_norm": 4.807548999786377, "learning_rate": 1.994822568986856e-05, "loss": 1.609, "step": 111400 }, { "epoch": 0.07761583084756349, "grad_norm": 3.974553108215332, "learning_rate": 1.994803984936205e-05, "loss": 1.6618, "step": 111600 }, { "epoch": 0.07775492731861647, "grad_norm": 3.7942748069763184, "learning_rate": 1.9947853677164085e-05, "loss": 1.6961, "step": 111800 }, { "epoch": 0.07789402378966945, "grad_norm": 4.514380931854248, "learning_rate": 1.994766717328356e-05, "loss": 1.6466, "step": 112000 }, { "epoch": 0.07803312026072243, "grad_norm": 4.735954761505127, "learning_rate": 1.9947480337729373e-05, "loss": 1.6127, "step": 112200 }, { "epoch": 0.0781722167317754, "grad_norm": 3.0629405975341797, "learning_rate": 1.9947293170510452e-05, "loss": 1.665, "step": 112400 }, { "epoch": 0.07831131320282839, "grad_norm": 4.7660017013549805, "learning_rate": 1.9947105671635722e-05, "loss": 1.6796, "step": 112600 }, { "epoch": 0.07845040967388137, "grad_norm": 5.615001678466797, "learning_rate": 1.9946917841114142e-05, "loss": 1.6658, "step": 112800 }, { "epoch": 0.07858950614493435, "grad_norm": 2.553229570388794, "learning_rate": 1.994672967895467e-05, "loss": 1.6455, "step": 113000 }, { "epoch": 0.07872860261598733, "grad_norm": 5.308156490325928, "learning_rate": 1.9946541185166303e-05, "loss": 1.6565, "step": 113200 }, { "epoch": 0.07886769908704032, "grad_norm": 2.3234572410583496, "learning_rate": 1.9946352359758025e-05, "loss": 1.6863, "step": 113400 }, { "epoch": 0.0790067955580933, "grad_norm": 4.731029033660889, "learning_rate": 1.9946163202738855e-05, "loss": 1.6302, "step": 113600 }, { "epoch": 0.07914589202914628, "grad_norm": 4.099117279052734, "learning_rate": 1.9945973714117826e-05, "loss": 1.69, "step": 113800 }, { "epoch": 0.07928498850019926, "grad_norm": 3.5221517086029053, "learning_rate": 1.994578389390398e-05, "loss": 1.7052, "step": 114000 }, { "epoch": 0.07942408497125224, "grad_norm": 4.142230987548828, "learning_rate": 1.9945593742106384e-05, "loss": 1.6812, "step": 114200 }, { "epoch": 0.07956318144230522, "grad_norm": 3.0929555892944336, "learning_rate": 1.9945403258734114e-05, "loss": 1.7257, "step": 114400 }, { "epoch": 0.0797022779133582, "grad_norm": 3.2204084396362305, "learning_rate": 1.9945212443796258e-05, "loss": 1.6641, "step": 114600 }, { "epoch": 0.07984137438441118, "grad_norm": 6.502091884613037, "learning_rate": 1.994502129730193e-05, "loss": 1.6554, "step": 114800 }, { "epoch": 0.07998047085546416, "grad_norm": 11.513239860534668, "learning_rate": 1.9944829819260257e-05, "loss": 1.6928, "step": 115000 }, { "epoch": 0.08011956732651715, "grad_norm": 4.377542018890381, "learning_rate": 1.9944638009680376e-05, "loss": 1.6582, "step": 115200 }, { "epoch": 0.08025866379757013, "grad_norm": 4.931886196136475, "learning_rate": 1.9944445868571445e-05, "loss": 1.7059, "step": 115400 }, { "epoch": 0.08039776026862311, "grad_norm": 3.6294796466827393, "learning_rate": 1.9944253395942634e-05, "loss": 1.6306, "step": 115600 }, { "epoch": 0.08053685673967609, "grad_norm": 4.6588239669799805, "learning_rate": 1.9944060591803138e-05, "loss": 1.7307, "step": 115800 }, { "epoch": 0.08067595321072907, "grad_norm": 4.7038445472717285, "learning_rate": 1.9943867456162157e-05, "loss": 1.6457, "step": 116000 }, { "epoch": 0.08081504968178205, "grad_norm": 4.736629962921143, "learning_rate": 1.994367398902891e-05, "loss": 1.6645, "step": 116200 }, { "epoch": 0.08095414615283503, "grad_norm": 4.819729804992676, "learning_rate": 1.994348019041263e-05, "loss": 1.6779, "step": 116400 }, { "epoch": 0.081093242623888, "grad_norm": 3.38431453704834, "learning_rate": 1.9943286060322577e-05, "loss": 1.6987, "step": 116600 }, { "epoch": 0.08123233909494099, "grad_norm": 3.7647595405578613, "learning_rate": 1.9943091598768015e-05, "loss": 1.6974, "step": 116800 }, { "epoch": 0.08137143556599398, "grad_norm": 5.142828464508057, "learning_rate": 1.994289680575823e-05, "loss": 1.6299, "step": 117000 }, { "epoch": 0.08151053203704696, "grad_norm": 4.299983024597168, "learning_rate": 1.9942701681302515e-05, "loss": 1.6841, "step": 117200 }, { "epoch": 0.08164962850809994, "grad_norm": 3.810953140258789, "learning_rate": 1.9942506225410186e-05, "loss": 1.661, "step": 117400 }, { "epoch": 0.08178872497915292, "grad_norm": 6.211481094360352, "learning_rate": 1.9942310438090575e-05, "loss": 1.6983, "step": 117600 }, { "epoch": 0.0819278214502059, "grad_norm": 4.454127788543701, "learning_rate": 1.9942114319353036e-05, "loss": 1.6027, "step": 117800 }, { "epoch": 0.08206691792125888, "grad_norm": 5.321477890014648, "learning_rate": 1.994191786920692e-05, "loss": 1.6627, "step": 118000 }, { "epoch": 0.08220601439231186, "grad_norm": 2.7003893852233887, "learning_rate": 1.9941721087661615e-05, "loss": 1.6073, "step": 118200 }, { "epoch": 0.08234511086336484, "grad_norm": 4.267752647399902, "learning_rate": 1.9941523974726506e-05, "loss": 1.6672, "step": 118400 }, { "epoch": 0.08248420733441782, "grad_norm": 3.5116851329803467, "learning_rate": 1.9941326530411008e-05, "loss": 1.6696, "step": 118600 }, { "epoch": 0.08262330380547081, "grad_norm": 4.231411933898926, "learning_rate": 1.994112875472455e-05, "loss": 1.6839, "step": 118800 }, { "epoch": 0.08276240027652379, "grad_norm": 3.529834032058716, "learning_rate": 1.9940930647676567e-05, "loss": 1.7259, "step": 119000 }, { "epoch": 0.08290149674757677, "grad_norm": 3.043891668319702, "learning_rate": 1.994073220927652e-05, "loss": 1.6588, "step": 119200 }, { "epoch": 0.08304059321862975, "grad_norm": 3.1819875240325928, "learning_rate": 1.9940533439533884e-05, "loss": 1.6443, "step": 119400 }, { "epoch": 0.08317968968968273, "grad_norm": 4.941934108734131, "learning_rate": 1.994033433845814e-05, "loss": 1.7036, "step": 119600 }, { "epoch": 0.08331878616073571, "grad_norm": 5.279249668121338, "learning_rate": 1.9940134906058803e-05, "loss": 1.6768, "step": 119800 }, { "epoch": 0.08345788263178869, "grad_norm": 5.617332935333252, "learning_rate": 1.993993514234539e-05, "loss": 1.6257, "step": 120000 }, { "epoch": 0.08359697910284167, "grad_norm": 8.196516990661621, "learning_rate": 1.9939735047327435e-05, "loss": 1.7018, "step": 120200 }, { "epoch": 0.08373607557389465, "grad_norm": 5.056685924530029, "learning_rate": 1.9939534621014488e-05, "loss": 1.626, "step": 120400 }, { "epoch": 0.08387517204494763, "grad_norm": 3.8837344646453857, "learning_rate": 1.9939333863416124e-05, "loss": 1.6686, "step": 120600 }, { "epoch": 0.08401426851600062, "grad_norm": 4.073197841644287, "learning_rate": 1.9939132774541924e-05, "loss": 1.6336, "step": 120800 }, { "epoch": 0.0841533649870536, "grad_norm": 2.8845059871673584, "learning_rate": 1.9938931354401487e-05, "loss": 1.7264, "step": 121000 }, { "epoch": 0.08429246145810658, "grad_norm": 6.454279899597168, "learning_rate": 1.993872960300443e-05, "loss": 1.632, "step": 121200 }, { "epoch": 0.08443155792915956, "grad_norm": 3.558516263961792, "learning_rate": 1.9938527520360386e-05, "loss": 1.6358, "step": 121400 }, { "epoch": 0.08457065440021254, "grad_norm": 2.6663408279418945, "learning_rate": 1.9938325106478994e-05, "loss": 1.6713, "step": 121600 }, { "epoch": 0.08470975087126552, "grad_norm": 2.0985066890716553, "learning_rate": 1.9938122361369925e-05, "loss": 1.6949, "step": 121800 }, { "epoch": 0.0848488473423185, "grad_norm": 7.359683036804199, "learning_rate": 1.993791928504286e-05, "loss": 1.686, "step": 122000 }, { "epoch": 0.08498794381337148, "grad_norm": 4.560733795166016, "learning_rate": 1.9937715877507483e-05, "loss": 1.6698, "step": 122200 }, { "epoch": 0.08512704028442446, "grad_norm": 9.444819450378418, "learning_rate": 1.9937512138773517e-05, "loss": 1.7594, "step": 122400 }, { "epoch": 0.08526613675547745, "grad_norm": 4.619702339172363, "learning_rate": 1.9937308068850675e-05, "loss": 1.6439, "step": 122600 }, { "epoch": 0.08540523322653043, "grad_norm": 4.782458305358887, "learning_rate": 1.993710366774871e-05, "loss": 1.69, "step": 122800 }, { "epoch": 0.08554432969758341, "grad_norm": 3.538214683532715, "learning_rate": 1.993689893547738e-05, "loss": 1.6316, "step": 123000 }, { "epoch": 0.08568342616863639, "grad_norm": 4.764087677001953, "learning_rate": 1.9936693872046446e-05, "loss": 1.6292, "step": 123200 }, { "epoch": 0.08582252263968937, "grad_norm": 2.1407108306884766, "learning_rate": 1.993648847746571e-05, "loss": 1.6813, "step": 123400 }, { "epoch": 0.08596161911074235, "grad_norm": 4.375445365905762, "learning_rate": 1.9936282751744978e-05, "loss": 1.658, "step": 123600 }, { "epoch": 0.08610071558179533, "grad_norm": 2.881798267364502, "learning_rate": 1.993607669489406e-05, "loss": 1.7203, "step": 123800 }, { "epoch": 0.08623981205284831, "grad_norm": 4.478676795959473, "learning_rate": 1.9935870306922803e-05, "loss": 1.7229, "step": 124000 }, { "epoch": 0.08637890852390129, "grad_norm": 3.7552738189697266, "learning_rate": 1.9935663587841057e-05, "loss": 1.6726, "step": 124200 }, { "epoch": 0.08651800499495428, "grad_norm": 5.035639762878418, "learning_rate": 1.9935456537658685e-05, "loss": 1.6308, "step": 124400 }, { "epoch": 0.08665710146600726, "grad_norm": 8.277714729309082, "learning_rate": 1.9935249156385578e-05, "loss": 1.645, "step": 124600 }, { "epoch": 0.08679619793706024, "grad_norm": 2.7229197025299072, "learning_rate": 1.9935041444031636e-05, "loss": 1.6117, "step": 124800 }, { "epoch": 0.08693529440811322, "grad_norm": 4.553562641143799, "learning_rate": 1.993483340060677e-05, "loss": 1.6732, "step": 125000 }, { "epoch": 0.0870743908791662, "grad_norm": 9.091089248657227, "learning_rate": 1.9934625026120916e-05, "loss": 1.6204, "step": 125200 }, { "epoch": 0.08721348735021918, "grad_norm": 5.536111831665039, "learning_rate": 1.9934416320584024e-05, "loss": 1.6774, "step": 125400 }, { "epoch": 0.08735258382127216, "grad_norm": 4.71929407119751, "learning_rate": 1.9934207284006053e-05, "loss": 1.6723, "step": 125600 }, { "epoch": 0.08749168029232514, "grad_norm": 3.918471336364746, "learning_rate": 1.993399791639698e-05, "loss": 1.6483, "step": 125800 }, { "epoch": 0.08763077676337812, "grad_norm": 4.748182773590088, "learning_rate": 1.99337882177668e-05, "loss": 1.6951, "step": 126000 }, { "epoch": 0.08776987323443111, "grad_norm": 5.15871524810791, "learning_rate": 1.993357818812553e-05, "loss": 1.6363, "step": 126200 }, { "epoch": 0.08790896970548409, "grad_norm": 3.576403856277466, "learning_rate": 1.9933367827483194e-05, "loss": 1.7353, "step": 126400 }, { "epoch": 0.08804806617653707, "grad_norm": 2.5834105014801025, "learning_rate": 1.9933157135849838e-05, "loss": 1.7252, "step": 126600 }, { "epoch": 0.08818716264759005, "grad_norm": 6.441946983337402, "learning_rate": 1.9932946113235507e-05, "loss": 1.7092, "step": 126800 }, { "epoch": 0.08832625911864303, "grad_norm": 3.035785436630249, "learning_rate": 1.9932734759650287e-05, "loss": 1.6231, "step": 127000 }, { "epoch": 0.08846535558969601, "grad_norm": 4.124936103820801, "learning_rate": 1.9932523075104266e-05, "loss": 1.6873, "step": 127200 }, { "epoch": 0.08860445206074899, "grad_norm": 3.943493366241455, "learning_rate": 1.993231105960755e-05, "loss": 1.7114, "step": 127400 }, { "epoch": 0.08874354853180197, "grad_norm": 5.265536785125732, "learning_rate": 1.9932098713170255e-05, "loss": 1.6584, "step": 127600 }, { "epoch": 0.08888264500285495, "grad_norm": 6.496641159057617, "learning_rate": 1.993188603580252e-05, "loss": 1.7036, "step": 127800 }, { "epoch": 0.08902174147390794, "grad_norm": 3.9704132080078125, "learning_rate": 1.9931673027514504e-05, "loss": 1.62, "step": 128000 }, { "epoch": 0.08916083794496092, "grad_norm": 3.230464220046997, "learning_rate": 1.9931459688316376e-05, "loss": 1.6721, "step": 128200 }, { "epoch": 0.0892999344160139, "grad_norm": 3.8911783695220947, "learning_rate": 1.9931246018218306e-05, "loss": 1.6456, "step": 128400 }, { "epoch": 0.08943903088706688, "grad_norm": 4.141741752624512, "learning_rate": 1.993103201723051e-05, "loss": 1.6524, "step": 128600 }, { "epoch": 0.08957812735811986, "grad_norm": 6.229700088500977, "learning_rate": 1.9930817685363196e-05, "loss": 1.646, "step": 128800 }, { "epoch": 0.08971722382917284, "grad_norm": 4.1471781730651855, "learning_rate": 1.9930603022626598e-05, "loss": 1.686, "step": 129000 }, { "epoch": 0.08985632030022582, "grad_norm": 3.428189277648926, "learning_rate": 1.993038802903097e-05, "loss": 1.6643, "step": 129200 }, { "epoch": 0.0899954167712788, "grad_norm": 3.0590615272521973, "learning_rate": 1.993017270458656e-05, "loss": 1.7002, "step": 129400 }, { "epoch": 0.09013451324233178, "grad_norm": 6.491008758544922, "learning_rate": 1.9929957049303663e-05, "loss": 1.6552, "step": 129600 }, { "epoch": 0.09027360971338477, "grad_norm": 4.609399795532227, "learning_rate": 1.9929741063192573e-05, "loss": 1.6732, "step": 129800 }, { "epoch": 0.09041270618443775, "grad_norm": 5.212355136871338, "learning_rate": 1.992952474626359e-05, "loss": 1.6594, "step": 130000 }, { "epoch": 0.09055180265549073, "grad_norm": 4.249414920806885, "learning_rate": 1.9929308098527046e-05, "loss": 1.6839, "step": 130200 }, { "epoch": 0.09069089912654371, "grad_norm": 5.197662353515625, "learning_rate": 1.9929091119993287e-05, "loss": 1.6776, "step": 130400 }, { "epoch": 0.09082999559759669, "grad_norm": 2.6278629302978516, "learning_rate": 1.992887381067267e-05, "loss": 1.7051, "step": 130600 }, { "epoch": 0.09096909206864967, "grad_norm": 6.805337429046631, "learning_rate": 1.9928656170575563e-05, "loss": 1.6536, "step": 130800 }, { "epoch": 0.09110818853970265, "grad_norm": 5.952248573303223, "learning_rate": 1.9928438199712366e-05, "loss": 1.6415, "step": 131000 }, { "epoch": 0.09124728501075563, "grad_norm": 4.186747074127197, "learning_rate": 1.9928219898093478e-05, "loss": 1.6267, "step": 131200 }, { "epoch": 0.09138638148180861, "grad_norm": 3.9875595569610596, "learning_rate": 1.9928001265729322e-05, "loss": 1.6717, "step": 131400 }, { "epoch": 0.0915254779528616, "grad_norm": 4.136226654052734, "learning_rate": 1.9927782302630337e-05, "loss": 1.6396, "step": 131600 }, { "epoch": 0.09166457442391458, "grad_norm": 4.574390411376953, "learning_rate": 1.992756300880697e-05, "loss": 1.6513, "step": 131800 }, { "epoch": 0.09180367089496756, "grad_norm": 5.300370693206787, "learning_rate": 1.9927343384269692e-05, "loss": 1.6643, "step": 132000 }, { "epoch": 0.09194276736602054, "grad_norm": 5.590688705444336, "learning_rate": 1.9927123429028995e-05, "loss": 1.6855, "step": 132200 }, { "epoch": 0.09208186383707352, "grad_norm": 5.214639186859131, "learning_rate": 1.9926903143095373e-05, "loss": 1.6301, "step": 132400 }, { "epoch": 0.0922209603081265, "grad_norm": 5.071218490600586, "learning_rate": 1.992668252647934e-05, "loss": 1.7033, "step": 132600 }, { "epoch": 0.09236005677917948, "grad_norm": 10.156872749328613, "learning_rate": 1.9926461579191436e-05, "loss": 1.6589, "step": 132800 }, { "epoch": 0.09249915325023246, "grad_norm": 5.648166179656982, "learning_rate": 1.9926240301242198e-05, "loss": 1.5749, "step": 133000 }, { "epoch": 0.09263824972128544, "grad_norm": 3.7916784286499023, "learning_rate": 1.99260186926422e-05, "loss": 1.7162, "step": 133200 }, { "epoch": 0.09277734619233843, "grad_norm": 6.712701320648193, "learning_rate": 1.9925796753402017e-05, "loss": 1.6956, "step": 133400 }, { "epoch": 0.09291644266339141, "grad_norm": 4.044628143310547, "learning_rate": 1.9925574483532248e-05, "loss": 1.6459, "step": 133600 }, { "epoch": 0.09305553913444439, "grad_norm": 3.9863102436065674, "learning_rate": 1.9925351883043492e-05, "loss": 1.6497, "step": 133800 }, { "epoch": 0.09319463560549737, "grad_norm": 4.047595024108887, "learning_rate": 1.9925128951946387e-05, "loss": 1.6704, "step": 134000 }, { "epoch": 0.09333373207655035, "grad_norm": 7.6964216232299805, "learning_rate": 1.9924905690251574e-05, "loss": 1.679, "step": 134200 }, { "epoch": 0.09347282854760333, "grad_norm": 5.07405424118042, "learning_rate": 1.9924682097969708e-05, "loss": 1.6106, "step": 134400 }, { "epoch": 0.09361192501865631, "grad_norm": 3.565641164779663, "learning_rate": 1.9924458175111462e-05, "loss": 1.672, "step": 134600 }, { "epoch": 0.09375102148970929, "grad_norm": 4.344285011291504, "learning_rate": 1.992423392168753e-05, "loss": 1.611, "step": 134800 }, { "epoch": 0.09389011796076227, "grad_norm": 3.279279947280884, "learning_rate": 1.9924009337708616e-05, "loss": 1.6553, "step": 135000 }, { "epoch": 0.09402921443181526, "grad_norm": 2.899543046951294, "learning_rate": 1.9923784423185443e-05, "loss": 1.6453, "step": 135200 }, { "epoch": 0.09416831090286824, "grad_norm": 3.4522011280059814, "learning_rate": 1.9923559178128743e-05, "loss": 1.6669, "step": 135400 }, { "epoch": 0.09430740737392122, "grad_norm": 4.811175346374512, "learning_rate": 1.9923333602549274e-05, "loss": 1.6536, "step": 135600 }, { "epoch": 0.0944465038449742, "grad_norm": 5.7362470626831055, "learning_rate": 1.99231076964578e-05, "loss": 1.6271, "step": 135800 }, { "epoch": 0.09458560031602718, "grad_norm": 3.7457661628723145, "learning_rate": 1.9922881459865114e-05, "loss": 1.6952, "step": 136000 }, { "epoch": 0.09472469678708016, "grad_norm": 4.152240753173828, "learning_rate": 1.9922654892782006e-05, "loss": 1.6562, "step": 136200 }, { "epoch": 0.09486379325813314, "grad_norm": 6.101974964141846, "learning_rate": 1.99224279952193e-05, "loss": 1.6818, "step": 136400 }, { "epoch": 0.09500288972918612, "grad_norm": 5.957167625427246, "learning_rate": 1.9922200767187824e-05, "loss": 1.5804, "step": 136600 }, { "epoch": 0.0951419862002391, "grad_norm": 3.512366771697998, "learning_rate": 1.9921973208698425e-05, "loss": 1.6287, "step": 136800 }, { "epoch": 0.0952810826712921, "grad_norm": 5.7234320640563965, "learning_rate": 1.992174531976197e-05, "loss": 1.6764, "step": 137000 }, { "epoch": 0.09542017914234507, "grad_norm": 4.115810871124268, "learning_rate": 1.9921517100389334e-05, "loss": 1.6838, "step": 137200 }, { "epoch": 0.09555927561339805, "grad_norm": 6.415590763092041, "learning_rate": 1.992128855059141e-05, "loss": 1.6411, "step": 137400 }, { "epoch": 0.09569837208445103, "grad_norm": 7.095560073852539, "learning_rate": 1.9921059670379123e-05, "loss": 1.6978, "step": 137600 }, { "epoch": 0.09583746855550401, "grad_norm": 6.347951889038086, "learning_rate": 1.9920830459763378e-05, "loss": 1.7948, "step": 137800 }, { "epoch": 0.09597656502655699, "grad_norm": 5.269043445587158, "learning_rate": 1.9920600918755134e-05, "loss": 1.6336, "step": 138000 }, { "epoch": 0.09611566149760997, "grad_norm": 2.732041597366333, "learning_rate": 1.992037104736534e-05, "loss": 1.6809, "step": 138200 }, { "epoch": 0.09625475796866295, "grad_norm": 3.8831331729888916, "learning_rate": 1.9920140845604977e-05, "loss": 1.7038, "step": 138400 }, { "epoch": 0.09639385443971593, "grad_norm": 3.255678415298462, "learning_rate": 1.9919910313485026e-05, "loss": 1.6552, "step": 138600 }, { "epoch": 0.09653295091076892, "grad_norm": 3.9140212535858154, "learning_rate": 1.9919679451016504e-05, "loss": 1.6221, "step": 138800 }, { "epoch": 0.0966720473818219, "grad_norm": 3.5831167697906494, "learning_rate": 1.9919448258210417e-05, "loss": 1.6112, "step": 139000 }, { "epoch": 0.09681114385287488, "grad_norm": 7.086493968963623, "learning_rate": 1.9919216735077812e-05, "loss": 1.6798, "step": 139200 }, { "epoch": 0.09695024032392786, "grad_norm": 3.7616541385650635, "learning_rate": 1.991898488162974e-05, "loss": 1.6874, "step": 139400 }, { "epoch": 0.09708933679498084, "grad_norm": 3.6000430583953857, "learning_rate": 1.9918752697877274e-05, "loss": 1.6736, "step": 139600 }, { "epoch": 0.09722843326603382, "grad_norm": 3.622159242630005, "learning_rate": 1.9918520183831487e-05, "loss": 1.6773, "step": 139800 }, { "epoch": 0.0973675297370868, "grad_norm": 4.9817118644714355, "learning_rate": 1.9918287339503487e-05, "loss": 1.6567, "step": 140000 }, { "epoch": 0.09750662620813978, "grad_norm": 6.518868923187256, "learning_rate": 1.9918054164904385e-05, "loss": 1.6965, "step": 140200 }, { "epoch": 0.09764572267919276, "grad_norm": 0.4949752986431122, "learning_rate": 1.991782066004532e-05, "loss": 1.6403, "step": 140400 }, { "epoch": 0.09778481915024576, "grad_norm": 4.795136451721191, "learning_rate": 1.9917586824937434e-05, "loss": 1.6665, "step": 140600 }, { "epoch": 0.09792391562129873, "grad_norm": 3.7141456604003906, "learning_rate": 1.991735265959189e-05, "loss": 1.6325, "step": 140800 }, { "epoch": 0.09806301209235171, "grad_norm": 4.222114086151123, "learning_rate": 1.9917118164019865e-05, "loss": 1.6767, "step": 141000 }, { "epoch": 0.0982021085634047, "grad_norm": 4.53550910949707, "learning_rate": 1.991688333823256e-05, "loss": 1.6214, "step": 141200 }, { "epoch": 0.09834120503445767, "grad_norm": 7.71623420715332, "learning_rate": 1.991664818224118e-05, "loss": 1.6656, "step": 141400 }, { "epoch": 0.09848030150551065, "grad_norm": 3.8144173622131348, "learning_rate": 1.991641269605695e-05, "loss": 1.6721, "step": 141600 }, { "epoch": 0.09861939797656363, "grad_norm": 5.038686275482178, "learning_rate": 1.9916176879691114e-05, "loss": 1.6655, "step": 141800 }, { "epoch": 0.09875849444761661, "grad_norm": 5.067854881286621, "learning_rate": 1.9915940733154934e-05, "loss": 1.7198, "step": 142000 }, { "epoch": 0.09889759091866959, "grad_norm": 4.443403720855713, "learning_rate": 1.991570425645968e-05, "loss": 1.67, "step": 142200 }, { "epoch": 0.09903668738972259, "grad_norm": 3.2373218536376953, "learning_rate": 1.991546744961663e-05, "loss": 1.6561, "step": 142400 }, { "epoch": 0.09917578386077557, "grad_norm": 3.6027562618255615, "learning_rate": 1.9915230312637107e-05, "loss": 1.7026, "step": 142600 }, { "epoch": 0.09931488033182855, "grad_norm": 4.93851375579834, "learning_rate": 1.991499284553242e-05, "loss": 1.641, "step": 142800 }, { "epoch": 0.09945397680288152, "grad_norm": 3.5570874214172363, "learning_rate": 1.9914755048313914e-05, "loss": 1.6767, "step": 143000 }, { "epoch": 0.0995930732739345, "grad_norm": 3.8586952686309814, "learning_rate": 1.991451692099293e-05, "loss": 1.6606, "step": 143200 }, { "epoch": 0.09973216974498748, "grad_norm": 4.373999118804932, "learning_rate": 1.991427846358085e-05, "loss": 1.7067, "step": 143400 }, { "epoch": 0.09987126621604046, "grad_norm": 3.3770077228546143, "learning_rate": 1.9914039676089047e-05, "loss": 1.6772, "step": 143600 }, { "epoch": 0.10001036268709344, "grad_norm": 5.1856231689453125, "learning_rate": 1.9913800558528923e-05, "loss": 1.6616, "step": 143800 }, { "epoch": 0.10014945915814642, "grad_norm": 4.421877861022949, "learning_rate": 1.9913561110911895e-05, "loss": 1.6897, "step": 144000 }, { "epoch": 0.10028855562919942, "grad_norm": 4.459238529205322, "learning_rate": 1.991332133324939e-05, "loss": 1.6637, "step": 144200 }, { "epoch": 0.1004276521002524, "grad_norm": 6.31149959564209, "learning_rate": 1.991308122555286e-05, "loss": 1.7043, "step": 144400 }, { "epoch": 0.10056674857130538, "grad_norm": 6.5246968269348145, "learning_rate": 1.9912840787833763e-05, "loss": 1.6994, "step": 144600 }, { "epoch": 0.10070584504235836, "grad_norm": 3.816343069076538, "learning_rate": 1.9912600020103578e-05, "loss": 1.6694, "step": 144800 }, { "epoch": 0.10084494151341133, "grad_norm": 7.002806663513184, "learning_rate": 1.99123589223738e-05, "loss": 1.6703, "step": 145000 }, { "epoch": 0.10098403798446431, "grad_norm": 10.060843467712402, "learning_rate": 1.991211749465594e-05, "loss": 1.6675, "step": 145200 }, { "epoch": 0.1011231344555173, "grad_norm": 4.098082542419434, "learning_rate": 1.991187573696152e-05, "loss": 1.6776, "step": 145400 }, { "epoch": 0.10126223092657027, "grad_norm": 3.1275992393493652, "learning_rate": 1.9911633649302085e-05, "loss": 1.6661, "step": 145600 }, { "epoch": 0.10140132739762325, "grad_norm": 5.146686553955078, "learning_rate": 1.9911391231689192e-05, "loss": 1.6658, "step": 145800 }, { "epoch": 0.10154042386867623, "grad_norm": 5.116509437561035, "learning_rate": 1.991114848413441e-05, "loss": 1.6676, "step": 146000 }, { "epoch": 0.10167952033972923, "grad_norm": 6.305995464324951, "learning_rate": 1.9910905406649327e-05, "loss": 1.6426, "step": 146200 }, { "epoch": 0.1018186168107822, "grad_norm": 4.751062393188477, "learning_rate": 1.9910661999245556e-05, "loss": 1.644, "step": 146400 }, { "epoch": 0.10195771328183519, "grad_norm": 4.653252601623535, "learning_rate": 1.9910418261934708e-05, "loss": 1.6646, "step": 146600 }, { "epoch": 0.10209680975288817, "grad_norm": 2.5268571376800537, "learning_rate": 1.9910174194728423e-05, "loss": 1.6318, "step": 146800 }, { "epoch": 0.10223590622394114, "grad_norm": 3.8986382484436035, "learning_rate": 1.990992979763835e-05, "loss": 1.6516, "step": 147000 }, { "epoch": 0.10237500269499412, "grad_norm": 3.6918115615844727, "learning_rate": 1.9909685070676157e-05, "loss": 1.6711, "step": 147200 }, { "epoch": 0.1025140991660471, "grad_norm": 5.962430477142334, "learning_rate": 1.9909440013853528e-05, "loss": 1.6529, "step": 147400 }, { "epoch": 0.10265319563710008, "grad_norm": 3.823061227798462, "learning_rate": 1.9909194627182157e-05, "loss": 1.6611, "step": 147600 }, { "epoch": 0.10279229210815306, "grad_norm": 4.465606212615967, "learning_rate": 1.9908948910673765e-05, "loss": 1.7157, "step": 147800 }, { "epoch": 0.10293138857920606, "grad_norm": 5.531731128692627, "learning_rate": 1.990870286434008e-05, "loss": 1.6073, "step": 148000 }, { "epoch": 0.10307048505025904, "grad_norm": 4.213583946228027, "learning_rate": 1.9908456488192847e-05, "loss": 1.7168, "step": 148200 }, { "epoch": 0.10320958152131202, "grad_norm": 3.7597150802612305, "learning_rate": 1.9908209782243832e-05, "loss": 1.6871, "step": 148400 }, { "epoch": 0.103348677992365, "grad_norm": 5.788958549499512, "learning_rate": 1.9907962746504806e-05, "loss": 1.642, "step": 148600 }, { "epoch": 0.10348777446341798, "grad_norm": 7.341514587402344, "learning_rate": 1.9907715380987566e-05, "loss": 1.6443, "step": 148800 }, { "epoch": 0.10362687093447095, "grad_norm": 4.443752765655518, "learning_rate": 1.9907467685703917e-05, "loss": 1.6884, "step": 149000 }, { "epoch": 0.10376596740552393, "grad_norm": 5.186500549316406, "learning_rate": 1.990721966066569e-05, "loss": 1.6674, "step": 149200 }, { "epoch": 0.10390506387657691, "grad_norm": 4.551130771636963, "learning_rate": 1.990697130588472e-05, "loss": 1.6119, "step": 149400 }, { "epoch": 0.1040441603476299, "grad_norm": 9.415721893310547, "learning_rate": 1.9906722621372867e-05, "loss": 1.63, "step": 149600 }, { "epoch": 0.10418325681868289, "grad_norm": 4.156707286834717, "learning_rate": 1.9906473607142e-05, "loss": 1.6697, "step": 149800 }, { "epoch": 0.10432235328973587, "grad_norm": 4.572336196899414, "learning_rate": 1.990622426320401e-05, "loss": 1.6057, "step": 150000 }, { "epoch": 0.10446144976078885, "grad_norm": 2.854973077774048, "learning_rate": 1.9905974589570792e-05, "loss": 1.6366, "step": 150200 }, { "epoch": 0.10460054623184183, "grad_norm": 4.791712760925293, "learning_rate": 1.990572458625428e-05, "loss": 1.654, "step": 150400 }, { "epoch": 0.1047396427028948, "grad_norm": 5.711724758148193, "learning_rate": 1.99054742532664e-05, "loss": 1.6476, "step": 150600 }, { "epoch": 0.10487873917394779, "grad_norm": 4.505434036254883, "learning_rate": 1.9905223590619096e-05, "loss": 1.6667, "step": 150800 }, { "epoch": 0.10501783564500076, "grad_norm": 4.14932107925415, "learning_rate": 1.9904972598324345e-05, "loss": 1.6195, "step": 151000 }, { "epoch": 0.10515693211605374, "grad_norm": 3.72538161277771, "learning_rate": 1.9904721276394122e-05, "loss": 1.6965, "step": 151200 }, { "epoch": 0.10529602858710672, "grad_norm": 2.7313473224639893, "learning_rate": 1.990446962484043e-05, "loss": 1.6402, "step": 151400 }, { "epoch": 0.10543512505815972, "grad_norm": 10.523667335510254, "learning_rate": 1.9904217643675287e-05, "loss": 1.6339, "step": 151600 }, { "epoch": 0.1055742215292127, "grad_norm": 4.01319694519043, "learning_rate": 1.9903965332910706e-05, "loss": 1.6149, "step": 151800 }, { "epoch": 0.10571331800026568, "grad_norm": 4.70330810546875, "learning_rate": 1.990371269255875e-05, "loss": 1.6127, "step": 152000 }, { "epoch": 0.10585241447131866, "grad_norm": 4.441647052764893, "learning_rate": 1.9903459722631466e-05, "loss": 1.6897, "step": 152200 }, { "epoch": 0.10599151094237164, "grad_norm": 6.007661819458008, "learning_rate": 1.9903206423140936e-05, "loss": 1.6853, "step": 152400 }, { "epoch": 0.10613060741342462, "grad_norm": 5.524699687957764, "learning_rate": 1.9902952794099257e-05, "loss": 1.6925, "step": 152600 }, { "epoch": 0.1062697038844776, "grad_norm": 4.7537665367126465, "learning_rate": 1.9902698835518533e-05, "loss": 1.6677, "step": 152800 }, { "epoch": 0.10640880035553058, "grad_norm": 8.036478996276855, "learning_rate": 1.9902444547410883e-05, "loss": 1.6759, "step": 153000 }, { "epoch": 0.10654789682658355, "grad_norm": 3.9966869354248047, "learning_rate": 1.9902189929788453e-05, "loss": 1.6993, "step": 153200 }, { "epoch": 0.10668699329763655, "grad_norm": 1.9982924461364746, "learning_rate": 1.9901934982663393e-05, "loss": 1.6157, "step": 153400 }, { "epoch": 0.10682608976868953, "grad_norm": 3.47050142288208, "learning_rate": 1.990167970604788e-05, "loss": 1.6414, "step": 153600 }, { "epoch": 0.10696518623974251, "grad_norm": 5.552868366241455, "learning_rate": 1.9901424099954094e-05, "loss": 1.7033, "step": 153800 }, { "epoch": 0.10710428271079549, "grad_norm": 4.4384565353393555, "learning_rate": 1.9901168164394242e-05, "loss": 1.7138, "step": 154000 }, { "epoch": 0.10724337918184847, "grad_norm": 3.6611669063568115, "learning_rate": 1.990091189938054e-05, "loss": 1.6361, "step": 154200 }, { "epoch": 0.10738247565290145, "grad_norm": 4.8382368087768555, "learning_rate": 1.9900655304925225e-05, "loss": 1.6241, "step": 154400 }, { "epoch": 0.10752157212395443, "grad_norm": 5.479028224945068, "learning_rate": 1.990039838104054e-05, "loss": 1.6736, "step": 154600 }, { "epoch": 0.1076606685950074, "grad_norm": 3.484339475631714, "learning_rate": 1.9900141127738757e-05, "loss": 1.6641, "step": 154800 }, { "epoch": 0.10779976506606039, "grad_norm": 4.040769577026367, "learning_rate": 1.989988354503215e-05, "loss": 1.6383, "step": 155000 }, { "epoch": 0.10793886153711338, "grad_norm": 4.502687931060791, "learning_rate": 1.9899625632933027e-05, "loss": 1.683, "step": 155200 }, { "epoch": 0.10807795800816636, "grad_norm": 5.479760646820068, "learning_rate": 1.989936739145369e-05, "loss": 1.6648, "step": 155400 }, { "epoch": 0.10821705447921934, "grad_norm": 5.040559768676758, "learning_rate": 1.989910882060647e-05, "loss": 1.6041, "step": 155600 }, { "epoch": 0.10835615095027232, "grad_norm": 4.883344650268555, "learning_rate": 1.9898849920403708e-05, "loss": 1.7287, "step": 155800 }, { "epoch": 0.1084952474213253, "grad_norm": 2.35851788520813, "learning_rate": 1.9898590690857774e-05, "loss": 1.6952, "step": 156000 }, { "epoch": 0.10863434389237828, "grad_norm": 4.935690879821777, "learning_rate": 1.989833113198103e-05, "loss": 1.6755, "step": 156200 }, { "epoch": 0.10877344036343126, "grad_norm": 4.803808689117432, "learning_rate": 1.9898071243785876e-05, "loss": 1.6358, "step": 156400 }, { "epoch": 0.10891253683448424, "grad_norm": 4.6056437492370605, "learning_rate": 1.9897811026284718e-05, "loss": 1.6624, "step": 156600 }, { "epoch": 0.10905163330553722, "grad_norm": 2.542036294937134, "learning_rate": 1.989755047948997e-05, "loss": 1.6909, "step": 156800 }, { "epoch": 0.10919072977659021, "grad_norm": 4.400025844573975, "learning_rate": 1.989728960341408e-05, "loss": 1.6712, "step": 157000 }, { "epoch": 0.10932982624764319, "grad_norm": 4.760671138763428, "learning_rate": 1.9897028398069503e-05, "loss": 1.6264, "step": 157200 }, { "epoch": 0.10946892271869617, "grad_norm": 5.346454620361328, "learning_rate": 1.98967668634687e-05, "loss": 1.6852, "step": 157400 }, { "epoch": 0.10960801918974915, "grad_norm": 5.924408912658691, "learning_rate": 1.9896504999624163e-05, "loss": 1.652, "step": 157600 }, { "epoch": 0.10974711566080213, "grad_norm": 4.676943778991699, "learning_rate": 1.989624280654839e-05, "loss": 1.6512, "step": 157800 }, { "epoch": 0.10988621213185511, "grad_norm": 5.802811145782471, "learning_rate": 1.98959802842539e-05, "loss": 1.6289, "step": 158000 }, { "epoch": 0.11002530860290809, "grad_norm": 3.330751657485962, "learning_rate": 1.9895717432753222e-05, "loss": 1.66, "step": 158200 }, { "epoch": 0.11016440507396107, "grad_norm": 4.629538059234619, "learning_rate": 1.9895454252058903e-05, "loss": 1.6574, "step": 158400 }, { "epoch": 0.11030350154501405, "grad_norm": 3.5124754905700684, "learning_rate": 1.9895190742183518e-05, "loss": 1.6347, "step": 158600 }, { "epoch": 0.11044259801606704, "grad_norm": 8.707324981689453, "learning_rate": 1.9894926903139633e-05, "loss": 1.6752, "step": 158800 }, { "epoch": 0.11058169448712002, "grad_norm": 4.8549981117248535, "learning_rate": 1.9894662734939847e-05, "loss": 1.6396, "step": 159000 }, { "epoch": 0.110720790958173, "grad_norm": 3.1186676025390625, "learning_rate": 1.989439823759678e-05, "loss": 1.664, "step": 159200 }, { "epoch": 0.11085988742922598, "grad_norm": 7.87991189956665, "learning_rate": 1.9894133411123047e-05, "loss": 1.6081, "step": 159400 }, { "epoch": 0.11099898390027896, "grad_norm": 2.4709787368774414, "learning_rate": 1.9893868255531295e-05, "loss": 1.644, "step": 159600 }, { "epoch": 0.11113808037133194, "grad_norm": 6.009008407592773, "learning_rate": 1.989360277083419e-05, "loss": 1.6496, "step": 159800 }, { "epoch": 0.11127717684238492, "grad_norm": 3.176150321960449, "learning_rate": 1.9893336957044394e-05, "loss": 1.725, "step": 160000 }, { "epoch": 0.1114162733134379, "grad_norm": 4.720170021057129, "learning_rate": 1.9893070814174604e-05, "loss": 1.663, "step": 160200 }, { "epoch": 0.11155536978449088, "grad_norm": 5.095078468322754, "learning_rate": 1.9892804342237518e-05, "loss": 1.6613, "step": 160400 }, { "epoch": 0.11169446625554387, "grad_norm": 3.53840708732605, "learning_rate": 1.9892537541245865e-05, "loss": 1.6817, "step": 160600 }, { "epoch": 0.11183356272659685, "grad_norm": 3.987409830093384, "learning_rate": 1.989227041121238e-05, "loss": 1.6603, "step": 160800 }, { "epoch": 0.11197265919764983, "grad_norm": 6.548856735229492, "learning_rate": 1.9892002952149815e-05, "loss": 1.6185, "step": 161000 }, { "epoch": 0.11211175566870281, "grad_norm": 6.083203315734863, "learning_rate": 1.989173516407093e-05, "loss": 1.6531, "step": 161200 }, { "epoch": 0.11225085213975579, "grad_norm": 4.503881931304932, "learning_rate": 1.9891467046988523e-05, "loss": 1.6681, "step": 161400 }, { "epoch": 0.11238994861080877, "grad_norm": 4.560178279876709, "learning_rate": 1.9891198600915383e-05, "loss": 1.6566, "step": 161600 }, { "epoch": 0.11252904508186175, "grad_norm": 4.399378299713135, "learning_rate": 1.9890929825864332e-05, "loss": 1.6344, "step": 161800 }, { "epoch": 0.11266814155291473, "grad_norm": 4.852181911468506, "learning_rate": 1.98906607218482e-05, "loss": 1.6884, "step": 162000 }, { "epoch": 0.11280723802396771, "grad_norm": 5.365102291107178, "learning_rate": 1.9890391288879828e-05, "loss": 1.6742, "step": 162200 }, { "epoch": 0.1129463344950207, "grad_norm": 4.318151473999023, "learning_rate": 1.989012152697208e-05, "loss": 1.7019, "step": 162400 }, { "epoch": 0.11308543096607368, "grad_norm": 4.111001491546631, "learning_rate": 1.988985143613784e-05, "loss": 1.6992, "step": 162600 }, { "epoch": 0.11322452743712666, "grad_norm": 3.2463414669036865, "learning_rate": 1.9889581016389998e-05, "loss": 1.687, "step": 162800 }, { "epoch": 0.11336362390817964, "grad_norm": 5.382221698760986, "learning_rate": 1.9889310267741467e-05, "loss": 1.6845, "step": 163000 }, { "epoch": 0.11350272037923262, "grad_norm": 3.3540163040161133, "learning_rate": 1.9889039190205164e-05, "loss": 1.64, "step": 163200 }, { "epoch": 0.1136418168502856, "grad_norm": 5.109584808349609, "learning_rate": 1.9888767783794035e-05, "loss": 1.6635, "step": 163400 }, { "epoch": 0.11378091332133858, "grad_norm": 4.127386093139648, "learning_rate": 1.9888496048521042e-05, "loss": 1.6542, "step": 163600 }, { "epoch": 0.11392000979239156, "grad_norm": 5.195967197418213, "learning_rate": 1.9888223984399143e-05, "loss": 1.6009, "step": 163800 }, { "epoch": 0.11405910626344454, "grad_norm": 3.4713029861450195, "learning_rate": 1.988795159144134e-05, "loss": 1.631, "step": 164000 }, { "epoch": 0.11419820273449753, "grad_norm": 3.14530086517334, "learning_rate": 1.9887678869660635e-05, "loss": 1.705, "step": 164200 }, { "epoch": 0.11433729920555051, "grad_norm": 5.993960857391357, "learning_rate": 1.988740581907004e-05, "loss": 1.6979, "step": 164400 }, { "epoch": 0.11447639567660349, "grad_norm": 3.825427532196045, "learning_rate": 1.9887132439682598e-05, "loss": 1.638, "step": 164600 }, { "epoch": 0.11461549214765647, "grad_norm": 3.7680768966674805, "learning_rate": 1.988685873151135e-05, "loss": 1.6357, "step": 164800 }, { "epoch": 0.11475458861870945, "grad_norm": 6.444240093231201, "learning_rate": 1.988658469456937e-05, "loss": 1.6663, "step": 165000 }, { "epoch": 0.11489368508976243, "grad_norm": 4.023960113525391, "learning_rate": 1.988631032886974e-05, "loss": 1.6737, "step": 165200 }, { "epoch": 0.11503278156081541, "grad_norm": 5.864124298095703, "learning_rate": 1.9886035634425558e-05, "loss": 1.6105, "step": 165400 }, { "epoch": 0.11517187803186839, "grad_norm": 5.171482086181641, "learning_rate": 1.9885760611249935e-05, "loss": 1.6624, "step": 165600 }, { "epoch": 0.11531097450292137, "grad_norm": 4.948298931121826, "learning_rate": 1.9885485259356e-05, "loss": 1.7219, "step": 165800 }, { "epoch": 0.11545007097397436, "grad_norm": 4.3434600830078125, "learning_rate": 1.9885209578756903e-05, "loss": 1.648, "step": 166000 }, { "epoch": 0.11558916744502734, "grad_norm": 2.8798773288726807, "learning_rate": 1.9884933569465797e-05, "loss": 1.6342, "step": 166200 }, { "epoch": 0.11572826391608032, "grad_norm": 4.3150858879089355, "learning_rate": 1.9884657231495868e-05, "loss": 1.6727, "step": 166400 }, { "epoch": 0.1158673603871333, "grad_norm": 3.4034597873687744, "learning_rate": 1.9884380564860297e-05, "loss": 1.6918, "step": 166600 }, { "epoch": 0.11600645685818628, "grad_norm": 3.551673650741577, "learning_rate": 1.9884103569572303e-05, "loss": 1.5462, "step": 166800 }, { "epoch": 0.11614555332923926, "grad_norm": 4.2407426834106445, "learning_rate": 1.9883826245645096e-05, "loss": 1.6682, "step": 167000 }, { "epoch": 0.11628464980029224, "grad_norm": 4.91990852355957, "learning_rate": 1.988354859309193e-05, "loss": 1.7025, "step": 167200 }, { "epoch": 0.11642374627134522, "grad_norm": 5.354189872741699, "learning_rate": 1.988327061192605e-05, "loss": 1.6579, "step": 167400 }, { "epoch": 0.1165628427423982, "grad_norm": 4.774054527282715, "learning_rate": 1.9882992302160726e-05, "loss": 1.679, "step": 167600 }, { "epoch": 0.11670193921345119, "grad_norm": 4.016425609588623, "learning_rate": 1.9882713663809256e-05, "loss": 1.6782, "step": 167800 }, { "epoch": 0.11684103568450417, "grad_norm": 4.55750036239624, "learning_rate": 1.9882434696884923e-05, "loss": 1.655, "step": 168000 }, { "epoch": 0.11698013215555715, "grad_norm": 3.620004653930664, "learning_rate": 1.988215540140106e-05, "loss": 1.7069, "step": 168200 }, { "epoch": 0.11711922862661013, "grad_norm": 4.133694171905518, "learning_rate": 1.9881875777370994e-05, "loss": 1.7015, "step": 168400 }, { "epoch": 0.11725832509766311, "grad_norm": 3.9172167778015137, "learning_rate": 1.988159582480808e-05, "loss": 1.6491, "step": 168600 }, { "epoch": 0.11739742156871609, "grad_norm": 5.854298114776611, "learning_rate": 1.988131554372567e-05, "loss": 1.6523, "step": 168800 }, { "epoch": 0.11753651803976907, "grad_norm": 4.317502975463867, "learning_rate": 1.9881034934137155e-05, "loss": 1.607, "step": 169000 }, { "epoch": 0.11767561451082205, "grad_norm": 7.027503967285156, "learning_rate": 1.9880753996055932e-05, "loss": 1.6405, "step": 169200 }, { "epoch": 0.11781471098187503, "grad_norm": 5.615001201629639, "learning_rate": 1.9880472729495403e-05, "loss": 1.6242, "step": 169400 }, { "epoch": 0.11795380745292802, "grad_norm": 4.811971664428711, "learning_rate": 1.9880191134469003e-05, "loss": 1.6899, "step": 169600 }, { "epoch": 0.118092903923981, "grad_norm": 3.679595708847046, "learning_rate": 1.9879909210990175e-05, "loss": 1.7084, "step": 169800 }, { "epoch": 0.11823200039503398, "grad_norm": 5.554993152618408, "learning_rate": 1.9879626959072373e-05, "loss": 1.6376, "step": 170000 }, { "epoch": 0.11837109686608696, "grad_norm": 3.7283120155334473, "learning_rate": 1.9879344378729076e-05, "loss": 1.7121, "step": 170200 }, { "epoch": 0.11851019333713994, "grad_norm": 4.596302032470703, "learning_rate": 1.987906146997377e-05, "loss": 1.6736, "step": 170400 }, { "epoch": 0.11864928980819292, "grad_norm": 4.5579752922058105, "learning_rate": 1.9878778232819962e-05, "loss": 1.6364, "step": 170600 }, { "epoch": 0.1187883862792459, "grad_norm": 4.720639705657959, "learning_rate": 1.9878494667281178e-05, "loss": 1.6624, "step": 170800 }, { "epoch": 0.11892748275029888, "grad_norm": 6.185659408569336, "learning_rate": 1.9878210773370947e-05, "loss": 1.6507, "step": 171000 }, { "epoch": 0.11906657922135186, "grad_norm": 4.2074875831604, "learning_rate": 1.987792655110283e-05, "loss": 1.6032, "step": 171200 }, { "epoch": 0.11920567569240485, "grad_norm": 6.382288455963135, "learning_rate": 1.9877642000490387e-05, "loss": 1.6729, "step": 171400 }, { "epoch": 0.11934477216345783, "grad_norm": 3.7348427772521973, "learning_rate": 1.9877357121547208e-05, "loss": 1.6801, "step": 171600 }, { "epoch": 0.11948386863451081, "grad_norm": 8.150784492492676, "learning_rate": 1.98770719142869e-05, "loss": 1.7321, "step": 171800 }, { "epoch": 0.11962296510556379, "grad_norm": 4.987847805023193, "learning_rate": 1.987678637872306e-05, "loss": 1.6517, "step": 172000 }, { "epoch": 0.11976206157661677, "grad_norm": 4.735747814178467, "learning_rate": 1.987650051486933e-05, "loss": 1.7061, "step": 172200 }, { "epoch": 0.11990115804766975, "grad_norm": 3.1466925144195557, "learning_rate": 1.987621432273936e-05, "loss": 1.6599, "step": 172400 }, { "epoch": 0.12004025451872273, "grad_norm": 3.5573043823242188, "learning_rate": 1.9875927802346804e-05, "loss": 1.6252, "step": 172600 }, { "epoch": 0.12017935098977571, "grad_norm": 9.280091285705566, "learning_rate": 1.9875640953705352e-05, "loss": 1.6821, "step": 172800 }, { "epoch": 0.12031844746082869, "grad_norm": 4.753815174102783, "learning_rate": 1.9875353776828682e-05, "loss": 1.6954, "step": 173000 }, { "epoch": 0.12045754393188167, "grad_norm": 3.200784206390381, "learning_rate": 1.9875066271730513e-05, "loss": 1.6659, "step": 173200 }, { "epoch": 0.12059664040293466, "grad_norm": 3.785381317138672, "learning_rate": 1.987477843842457e-05, "loss": 1.6944, "step": 173400 }, { "epoch": 0.12073573687398764, "grad_norm": 4.492599964141846, "learning_rate": 1.9874490276924598e-05, "loss": 1.6643, "step": 173600 }, { "epoch": 0.12087483334504062, "grad_norm": 3.9844391345977783, "learning_rate": 1.9874201787244344e-05, "loss": 1.6518, "step": 173800 }, { "epoch": 0.1210139298160936, "grad_norm": 7.214785099029541, "learning_rate": 1.9873912969397586e-05, "loss": 1.6384, "step": 174000 }, { "epoch": 0.12115302628714658, "grad_norm": 3.2598700523376465, "learning_rate": 1.9873623823398107e-05, "loss": 1.7138, "step": 174200 }, { "epoch": 0.12129212275819956, "grad_norm": 3.9542293548583984, "learning_rate": 1.9873334349259715e-05, "loss": 1.636, "step": 174400 }, { "epoch": 0.12143121922925254, "grad_norm": 3.9196441173553467, "learning_rate": 1.987304454699623e-05, "loss": 1.686, "step": 174600 }, { "epoch": 0.12157031570030552, "grad_norm": 3.9339749813079834, "learning_rate": 1.987275441662148e-05, "loss": 1.6941, "step": 174800 }, { "epoch": 0.1217094121713585, "grad_norm": 5.266628265380859, "learning_rate": 1.987246395814933e-05, "loss": 1.6873, "step": 175000 }, { "epoch": 0.1218485086424115, "grad_norm": 3.3184611797332764, "learning_rate": 1.9872173171593627e-05, "loss": 1.6133, "step": 175200 }, { "epoch": 0.12198760511346447, "grad_norm": 6.511892318725586, "learning_rate": 1.9871882056968264e-05, "loss": 1.6107, "step": 175400 }, { "epoch": 0.12212670158451745, "grad_norm": 3.9185543060302734, "learning_rate": 1.9871590614287146e-05, "loss": 1.6659, "step": 175600 }, { "epoch": 0.12226579805557043, "grad_norm": 4.299091815948486, "learning_rate": 1.9871298843564163e-05, "loss": 1.6664, "step": 175800 }, { "epoch": 0.12240489452662341, "grad_norm": 6.176652431488037, "learning_rate": 1.9871006744813265e-05, "loss": 1.6531, "step": 176000 }, { "epoch": 0.12254399099767639, "grad_norm": 10.306751251220703, "learning_rate": 1.987071431804839e-05, "loss": 1.6285, "step": 176200 }, { "epoch": 0.12268308746872937, "grad_norm": 5.622617721557617, "learning_rate": 1.9870421563283493e-05, "loss": 1.6667, "step": 176400 }, { "epoch": 0.12282218393978235, "grad_norm": 3.5828144550323486, "learning_rate": 1.9870128480532556e-05, "loss": 1.6789, "step": 176600 }, { "epoch": 0.12296128041083533, "grad_norm": 7.108849048614502, "learning_rate": 1.9869835069809572e-05, "loss": 1.6318, "step": 176800 }, { "epoch": 0.12310037688188832, "grad_norm": 4.111839294433594, "learning_rate": 1.9869541331128543e-05, "loss": 1.666, "step": 177000 }, { "epoch": 0.1232394733529413, "grad_norm": 4.206272602081299, "learning_rate": 1.986924726450349e-05, "loss": 1.6683, "step": 177200 }, { "epoch": 0.12337856982399428, "grad_norm": 5.93388032913208, "learning_rate": 1.986895286994846e-05, "loss": 1.6857, "step": 177400 }, { "epoch": 0.12351766629504726, "grad_norm": 3.5534212589263916, "learning_rate": 1.9868658147477496e-05, "loss": 1.664, "step": 177600 }, { "epoch": 0.12365676276610024, "grad_norm": 4.549261569976807, "learning_rate": 1.9868363097104673e-05, "loss": 1.6206, "step": 177800 }, { "epoch": 0.12379585923715322, "grad_norm": 3.7361629009246826, "learning_rate": 1.9868067718844086e-05, "loss": 1.6629, "step": 178000 }, { "epoch": 0.1239349557082062, "grad_norm": 5.74630880355835, "learning_rate": 1.9867772012709817e-05, "loss": 1.6722, "step": 178200 }, { "epoch": 0.12407405217925918, "grad_norm": 3.605379581451416, "learning_rate": 1.9867475978715997e-05, "loss": 1.5986, "step": 178400 }, { "epoch": 0.12421314865031216, "grad_norm": 6.727748870849609, "learning_rate": 1.9867179616876756e-05, "loss": 1.6251, "step": 178600 }, { "epoch": 0.12435224512136515, "grad_norm": 2.8364996910095215, "learning_rate": 1.9866882927206234e-05, "loss": 1.657, "step": 178800 }, { "epoch": 0.12449134159241813, "grad_norm": 4.181228160858154, "learning_rate": 1.9866585909718606e-05, "loss": 1.687, "step": 179000 }, { "epoch": 0.12463043806347111, "grad_norm": 4.9627180099487305, "learning_rate": 1.9866288564428045e-05, "loss": 1.6855, "step": 179200 }, { "epoch": 0.12476953453452409, "grad_norm": 3.6266908645629883, "learning_rate": 1.9865990891348744e-05, "loss": 1.6203, "step": 179400 }, { "epoch": 0.12490863100557707, "grad_norm": 3.5489137172698975, "learning_rate": 1.986569289049492e-05, "loss": 1.6975, "step": 179600 }, { "epoch": 0.12504772747663007, "grad_norm": 4.391172409057617, "learning_rate": 1.986539456188079e-05, "loss": 1.6629, "step": 179800 }, { "epoch": 0.12518682394768305, "grad_norm": 5.877786636352539, "learning_rate": 1.9865095905520606e-05, "loss": 1.6257, "step": 180000 }, { "epoch": 0.12532592041873603, "grad_norm": 3.692997932434082, "learning_rate": 1.986479692142862e-05, "loss": 1.6024, "step": 180200 }, { "epoch": 0.125465016889789, "grad_norm": 5.69085168838501, "learning_rate": 1.9864497609619103e-05, "loss": 1.6724, "step": 180400 }, { "epoch": 0.12560411336084198, "grad_norm": 3.840266704559326, "learning_rate": 1.986419797010635e-05, "loss": 1.7007, "step": 180600 }, { "epoch": 0.12574320983189496, "grad_norm": 4.066295623779297, "learning_rate": 1.986389800290466e-05, "loss": 1.6899, "step": 180800 }, { "epoch": 0.12588230630294794, "grad_norm": 6.250936031341553, "learning_rate": 1.9863597708028358e-05, "loss": 1.6156, "step": 181000 }, { "epoch": 0.12602140277400092, "grad_norm": 6.361692428588867, "learning_rate": 1.9863297085491776e-05, "loss": 1.7042, "step": 181200 }, { "epoch": 0.1261604992450539, "grad_norm": 6.459218978881836, "learning_rate": 1.986299613530927e-05, "loss": 1.6804, "step": 181400 }, { "epoch": 0.12629959571610688, "grad_norm": 4.83172082901001, "learning_rate": 1.9862694857495196e-05, "loss": 1.6259, "step": 181600 }, { "epoch": 0.12643869218715986, "grad_norm": 6.288012981414795, "learning_rate": 1.9862393252063953e-05, "loss": 1.6328, "step": 181800 }, { "epoch": 0.12657778865821284, "grad_norm": 3.8257863521575928, "learning_rate": 1.9862091319029918e-05, "loss": 1.6375, "step": 182000 }, { "epoch": 0.12671688512926582, "grad_norm": 5.976084232330322, "learning_rate": 1.9861789058407526e-05, "loss": 1.7077, "step": 182200 }, { "epoch": 0.1268559816003188, "grad_norm": 5.479156970977783, "learning_rate": 1.98614864702112e-05, "loss": 1.665, "step": 182400 }, { "epoch": 0.12699507807137178, "grad_norm": 7.1532511711120605, "learning_rate": 1.986118355445538e-05, "loss": 1.6432, "step": 182600 }, { "epoch": 0.12713417454242476, "grad_norm": 7.886996269226074, "learning_rate": 1.986088031115453e-05, "loss": 1.6781, "step": 182800 }, { "epoch": 0.12727327101347774, "grad_norm": 3.760727882385254, "learning_rate": 1.9860576740323125e-05, "loss": 1.67, "step": 183000 }, { "epoch": 0.12741236748453075, "grad_norm": 5.325572490692139, "learning_rate": 1.986027284197566e-05, "loss": 1.6597, "step": 183200 }, { "epoch": 0.12755146395558373, "grad_norm": 8.754809379577637, "learning_rate": 1.985996861612664e-05, "loss": 1.643, "step": 183400 }, { "epoch": 0.1276905604266367, "grad_norm": 2.888570547103882, "learning_rate": 1.9859664062790594e-05, "loss": 1.6941, "step": 183600 }, { "epoch": 0.1278296568976897, "grad_norm": 3.755387306213379, "learning_rate": 1.9859359181982053e-05, "loss": 1.6699, "step": 183800 }, { "epoch": 0.12796875336874267, "grad_norm": 4.906467914581299, "learning_rate": 1.9859053973715573e-05, "loss": 1.6656, "step": 184000 }, { "epoch": 0.12810784983979565, "grad_norm": 3.2837483882904053, "learning_rate": 1.985874843800573e-05, "loss": 1.6658, "step": 184200 }, { "epoch": 0.12824694631084863, "grad_norm": 4.631415843963623, "learning_rate": 1.985844257486711e-05, "loss": 1.688, "step": 184400 }, { "epoch": 0.1283860427819016, "grad_norm": 5.126579284667969, "learning_rate": 1.985813638431431e-05, "loss": 1.7093, "step": 184600 }, { "epoch": 0.12852513925295458, "grad_norm": 3.632361650466919, "learning_rate": 1.9857829866361945e-05, "loss": 1.6936, "step": 184800 }, { "epoch": 0.12866423572400756, "grad_norm": 3.591118097305298, "learning_rate": 1.9857523021024652e-05, "loss": 1.6639, "step": 185000 }, { "epoch": 0.12880333219506054, "grad_norm": 5.093484401702881, "learning_rate": 1.985721584831708e-05, "loss": 1.6375, "step": 185200 }, { "epoch": 0.12894242866611352, "grad_norm": 5.599116802215576, "learning_rate": 1.9856908348253888e-05, "loss": 1.6667, "step": 185400 }, { "epoch": 0.1290815251371665, "grad_norm": 4.788045406341553, "learning_rate": 1.9856600520849767e-05, "loss": 1.6249, "step": 185600 }, { "epoch": 0.12922062160821948, "grad_norm": 3.3481669425964355, "learning_rate": 1.98562923661194e-05, "loss": 1.6966, "step": 185800 }, { "epoch": 0.12935971807927246, "grad_norm": 6.194669246673584, "learning_rate": 1.98559838840775e-05, "loss": 1.6743, "step": 186000 }, { "epoch": 0.12949881455032544, "grad_norm": 3.6862707138061523, "learning_rate": 1.98556750747388e-05, "loss": 1.6835, "step": 186200 }, { "epoch": 0.12963791102137842, "grad_norm": 4.1235127449035645, "learning_rate": 1.985536593811804e-05, "loss": 1.6472, "step": 186400 }, { "epoch": 0.1297770074924314, "grad_norm": 4.046864986419678, "learning_rate": 1.9855056474229975e-05, "loss": 1.6885, "step": 186600 }, { "epoch": 0.12991610396348438, "grad_norm": 4.1069746017456055, "learning_rate": 1.9854746683089375e-05, "loss": 1.6985, "step": 186800 }, { "epoch": 0.1300552004345374, "grad_norm": 2.7363228797912598, "learning_rate": 1.9854436564711038e-05, "loss": 1.6469, "step": 187000 }, { "epoch": 0.13019429690559037, "grad_norm": 4.64224910736084, "learning_rate": 1.9854126119109764e-05, "loss": 1.691, "step": 187200 }, { "epoch": 0.13033339337664335, "grad_norm": 4.297823905944824, "learning_rate": 1.9853815346300375e-05, "loss": 1.7174, "step": 187400 }, { "epoch": 0.13047248984769633, "grad_norm": 5.539035797119141, "learning_rate": 1.9853504246297706e-05, "loss": 1.6865, "step": 187600 }, { "epoch": 0.1306115863187493, "grad_norm": 6.837066650390625, "learning_rate": 1.9853192819116604e-05, "loss": 1.6603, "step": 187800 }, { "epoch": 0.13075068278980229, "grad_norm": 5.642529487609863, "learning_rate": 1.9852881064771944e-05, "loss": 1.6914, "step": 188000 }, { "epoch": 0.13088977926085527, "grad_norm": 5.806487560272217, "learning_rate": 1.985256898327861e-05, "loss": 1.6721, "step": 188200 }, { "epoch": 0.13102887573190825, "grad_norm": 4.210550308227539, "learning_rate": 1.985225657465149e-05, "loss": 1.6453, "step": 188400 }, { "epoch": 0.13116797220296122, "grad_norm": 2.4431819915771484, "learning_rate": 1.9851943838905507e-05, "loss": 1.6577, "step": 188600 }, { "epoch": 0.1313070686740142, "grad_norm": 4.8577880859375, "learning_rate": 1.985163077605558e-05, "loss": 1.6024, "step": 188800 }, { "epoch": 0.13144616514506718, "grad_norm": 5.452568054199219, "learning_rate": 1.985131738611667e-05, "loss": 1.6769, "step": 189000 }, { "epoch": 0.13158526161612016, "grad_norm": 9.054600715637207, "learning_rate": 1.9851003669103724e-05, "loss": 1.6333, "step": 189200 }, { "epoch": 0.13172435808717314, "grad_norm": 4.141605854034424, "learning_rate": 1.985068962503173e-05, "loss": 1.6546, "step": 189400 }, { "epoch": 0.13186345455822612, "grad_norm": 6.251303672790527, "learning_rate": 1.985037525391567e-05, "loss": 1.6554, "step": 189600 }, { "epoch": 0.1320025510292791, "grad_norm": 4.922019958496094, "learning_rate": 1.9850060555770554e-05, "loss": 1.6644, "step": 189800 }, { "epoch": 0.13214164750033208, "grad_norm": 4.558050155639648, "learning_rate": 1.9849745530611413e-05, "loss": 1.6846, "step": 190000 }, { "epoch": 0.13228074397138506, "grad_norm": 6.424139499664307, "learning_rate": 1.9849430178453276e-05, "loss": 1.6354, "step": 190200 }, { "epoch": 0.13241984044243804, "grad_norm": 3.612401247024536, "learning_rate": 1.9849114499311205e-05, "loss": 1.6416, "step": 190400 }, { "epoch": 0.13255893691349105, "grad_norm": 5.989872932434082, "learning_rate": 1.984879849320026e-05, "loss": 1.6757, "step": 190600 }, { "epoch": 0.13269803338454403, "grad_norm": 3.9288203716278076, "learning_rate": 1.9848482160135538e-05, "loss": 1.6961, "step": 190800 }, { "epoch": 0.132837129855597, "grad_norm": 4.6295294761657715, "learning_rate": 1.9848165500132137e-05, "loss": 1.6701, "step": 191000 }, { "epoch": 0.13297622632665, "grad_norm": 4.14421272277832, "learning_rate": 1.9847848513205172e-05, "loss": 1.674, "step": 191200 }, { "epoch": 0.13311532279770297, "grad_norm": 4.674405097961426, "learning_rate": 1.9847531199369778e-05, "loss": 1.7052, "step": 191400 }, { "epoch": 0.13325441926875595, "grad_norm": 6.369792461395264, "learning_rate": 1.9847213558641103e-05, "loss": 1.7116, "step": 191600 }, { "epoch": 0.13339351573980893, "grad_norm": 4.456991672515869, "learning_rate": 1.9846895591034303e-05, "loss": 1.6899, "step": 191800 }, { "epoch": 0.1335326122108619, "grad_norm": 7.715414047241211, "learning_rate": 1.9846577296564573e-05, "loss": 1.627, "step": 192000 }, { "epoch": 0.13367170868191489, "grad_norm": 3.531667709350586, "learning_rate": 1.9846258675247088e-05, "loss": 1.6544, "step": 192200 }, { "epoch": 0.13381080515296787, "grad_norm": 4.7513203620910645, "learning_rate": 1.984593972709708e-05, "loss": 1.6266, "step": 192400 }, { "epoch": 0.13394990162402085, "grad_norm": 4.610849857330322, "learning_rate": 1.9845620452129758e-05, "loss": 1.6541, "step": 192600 }, { "epoch": 0.13408899809507382, "grad_norm": 5.611361503601074, "learning_rate": 1.9845300850360375e-05, "loss": 1.6377, "step": 192800 }, { "epoch": 0.1342280945661268, "grad_norm": 7.1261444091796875, "learning_rate": 1.984498092180418e-05, "loss": 1.6643, "step": 193000 }, { "epoch": 0.13436719103717978, "grad_norm": 6.222692012786865, "learning_rate": 1.984466066647645e-05, "loss": 1.6668, "step": 193200 }, { "epoch": 0.13450628750823276, "grad_norm": 5.6644744873046875, "learning_rate": 1.984434008439248e-05, "loss": 1.7046, "step": 193400 }, { "epoch": 0.13464538397928574, "grad_norm": 3.9181125164031982, "learning_rate": 1.9844019175567565e-05, "loss": 1.7023, "step": 193600 }, { "epoch": 0.13478448045033872, "grad_norm": 2.7055773735046387, "learning_rate": 1.9843697940017025e-05, "loss": 1.6359, "step": 193800 }, { "epoch": 0.1349235769213917, "grad_norm": 6.363256931304932, "learning_rate": 1.98433763777562e-05, "loss": 1.6489, "step": 194000 }, { "epoch": 0.1350626733924447, "grad_norm": 3.7628371715545654, "learning_rate": 1.984305448880044e-05, "loss": 1.6814, "step": 194200 }, { "epoch": 0.1352017698634977, "grad_norm": 3.9720499515533447, "learning_rate": 1.984273227316511e-05, "loss": 1.6726, "step": 194400 }, { "epoch": 0.13534086633455067, "grad_norm": 4.181582927703857, "learning_rate": 1.9842409730865593e-05, "loss": 1.7293, "step": 194600 }, { "epoch": 0.13547996280560365, "grad_norm": 4.109186172485352, "learning_rate": 1.9842086861917284e-05, "loss": 1.6844, "step": 194800 }, { "epoch": 0.13561905927665663, "grad_norm": 5.157780647277832, "learning_rate": 1.9841763666335604e-05, "loss": 1.7285, "step": 195000 }, { "epoch": 0.1357581557477096, "grad_norm": 3.560006618499756, "learning_rate": 1.9841440144135974e-05, "loss": 1.6334, "step": 195200 }, { "epoch": 0.1358972522187626, "grad_norm": 5.187409400939941, "learning_rate": 1.9841116295333842e-05, "loss": 1.6215, "step": 195400 }, { "epoch": 0.13603634868981557, "grad_norm": 3.573528528213501, "learning_rate": 1.984079211994467e-05, "loss": 1.6543, "step": 195600 }, { "epoch": 0.13617544516086855, "grad_norm": 7.436543941497803, "learning_rate": 1.9840467617983933e-05, "loss": 1.6384, "step": 195800 }, { "epoch": 0.13631454163192153, "grad_norm": 7.499304294586182, "learning_rate": 1.9840142789467113e-05, "loss": 1.6457, "step": 196000 }, { "epoch": 0.1364536381029745, "grad_norm": 4.699829578399658, "learning_rate": 1.9839817634409733e-05, "loss": 1.6635, "step": 196200 }, { "epoch": 0.13659273457402749, "grad_norm": 4.371967792510986, "learning_rate": 1.9839492152827305e-05, "loss": 1.6795, "step": 196400 }, { "epoch": 0.13673183104508047, "grad_norm": 9.641822814941406, "learning_rate": 1.983916634473537e-05, "loss": 1.6681, "step": 196600 }, { "epoch": 0.13687092751613344, "grad_norm": 3.316873550415039, "learning_rate": 1.983884021014948e-05, "loss": 1.6579, "step": 196800 }, { "epoch": 0.13701002398718642, "grad_norm": 3.2673499584198, "learning_rate": 1.983851374908521e-05, "loss": 1.6427, "step": 197000 }, { "epoch": 0.1371491204582394, "grad_norm": 4.909361839294434, "learning_rate": 1.9838186961558135e-05, "loss": 1.6636, "step": 197200 }, { "epoch": 0.13728821692929238, "grad_norm": 6.074746131896973, "learning_rate": 1.9837859847583864e-05, "loss": 1.699, "step": 197400 }, { "epoch": 0.13742731340034536, "grad_norm": 3.8565711975097656, "learning_rate": 1.9837532407178006e-05, "loss": 1.6797, "step": 197600 }, { "epoch": 0.13756640987139837, "grad_norm": 3.768479824066162, "learning_rate": 1.98372046403562e-05, "loss": 1.6711, "step": 197800 }, { "epoch": 0.13770550634245135, "grad_norm": 5.409492492675781, "learning_rate": 1.9836876547134092e-05, "loss": 1.6492, "step": 198000 }, { "epoch": 0.13784460281350433, "grad_norm": 3.9303548336029053, "learning_rate": 1.9836548127527343e-05, "loss": 1.5974, "step": 198200 }, { "epoch": 0.1379836992845573, "grad_norm": 4.712244033813477, "learning_rate": 1.9836219381551626e-05, "loss": 1.6403, "step": 198400 }, { "epoch": 0.1381227957556103, "grad_norm": 2.6144423484802246, "learning_rate": 1.9835890309222642e-05, "loss": 1.703, "step": 198600 }, { "epoch": 0.13826189222666327, "grad_norm": 7.963686943054199, "learning_rate": 1.9835560910556096e-05, "loss": 1.6376, "step": 198800 }, { "epoch": 0.13840098869771625, "grad_norm": 7.453803539276123, "learning_rate": 1.983523118556772e-05, "loss": 1.6707, "step": 199000 }, { "epoch": 0.13854008516876923, "grad_norm": 4.660197734832764, "learning_rate": 1.9834901134273247e-05, "loss": 1.6914, "step": 199200 }, { "epoch": 0.1386791816398222, "grad_norm": 4.454774379730225, "learning_rate": 1.983457075668844e-05, "loss": 1.699, "step": 199400 }, { "epoch": 0.1388182781108752, "grad_norm": 4.318262577056885, "learning_rate": 1.9834240052829065e-05, "loss": 1.6629, "step": 199600 }, { "epoch": 0.13895737458192817, "grad_norm": 3.4751460552215576, "learning_rate": 1.9833909022710913e-05, "loss": 1.6609, "step": 199800 }, { "epoch": 0.13909647105298115, "grad_norm": 5.436317443847656, "learning_rate": 1.9833577666349784e-05, "loss": 1.6809, "step": 200000 }, { "epoch": 0.13923556752403413, "grad_norm": 5.233275890350342, "learning_rate": 1.9833245983761502e-05, "loss": 1.7008, "step": 200200 }, { "epoch": 0.1393746639950871, "grad_norm": 5.347460746765137, "learning_rate": 1.9832913974961895e-05, "loss": 1.7149, "step": 200400 }, { "epoch": 0.13951376046614009, "grad_norm": 2.830404043197632, "learning_rate": 1.9832581639966814e-05, "loss": 1.6777, "step": 200600 }, { "epoch": 0.13965285693719307, "grad_norm": 5.6208815574646, "learning_rate": 1.983224897879213e-05, "loss": 1.6505, "step": 200800 }, { "epoch": 0.13979195340824604, "grad_norm": 3.5405380725860596, "learning_rate": 1.9831915991453715e-05, "loss": 1.6412, "step": 201000 }, { "epoch": 0.13993104987929902, "grad_norm": 3.0910046100616455, "learning_rate": 1.983158267796747e-05, "loss": 1.6859, "step": 201200 }, { "epoch": 0.14007014635035203, "grad_norm": 5.490592956542969, "learning_rate": 1.9831249038349313e-05, "loss": 1.6568, "step": 201400 }, { "epoch": 0.140209242821405, "grad_norm": 5.224388122558594, "learning_rate": 1.983091507261516e-05, "loss": 1.6622, "step": 201600 }, { "epoch": 0.140348339292458, "grad_norm": 4.169788360595703, "learning_rate": 1.983058078078096e-05, "loss": 1.6532, "step": 201800 }, { "epoch": 0.14048743576351097, "grad_norm": 3.974165916442871, "learning_rate": 1.9830246162862668e-05, "loss": 1.6186, "step": 202000 }, { "epoch": 0.14062653223456395, "grad_norm": 3.8338491916656494, "learning_rate": 1.982991121887627e-05, "loss": 1.651, "step": 202200 }, { "epoch": 0.14076562870561693, "grad_norm": 9.311616897583008, "learning_rate": 1.982957594883774e-05, "loss": 1.6013, "step": 202400 }, { "epoch": 0.1409047251766699, "grad_norm": 6.677712917327881, "learning_rate": 1.9829240352763096e-05, "loss": 1.6737, "step": 202600 }, { "epoch": 0.1410438216477229, "grad_norm": 4.073410987854004, "learning_rate": 1.9828904430668352e-05, "loss": 1.6654, "step": 202800 }, { "epoch": 0.14118291811877587, "grad_norm": 3.849587917327881, "learning_rate": 1.9828568182569547e-05, "loss": 1.6426, "step": 203000 }, { "epoch": 0.14132201458982885, "grad_norm": 7.148254871368408, "learning_rate": 1.982823160848273e-05, "loss": 1.7172, "step": 203200 }, { "epoch": 0.14146111106088183, "grad_norm": 4.130821228027344, "learning_rate": 1.982789470842397e-05, "loss": 1.6831, "step": 203400 }, { "epoch": 0.1416002075319348, "grad_norm": 4.6689066886901855, "learning_rate": 1.9827557482409355e-05, "loss": 1.7194, "step": 203600 }, { "epoch": 0.1417393040029878, "grad_norm": 3.741334915161133, "learning_rate": 1.982721993045498e-05, "loss": 1.6994, "step": 203800 }, { "epoch": 0.14187840047404077, "grad_norm": 2.899327039718628, "learning_rate": 1.982688205257696e-05, "loss": 1.6241, "step": 204000 }, { "epoch": 0.14201749694509375, "grad_norm": 4.312928676605225, "learning_rate": 1.9826543848791423e-05, "loss": 1.583, "step": 204200 }, { "epoch": 0.14215659341614673, "grad_norm": 5.321345329284668, "learning_rate": 1.9826205319114513e-05, "loss": 1.6599, "step": 204400 }, { "epoch": 0.1422956898871997, "grad_norm": 4.168429374694824, "learning_rate": 1.98258664635624e-05, "loss": 1.6851, "step": 204600 }, { "epoch": 0.14243478635825269, "grad_norm": 5.454354286193848, "learning_rate": 1.9825527282151244e-05, "loss": 1.6046, "step": 204800 }, { "epoch": 0.1425738828293057, "grad_norm": 4.223209857940674, "learning_rate": 1.982518777489726e-05, "loss": 1.6851, "step": 205000 }, { "epoch": 0.14271297930035867, "grad_norm": 5.312924385070801, "learning_rate": 1.9824847941816638e-05, "loss": 1.6433, "step": 205200 }, { "epoch": 0.14285207577141165, "grad_norm": 4.545292854309082, "learning_rate": 1.9824507782925603e-05, "loss": 1.6801, "step": 205400 }, { "epoch": 0.14299117224246463, "grad_norm": 6.492091655731201, "learning_rate": 1.9824167298240398e-05, "loss": 1.7089, "step": 205600 }, { "epoch": 0.1431302687135176, "grad_norm": 9.169800758361816, "learning_rate": 1.982382648777728e-05, "loss": 1.7032, "step": 205800 }, { "epoch": 0.1432693651845706, "grad_norm": 6.819125652313232, "learning_rate": 1.9823485351552514e-05, "loss": 1.7034, "step": 206000 }, { "epoch": 0.14340846165562357, "grad_norm": 4.523713111877441, "learning_rate": 1.9823143889582388e-05, "loss": 1.6439, "step": 206200 }, { "epoch": 0.14354755812667655, "grad_norm": 8.794898986816406, "learning_rate": 1.98228021018832e-05, "loss": 1.6834, "step": 206400 }, { "epoch": 0.14368665459772953, "grad_norm": 2.6776249408721924, "learning_rate": 1.982245998847127e-05, "loss": 1.6695, "step": 206600 }, { "epoch": 0.1438257510687825, "grad_norm": 9.791224479675293, "learning_rate": 1.982211754936293e-05, "loss": 1.6351, "step": 206800 }, { "epoch": 0.1439648475398355, "grad_norm": 5.362414360046387, "learning_rate": 1.9821774784574526e-05, "loss": 1.6948, "step": 207000 }, { "epoch": 0.14410394401088847, "grad_norm": 3.5498175621032715, "learning_rate": 1.982143169412242e-05, "loss": 1.7167, "step": 207200 }, { "epoch": 0.14424304048194145, "grad_norm": 6.9086012840271, "learning_rate": 1.982108827802299e-05, "loss": 1.7414, "step": 207400 }, { "epoch": 0.14438213695299443, "grad_norm": 3.778520107269287, "learning_rate": 1.9820744536292634e-05, "loss": 1.6731, "step": 207600 }, { "epoch": 0.1445212334240474, "grad_norm": 3.5077078342437744, "learning_rate": 1.982040046894776e-05, "loss": 1.6416, "step": 207800 }, { "epoch": 0.1446603298951004, "grad_norm": 4.8149261474609375, "learning_rate": 1.9820056076004798e-05, "loss": 1.6852, "step": 208000 }, { "epoch": 0.14479942636615337, "grad_norm": 3.931675910949707, "learning_rate": 1.9819711357480182e-05, "loss": 1.6737, "step": 208200 }, { "epoch": 0.14493852283720635, "grad_norm": 6.161195278167725, "learning_rate": 1.9819366313390368e-05, "loss": 1.6781, "step": 208400 }, { "epoch": 0.14507761930825935, "grad_norm": 5.015232086181641, "learning_rate": 1.9819020943751837e-05, "loss": 1.6852, "step": 208600 }, { "epoch": 0.14521671577931233, "grad_norm": 5.486717700958252, "learning_rate": 1.9818675248581062e-05, "loss": 1.6469, "step": 208800 }, { "epoch": 0.1453558122503653, "grad_norm": 6.021148204803467, "learning_rate": 1.9818329227894562e-05, "loss": 1.648, "step": 209000 }, { "epoch": 0.1454949087214183, "grad_norm": 3.5915627479553223, "learning_rate": 1.9817982881708846e-05, "loss": 1.6666, "step": 209200 }, { "epoch": 0.14563400519247127, "grad_norm": 5.235846519470215, "learning_rate": 1.981763621004045e-05, "loss": 1.7183, "step": 209400 }, { "epoch": 0.14577310166352425, "grad_norm": 6.775132656097412, "learning_rate": 1.9817289212905927e-05, "loss": 1.6497, "step": 209600 }, { "epoch": 0.14591219813457723, "grad_norm": 5.555789947509766, "learning_rate": 1.981694189032184e-05, "loss": 1.6312, "step": 209800 }, { "epoch": 0.1460512946056302, "grad_norm": 3.7014245986938477, "learning_rate": 1.981659424230477e-05, "loss": 1.6909, "step": 210000 }, { "epoch": 0.1461903910766832, "grad_norm": 6.824801921844482, "learning_rate": 1.9816246268871304e-05, "loss": 1.6501, "step": 210200 }, { "epoch": 0.14632948754773617, "grad_norm": 3.271857500076294, "learning_rate": 1.981589797003807e-05, "loss": 1.6675, "step": 210400 }, { "epoch": 0.14646858401878915, "grad_norm": 1.9796348810195923, "learning_rate": 1.9815549345821686e-05, "loss": 1.6666, "step": 210600 }, { "epoch": 0.14660768048984213, "grad_norm": 4.542928218841553, "learning_rate": 1.9815200396238797e-05, "loss": 1.6367, "step": 210800 }, { "epoch": 0.1467467769608951, "grad_norm": 4.7348761558532715, "learning_rate": 1.981485112130606e-05, "loss": 1.5782, "step": 211000 }, { "epoch": 0.1468858734319481, "grad_norm": 4.386654376983643, "learning_rate": 1.981450152104015e-05, "loss": 1.6155, "step": 211200 }, { "epoch": 0.14702496990300107, "grad_norm": 4.197893142700195, "learning_rate": 1.981415159545776e-05, "loss": 1.6381, "step": 211400 }, { "epoch": 0.14716406637405405, "grad_norm": 6.583975315093994, "learning_rate": 1.9813801344575587e-05, "loss": 1.6626, "step": 211600 }, { "epoch": 0.14730316284510703, "grad_norm": 4.117565155029297, "learning_rate": 1.981345076841036e-05, "loss": 1.7211, "step": 211800 }, { "epoch": 0.14744225931616, "grad_norm": 6.546325206756592, "learning_rate": 1.981309986697881e-05, "loss": 1.6327, "step": 212000 }, { "epoch": 0.147581355787213, "grad_norm": 4.35620641708374, "learning_rate": 1.9812748640297693e-05, "loss": 1.7034, "step": 212200 }, { "epoch": 0.147720452258266, "grad_norm": 7.925172805786133, "learning_rate": 1.981239708838377e-05, "loss": 1.6408, "step": 212400 }, { "epoch": 0.14785954872931897, "grad_norm": 3.4991142749786377, "learning_rate": 1.9812045211253824e-05, "loss": 1.6417, "step": 212600 }, { "epoch": 0.14799864520037195, "grad_norm": 3.4862661361694336, "learning_rate": 1.981169300892466e-05, "loss": 1.6432, "step": 212800 }, { "epoch": 0.14813774167142493, "grad_norm": 4.1005425453186035, "learning_rate": 1.9811340481413094e-05, "loss": 1.647, "step": 213000 }, { "epoch": 0.1482768381424779, "grad_norm": 4.734884738922119, "learning_rate": 1.981098762873594e-05, "loss": 1.6615, "step": 213200 }, { "epoch": 0.1484159346135309, "grad_norm": 5.329046726226807, "learning_rate": 1.9810634450910054e-05, "loss": 1.673, "step": 213400 }, { "epoch": 0.14855503108458387, "grad_norm": 4.662872314453125, "learning_rate": 1.9810280947952292e-05, "loss": 1.6696, "step": 213600 }, { "epoch": 0.14869412755563685, "grad_norm": 6.65009880065918, "learning_rate": 1.980992711987954e-05, "loss": 1.6561, "step": 213800 }, { "epoch": 0.14883322402668983, "grad_norm": 5.254068374633789, "learning_rate": 1.980957296670867e-05, "loss": 1.6387, "step": 214000 }, { "epoch": 0.1489723204977428, "grad_norm": 5.0962042808532715, "learning_rate": 1.980921848845661e-05, "loss": 1.6657, "step": 214200 }, { "epoch": 0.1491114169687958, "grad_norm": 3.870120048522949, "learning_rate": 1.980886368514027e-05, "loss": 1.6548, "step": 214400 }, { "epoch": 0.14925051343984877, "grad_norm": 5.185789585113525, "learning_rate": 1.980850855677659e-05, "loss": 1.6998, "step": 214600 }, { "epoch": 0.14938960991090175, "grad_norm": 2.880143642425537, "learning_rate": 1.9808153103382522e-05, "loss": 1.6384, "step": 214800 }, { "epoch": 0.14952870638195473, "grad_norm": 4.383133411407471, "learning_rate": 1.980779732497504e-05, "loss": 1.6412, "step": 215000 }, { "epoch": 0.1496678028530077, "grad_norm": 4.409369945526123, "learning_rate": 1.9807441221571122e-05, "loss": 1.7487, "step": 215200 }, { "epoch": 0.1498068993240607, "grad_norm": 8.62210750579834, "learning_rate": 1.9807084793187776e-05, "loss": 1.6259, "step": 215400 }, { "epoch": 0.14994599579511367, "grad_norm": 5.083277702331543, "learning_rate": 1.980672803984201e-05, "loss": 1.6404, "step": 215600 }, { "epoch": 0.15008509226616665, "grad_norm": 11.217997550964355, "learning_rate": 1.9806370961550855e-05, "loss": 1.6553, "step": 215800 }, { "epoch": 0.15022418873721965, "grad_norm": 3.5800321102142334, "learning_rate": 1.9806013558331364e-05, "loss": 1.6939, "step": 216000 }, { "epoch": 0.15036328520827263, "grad_norm": 4.781148910522461, "learning_rate": 1.9805655830200593e-05, "loss": 1.6453, "step": 216200 }, { "epoch": 0.15050238167932561, "grad_norm": 2.5723118782043457, "learning_rate": 1.9805297777175626e-05, "loss": 1.6093, "step": 216400 }, { "epoch": 0.1506414781503786, "grad_norm": 3.8561720848083496, "learning_rate": 1.9804939399273548e-05, "loss": 1.6449, "step": 216600 }, { "epoch": 0.15078057462143157, "grad_norm": 7.823061466217041, "learning_rate": 1.9804580696511474e-05, "loss": 1.6499, "step": 216800 }, { "epoch": 0.15091967109248455, "grad_norm": 3.7385520935058594, "learning_rate": 1.9804221668906526e-05, "loss": 1.6327, "step": 217000 }, { "epoch": 0.15105876756353753, "grad_norm": 7.89150857925415, "learning_rate": 1.9803862316475837e-05, "loss": 1.6593, "step": 217200 }, { "epoch": 0.1511978640345905, "grad_norm": 3.5468215942382812, "learning_rate": 1.9803502639236575e-05, "loss": 1.564, "step": 217400 }, { "epoch": 0.1513369605056435, "grad_norm": 4.701639175415039, "learning_rate": 1.98031426372059e-05, "loss": 1.6785, "step": 217600 }, { "epoch": 0.15147605697669647, "grad_norm": 3.7408063411712646, "learning_rate": 1.9802782310401004e-05, "loss": 1.6944, "step": 217800 }, { "epoch": 0.15161515344774945, "grad_norm": 5.009680271148682, "learning_rate": 1.9802421658839085e-05, "loss": 1.6944, "step": 218000 }, { "epoch": 0.15175424991880243, "grad_norm": 6.934037685394287, "learning_rate": 1.9802060682537362e-05, "loss": 1.6356, "step": 218200 }, { "epoch": 0.1518933463898554, "grad_norm": 5.480565547943115, "learning_rate": 1.9801699381513066e-05, "loss": 1.661, "step": 218400 }, { "epoch": 0.1520324428609084, "grad_norm": 3.7553224563598633, "learning_rate": 1.9801337755783447e-05, "loss": 1.6771, "step": 218600 }, { "epoch": 0.15217153933196137, "grad_norm": 6.108246803283691, "learning_rate": 1.980097580536577e-05, "loss": 1.6398, "step": 218800 }, { "epoch": 0.15231063580301435, "grad_norm": 2.7904536724090576, "learning_rate": 1.980061353027731e-05, "loss": 1.6457, "step": 219000 }, { "epoch": 0.15244973227406733, "grad_norm": 5.668387413024902, "learning_rate": 1.980025093053536e-05, "loss": 1.6924, "step": 219200 }, { "epoch": 0.1525888287451203, "grad_norm": 3.5706393718719482, "learning_rate": 1.979988800615724e-05, "loss": 1.6739, "step": 219400 }, { "epoch": 0.15272792521617332, "grad_norm": 4.151406288146973, "learning_rate": 1.9799524757160263e-05, "loss": 1.7423, "step": 219600 }, { "epoch": 0.1528670216872263, "grad_norm": 6.009292125701904, "learning_rate": 1.9799161183561776e-05, "loss": 1.6735, "step": 219800 }, { "epoch": 0.15300611815827928, "grad_norm": 4.633773326873779, "learning_rate": 1.979879728537914e-05, "loss": 1.6354, "step": 220000 }, { "epoch": 0.15314521462933225, "grad_norm": 7.916836261749268, "learning_rate": 1.979843306262972e-05, "loss": 1.7064, "step": 220200 }, { "epoch": 0.15328431110038523, "grad_norm": 2.3723714351654053, "learning_rate": 1.9798068515330913e-05, "loss": 1.6533, "step": 220400 }, { "epoch": 0.15342340757143821, "grad_norm": 4.5149993896484375, "learning_rate": 1.979770364350011e-05, "loss": 1.6492, "step": 220600 }, { "epoch": 0.1535625040424912, "grad_norm": 7.221534729003906, "learning_rate": 1.9797338447154732e-05, "loss": 1.6773, "step": 220800 }, { "epoch": 0.15370160051354417, "grad_norm": 5.349063873291016, "learning_rate": 1.9796972926312223e-05, "loss": 1.6608, "step": 221000 }, { "epoch": 0.15384069698459715, "grad_norm": 5.269424915313721, "learning_rate": 1.9796607080990023e-05, "loss": 1.7, "step": 221200 }, { "epoch": 0.15397979345565013, "grad_norm": 4.772226810455322, "learning_rate": 1.97962409112056e-05, "loss": 1.6641, "step": 221400 }, { "epoch": 0.1541188899267031, "grad_norm": 4.610666751861572, "learning_rate": 1.979587441697643e-05, "loss": 1.6373, "step": 221600 }, { "epoch": 0.1542579863977561, "grad_norm": 5.623795986175537, "learning_rate": 1.979550759832002e-05, "loss": 1.7177, "step": 221800 }, { "epoch": 0.15439708286880907, "grad_norm": 5.421236038208008, "learning_rate": 1.9795140455253872e-05, "loss": 1.6423, "step": 222000 }, { "epoch": 0.15453617933986205, "grad_norm": 4.521764278411865, "learning_rate": 1.9794772987795517e-05, "loss": 1.615, "step": 222200 }, { "epoch": 0.15467527581091503, "grad_norm": 5.754947662353516, "learning_rate": 1.979440519596249e-05, "loss": 1.6762, "step": 222400 }, { "epoch": 0.154814372281968, "grad_norm": 4.5179443359375, "learning_rate": 1.9794037079772362e-05, "loss": 1.6119, "step": 222600 }, { "epoch": 0.154953468753021, "grad_norm": 4.03549861907959, "learning_rate": 1.97936686392427e-05, "loss": 1.647, "step": 222800 }, { "epoch": 0.15509256522407397, "grad_norm": 2.628634214401245, "learning_rate": 1.9793299874391088e-05, "loss": 1.7272, "step": 223000 }, { "epoch": 0.15523166169512698, "grad_norm": 5.5775933265686035, "learning_rate": 1.9792930785235138e-05, "loss": 1.6965, "step": 223200 }, { "epoch": 0.15537075816617996, "grad_norm": 4.014023780822754, "learning_rate": 1.979256137179247e-05, "loss": 1.6548, "step": 223400 }, { "epoch": 0.15550985463723294, "grad_norm": 2.4270730018615723, "learning_rate": 1.9792191634080708e-05, "loss": 1.6754, "step": 223600 }, { "epoch": 0.15564895110828592, "grad_norm": 5.167558193206787, "learning_rate": 1.9791821572117515e-05, "loss": 1.6311, "step": 223800 }, { "epoch": 0.1557880475793389, "grad_norm": 3.3084609508514404, "learning_rate": 1.979145118592056e-05, "loss": 1.6325, "step": 224000 }, { "epoch": 0.15592714405039187, "grad_norm": 5.884838581085205, "learning_rate": 1.9791080475507508e-05, "loss": 1.6533, "step": 224200 }, { "epoch": 0.15606624052144485, "grad_norm": 4.583337783813477, "learning_rate": 1.9790709440896073e-05, "loss": 1.6061, "step": 224400 }, { "epoch": 0.15620533699249783, "grad_norm": 3.328657865524292, "learning_rate": 1.979033808210396e-05, "loss": 1.642, "step": 224600 }, { "epoch": 0.1563444334635508, "grad_norm": 4.5754852294921875, "learning_rate": 1.9789966399148897e-05, "loss": 1.6176, "step": 224800 }, { "epoch": 0.1564835299346038, "grad_norm": 3.455652952194214, "learning_rate": 1.9789594392048625e-05, "loss": 1.6757, "step": 225000 }, { "epoch": 0.15662262640565677, "grad_norm": 7.790688991546631, "learning_rate": 1.978922206082092e-05, "loss": 1.7227, "step": 225200 }, { "epoch": 0.15676172287670975, "grad_norm": 5.9034423828125, "learning_rate": 1.9788849405483535e-05, "loss": 1.6887, "step": 225400 }, { "epoch": 0.15690081934776273, "grad_norm": 3.6279520988464355, "learning_rate": 1.978847642605427e-05, "loss": 1.6682, "step": 225600 }, { "epoch": 0.1570399158188157, "grad_norm": 5.0766072273254395, "learning_rate": 1.9788103122550932e-05, "loss": 1.7731, "step": 225800 }, { "epoch": 0.1571790122898687, "grad_norm": 6.360209941864014, "learning_rate": 1.978772949499134e-05, "loss": 1.6643, "step": 226000 }, { "epoch": 0.15731810876092167, "grad_norm": 3.8049356937408447, "learning_rate": 1.9787355543393326e-05, "loss": 1.6699, "step": 226200 }, { "epoch": 0.15745720523197465, "grad_norm": 3.7539074420928955, "learning_rate": 1.978698126777475e-05, "loss": 1.6631, "step": 226400 }, { "epoch": 0.15759630170302763, "grad_norm": 5.550812244415283, "learning_rate": 1.9786606668153478e-05, "loss": 1.6711, "step": 226600 }, { "epoch": 0.15773539817408064, "grad_norm": 3.5647733211517334, "learning_rate": 1.9786231744547387e-05, "loss": 1.6213, "step": 226800 }, { "epoch": 0.15787449464513362, "grad_norm": 5.059593200683594, "learning_rate": 1.9785856496974382e-05, "loss": 1.6958, "step": 227000 }, { "epoch": 0.1580135911161866, "grad_norm": 3.6114261150360107, "learning_rate": 1.9785480925452373e-05, "loss": 1.6591, "step": 227200 }, { "epoch": 0.15815268758723958, "grad_norm": 2.9995696544647217, "learning_rate": 1.978510502999929e-05, "loss": 1.6269, "step": 227400 }, { "epoch": 0.15829178405829256, "grad_norm": 3.680255174636841, "learning_rate": 1.9784728810633082e-05, "loss": 1.6531, "step": 227600 }, { "epoch": 0.15843088052934554, "grad_norm": 4.238405227661133, "learning_rate": 1.97843522673717e-05, "loss": 1.6931, "step": 227800 }, { "epoch": 0.15856997700039852, "grad_norm": 3.1069247722625732, "learning_rate": 1.978397540023313e-05, "loss": 1.7058, "step": 228000 }, { "epoch": 0.1587090734714515, "grad_norm": 3.084455966949463, "learning_rate": 1.9783598209235362e-05, "loss": 1.6536, "step": 228200 }, { "epoch": 0.15884816994250447, "grad_norm": 12.894168853759766, "learning_rate": 1.978322069439639e-05, "loss": 1.6928, "step": 228400 }, { "epoch": 0.15898726641355745, "grad_norm": 4.349207401275635, "learning_rate": 1.9782842855734254e-05, "loss": 1.6267, "step": 228600 }, { "epoch": 0.15912636288461043, "grad_norm": 2.0193252563476562, "learning_rate": 1.9782464693266978e-05, "loss": 1.6574, "step": 228800 }, { "epoch": 0.1592654593556634, "grad_norm": 4.83428955078125, "learning_rate": 1.978208620701262e-05, "loss": 1.7005, "step": 229000 }, { "epoch": 0.1594045558267164, "grad_norm": 4.400110721588135, "learning_rate": 1.9781707396989255e-05, "loss": 1.6096, "step": 229200 }, { "epoch": 0.15954365229776937, "grad_norm": 4.364837169647217, "learning_rate": 1.9781328263214953e-05, "loss": 1.66, "step": 229400 }, { "epoch": 0.15968274876882235, "grad_norm": 4.045548915863037, "learning_rate": 1.9780948805707823e-05, "loss": 1.6176, "step": 229600 }, { "epoch": 0.15982184523987533, "grad_norm": 5.348182201385498, "learning_rate": 1.978056902448598e-05, "loss": 1.6951, "step": 229800 }, { "epoch": 0.1599609417109283, "grad_norm": 5.402227878570557, "learning_rate": 1.9780188919567548e-05, "loss": 1.6262, "step": 230000 }, { "epoch": 0.1601000381819813, "grad_norm": 5.651025772094727, "learning_rate": 1.9779808490970683e-05, "loss": 1.6461, "step": 230200 }, { "epoch": 0.1602391346530343, "grad_norm": 3.296238422393799, "learning_rate": 1.977942773871354e-05, "loss": 1.6568, "step": 230400 }, { "epoch": 0.16037823112408728, "grad_norm": 4.984383583068848, "learning_rate": 1.977904666281429e-05, "loss": 1.6582, "step": 230600 }, { "epoch": 0.16051732759514026, "grad_norm": 6.059391021728516, "learning_rate": 1.9778665263291134e-05, "loss": 1.694, "step": 230800 }, { "epoch": 0.16065642406619324, "grad_norm": 2.1012871265411377, "learning_rate": 1.977828354016228e-05, "loss": 1.6614, "step": 231000 }, { "epoch": 0.16079552053724622, "grad_norm": 2.401557445526123, "learning_rate": 1.977790149344594e-05, "loss": 1.6463, "step": 231200 }, { "epoch": 0.1609346170082992, "grad_norm": 4.753235340118408, "learning_rate": 1.9777519123160363e-05, "loss": 1.707, "step": 231400 }, { "epoch": 0.16107371347935218, "grad_norm": 3.4306366443634033, "learning_rate": 1.9777136429323805e-05, "loss": 1.6765, "step": 231600 }, { "epoch": 0.16121280995040516, "grad_norm": 2.307401180267334, "learning_rate": 1.9776753411954523e-05, "loss": 1.7106, "step": 231800 }, { "epoch": 0.16135190642145814, "grad_norm": 4.67930269241333, "learning_rate": 1.9776370071070815e-05, "loss": 1.6444, "step": 232000 }, { "epoch": 0.16149100289251112, "grad_norm": 4.696917533874512, "learning_rate": 1.977598640669097e-05, "loss": 1.7106, "step": 232200 }, { "epoch": 0.1616300993635641, "grad_norm": 4.120449066162109, "learning_rate": 1.9775602418833313e-05, "loss": 1.6614, "step": 232400 }, { "epoch": 0.16176919583461707, "grad_norm": 4.078285217285156, "learning_rate": 1.977521810751617e-05, "loss": 1.6652, "step": 232600 }, { "epoch": 0.16190829230567005, "grad_norm": 4.968871116638184, "learning_rate": 1.9774833472757886e-05, "loss": 1.6464, "step": 232800 }, { "epoch": 0.16204738877672303, "grad_norm": 2.427725076675415, "learning_rate": 1.9774448514576828e-05, "loss": 1.6473, "step": 233000 }, { "epoch": 0.162186485247776, "grad_norm": 6.255926132202148, "learning_rate": 1.9774063232991377e-05, "loss": 1.6456, "step": 233200 }, { "epoch": 0.162325581718829, "grad_norm": 7.306273937225342, "learning_rate": 1.9773677628019914e-05, "loss": 1.6192, "step": 233400 }, { "epoch": 0.16246467818988197, "grad_norm": 6.341434478759766, "learning_rate": 1.9773291699680857e-05, "loss": 1.661, "step": 233600 }, { "epoch": 0.16260377466093495, "grad_norm": 4.520346641540527, "learning_rate": 1.9772905447992624e-05, "loss": 1.6731, "step": 233800 }, { "epoch": 0.16274287113198796, "grad_norm": 4.508842468261719, "learning_rate": 1.9772518872973653e-05, "loss": 1.6526, "step": 234000 }, { "epoch": 0.16288196760304094, "grad_norm": 3.734306812286377, "learning_rate": 1.9772131974642406e-05, "loss": 1.6493, "step": 234200 }, { "epoch": 0.16302106407409392, "grad_norm": 6.192078590393066, "learning_rate": 1.9771744753017348e-05, "loss": 1.6857, "step": 234400 }, { "epoch": 0.1631601605451469, "grad_norm": 2.771817207336426, "learning_rate": 1.977135720811697e-05, "loss": 1.6534, "step": 234600 }, { "epoch": 0.16329925701619988, "grad_norm": 4.116189479827881, "learning_rate": 1.9770969339959763e-05, "loss": 1.6348, "step": 234800 }, { "epoch": 0.16343835348725286, "grad_norm": 6.450043678283691, "learning_rate": 1.9770581148564254e-05, "loss": 1.6792, "step": 235000 }, { "epoch": 0.16357744995830584, "grad_norm": 3.6712119579315186, "learning_rate": 1.9770192633948966e-05, "loss": 1.6559, "step": 235200 }, { "epoch": 0.16371654642935882, "grad_norm": 3.8811490535736084, "learning_rate": 1.976980379613245e-05, "loss": 1.6473, "step": 235400 }, { "epoch": 0.1638556429004118, "grad_norm": 8.544036865234375, "learning_rate": 1.9769414635133272e-05, "loss": 1.6666, "step": 235600 }, { "epoch": 0.16399473937146478, "grad_norm": 2.6414921283721924, "learning_rate": 1.9769025150970004e-05, "loss": 1.7163, "step": 235800 }, { "epoch": 0.16413383584251776, "grad_norm": 3.8313961029052734, "learning_rate": 1.976863534366124e-05, "loss": 1.6673, "step": 236000 }, { "epoch": 0.16427293231357074, "grad_norm": 4.338851451873779, "learning_rate": 1.97682452132256e-05, "loss": 1.6587, "step": 236200 }, { "epoch": 0.16441202878462371, "grad_norm": 5.920814514160156, "learning_rate": 1.9767854759681694e-05, "loss": 1.7192, "step": 236400 }, { "epoch": 0.1645511252556767, "grad_norm": 7.062288761138916, "learning_rate": 1.976746398304817e-05, "loss": 1.6747, "step": 236600 }, { "epoch": 0.16469022172672967, "grad_norm": 4.87226676940918, "learning_rate": 1.976707288334368e-05, "loss": 1.7216, "step": 236800 }, { "epoch": 0.16482931819778265, "grad_norm": 4.253633499145508, "learning_rate": 1.9766681460586894e-05, "loss": 1.6602, "step": 237000 }, { "epoch": 0.16496841466883563, "grad_norm": 5.2997822761535645, "learning_rate": 1.9766289714796502e-05, "loss": 1.6209, "step": 237200 }, { "epoch": 0.1651075111398886, "grad_norm": 8.48527717590332, "learning_rate": 1.97658976459912e-05, "loss": 1.6526, "step": 237400 }, { "epoch": 0.16524660761094162, "grad_norm": 3.7595603466033936, "learning_rate": 1.9765505254189708e-05, "loss": 1.718, "step": 237600 }, { "epoch": 0.1653857040819946, "grad_norm": 2.8959290981292725, "learning_rate": 1.9765112539410758e-05, "loss": 1.729, "step": 237800 }, { "epoch": 0.16552480055304758, "grad_norm": 3.3761868476867676, "learning_rate": 1.97647195016731e-05, "loss": 1.6648, "step": 238000 }, { "epoch": 0.16566389702410056, "grad_norm": 6.8370585441589355, "learning_rate": 1.9764326140995496e-05, "loss": 1.6535, "step": 238200 }, { "epoch": 0.16580299349515354, "grad_norm": 4.386465072631836, "learning_rate": 1.976393245739672e-05, "loss": 1.6181, "step": 238400 }, { "epoch": 0.16594208996620652, "grad_norm": 2.054741144180298, "learning_rate": 1.9763538450895576e-05, "loss": 1.6094, "step": 238600 }, { "epoch": 0.1660811864372595, "grad_norm": 4.956938743591309, "learning_rate": 1.976314412151086e-05, "loss": 1.7039, "step": 238800 }, { "epoch": 0.16622028290831248, "grad_norm": 3.4034650325775146, "learning_rate": 1.976274946926141e-05, "loss": 1.7075, "step": 239000 }, { "epoch": 0.16635937937936546, "grad_norm": 5.052691459655762, "learning_rate": 1.976235449416606e-05, "loss": 1.606, "step": 239200 }, { "epoch": 0.16649847585041844, "grad_norm": 5.3564372062683105, "learning_rate": 1.9761959196243662e-05, "loss": 1.637, "step": 239400 }, { "epoch": 0.16663757232147142, "grad_norm": 4.567344665527344, "learning_rate": 1.9761563575513093e-05, "loss": 1.6433, "step": 239600 }, { "epoch": 0.1667766687925244, "grad_norm": 6.281781196594238, "learning_rate": 1.9761167631993237e-05, "loss": 1.6451, "step": 239800 }, { "epoch": 0.16691576526357738, "grad_norm": 3.993034839630127, "learning_rate": 1.9760771365703e-05, "loss": 1.6326, "step": 240000 }, { "epoch": 0.16705486173463036, "grad_norm": 6.029495716094971, "learning_rate": 1.9760374776661288e-05, "loss": 1.6587, "step": 240200 }, { "epoch": 0.16719395820568334, "grad_norm": 4.002114772796631, "learning_rate": 1.9759977864887044e-05, "loss": 1.6484, "step": 240400 }, { "epoch": 0.16733305467673631, "grad_norm": 4.767657279968262, "learning_rate": 1.9759580630399218e-05, "loss": 1.6874, "step": 240600 }, { "epoch": 0.1674721511477893, "grad_norm": 5.881978511810303, "learning_rate": 1.9759183073216768e-05, "loss": 1.6754, "step": 240800 }, { "epoch": 0.16761124761884227, "grad_norm": 4.917115688323975, "learning_rate": 1.9758785193358672e-05, "loss": 1.6594, "step": 241000 }, { "epoch": 0.16775034408989525, "grad_norm": 7.00810432434082, "learning_rate": 1.9758386990843928e-05, "loss": 1.6253, "step": 241200 }, { "epoch": 0.16788944056094826, "grad_norm": 3.8624985218048096, "learning_rate": 1.9757988465691542e-05, "loss": 1.6543, "step": 241400 }, { "epoch": 0.16802853703200124, "grad_norm": 6.821996212005615, "learning_rate": 1.9757589617920542e-05, "loss": 1.674, "step": 241600 }, { "epoch": 0.16816763350305422, "grad_norm": 7.101013660430908, "learning_rate": 1.9757190447549967e-05, "loss": 1.6478, "step": 241800 }, { "epoch": 0.1683067299741072, "grad_norm": 3.3569910526275635, "learning_rate": 1.9756790954598874e-05, "loss": 1.6584, "step": 242000 }, { "epoch": 0.16844582644516018, "grad_norm": 3.9595654010772705, "learning_rate": 1.9756391139086332e-05, "loss": 1.6638, "step": 242200 }, { "epoch": 0.16858492291621316, "grad_norm": 7.490050315856934, "learning_rate": 1.9755991001031433e-05, "loss": 1.6073, "step": 242400 }, { "epoch": 0.16872401938726614, "grad_norm": 6.08933162689209, "learning_rate": 1.9755590540453275e-05, "loss": 1.6431, "step": 242600 }, { "epoch": 0.16886311585831912, "grad_norm": 5.067729473114014, "learning_rate": 1.9755189757370973e-05, "loss": 1.6102, "step": 242800 }, { "epoch": 0.1690022123293721, "grad_norm": 5.07871150970459, "learning_rate": 1.9754788651803664e-05, "loss": 1.6425, "step": 243000 }, { "epoch": 0.16914130880042508, "grad_norm": 4.212064743041992, "learning_rate": 1.97543872237705e-05, "loss": 1.6382, "step": 243200 }, { "epoch": 0.16928040527147806, "grad_norm": 5.310381889343262, "learning_rate": 1.9753985473290637e-05, "loss": 1.649, "step": 243400 }, { "epoch": 0.16941950174253104, "grad_norm": 5.563880443572998, "learning_rate": 1.9753583400383262e-05, "loss": 1.68, "step": 243600 }, { "epoch": 0.16955859821358402, "grad_norm": 3.346017837524414, "learning_rate": 1.975318100506756e-05, "loss": 1.6314, "step": 243800 }, { "epoch": 0.169697694684637, "grad_norm": 3.5837533473968506, "learning_rate": 1.9752778287362746e-05, "loss": 1.6668, "step": 244000 }, { "epoch": 0.16983679115568998, "grad_norm": 8.021364212036133, "learning_rate": 1.9752375247288046e-05, "loss": 1.6824, "step": 244200 }, { "epoch": 0.16997588762674296, "grad_norm": 6.544102191925049, "learning_rate": 1.97519718848627e-05, "loss": 1.6961, "step": 244400 }, { "epoch": 0.17011498409779593, "grad_norm": 6.292764663696289, "learning_rate": 1.9751568200105962e-05, "loss": 1.6397, "step": 244600 }, { "epoch": 0.17025408056884891, "grad_norm": 7.250925064086914, "learning_rate": 1.9751164193037104e-05, "loss": 1.7036, "step": 244800 }, { "epoch": 0.17039317703990192, "grad_norm": 4.996527194976807, "learning_rate": 1.975075986367542e-05, "loss": 1.609, "step": 245000 }, { "epoch": 0.1705322735109549, "grad_norm": 5.131301403045654, "learning_rate": 1.97503552120402e-05, "loss": 1.7069, "step": 245200 }, { "epoch": 0.17067136998200788, "grad_norm": 8.829029083251953, "learning_rate": 1.9749950238150776e-05, "loss": 1.6709, "step": 245400 }, { "epoch": 0.17081046645306086, "grad_norm": 5.728978633880615, "learning_rate": 1.9749544942026467e-05, "loss": 1.672, "step": 245600 }, { "epoch": 0.17094956292411384, "grad_norm": 5.395960330963135, "learning_rate": 1.9749139323686628e-05, "loss": 1.6404, "step": 245800 }, { "epoch": 0.17108865939516682, "grad_norm": 3.34220027923584, "learning_rate": 1.9748733383150624e-05, "loss": 1.6915, "step": 246000 }, { "epoch": 0.1712277558662198, "grad_norm": 2.9125590324401855, "learning_rate": 1.974832712043783e-05, "loss": 1.6057, "step": 246200 }, { "epoch": 0.17136685233727278, "grad_norm": 5.961441516876221, "learning_rate": 1.974792053556764e-05, "loss": 1.6639, "step": 246400 }, { "epoch": 0.17150594880832576, "grad_norm": 3.4587485790252686, "learning_rate": 1.9747513628559473e-05, "loss": 1.667, "step": 246600 }, { "epoch": 0.17164504527937874, "grad_norm": 3.578892946243286, "learning_rate": 1.974710639943274e-05, "loss": 1.6889, "step": 246800 }, { "epoch": 0.17178414175043172, "grad_norm": 4.567336082458496, "learning_rate": 1.9746698848206897e-05, "loss": 1.6884, "step": 247000 }, { "epoch": 0.1719232382214847, "grad_norm": 4.0480523109436035, "learning_rate": 1.974629097490139e-05, "loss": 1.6463, "step": 247200 }, { "epoch": 0.17206233469253768, "grad_norm": 4.559631824493408, "learning_rate": 1.974588277953569e-05, "loss": 1.6653, "step": 247400 }, { "epoch": 0.17220143116359066, "grad_norm": 5.548762798309326, "learning_rate": 1.974547426212929e-05, "loss": 1.7268, "step": 247600 }, { "epoch": 0.17234052763464364, "grad_norm": 5.385092735290527, "learning_rate": 1.9745065422701688e-05, "loss": 1.6544, "step": 247800 }, { "epoch": 0.17247962410569662, "grad_norm": 4.914535999298096, "learning_rate": 1.9744656261272402e-05, "loss": 1.6029, "step": 248000 }, { "epoch": 0.1726187205767496, "grad_norm": 4.936513423919678, "learning_rate": 1.9744246777860966e-05, "loss": 1.6773, "step": 248200 }, { "epoch": 0.17275781704780258, "grad_norm": 6.90836238861084, "learning_rate": 1.9743836972486927e-05, "loss": 1.6969, "step": 248400 }, { "epoch": 0.17289691351885558, "grad_norm": 2.9647436141967773, "learning_rate": 1.974342684516985e-05, "loss": 1.6023, "step": 248600 }, { "epoch": 0.17303600998990856, "grad_norm": 5.508109092712402, "learning_rate": 1.9743016395929313e-05, "loss": 1.6765, "step": 248800 }, { "epoch": 0.17317510646096154, "grad_norm": 3.3895699977874756, "learning_rate": 1.974260562478491e-05, "loss": 1.5906, "step": 249000 }, { "epoch": 0.17331420293201452, "grad_norm": 5.579315185546875, "learning_rate": 1.9742194531756248e-05, "loss": 1.6525, "step": 249200 }, { "epoch": 0.1734532994030675, "grad_norm": 8.646828651428223, "learning_rate": 1.9741783116862962e-05, "loss": 1.6733, "step": 249400 }, { "epoch": 0.17359239587412048, "grad_norm": 3.6953933238983154, "learning_rate": 1.974137138012468e-05, "loss": 1.5955, "step": 249600 }, { "epoch": 0.17373149234517346, "grad_norm": 4.18018913269043, "learning_rate": 1.974095932156107e-05, "loss": 1.6134, "step": 249800 }, { "epoch": 0.17387058881622644, "grad_norm": 3.936396837234497, "learning_rate": 1.9740546941191794e-05, "loss": 1.6412, "step": 250000 }, { "epoch": 0.17400968528727942, "grad_norm": 4.915919780731201, "learning_rate": 1.974013423903654e-05, "loss": 1.6542, "step": 250200 }, { "epoch": 0.1741487817583324, "grad_norm": 3.191892385482788, "learning_rate": 1.9739721215115012e-05, "loss": 1.6389, "step": 250400 }, { "epoch": 0.17428787822938538, "grad_norm": 3.418745517730713, "learning_rate": 1.9739307869446928e-05, "loss": 1.6424, "step": 250600 }, { "epoch": 0.17442697470043836, "grad_norm": 6.035984992980957, "learning_rate": 1.9738894202052017e-05, "loss": 1.6995, "step": 250800 }, { "epoch": 0.17456607117149134, "grad_norm": 4.395087242126465, "learning_rate": 1.9738480212950032e-05, "loss": 1.6077, "step": 251000 }, { "epoch": 0.17470516764254432, "grad_norm": 5.528264045715332, "learning_rate": 1.973806590216073e-05, "loss": 1.6664, "step": 251200 }, { "epoch": 0.1748442641135973, "grad_norm": 5.614687919616699, "learning_rate": 1.9737651269703898e-05, "loss": 1.6981, "step": 251400 }, { "epoch": 0.17498336058465028, "grad_norm": 5.233060836791992, "learning_rate": 1.9737236315599323e-05, "loss": 1.6558, "step": 251600 }, { "epoch": 0.17512245705570326, "grad_norm": 4.24580717086792, "learning_rate": 1.9736821039866817e-05, "loss": 1.6643, "step": 251800 }, { "epoch": 0.17526155352675624, "grad_norm": 6.44655179977417, "learning_rate": 1.9736405442526204e-05, "loss": 1.619, "step": 252000 }, { "epoch": 0.17540064999780924, "grad_norm": 6.081040382385254, "learning_rate": 1.9735989523597325e-05, "loss": 1.6087, "step": 252200 }, { "epoch": 0.17553974646886222, "grad_norm": 5.6498260498046875, "learning_rate": 1.9735573283100038e-05, "loss": 1.576, "step": 252400 }, { "epoch": 0.1756788429399152, "grad_norm": 3.5974433422088623, "learning_rate": 1.973515672105421e-05, "loss": 1.648, "step": 252600 }, { "epoch": 0.17581793941096818, "grad_norm": 6.066366672515869, "learning_rate": 1.973473983747973e-05, "loss": 1.6922, "step": 252800 }, { "epoch": 0.17595703588202116, "grad_norm": 7.197040557861328, "learning_rate": 1.973432263239649e-05, "loss": 1.6944, "step": 253000 }, { "epoch": 0.17609613235307414, "grad_norm": 5.857459545135498, "learning_rate": 1.9733905105824425e-05, "loss": 1.6374, "step": 253200 }, { "epoch": 0.17623522882412712, "grad_norm": 9.407792091369629, "learning_rate": 1.9733487257783454e-05, "loss": 1.601, "step": 253400 }, { "epoch": 0.1763743252951801, "grad_norm": 5.07534646987915, "learning_rate": 1.973306908829353e-05, "loss": 1.6582, "step": 253600 }, { "epoch": 0.17651342176623308, "grad_norm": 5.791580677032471, "learning_rate": 1.9732650597374606e-05, "loss": 1.6761, "step": 253800 }, { "epoch": 0.17665251823728606, "grad_norm": 3.480356454849243, "learning_rate": 1.973223178504667e-05, "loss": 1.6725, "step": 254000 }, { "epoch": 0.17679161470833904, "grad_norm": 6.098285675048828, "learning_rate": 1.9731812651329714e-05, "loss": 1.6698, "step": 254200 }, { "epoch": 0.17693071117939202, "grad_norm": 5.017908096313477, "learning_rate": 1.9731393196243754e-05, "loss": 1.6495, "step": 254400 }, { "epoch": 0.177069807650445, "grad_norm": 5.378422260284424, "learning_rate": 1.9730973419808798e-05, "loss": 1.6464, "step": 254600 }, { "epoch": 0.17720890412149798, "grad_norm": 4.927765369415283, "learning_rate": 1.97305533220449e-05, "loss": 1.6137, "step": 254800 }, { "epoch": 0.17734800059255096, "grad_norm": 3.2673325538635254, "learning_rate": 1.9730132902972104e-05, "loss": 1.6225, "step": 255000 }, { "epoch": 0.17748709706360394, "grad_norm": 3.544149160385132, "learning_rate": 1.972971216261049e-05, "loss": 1.6482, "step": 255200 }, { "epoch": 0.17762619353465692, "grad_norm": 4.093089580535889, "learning_rate": 1.9729291100980137e-05, "loss": 1.5997, "step": 255400 }, { "epoch": 0.1777652900057099, "grad_norm": 5.751675128936768, "learning_rate": 1.972886971810115e-05, "loss": 1.6099, "step": 255600 }, { "epoch": 0.1779043864767629, "grad_norm": 3.959738254547119, "learning_rate": 1.9728448013993642e-05, "loss": 1.6256, "step": 255800 }, { "epoch": 0.17804348294781588, "grad_norm": 4.163662910461426, "learning_rate": 1.9728025988677747e-05, "loss": 1.6385, "step": 256000 }, { "epoch": 0.17818257941886886, "grad_norm": 7.837775707244873, "learning_rate": 1.9727603642173613e-05, "loss": 1.6774, "step": 256200 }, { "epoch": 0.17832167588992184, "grad_norm": 4.611356258392334, "learning_rate": 1.9727180974501402e-05, "loss": 1.6491, "step": 256400 }, { "epoch": 0.17846077236097482, "grad_norm": 3.0810539722442627, "learning_rate": 1.9726757985681288e-05, "loss": 1.6984, "step": 256600 }, { "epoch": 0.1785998688320278, "grad_norm": 4.1702351570129395, "learning_rate": 1.9726334675733464e-05, "loss": 1.633, "step": 256800 }, { "epoch": 0.17873896530308078, "grad_norm": 4.221049785614014, "learning_rate": 1.9725911044678146e-05, "loss": 1.6527, "step": 257000 }, { "epoch": 0.17887806177413376, "grad_norm": 6.744946002960205, "learning_rate": 1.9725487092535548e-05, "loss": 1.6481, "step": 257200 }, { "epoch": 0.17901715824518674, "grad_norm": 4.854006290435791, "learning_rate": 1.9725062819325918e-05, "loss": 1.6068, "step": 257400 }, { "epoch": 0.17915625471623972, "grad_norm": 4.671850681304932, "learning_rate": 1.97246382250695e-05, "loss": 1.6917, "step": 257600 }, { "epoch": 0.1792953511872927, "grad_norm": 5.779367923736572, "learning_rate": 1.972421330978657e-05, "loss": 1.6526, "step": 257800 }, { "epoch": 0.17943444765834568, "grad_norm": 3.814084768295288, "learning_rate": 1.9723788073497414e-05, "loss": 1.6146, "step": 258000 }, { "epoch": 0.17957354412939866, "grad_norm": 3.730072259902954, "learning_rate": 1.9723362516222333e-05, "loss": 1.6518, "step": 258200 }, { "epoch": 0.17971264060045164, "grad_norm": 4.372803211212158, "learning_rate": 1.9722936637981637e-05, "loss": 1.6634, "step": 258400 }, { "epoch": 0.17985173707150462, "grad_norm": 4.720413684844971, "learning_rate": 1.9722510438795664e-05, "loss": 1.6809, "step": 258600 }, { "epoch": 0.1799908335425576, "grad_norm": 5.205902099609375, "learning_rate": 1.9722083918684754e-05, "loss": 1.6719, "step": 258800 }, { "epoch": 0.18012993001361058, "grad_norm": 4.575140476226807, "learning_rate": 1.972165707766927e-05, "loss": 1.6258, "step": 259000 }, { "epoch": 0.18026902648466356, "grad_norm": 5.279964447021484, "learning_rate": 1.972122991576959e-05, "loss": 1.6899, "step": 259200 }, { "epoch": 0.18040812295571657, "grad_norm": 4.771291255950928, "learning_rate": 1.9720802433006107e-05, "loss": 1.6392, "step": 259400 }, { "epoch": 0.18054721942676955, "grad_norm": 4.930868148803711, "learning_rate": 1.972037462939923e-05, "loss": 1.6513, "step": 259600 }, { "epoch": 0.18068631589782252, "grad_norm": 2.3612167835235596, "learning_rate": 1.9719946504969374e-05, "loss": 1.6017, "step": 259800 }, { "epoch": 0.1808254123688755, "grad_norm": 6.671528339385986, "learning_rate": 1.9719518059736987e-05, "loss": 1.6377, "step": 260000 }, { "epoch": 0.18096450883992848, "grad_norm": 6.053910255432129, "learning_rate": 1.9719089293722517e-05, "loss": 1.6561, "step": 260200 }, { "epoch": 0.18110360531098146, "grad_norm": 8.888887405395508, "learning_rate": 1.9718660206946434e-05, "loss": 1.6484, "step": 260400 }, { "epoch": 0.18124270178203444, "grad_norm": 8.163064002990723, "learning_rate": 1.9718230799429224e-05, "loss": 1.6584, "step": 260600 }, { "epoch": 0.18138179825308742, "grad_norm": 3.8536617755889893, "learning_rate": 1.971780107119138e-05, "loss": 1.632, "step": 260800 }, { "epoch": 0.1815208947241404, "grad_norm": 5.531630992889404, "learning_rate": 1.9717371022253425e-05, "loss": 1.6318, "step": 261000 }, { "epoch": 0.18165999119519338, "grad_norm": 7.324404716491699, "learning_rate": 1.9716940652635884e-05, "loss": 1.6994, "step": 261200 }, { "epoch": 0.18179908766624636, "grad_norm": 7.374257564544678, "learning_rate": 1.9716509962359303e-05, "loss": 1.6858, "step": 261400 }, { "epoch": 0.18193818413729934, "grad_norm": 4.538333415985107, "learning_rate": 1.9716078951444248e-05, "loss": 1.624, "step": 261600 }, { "epoch": 0.18207728060835232, "grad_norm": 6.309831142425537, "learning_rate": 1.9715647619911284e-05, "loss": 1.6307, "step": 261800 }, { "epoch": 0.1822163770794053, "grad_norm": 4.1088128089904785, "learning_rate": 1.971521596778101e-05, "loss": 1.6736, "step": 262000 }, { "epoch": 0.18235547355045828, "grad_norm": 4.381863117218018, "learning_rate": 1.9714783995074032e-05, "loss": 1.6639, "step": 262200 }, { "epoch": 0.18249457002151126, "grad_norm": 3.055105209350586, "learning_rate": 1.971435170181097e-05, "loss": 1.668, "step": 262400 }, { "epoch": 0.18263366649256424, "grad_norm": 3.906743049621582, "learning_rate": 1.9713919088012462e-05, "loss": 1.6917, "step": 262600 }, { "epoch": 0.18277276296361722, "grad_norm": 7.914857864379883, "learning_rate": 1.9713486153699165e-05, "loss": 1.632, "step": 262800 }, { "epoch": 0.18291185943467023, "grad_norm": 4.47186803817749, "learning_rate": 1.9713052898891738e-05, "loss": 1.6699, "step": 263000 }, { "epoch": 0.1830509559057232, "grad_norm": 4.610714912414551, "learning_rate": 1.971261932361087e-05, "loss": 1.6515, "step": 263200 }, { "epoch": 0.18319005237677619, "grad_norm": 4.2848663330078125, "learning_rate": 1.9712185427877256e-05, "loss": 1.6598, "step": 263400 }, { "epoch": 0.18332914884782917, "grad_norm": 3.241429328918457, "learning_rate": 1.9711751211711615e-05, "loss": 1.6509, "step": 263600 }, { "epoch": 0.18346824531888214, "grad_norm": 4.7403388023376465, "learning_rate": 1.9711316675134667e-05, "loss": 1.5936, "step": 263800 }, { "epoch": 0.18360734178993512, "grad_norm": 6.96336030960083, "learning_rate": 1.9710881818167165e-05, "loss": 1.69, "step": 264000 }, { "epoch": 0.1837464382609881, "grad_norm": 6.949305534362793, "learning_rate": 1.971044664082987e-05, "loss": 1.7313, "step": 264200 }, { "epoch": 0.18388553473204108, "grad_norm": 5.006408214569092, "learning_rate": 1.9710011143143543e-05, "loss": 1.6883, "step": 264400 }, { "epoch": 0.18402463120309406, "grad_norm": 6.867736339569092, "learning_rate": 1.9709575325128992e-05, "loss": 1.6202, "step": 264600 }, { "epoch": 0.18416372767414704, "grad_norm": 4.534424781799316, "learning_rate": 1.970913918680701e-05, "loss": 1.6482, "step": 264800 }, { "epoch": 0.18430282414520002, "grad_norm": 3.9042301177978516, "learning_rate": 1.9708702728198425e-05, "loss": 1.6978, "step": 265000 }, { "epoch": 0.184441920616253, "grad_norm": 6.262451171875, "learning_rate": 1.9708265949324065e-05, "loss": 1.6781, "step": 265200 }, { "epoch": 0.18458101708730598, "grad_norm": 4.745058536529541, "learning_rate": 1.9707828850204785e-05, "loss": 1.6822, "step": 265400 }, { "epoch": 0.18472011355835896, "grad_norm": 2.7927439212799072, "learning_rate": 1.9707391430861458e-05, "loss": 1.7232, "step": 265600 }, { "epoch": 0.18485921002941194, "grad_norm": 4.233567714691162, "learning_rate": 1.9706953691314957e-05, "loss": 1.6036, "step": 265800 }, { "epoch": 0.18499830650046492, "grad_norm": 3.4694018363952637, "learning_rate": 1.9706515631586186e-05, "loss": 1.7019, "step": 266000 }, { "epoch": 0.1851374029715179, "grad_norm": 10.153181076049805, "learning_rate": 1.9706077251696052e-05, "loss": 1.6461, "step": 266200 }, { "epoch": 0.18527649944257088, "grad_norm": 5.279789447784424, "learning_rate": 1.9705638551665488e-05, "loss": 1.6111, "step": 266400 }, { "epoch": 0.18541559591362386, "grad_norm": 6.009661674499512, "learning_rate": 1.9705199531515427e-05, "loss": 1.6265, "step": 266600 }, { "epoch": 0.18555469238467687, "grad_norm": 7.22796106338501, "learning_rate": 1.970476019126684e-05, "loss": 1.715, "step": 266800 }, { "epoch": 0.18569378885572985, "grad_norm": 3.068493127822876, "learning_rate": 1.9704320530940694e-05, "loss": 1.6825, "step": 267000 }, { "epoch": 0.18583288532678283, "grad_norm": 2.814833879470825, "learning_rate": 1.9703880550557976e-05, "loss": 1.6431, "step": 267200 }, { "epoch": 0.1859719817978358, "grad_norm": 10.685017585754395, "learning_rate": 1.9703440250139696e-05, "loss": 1.6924, "step": 267400 }, { "epoch": 0.18611107826888879, "grad_norm": 4.870309829711914, "learning_rate": 1.9702999629706866e-05, "loss": 1.6582, "step": 267600 }, { "epoch": 0.18625017473994177, "grad_norm": 7.665128707885742, "learning_rate": 1.970255868928053e-05, "loss": 1.6855, "step": 267800 }, { "epoch": 0.18638927121099474, "grad_norm": 5.275532245635986, "learning_rate": 1.970211742888173e-05, "loss": 1.6763, "step": 268000 }, { "epoch": 0.18652836768204772, "grad_norm": 3.489755392074585, "learning_rate": 1.9701675848531535e-05, "loss": 1.6366, "step": 268200 }, { "epoch": 0.1866674641531007, "grad_norm": 5.010584354400635, "learning_rate": 1.970123394825103e-05, "loss": 1.6195, "step": 268400 }, { "epoch": 0.18680656062415368, "grad_norm": 2.8063342571258545, "learning_rate": 1.9700791728061296e-05, "loss": 1.6696, "step": 268600 }, { "epoch": 0.18694565709520666, "grad_norm": 5.581388473510742, "learning_rate": 1.970034918798346e-05, "loss": 1.7241, "step": 268800 }, { "epoch": 0.18708475356625964, "grad_norm": 9.602514266967773, "learning_rate": 1.9699906328038643e-05, "loss": 1.6885, "step": 269000 }, { "epoch": 0.18722385003731262, "grad_norm": 3.747452735900879, "learning_rate": 1.9699463148247983e-05, "loss": 1.6347, "step": 269200 }, { "epoch": 0.1873629465083656, "grad_norm": 4.583897590637207, "learning_rate": 1.969901964863264e-05, "loss": 1.6469, "step": 269400 }, { "epoch": 0.18750204297941858, "grad_norm": 4.23382568359375, "learning_rate": 1.9698575829213786e-05, "loss": 1.5978, "step": 269600 }, { "epoch": 0.18764113945047156, "grad_norm": 7.168055057525635, "learning_rate": 1.969813169001261e-05, "loss": 1.6768, "step": 269800 }, { "epoch": 0.18778023592152454, "grad_norm": 6.323521614074707, "learning_rate": 1.9697687231050314e-05, "loss": 1.6085, "step": 270000 }, { "epoch": 0.18791933239257752, "grad_norm": 6.865908145904541, "learning_rate": 1.969724245234811e-05, "loss": 1.5905, "step": 270200 }, { "epoch": 0.18805842886363053, "grad_norm": 5.0147294998168945, "learning_rate": 1.9696797353927234e-05, "loss": 1.6379, "step": 270400 }, { "epoch": 0.1881975253346835, "grad_norm": 5.6992316246032715, "learning_rate": 1.9696351935808945e-05, "loss": 1.6266, "step": 270600 }, { "epoch": 0.1883366218057365, "grad_norm": 3.931854248046875, "learning_rate": 1.9695906198014496e-05, "loss": 1.6416, "step": 270800 }, { "epoch": 0.18847571827678947, "grad_norm": 4.03151798248291, "learning_rate": 1.9695460140565164e-05, "loss": 1.6011, "step": 271000 }, { "epoch": 0.18861481474784245, "grad_norm": 3.1182284355163574, "learning_rate": 1.969501376348225e-05, "loss": 1.6026, "step": 271200 }, { "epoch": 0.18875391121889543, "grad_norm": 5.573946952819824, "learning_rate": 1.969456706678706e-05, "loss": 1.6923, "step": 271400 }, { "epoch": 0.1888930076899484, "grad_norm": 3.697075843811035, "learning_rate": 1.969412005050092e-05, "loss": 1.5972, "step": 271600 }, { "epoch": 0.18903210416100139, "grad_norm": 6.2319722175598145, "learning_rate": 1.9693672714645175e-05, "loss": 1.6819, "step": 271800 }, { "epoch": 0.18917120063205436, "grad_norm": 5.601167678833008, "learning_rate": 1.969322505924117e-05, "loss": 1.6835, "step": 272000 }, { "epoch": 0.18931029710310734, "grad_norm": 3.6777641773223877, "learning_rate": 1.969277708431028e-05, "loss": 1.5691, "step": 272200 }, { "epoch": 0.18944939357416032, "grad_norm": 6.166382789611816, "learning_rate": 1.9692328789873895e-05, "loss": 1.7033, "step": 272400 }, { "epoch": 0.1895884900452133, "grad_norm": 7.177475452423096, "learning_rate": 1.9691880175953418e-05, "loss": 1.6617, "step": 272600 }, { "epoch": 0.18972758651626628, "grad_norm": 5.140500068664551, "learning_rate": 1.969143124257025e-05, "loss": 1.655, "step": 272800 }, { "epoch": 0.18986668298731926, "grad_norm": 4.847902774810791, "learning_rate": 1.969098198974584e-05, "loss": 1.6835, "step": 273000 }, { "epoch": 0.19000577945837224, "grad_norm": 5.792107105255127, "learning_rate": 1.969053241750162e-05, "loss": 1.7271, "step": 273200 }, { "epoch": 0.19014487592942522, "grad_norm": 4.143697738647461, "learning_rate": 1.9690082525859067e-05, "loss": 1.7298, "step": 273400 }, { "epoch": 0.1902839724004782, "grad_norm": 4.511526584625244, "learning_rate": 1.9689632314839642e-05, "loss": 1.6701, "step": 273600 }, { "epoch": 0.19042306887153118, "grad_norm": 3.1369123458862305, "learning_rate": 1.9689181784464852e-05, "loss": 1.6691, "step": 273800 }, { "epoch": 0.1905621653425842, "grad_norm": 8.698987007141113, "learning_rate": 1.9688730934756198e-05, "loss": 1.6709, "step": 274000 }, { "epoch": 0.19070126181363717, "grad_norm": 2.9548964500427246, "learning_rate": 1.9688279765735204e-05, "loss": 1.6716, "step": 274200 }, { "epoch": 0.19084035828469015, "grad_norm": 5.724959373474121, "learning_rate": 1.968782827742341e-05, "loss": 1.7065, "step": 274400 }, { "epoch": 0.19097945475574313, "grad_norm": 3.5652151107788086, "learning_rate": 1.968737646984237e-05, "loss": 1.6608, "step": 274600 }, { "epoch": 0.1911185512267961, "grad_norm": 6.696306228637695, "learning_rate": 1.9686924343013648e-05, "loss": 1.5816, "step": 274800 }, { "epoch": 0.1912576476978491, "grad_norm": 5.117129325866699, "learning_rate": 1.968647189695883e-05, "loss": 1.6743, "step": 275000 }, { "epoch": 0.19139674416890207, "grad_norm": 3.5398950576782227, "learning_rate": 1.9686019131699518e-05, "loss": 1.6958, "step": 275200 }, { "epoch": 0.19153584063995505, "grad_norm": 3.010068655014038, "learning_rate": 1.9685566047257324e-05, "loss": 1.648, "step": 275400 }, { "epoch": 0.19167493711100803, "grad_norm": 6.521470069885254, "learning_rate": 1.9685112643653877e-05, "loss": 1.6843, "step": 275600 }, { "epoch": 0.191814033582061, "grad_norm": 2.959944725036621, "learning_rate": 1.9684658920910822e-05, "loss": 1.6299, "step": 275800 }, { "epoch": 0.19195313005311399, "grad_norm": 4.166990756988525, "learning_rate": 1.9684204879049825e-05, "loss": 1.607, "step": 276000 }, { "epoch": 0.19209222652416696, "grad_norm": 3.578461170196533, "learning_rate": 1.9683750518092553e-05, "loss": 1.659, "step": 276200 }, { "epoch": 0.19223132299521994, "grad_norm": 4.841921329498291, "learning_rate": 1.96832958380607e-05, "loss": 1.6197, "step": 276400 }, { "epoch": 0.19237041946627292, "grad_norm": 11.981287956237793, "learning_rate": 1.9682840838975977e-05, "loss": 1.6229, "step": 276600 }, { "epoch": 0.1925095159373259, "grad_norm": 2.8661787509918213, "learning_rate": 1.9682385520860097e-05, "loss": 1.6775, "step": 276800 }, { "epoch": 0.19264861240837888, "grad_norm": 4.213954925537109, "learning_rate": 1.96819298837348e-05, "loss": 1.6627, "step": 277000 }, { "epoch": 0.19278770887943186, "grad_norm": 5.1365156173706055, "learning_rate": 1.9681473927621838e-05, "loss": 1.6678, "step": 277200 }, { "epoch": 0.19292680535048484, "grad_norm": 2.4719531536102295, "learning_rate": 1.968101765254298e-05, "loss": 1.6817, "step": 277400 }, { "epoch": 0.19306590182153785, "grad_norm": 4.936479091644287, "learning_rate": 1.9680561058520005e-05, "loss": 1.6649, "step": 277600 }, { "epoch": 0.19320499829259083, "grad_norm": 4.365041255950928, "learning_rate": 1.9680104145574707e-05, "loss": 1.6396, "step": 277800 }, { "epoch": 0.1933440947636438, "grad_norm": 3.9776573181152344, "learning_rate": 1.9679646913728904e-05, "loss": 1.6523, "step": 278000 }, { "epoch": 0.1934831912346968, "grad_norm": 5.670855522155762, "learning_rate": 1.9679189363004422e-05, "loss": 1.6252, "step": 278200 }, { "epoch": 0.19362228770574977, "grad_norm": 4.940858364105225, "learning_rate": 1.9678731493423106e-05, "loss": 1.6336, "step": 278400 }, { "epoch": 0.19376138417680275, "grad_norm": 5.0531535148620605, "learning_rate": 1.967827330500681e-05, "loss": 1.6045, "step": 278600 }, { "epoch": 0.19390048064785573, "grad_norm": 5.485026836395264, "learning_rate": 1.9677814797777407e-05, "loss": 1.6662, "step": 278800 }, { "epoch": 0.1940395771189087, "grad_norm": 7.188427448272705, "learning_rate": 1.9677355971756796e-05, "loss": 1.6233, "step": 279000 }, { "epoch": 0.1941786735899617, "grad_norm": 4.312156677246094, "learning_rate": 1.9676896826966866e-05, "loss": 1.7074, "step": 279200 }, { "epoch": 0.19431777006101467, "grad_norm": 4.042320251464844, "learning_rate": 1.9676437363429547e-05, "loss": 1.6669, "step": 279400 }, { "epoch": 0.19445686653206765, "grad_norm": 6.421662330627441, "learning_rate": 1.9675977581166764e-05, "loss": 1.6734, "step": 279600 }, { "epoch": 0.19459596300312063, "grad_norm": 6.484320163726807, "learning_rate": 1.9675517480200472e-05, "loss": 1.6547, "step": 279800 }, { "epoch": 0.1947350594741736, "grad_norm": 5.941833972930908, "learning_rate": 1.9675057060552637e-05, "loss": 1.7146, "step": 280000 }, { "epoch": 0.19487415594522658, "grad_norm": 5.892642974853516, "learning_rate": 1.9674596322245237e-05, "loss": 1.64, "step": 280200 }, { "epoch": 0.19501325241627956, "grad_norm": 4.300663471221924, "learning_rate": 1.9674135265300268e-05, "loss": 1.7107, "step": 280400 }, { "epoch": 0.19515234888733254, "grad_norm": 3.902686357498169, "learning_rate": 1.9673673889739742e-05, "loss": 1.672, "step": 280600 }, { "epoch": 0.19529144535838552, "grad_norm": 4.7465314865112305, "learning_rate": 1.967321219558568e-05, "loss": 1.6237, "step": 280800 }, { "epoch": 0.1954305418294385, "grad_norm": 3.3442418575286865, "learning_rate": 1.9672750182860128e-05, "loss": 1.6684, "step": 281000 }, { "epoch": 0.1955696383004915, "grad_norm": 6.627809524536133, "learning_rate": 1.9672287851585135e-05, "loss": 1.6514, "step": 281200 }, { "epoch": 0.1957087347715445, "grad_norm": 3.9617884159088135, "learning_rate": 1.9671825201782776e-05, "loss": 1.6773, "step": 281400 }, { "epoch": 0.19584783124259747, "grad_norm": 7.082869529724121, "learning_rate": 1.967136223347514e-05, "loss": 1.7027, "step": 281600 }, { "epoch": 0.19598692771365045, "grad_norm": 3.1545870304107666, "learning_rate": 1.9670898946684323e-05, "loss": 1.6829, "step": 281800 }, { "epoch": 0.19612602418470343, "grad_norm": 5.681509971618652, "learning_rate": 1.967043534143245e-05, "loss": 1.6874, "step": 282000 }, { "epoch": 0.1962651206557564, "grad_norm": 4.982570171356201, "learning_rate": 1.9669971417741644e-05, "loss": 1.671, "step": 282200 }, { "epoch": 0.1964042171268094, "grad_norm": 4.787365436553955, "learning_rate": 1.9669507175634055e-05, "loss": 1.6774, "step": 282400 }, { "epoch": 0.19654331359786237, "grad_norm": 3.468003988265991, "learning_rate": 1.9669042615131846e-05, "loss": 1.6844, "step": 282600 }, { "epoch": 0.19668241006891535, "grad_norm": 4.703042507171631, "learning_rate": 1.9668577736257196e-05, "loss": 1.6502, "step": 282800 }, { "epoch": 0.19682150653996833, "grad_norm": 3.5980827808380127, "learning_rate": 1.96681125390323e-05, "loss": 1.6675, "step": 283000 }, { "epoch": 0.1969606030110213, "grad_norm": 5.123573303222656, "learning_rate": 1.966764702347936e-05, "loss": 1.6239, "step": 283200 }, { "epoch": 0.1970996994820743, "grad_norm": 3.511472225189209, "learning_rate": 1.9667181189620602e-05, "loss": 1.6676, "step": 283400 }, { "epoch": 0.19723879595312727, "grad_norm": 5.761998176574707, "learning_rate": 1.9666715037478264e-05, "loss": 1.6952, "step": 283600 }, { "epoch": 0.19737789242418025, "grad_norm": 4.794870376586914, "learning_rate": 1.9666248567074602e-05, "loss": 1.6473, "step": 283800 }, { "epoch": 0.19751698889523323, "grad_norm": 3.9736626148223877, "learning_rate": 1.966578177843188e-05, "loss": 1.6795, "step": 284000 }, { "epoch": 0.1976560853662862, "grad_norm": 4.012296676635742, "learning_rate": 1.9665314671572388e-05, "loss": 1.6281, "step": 284200 }, { "epoch": 0.19779518183733918, "grad_norm": 5.5282182693481445, "learning_rate": 1.9664847246518416e-05, "loss": 1.6658, "step": 284400 }, { "epoch": 0.19793427830839216, "grad_norm": 3.9729387760162354, "learning_rate": 1.9664379503292287e-05, "loss": 1.5987, "step": 284600 }, { "epoch": 0.19807337477944517, "grad_norm": 6.152369022369385, "learning_rate": 1.9663911441916333e-05, "loss": 1.6623, "step": 284800 }, { "epoch": 0.19821247125049815, "grad_norm": 4.116359710693359, "learning_rate": 1.9663443062412884e-05, "loss": 1.7005, "step": 285000 }, { "epoch": 0.19835156772155113, "grad_norm": 3.8620975017547607, "learning_rate": 1.9662974364804314e-05, "loss": 1.6533, "step": 285200 }, { "epoch": 0.1984906641926041, "grad_norm": 8.122262954711914, "learning_rate": 1.9662505349112995e-05, "loss": 1.6503, "step": 285400 }, { "epoch": 0.1986297606636571, "grad_norm": 5.698178291320801, "learning_rate": 1.9662036015361313e-05, "loss": 1.6512, "step": 285600 }, { "epoch": 0.19876885713471007, "grad_norm": 3.9581122398376465, "learning_rate": 1.9661566363571673e-05, "loss": 1.6597, "step": 285800 }, { "epoch": 0.19890795360576305, "grad_norm": 4.182572841644287, "learning_rate": 1.9661096393766504e-05, "loss": 1.6463, "step": 286000 }, { "epoch": 0.19904705007681603, "grad_norm": 6.348291397094727, "learning_rate": 1.9660626105968234e-05, "loss": 1.6712, "step": 286200 }, { "epoch": 0.199186146547869, "grad_norm": 3.3880882263183594, "learning_rate": 1.9660155500199317e-05, "loss": 1.6846, "step": 286400 }, { "epoch": 0.199325243018922, "grad_norm": 3.741093397140503, "learning_rate": 1.965968457648222e-05, "loss": 1.6929, "step": 286600 }, { "epoch": 0.19946433948997497, "grad_norm": 6.330505847930908, "learning_rate": 1.965921333483942e-05, "loss": 1.673, "step": 286800 }, { "epoch": 0.19960343596102795, "grad_norm": 6.309572219848633, "learning_rate": 1.9658741775293418e-05, "loss": 1.612, "step": 287000 }, { "epoch": 0.19974253243208093, "grad_norm": 6.026462554931641, "learning_rate": 1.9658269897866728e-05, "loss": 1.6255, "step": 287200 }, { "epoch": 0.1998816289031339, "grad_norm": 3.66117000579834, "learning_rate": 1.9657797702581868e-05, "loss": 1.6389, "step": 287400 }, { "epoch": 0.2000207253741869, "grad_norm": 4.466751575469971, "learning_rate": 1.9657325189461386e-05, "loss": 1.6851, "step": 287600 }, { "epoch": 0.20015982184523987, "grad_norm": 4.952764511108398, "learning_rate": 1.9656852358527837e-05, "loss": 1.6372, "step": 287800 }, { "epoch": 0.20029891831629285, "grad_norm": 3.539882183074951, "learning_rate": 1.9656379209803796e-05, "loss": 1.657, "step": 288000 }, { "epoch": 0.20043801478734583, "grad_norm": 4.36411190032959, "learning_rate": 1.9655905743311855e-05, "loss": 1.6512, "step": 288200 }, { "epoch": 0.20057711125839883, "grad_norm": 5.147241115570068, "learning_rate": 1.9655431959074606e-05, "loss": 1.6875, "step": 288400 }, { "epoch": 0.2007162077294518, "grad_norm": 10.555268287658691, "learning_rate": 1.9654957857114675e-05, "loss": 1.6896, "step": 288600 }, { "epoch": 0.2008553042005048, "grad_norm": 4.449903964996338, "learning_rate": 1.9654483437454685e-05, "loss": 1.6585, "step": 288800 }, { "epoch": 0.20099440067155777, "grad_norm": 7.236289978027344, "learning_rate": 1.96540087001173e-05, "loss": 1.6599, "step": 289000 }, { "epoch": 0.20113349714261075, "grad_norm": 5.000361442565918, "learning_rate": 1.9653533645125173e-05, "loss": 1.5978, "step": 289200 }, { "epoch": 0.20127259361366373, "grad_norm": 3.5129621028900146, "learning_rate": 1.9653058272500977e-05, "loss": 1.6843, "step": 289400 }, { "epoch": 0.2014116900847167, "grad_norm": 3.642213821411133, "learning_rate": 1.965258258226742e-05, "loss": 1.7248, "step": 289600 }, { "epoch": 0.2015507865557697, "grad_norm": 3.3052613735198975, "learning_rate": 1.9652106574447205e-05, "loss": 1.6429, "step": 289800 }, { "epoch": 0.20168988302682267, "grad_norm": 9.101256370544434, "learning_rate": 1.965163024906305e-05, "loss": 1.6753, "step": 290000 }, { "epoch": 0.20182897949787565, "grad_norm": 4.371603965759277, "learning_rate": 1.9651153606137706e-05, "loss": 1.6967, "step": 290200 }, { "epoch": 0.20196807596892863, "grad_norm": 5.514856338500977, "learning_rate": 1.9650676645693916e-05, "loss": 1.5965, "step": 290400 }, { "epoch": 0.2021071724399816, "grad_norm": 6.857879638671875, "learning_rate": 1.9650199367754453e-05, "loss": 1.652, "step": 290600 }, { "epoch": 0.2022462689110346, "grad_norm": 3.5766725540161133, "learning_rate": 1.9649721772342104e-05, "loss": 1.6358, "step": 290800 }, { "epoch": 0.20238536538208757, "grad_norm": 4.538419723510742, "learning_rate": 1.964924385947967e-05, "loss": 1.6559, "step": 291000 }, { "epoch": 0.20252446185314055, "grad_norm": 3.8856050968170166, "learning_rate": 1.9648765629189963e-05, "loss": 1.6379, "step": 291200 }, { "epoch": 0.20266355832419353, "grad_norm": 4.216159343719482, "learning_rate": 1.9648287081495815e-05, "loss": 1.6622, "step": 291400 }, { "epoch": 0.2028026547952465, "grad_norm": 5.262803554534912, "learning_rate": 1.964780821642007e-05, "loss": 1.6383, "step": 291600 }, { "epoch": 0.20294175126629949, "grad_norm": 4.314306259155273, "learning_rate": 1.9647329033985588e-05, "loss": 1.6859, "step": 291800 }, { "epoch": 0.20308084773735247, "grad_norm": 5.328889846801758, "learning_rate": 1.9646849534215247e-05, "loss": 1.6393, "step": 292000 }, { "epoch": 0.20321994420840547, "grad_norm": 4.870936870574951, "learning_rate": 1.964636971713193e-05, "loss": 1.6896, "step": 292200 }, { "epoch": 0.20335904067945845, "grad_norm": 4.600678443908691, "learning_rate": 1.9645889582758557e-05, "loss": 1.5797, "step": 292400 }, { "epoch": 0.20349813715051143, "grad_norm": 4.977414131164551, "learning_rate": 1.9645409131118042e-05, "loss": 1.6682, "step": 292600 }, { "epoch": 0.2036372336215644, "grad_norm": 3.468921422958374, "learning_rate": 1.964492836223331e-05, "loss": 1.6754, "step": 292800 }, { "epoch": 0.2037763300926174, "grad_norm": 4.028801918029785, "learning_rate": 1.9644447276127333e-05, "loss": 1.6543, "step": 293000 }, { "epoch": 0.20391542656367037, "grad_norm": 4.293079376220703, "learning_rate": 1.9643965872823062e-05, "loss": 1.6697, "step": 293200 }, { "epoch": 0.20405452303472335, "grad_norm": 3.8638367652893066, "learning_rate": 1.9643484152343486e-05, "loss": 1.6864, "step": 293400 }, { "epoch": 0.20419361950577633, "grad_norm": 3.492187023162842, "learning_rate": 1.96430021147116e-05, "loss": 1.6544, "step": 293600 }, { "epoch": 0.2043327159768293, "grad_norm": 5.296926975250244, "learning_rate": 1.9642519759950414e-05, "loss": 1.6679, "step": 293800 }, { "epoch": 0.2044718124478823, "grad_norm": 4.00054407119751, "learning_rate": 1.9642037088082957e-05, "loss": 1.6582, "step": 294000 }, { "epoch": 0.20461090891893527, "grad_norm": 4.101848125457764, "learning_rate": 1.964155409913227e-05, "loss": 1.6128, "step": 294200 }, { "epoch": 0.20475000538998825, "grad_norm": 4.239231586456299, "learning_rate": 1.9641070793121413e-05, "loss": 1.6433, "step": 294400 }, { "epoch": 0.20488910186104123, "grad_norm": 5.212873935699463, "learning_rate": 1.9640587170073452e-05, "loss": 1.6895, "step": 294600 }, { "epoch": 0.2050281983320942, "grad_norm": 6.059063911437988, "learning_rate": 1.9640103230011485e-05, "loss": 1.6838, "step": 294800 }, { "epoch": 0.2051672948031472, "grad_norm": 5.193586349487305, "learning_rate": 1.9639618972958608e-05, "loss": 1.6634, "step": 295000 }, { "epoch": 0.20530639127420017, "grad_norm": 6.4489946365356445, "learning_rate": 1.963913439893794e-05, "loss": 1.6601, "step": 295200 }, { "epoch": 0.20544548774525315, "grad_norm": 3.689493417739868, "learning_rate": 1.963864950797261e-05, "loss": 1.6345, "step": 295400 }, { "epoch": 0.20558458421630613, "grad_norm": 6.346654891967773, "learning_rate": 1.9638164300085775e-05, "loss": 1.558, "step": 295600 }, { "epoch": 0.20572368068735913, "grad_norm": 3.1460225582122803, "learning_rate": 1.9637678775300593e-05, "loss": 1.6409, "step": 295800 }, { "epoch": 0.2058627771584121, "grad_norm": 4.389196872711182, "learning_rate": 1.9637192933640237e-05, "loss": 1.711, "step": 296000 }, { "epoch": 0.2060018736294651, "grad_norm": 3.85598087310791, "learning_rate": 1.9636706775127915e-05, "loss": 1.6426, "step": 296200 }, { "epoch": 0.20614097010051807, "grad_norm": 4.471874713897705, "learning_rate": 1.963622029978682e-05, "loss": 1.6757, "step": 296400 }, { "epoch": 0.20628006657157105, "grad_norm": 5.590032577514648, "learning_rate": 1.9635733507640184e-05, "loss": 1.6918, "step": 296600 }, { "epoch": 0.20641916304262403, "grad_norm": 3.761293411254883, "learning_rate": 1.9635246398711246e-05, "loss": 1.6482, "step": 296800 }, { "epoch": 0.206558259513677, "grad_norm": 3.6025288105010986, "learning_rate": 1.963475897302326e-05, "loss": 1.6329, "step": 297000 }, { "epoch": 0.20669735598473, "grad_norm": 7.067914009094238, "learning_rate": 1.963427123059949e-05, "loss": 1.6513, "step": 297200 }, { "epoch": 0.20683645245578297, "grad_norm": 3.880753755569458, "learning_rate": 1.9633783171463225e-05, "loss": 1.6508, "step": 297400 }, { "epoch": 0.20697554892683595, "grad_norm": 4.853597164154053, "learning_rate": 1.9633294795637764e-05, "loss": 1.6835, "step": 297600 }, { "epoch": 0.20711464539788893, "grad_norm": 4.499924659729004, "learning_rate": 1.9632806103146417e-05, "loss": 1.6409, "step": 297800 }, { "epoch": 0.2072537418689419, "grad_norm": 3.5251362323760986, "learning_rate": 1.9632317094012524e-05, "loss": 1.6415, "step": 298000 }, { "epoch": 0.2073928383399949, "grad_norm": 4.933137893676758, "learning_rate": 1.9631827768259423e-05, "loss": 1.6775, "step": 298200 }, { "epoch": 0.20753193481104787, "grad_norm": 5.984827041625977, "learning_rate": 1.9631338125910472e-05, "loss": 1.639, "step": 298400 }, { "epoch": 0.20767103128210085, "grad_norm": 5.204819679260254, "learning_rate": 1.9630848166989045e-05, "loss": 1.6886, "step": 298600 }, { "epoch": 0.20781012775315383, "grad_norm": 4.68050479888916, "learning_rate": 1.9630357891518538e-05, "loss": 1.6155, "step": 298800 }, { "epoch": 0.2079492242242068, "grad_norm": 3.6042912006378174, "learning_rate": 1.9629867299522352e-05, "loss": 1.5673, "step": 299000 }, { "epoch": 0.2080883206952598, "grad_norm": 6.580783843994141, "learning_rate": 1.962937639102391e-05, "loss": 1.6707, "step": 299200 }, { "epoch": 0.2082274171663128, "grad_norm": 6.235212802886963, "learning_rate": 1.9628885166046644e-05, "loss": 1.6507, "step": 299400 }, { "epoch": 0.20836651363736577, "grad_norm": 4.769062519073486, "learning_rate": 1.9628393624614012e-05, "loss": 1.6523, "step": 299600 }, { "epoch": 0.20850561010841875, "grad_norm": 3.149367332458496, "learning_rate": 1.962790176674947e-05, "loss": 1.6742, "step": 299800 }, { "epoch": 0.20864470657947173, "grad_norm": 6.838754653930664, "learning_rate": 1.96274095924765e-05, "loss": 1.6519, "step": 300000 }, { "epoch": 0.2087838030505247, "grad_norm": 5.285316467285156, "learning_rate": 1.9626917101818605e-05, "loss": 1.6303, "step": 300200 }, { "epoch": 0.2089228995215777, "grad_norm": 6.633730411529541, "learning_rate": 1.962642429479929e-05, "loss": 1.6582, "step": 300400 }, { "epoch": 0.20906199599263067, "grad_norm": 2.6784451007843018, "learning_rate": 1.9625931171442084e-05, "loss": 1.6345, "step": 300600 }, { "epoch": 0.20920109246368365, "grad_norm": 8.77546501159668, "learning_rate": 1.9625437731770525e-05, "loss": 1.6969, "step": 300800 }, { "epoch": 0.20934018893473663, "grad_norm": 4.31553840637207, "learning_rate": 1.9624943975808173e-05, "loss": 1.7221, "step": 301000 }, { "epoch": 0.2094792854057896, "grad_norm": 4.152640342712402, "learning_rate": 1.9624449903578595e-05, "loss": 1.6443, "step": 301200 }, { "epoch": 0.2096183818768426, "grad_norm": 4.746995449066162, "learning_rate": 1.962395551510538e-05, "loss": 1.6309, "step": 301400 }, { "epoch": 0.20975747834789557, "grad_norm": 3.393209934234619, "learning_rate": 1.962346081041213e-05, "loss": 1.6638, "step": 301600 }, { "epoch": 0.20989657481894855, "grad_norm": 5.5710039138793945, "learning_rate": 1.9622965789522465e-05, "loss": 1.6157, "step": 301800 }, { "epoch": 0.21003567129000153, "grad_norm": 3.797006845474243, "learning_rate": 1.9622470452460008e-05, "loss": 1.7037, "step": 302000 }, { "epoch": 0.2101747677610545, "grad_norm": 6.540433406829834, "learning_rate": 1.9621974799248415e-05, "loss": 1.6766, "step": 302200 }, { "epoch": 0.2103138642321075, "grad_norm": 2.9818971157073975, "learning_rate": 1.962147882991134e-05, "loss": 1.6521, "step": 302400 }, { "epoch": 0.21045296070316047, "grad_norm": 5.232878684997559, "learning_rate": 1.962098254447246e-05, "loss": 1.6709, "step": 302600 }, { "epoch": 0.21059205717421345, "grad_norm": 4.583964824676514, "learning_rate": 1.962048594295548e-05, "loss": 1.6536, "step": 302800 }, { "epoch": 0.21073115364526646, "grad_norm": 4.246820449829102, "learning_rate": 1.9619989025384095e-05, "loss": 1.6869, "step": 303000 }, { "epoch": 0.21087025011631944, "grad_norm": 3.0684456825256348, "learning_rate": 1.9619491791782024e-05, "loss": 1.5866, "step": 303200 }, { "epoch": 0.21100934658737242, "grad_norm": 4.3238043785095215, "learning_rate": 1.961899424217301e-05, "loss": 1.7015, "step": 303400 }, { "epoch": 0.2111484430584254, "grad_norm": 7.376230716705322, "learning_rate": 1.961849637658081e-05, "loss": 1.6651, "step": 303600 }, { "epoch": 0.21128753952947837, "grad_norm": 4.386313438415527, "learning_rate": 1.9617998195029193e-05, "loss": 1.7015, "step": 303800 }, { "epoch": 0.21142663600053135, "grad_norm": 3.8081557750701904, "learning_rate": 1.961749969754193e-05, "loss": 1.642, "step": 304000 }, { "epoch": 0.21156573247158433, "grad_norm": 5.479763984680176, "learning_rate": 1.961700088414282e-05, "loss": 1.65, "step": 304200 }, { "epoch": 0.2117048289426373, "grad_norm": 3.5513412952423096, "learning_rate": 1.961650175485569e-05, "loss": 1.6974, "step": 304400 }, { "epoch": 0.2118439254136903, "grad_norm": 4.357348442077637, "learning_rate": 1.9616002309704355e-05, "loss": 1.6613, "step": 304600 }, { "epoch": 0.21198302188474327, "grad_norm": 4.734049320220947, "learning_rate": 1.961550254871266e-05, "loss": 1.6765, "step": 304800 }, { "epoch": 0.21212211835579625, "grad_norm": 5.617879867553711, "learning_rate": 1.961500247190446e-05, "loss": 1.6549, "step": 305000 }, { "epoch": 0.21226121482684923, "grad_norm": 5.3027825355529785, "learning_rate": 1.9614502079303638e-05, "loss": 1.7074, "step": 305200 }, { "epoch": 0.2124003112979022, "grad_norm": 5.7485880851745605, "learning_rate": 1.9614001370934076e-05, "loss": 1.6845, "step": 305400 }, { "epoch": 0.2125394077689552, "grad_norm": 5.49586296081543, "learning_rate": 1.961350034681968e-05, "loss": 1.6251, "step": 305600 }, { "epoch": 0.21267850424000817, "grad_norm": 4.661770343780518, "learning_rate": 1.9612999006984363e-05, "loss": 1.6661, "step": 305800 }, { "epoch": 0.21281760071106115, "grad_norm": 5.290302753448486, "learning_rate": 1.9612497351452063e-05, "loss": 1.6457, "step": 306000 }, { "epoch": 0.21295669718211413, "grad_norm": 10.018582344055176, "learning_rate": 1.9611995380246728e-05, "loss": 1.6699, "step": 306200 }, { "epoch": 0.2130957936531671, "grad_norm": 4.365277290344238, "learning_rate": 1.9611493093392316e-05, "loss": 1.6775, "step": 306400 }, { "epoch": 0.21323489012422012, "grad_norm": 4.452324390411377, "learning_rate": 1.9610990490912813e-05, "loss": 1.6628, "step": 306600 }, { "epoch": 0.2133739865952731, "grad_norm": 5.4576873779296875, "learning_rate": 1.9610487572832206e-05, "loss": 1.6385, "step": 306800 }, { "epoch": 0.21351308306632608, "grad_norm": 6.880828380584717, "learning_rate": 1.960998433917451e-05, "loss": 1.6811, "step": 307000 }, { "epoch": 0.21365217953737906, "grad_norm": 4.674741744995117, "learning_rate": 1.960948078996375e-05, "loss": 1.6333, "step": 307200 }, { "epoch": 0.21379127600843204, "grad_norm": 4.738868713378906, "learning_rate": 1.9608976925223957e-05, "loss": 1.7225, "step": 307400 }, { "epoch": 0.21393037247948501, "grad_norm": 4.7039923667907715, "learning_rate": 1.960847274497919e-05, "loss": 1.6416, "step": 307600 }, { "epoch": 0.214069468950538, "grad_norm": 4.587064266204834, "learning_rate": 1.960796824925352e-05, "loss": 1.6528, "step": 307800 }, { "epoch": 0.21420856542159097, "grad_norm": 4.1382598876953125, "learning_rate": 1.9607463438071024e-05, "loss": 1.6532, "step": 308000 }, { "epoch": 0.21434766189264395, "grad_norm": 5.715200901031494, "learning_rate": 1.960695831145581e-05, "loss": 1.6668, "step": 308200 }, { "epoch": 0.21448675836369693, "grad_norm": 5.354598045349121, "learning_rate": 1.9606452869431983e-05, "loss": 1.6533, "step": 308400 }, { "epoch": 0.2146258548347499, "grad_norm": 3.983879804611206, "learning_rate": 1.960594711202368e-05, "loss": 1.6296, "step": 308600 }, { "epoch": 0.2147649513058029, "grad_norm": 5.253481864929199, "learning_rate": 1.960544103925504e-05, "loss": 1.7002, "step": 308800 }, { "epoch": 0.21490404777685587, "grad_norm": 5.409156799316406, "learning_rate": 1.960493465115022e-05, "loss": 1.666, "step": 309000 }, { "epoch": 0.21504314424790885, "grad_norm": 4.206005096435547, "learning_rate": 1.9604427947733403e-05, "loss": 1.7152, "step": 309200 }, { "epoch": 0.21518224071896183, "grad_norm": 3.160194158554077, "learning_rate": 1.960392092902877e-05, "loss": 1.6177, "step": 309400 }, { "epoch": 0.2153213371900148, "grad_norm": 5.920907497406006, "learning_rate": 1.9603413595060535e-05, "loss": 1.6326, "step": 309600 }, { "epoch": 0.2154604336610678, "grad_norm": 11.446234703063965, "learning_rate": 1.9602905945852915e-05, "loss": 1.6339, "step": 309800 }, { "epoch": 0.21559953013212077, "grad_norm": 3.757387161254883, "learning_rate": 1.9602397981430136e-05, "loss": 1.6972, "step": 310000 }, { "epoch": 0.21573862660317378, "grad_norm": 3.476900339126587, "learning_rate": 1.9601889701816456e-05, "loss": 1.7043, "step": 310200 }, { "epoch": 0.21587772307422676, "grad_norm": 4.498541831970215, "learning_rate": 1.960138110703613e-05, "loss": 1.7175, "step": 310400 }, { "epoch": 0.21601681954527974, "grad_norm": 9.986011505126953, "learning_rate": 1.9600872197113455e-05, "loss": 1.7123, "step": 310600 }, { "epoch": 0.21615591601633272, "grad_norm": 4.537940502166748, "learning_rate": 1.9600362972072712e-05, "loss": 1.6774, "step": 310800 }, { "epoch": 0.2162950124873857, "grad_norm": 2.205578327178955, "learning_rate": 1.959985343193821e-05, "loss": 1.6965, "step": 311000 }, { "epoch": 0.21643410895843868, "grad_norm": 4.593422889709473, "learning_rate": 1.959934357673428e-05, "loss": 1.6569, "step": 311200 }, { "epoch": 0.21657320542949166, "grad_norm": 6.6114277839660645, "learning_rate": 1.959883340648526e-05, "loss": 1.6924, "step": 311400 }, { "epoch": 0.21671230190054463, "grad_norm": 9.484886169433594, "learning_rate": 1.9598322921215505e-05, "loss": 1.7288, "step": 311600 }, { "epoch": 0.21685139837159761, "grad_norm": 5.591075420379639, "learning_rate": 1.959781212094939e-05, "loss": 1.6314, "step": 311800 }, { "epoch": 0.2169904948426506, "grad_norm": 5.16893196105957, "learning_rate": 1.9597301005711295e-05, "loss": 1.6509, "step": 312000 }, { "epoch": 0.21712959131370357, "grad_norm": 3.7571005821228027, "learning_rate": 1.959678957552561e-05, "loss": 1.6595, "step": 312200 }, { "epoch": 0.21726868778475655, "grad_norm": 5.437869071960449, "learning_rate": 1.9596277830416767e-05, "loss": 1.6596, "step": 312400 }, { "epoch": 0.21740778425580953, "grad_norm": 2.9518752098083496, "learning_rate": 1.959576577040919e-05, "loss": 1.6525, "step": 312600 }, { "epoch": 0.2175468807268625, "grad_norm": 5.927860736846924, "learning_rate": 1.9595253395527324e-05, "loss": 1.6702, "step": 312800 }, { "epoch": 0.2176859771979155, "grad_norm": 4.122495174407959, "learning_rate": 1.9594740705795624e-05, "loss": 1.5983, "step": 313000 }, { "epoch": 0.21782507366896847, "grad_norm": 5.69853401184082, "learning_rate": 1.9594227701238577e-05, "loss": 1.6853, "step": 313200 }, { "epoch": 0.21796417014002145, "grad_norm": 4.321805477142334, "learning_rate": 1.959371438188066e-05, "loss": 1.6881, "step": 313400 }, { "epoch": 0.21810326661107443, "grad_norm": 5.1796555519104, "learning_rate": 1.9593200747746385e-05, "loss": 1.6462, "step": 313600 }, { "epoch": 0.21824236308212744, "grad_norm": 4.2316484451293945, "learning_rate": 1.9592686798860274e-05, "loss": 1.7556, "step": 313800 }, { "epoch": 0.21838145955318042, "grad_norm": 4.078604698181152, "learning_rate": 1.959217253524686e-05, "loss": 1.6371, "step": 314000 }, { "epoch": 0.2185205560242334, "grad_norm": 3.3813111782073975, "learning_rate": 1.9591657956930695e-05, "loss": 1.5876, "step": 314200 }, { "epoch": 0.21865965249528638, "grad_norm": 5.239701271057129, "learning_rate": 1.9591143063936338e-05, "loss": 1.6864, "step": 314400 }, { "epoch": 0.21879874896633936, "grad_norm": 4.93508243560791, "learning_rate": 1.9590627856288377e-05, "loss": 1.6344, "step": 314600 }, { "epoch": 0.21893784543739234, "grad_norm": 2.18503737449646, "learning_rate": 1.9590112334011404e-05, "loss": 1.7201, "step": 314800 }, { "epoch": 0.21907694190844532, "grad_norm": 3.247765064239502, "learning_rate": 1.958959649713003e-05, "loss": 1.656, "step": 315000 }, { "epoch": 0.2192160383794983, "grad_norm": 3.3061673641204834, "learning_rate": 1.958908034566888e-05, "loss": 1.6291, "step": 315200 }, { "epoch": 0.21935513485055128, "grad_norm": 3.5833218097686768, "learning_rate": 1.9588563879652596e-05, "loss": 1.7141, "step": 315400 }, { "epoch": 0.21949423132160426, "grad_norm": 6.012962818145752, "learning_rate": 1.958804709910583e-05, "loss": 1.6554, "step": 315600 }, { "epoch": 0.21963332779265723, "grad_norm": 3.7320868968963623, "learning_rate": 1.958753000405326e-05, "loss": 1.6457, "step": 315800 }, { "epoch": 0.21977242426371021, "grad_norm": 2.3896939754486084, "learning_rate": 1.958701259451956e-05, "loss": 1.6131, "step": 316000 }, { "epoch": 0.2199115207347632, "grad_norm": 4.528538703918457, "learning_rate": 1.958649487052944e-05, "loss": 1.6262, "step": 316200 }, { "epoch": 0.22005061720581617, "grad_norm": 5.505406379699707, "learning_rate": 1.9585976832107616e-05, "loss": 1.7339, "step": 316400 }, { "epoch": 0.22018971367686915, "grad_norm": 7.858295440673828, "learning_rate": 1.958545847927881e-05, "loss": 1.6797, "step": 316600 }, { "epoch": 0.22032881014792213, "grad_norm": 6.450380802154541, "learning_rate": 1.9584939812067776e-05, "loss": 1.6696, "step": 316800 }, { "epoch": 0.2204679066189751, "grad_norm": 4.586244583129883, "learning_rate": 1.9584420830499274e-05, "loss": 1.6595, "step": 317000 }, { "epoch": 0.2206070030900281, "grad_norm": 6.858448505401611, "learning_rate": 1.958390153459807e-05, "loss": 1.6334, "step": 317200 }, { "epoch": 0.2207460995610811, "grad_norm": 7.538793087005615, "learning_rate": 1.9583381924388964e-05, "loss": 1.6413, "step": 317400 }, { "epoch": 0.22088519603213408, "grad_norm": 3.9224114418029785, "learning_rate": 1.958286199989676e-05, "loss": 1.6679, "step": 317600 }, { "epoch": 0.22102429250318706, "grad_norm": 4.8906121253967285, "learning_rate": 1.958234176114628e-05, "loss": 1.6815, "step": 317800 }, { "epoch": 0.22116338897424004, "grad_norm": 4.209512233734131, "learning_rate": 1.9581821208162352e-05, "loss": 1.6344, "step": 318000 }, { "epoch": 0.22130248544529302, "grad_norm": 4.522933006286621, "learning_rate": 1.958130034096983e-05, "loss": 1.6656, "step": 318200 }, { "epoch": 0.221441581916346, "grad_norm": 5.4859232902526855, "learning_rate": 1.958077915959359e-05, "loss": 1.6618, "step": 318400 }, { "epoch": 0.22158067838739898, "grad_norm": 2.8956458568573, "learning_rate": 1.95802576640585e-05, "loss": 1.6673, "step": 318600 }, { "epoch": 0.22171977485845196, "grad_norm": 3.687004804611206, "learning_rate": 1.9579735854389463e-05, "loss": 1.6202, "step": 318800 }, { "epoch": 0.22185887132950494, "grad_norm": 5.116758346557617, "learning_rate": 1.957921373061138e-05, "loss": 1.6749, "step": 319000 }, { "epoch": 0.22199796780055792, "grad_norm": 4.444911956787109, "learning_rate": 1.9578691292749184e-05, "loss": 1.6106, "step": 319200 }, { "epoch": 0.2221370642716109, "grad_norm": 3.0427870750427246, "learning_rate": 1.9578168540827817e-05, "loss": 1.6919, "step": 319400 }, { "epoch": 0.22227616074266388, "grad_norm": 5.375112056732178, "learning_rate": 1.9577645474872234e-05, "loss": 1.6891, "step": 319600 }, { "epoch": 0.22241525721371685, "grad_norm": 6.9308061599731445, "learning_rate": 1.95771220949074e-05, "loss": 1.6316, "step": 319800 }, { "epoch": 0.22255435368476983, "grad_norm": 7.145652770996094, "learning_rate": 1.9576598400958305e-05, "loss": 1.6319, "step": 320000 }, { "epoch": 0.22269345015582281, "grad_norm": 6.354142189025879, "learning_rate": 1.9576074393049945e-05, "loss": 1.6554, "step": 320200 }, { "epoch": 0.2228325466268758, "grad_norm": 9.545760154724121, "learning_rate": 1.9575550071207345e-05, "loss": 1.6547, "step": 320400 }, { "epoch": 0.22297164309792877, "grad_norm": 3.492077112197876, "learning_rate": 1.9575025435455525e-05, "loss": 1.6255, "step": 320600 }, { "epoch": 0.22311073956898175, "grad_norm": 5.889474391937256, "learning_rate": 1.9574500485819542e-05, "loss": 1.6045, "step": 320800 }, { "epoch": 0.22324983604003473, "grad_norm": 5.40513801574707, "learning_rate": 1.9573975222324445e-05, "loss": 1.6844, "step": 321000 }, { "epoch": 0.22338893251108774, "grad_norm": 5.019711017608643, "learning_rate": 1.9573449644995313e-05, "loss": 1.627, "step": 321200 }, { "epoch": 0.22352802898214072, "grad_norm": 4.842776298522949, "learning_rate": 1.9572923753857243e-05, "loss": 1.6743, "step": 321400 }, { "epoch": 0.2236671254531937, "grad_norm": 3.935666561126709, "learning_rate": 1.9572397548935332e-05, "loss": 1.6871, "step": 321600 }, { "epoch": 0.22380622192424668, "grad_norm": 4.007401466369629, "learning_rate": 1.9571871030254698e-05, "loss": 1.694, "step": 321800 }, { "epoch": 0.22394531839529966, "grad_norm": 4.626333236694336, "learning_rate": 1.957134419784049e-05, "loss": 1.6435, "step": 322000 }, { "epoch": 0.22408441486635264, "grad_norm": 4.324277877807617, "learning_rate": 1.957081705171785e-05, "loss": 1.6935, "step": 322200 }, { "epoch": 0.22422351133740562, "grad_norm": 7.599738597869873, "learning_rate": 1.957028959191194e-05, "loss": 1.6648, "step": 322400 }, { "epoch": 0.2243626078084586, "grad_norm": 9.507134437561035, "learning_rate": 1.9569761818447947e-05, "loss": 1.6628, "step": 322600 }, { "epoch": 0.22450170427951158, "grad_norm": 5.793203830718994, "learning_rate": 1.9569233731351058e-05, "loss": 1.6294, "step": 322800 }, { "epoch": 0.22464080075056456, "grad_norm": 3.9718925952911377, "learning_rate": 1.9568705330646492e-05, "loss": 1.7046, "step": 323000 }, { "epoch": 0.22477989722161754, "grad_norm": 6.051609516143799, "learning_rate": 1.956817661635947e-05, "loss": 1.599, "step": 323200 }, { "epoch": 0.22491899369267052, "grad_norm": 3.4729206562042236, "learning_rate": 1.9567647588515236e-05, "loss": 1.681, "step": 323400 }, { "epoch": 0.2250580901637235, "grad_norm": 3.9161744117736816, "learning_rate": 1.9567118247139037e-05, "loss": 1.6401, "step": 323600 }, { "epoch": 0.22519718663477648, "grad_norm": 10.702912330627441, "learning_rate": 1.9566588592256154e-05, "loss": 1.6639, "step": 323800 }, { "epoch": 0.22533628310582945, "grad_norm": 4.2679009437561035, "learning_rate": 1.9566058623891862e-05, "loss": 1.6799, "step": 324000 }, { "epoch": 0.22547537957688243, "grad_norm": 4.640923976898193, "learning_rate": 1.9565528342071468e-05, "loss": 1.6343, "step": 324200 }, { "epoch": 0.22561447604793541, "grad_norm": 8.389718055725098, "learning_rate": 1.9564997746820284e-05, "loss": 1.6922, "step": 324400 }, { "epoch": 0.2257535725189884, "grad_norm": 6.336442470550537, "learning_rate": 1.9564466838163644e-05, "loss": 1.6746, "step": 324600 }, { "epoch": 0.2258926689900414, "grad_norm": 8.480534553527832, "learning_rate": 1.9563935616126883e-05, "loss": 1.6402, "step": 324800 }, { "epoch": 0.22603176546109438, "grad_norm": 3.2278125286102295, "learning_rate": 1.956340408073537e-05, "loss": 1.6153, "step": 325000 }, { "epoch": 0.22617086193214736, "grad_norm": 5.15024995803833, "learning_rate": 1.956287223201448e-05, "loss": 1.6464, "step": 325200 }, { "epoch": 0.22630995840320034, "grad_norm": 4.23423433303833, "learning_rate": 1.9562340069989597e-05, "loss": 1.6766, "step": 325400 }, { "epoch": 0.22644905487425332, "grad_norm": 5.7771897315979, "learning_rate": 1.9561807594686134e-05, "loss": 1.6951, "step": 325600 }, { "epoch": 0.2265881513453063, "grad_norm": 3.168464183807373, "learning_rate": 1.9561274806129497e-05, "loss": 1.6412, "step": 325800 }, { "epoch": 0.22672724781635928, "grad_norm": 5.302984714508057, "learning_rate": 1.9560741704345134e-05, "loss": 1.6193, "step": 326000 }, { "epoch": 0.22686634428741226, "grad_norm": 6.367260456085205, "learning_rate": 1.956020828935849e-05, "loss": 1.6941, "step": 326200 }, { "epoch": 0.22700544075846524, "grad_norm": 5.027446746826172, "learning_rate": 1.9559674561195036e-05, "loss": 1.7103, "step": 326400 }, { "epoch": 0.22714453722951822, "grad_norm": 5.662863731384277, "learning_rate": 1.9559140519880237e-05, "loss": 1.6231, "step": 326600 }, { "epoch": 0.2272836337005712, "grad_norm": 5.539330005645752, "learning_rate": 1.95586061654396e-05, "loss": 1.6932, "step": 326800 }, { "epoch": 0.22742273017162418, "grad_norm": 3.2600276470184326, "learning_rate": 1.955807149789863e-05, "loss": 1.6232, "step": 327000 }, { "epoch": 0.22756182664267716, "grad_norm": 7.053085803985596, "learning_rate": 1.955753651728285e-05, "loss": 1.5793, "step": 327200 }, { "epoch": 0.22770092311373014, "grad_norm": 2.6475093364715576, "learning_rate": 1.95570012236178e-05, "loss": 1.6645, "step": 327400 }, { "epoch": 0.22784001958478312, "grad_norm": 5.7703423500061035, "learning_rate": 1.955646561692904e-05, "loss": 1.6229, "step": 327600 }, { "epoch": 0.2279791160558361, "grad_norm": 0.4474673271179199, "learning_rate": 1.9555929697242133e-05, "loss": 1.6597, "step": 327800 }, { "epoch": 0.22811821252688907, "grad_norm": 5.461849212646484, "learning_rate": 1.955539346458266e-05, "loss": 1.6588, "step": 328000 }, { "epoch": 0.22825730899794205, "grad_norm": 4.46312141418457, "learning_rate": 1.9554856918976234e-05, "loss": 1.6809, "step": 328200 }, { "epoch": 0.22839640546899506, "grad_norm": 2.568528413772583, "learning_rate": 1.9554320060448454e-05, "loss": 1.625, "step": 328400 }, { "epoch": 0.22853550194004804, "grad_norm": 4.714695930480957, "learning_rate": 1.9553782889024954e-05, "loss": 1.6627, "step": 328600 }, { "epoch": 0.22867459841110102, "grad_norm": 4.660126686096191, "learning_rate": 1.9553245404731384e-05, "loss": 1.637, "step": 328800 }, { "epoch": 0.228813694882154, "grad_norm": 4.772575378417969, "learning_rate": 1.95527076075934e-05, "loss": 1.5978, "step": 329000 }, { "epoch": 0.22895279135320698, "grad_norm": 5.092039585113525, "learning_rate": 1.9552169497636665e-05, "loss": 1.627, "step": 329200 }, { "epoch": 0.22909188782425996, "grad_norm": 4.213221073150635, "learning_rate": 1.955163107488688e-05, "loss": 1.6581, "step": 329400 }, { "epoch": 0.22923098429531294, "grad_norm": 5.284055233001709, "learning_rate": 1.9551092339369745e-05, "loss": 1.6842, "step": 329600 }, { "epoch": 0.22937008076636592, "grad_norm": 2.6570284366607666, "learning_rate": 1.9550553291110984e-05, "loss": 1.6129, "step": 329800 }, { "epoch": 0.2295091772374189, "grad_norm": 5.607264995574951, "learning_rate": 1.955001393013632e-05, "loss": 1.682, "step": 330000 }, { "epoch": 0.22964827370847188, "grad_norm": 3.649813413619995, "learning_rate": 1.954947425647151e-05, "loss": 1.6495, "step": 330200 }, { "epoch": 0.22978737017952486, "grad_norm": 5.09197998046875, "learning_rate": 1.954893427014231e-05, "loss": 1.6972, "step": 330400 }, { "epoch": 0.22992646665057784, "grad_norm": 2.6789989471435547, "learning_rate": 1.9548393971174504e-05, "loss": 1.6026, "step": 330600 }, { "epoch": 0.23006556312163082, "grad_norm": 3.048635721206665, "learning_rate": 1.9547853359593883e-05, "loss": 1.6314, "step": 330800 }, { "epoch": 0.2302046595926838, "grad_norm": 3.098134756088257, "learning_rate": 1.9547312435426256e-05, "loss": 1.6932, "step": 331000 }, { "epoch": 0.23034375606373678, "grad_norm": 4.221190452575684, "learning_rate": 1.9546771198697447e-05, "loss": 1.6767, "step": 331200 }, { "epoch": 0.23048285253478976, "grad_norm": 3.3209691047668457, "learning_rate": 1.9546229649433295e-05, "loss": 1.7118, "step": 331400 }, { "epoch": 0.23062194900584274, "grad_norm": 3.279834508895874, "learning_rate": 1.9545687787659654e-05, "loss": 1.6302, "step": 331600 }, { "epoch": 0.23076104547689572, "grad_norm": 3.822474718093872, "learning_rate": 1.9545145613402382e-05, "loss": 1.7327, "step": 331800 }, { "epoch": 0.23090014194794872, "grad_norm": 7.678128242492676, "learning_rate": 1.9544603126687368e-05, "loss": 1.6799, "step": 332000 }, { "epoch": 0.2310392384190017, "grad_norm": 3.414104700088501, "learning_rate": 1.9544060327540518e-05, "loss": 1.6382, "step": 332200 }, { "epoch": 0.23117833489005468, "grad_norm": 4.272618770599365, "learning_rate": 1.954351721598773e-05, "loss": 1.6937, "step": 332400 }, { "epoch": 0.23131743136110766, "grad_norm": 7.377589702606201, "learning_rate": 1.9542973792054937e-05, "loss": 1.6368, "step": 332600 }, { "epoch": 0.23145652783216064, "grad_norm": 4.968153476715088, "learning_rate": 1.954243005576809e-05, "loss": 1.6656, "step": 332800 }, { "epoch": 0.23159562430321362, "grad_norm": 3.5833253860473633, "learning_rate": 1.9541886007153134e-05, "loss": 1.6268, "step": 333000 }, { "epoch": 0.2317347207742666, "grad_norm": 5.1467390060424805, "learning_rate": 1.954134164623605e-05, "loss": 1.6661, "step": 333200 }, { "epoch": 0.23187381724531958, "grad_norm": 5.069991588592529, "learning_rate": 1.954079697304282e-05, "loss": 1.6153, "step": 333400 }, { "epoch": 0.23201291371637256, "grad_norm": 5.196010589599609, "learning_rate": 1.9540251987599453e-05, "loss": 1.6555, "step": 333600 }, { "epoch": 0.23215201018742554, "grad_norm": 3.9568543434143066, "learning_rate": 1.9539706689931953e-05, "loss": 1.6272, "step": 333800 }, { "epoch": 0.23229110665847852, "grad_norm": 3.6363978385925293, "learning_rate": 1.9539161080066363e-05, "loss": 1.6389, "step": 334000 }, { "epoch": 0.2324302031295315, "grad_norm": 6.969249248504639, "learning_rate": 1.9538615158028725e-05, "loss": 1.6611, "step": 334200 }, { "epoch": 0.23256929960058448, "grad_norm": 5.5380072593688965, "learning_rate": 1.9538068923845104e-05, "loss": 1.6576, "step": 334400 }, { "epoch": 0.23270839607163746, "grad_norm": 4.26546049118042, "learning_rate": 1.9537522377541576e-05, "loss": 1.6308, "step": 334600 }, { "epoch": 0.23284749254269044, "grad_norm": 4.9990973472595215, "learning_rate": 1.9536975519144228e-05, "loss": 1.673, "step": 334800 }, { "epoch": 0.23298658901374342, "grad_norm": 5.484916687011719, "learning_rate": 1.953642834867917e-05, "loss": 1.6216, "step": 335000 }, { "epoch": 0.2331256854847964, "grad_norm": 3.4473931789398193, "learning_rate": 1.9535880866172526e-05, "loss": 1.6519, "step": 335200 }, { "epoch": 0.23326478195584938, "grad_norm": 4.119619369506836, "learning_rate": 1.9535333071650425e-05, "loss": 1.6383, "step": 335400 }, { "epoch": 0.23340387842690238, "grad_norm": 4.20940637588501, "learning_rate": 1.953478496513902e-05, "loss": 1.6113, "step": 335600 }, { "epoch": 0.23354297489795536, "grad_norm": 5.950852870941162, "learning_rate": 1.953423654666449e-05, "loss": 1.6016, "step": 335800 }, { "epoch": 0.23368207136900834, "grad_norm": 3.4197518825531006, "learning_rate": 1.9533687816252997e-05, "loss": 1.638, "step": 336000 }, { "epoch": 0.23382116784006132, "grad_norm": 6.697747230529785, "learning_rate": 1.9533138773930743e-05, "loss": 1.6561, "step": 336200 }, { "epoch": 0.2339602643111143, "grad_norm": 4.98708963394165, "learning_rate": 1.9532589419723944e-05, "loss": 1.7161, "step": 336400 }, { "epoch": 0.23409936078216728, "grad_norm": 5.75113582611084, "learning_rate": 1.9532039753658822e-05, "loss": 1.6752, "step": 336600 }, { "epoch": 0.23423845725322026, "grad_norm": 3.8082878589630127, "learning_rate": 1.9531489775761617e-05, "loss": 1.6679, "step": 336800 }, { "epoch": 0.23437755372427324, "grad_norm": 4.37647819519043, "learning_rate": 1.953093948605858e-05, "loss": 1.6643, "step": 337000 }, { "epoch": 0.23451665019532622, "grad_norm": 5.018675327301025, "learning_rate": 1.953038888457599e-05, "loss": 1.6606, "step": 337200 }, { "epoch": 0.2346557466663792, "grad_norm": 5.047998905181885, "learning_rate": 1.952983797134013e-05, "loss": 1.6508, "step": 337400 }, { "epoch": 0.23479484313743218, "grad_norm": 7.279408931732178, "learning_rate": 1.95292867463773e-05, "loss": 1.6547, "step": 337600 }, { "epoch": 0.23493393960848516, "grad_norm": 6.7975382804870605, "learning_rate": 1.9528735209713808e-05, "loss": 1.6461, "step": 337800 }, { "epoch": 0.23507303607953814, "grad_norm": 7.198062896728516, "learning_rate": 1.9528183361375986e-05, "loss": 1.6954, "step": 338000 }, { "epoch": 0.23521213255059112, "grad_norm": 4.493501663208008, "learning_rate": 1.9527631201390185e-05, "loss": 1.6956, "step": 338200 }, { "epoch": 0.2353512290216441, "grad_norm": 4.0898118019104, "learning_rate": 1.952707872978276e-05, "loss": 1.6233, "step": 338400 }, { "epoch": 0.23549032549269708, "grad_norm": 3.5022025108337402, "learning_rate": 1.952652594658009e-05, "loss": 1.6675, "step": 338600 }, { "epoch": 0.23562942196375006, "grad_norm": 3.9198243618011475, "learning_rate": 1.9525972851808555e-05, "loss": 1.6433, "step": 338800 }, { "epoch": 0.23576851843480304, "grad_norm": 4.736083507537842, "learning_rate": 1.9525419445494563e-05, "loss": 1.6486, "step": 339000 }, { "epoch": 0.23590761490585604, "grad_norm": 3.913604259490967, "learning_rate": 1.952486572766454e-05, "loss": 1.5873, "step": 339200 }, { "epoch": 0.23604671137690902, "grad_norm": 4.593210220336914, "learning_rate": 1.9524311698344908e-05, "loss": 1.696, "step": 339400 }, { "epoch": 0.236185807847962, "grad_norm": 12.825864791870117, "learning_rate": 1.9523757357562124e-05, "loss": 1.6756, "step": 339600 }, { "epoch": 0.23632490431901498, "grad_norm": 3.4124608039855957, "learning_rate": 1.9523202705342653e-05, "loss": 1.6614, "step": 339800 }, { "epoch": 0.23646400079006796, "grad_norm": 3.605181932449341, "learning_rate": 1.9522647741712966e-05, "loss": 1.6916, "step": 340000 }, { "epoch": 0.23660309726112094, "grad_norm": 5.278689384460449, "learning_rate": 1.952209246669956e-05, "loss": 1.6617, "step": 340200 }, { "epoch": 0.23674219373217392, "grad_norm": 5.578737258911133, "learning_rate": 1.9521536880328943e-05, "loss": 1.7077, "step": 340400 }, { "epoch": 0.2368812902032269, "grad_norm": 4.157208442687988, "learning_rate": 1.9520980982627642e-05, "loss": 1.6824, "step": 340600 }, { "epoch": 0.23702038667427988, "grad_norm": 3.1329407691955566, "learning_rate": 1.9520424773622193e-05, "loss": 1.6559, "step": 340800 }, { "epoch": 0.23715948314533286, "grad_norm": 4.475450038909912, "learning_rate": 1.951986825333914e-05, "loss": 1.7017, "step": 341000 }, { "epoch": 0.23729857961638584, "grad_norm": 4.912330627441406, "learning_rate": 1.9519311421805062e-05, "loss": 1.6263, "step": 341200 }, { "epoch": 0.23743767608743882, "grad_norm": 6.892397403717041, "learning_rate": 1.951875427904654e-05, "loss": 1.7071, "step": 341400 }, { "epoch": 0.2375767725584918, "grad_norm": 4.659296989440918, "learning_rate": 1.9518196825090167e-05, "loss": 1.6526, "step": 341600 }, { "epoch": 0.23771586902954478, "grad_norm": 7.2321977615356445, "learning_rate": 1.9517639059962558e-05, "loss": 1.619, "step": 341800 }, { "epoch": 0.23785496550059776, "grad_norm": 4.7723283767700195, "learning_rate": 1.951708098369033e-05, "loss": 1.6601, "step": 342000 }, { "epoch": 0.23799406197165074, "grad_norm": 4.46943473815918, "learning_rate": 1.951652259630014e-05, "loss": 1.6552, "step": 342200 }, { "epoch": 0.23813315844270372, "grad_norm": 3.9207563400268555, "learning_rate": 1.951596389781864e-05, "loss": 1.6588, "step": 342400 }, { "epoch": 0.2382722549137567, "grad_norm": 4.317783355712891, "learning_rate": 1.95154048882725e-05, "loss": 1.6362, "step": 342600 }, { "epoch": 0.2384113513848097, "grad_norm": 4.8455939292907715, "learning_rate": 1.9514845567688408e-05, "loss": 1.6518, "step": 342800 }, { "epoch": 0.23855044785586269, "grad_norm": 7.664321422576904, "learning_rate": 1.9514285936093064e-05, "loss": 1.6889, "step": 343000 }, { "epoch": 0.23868954432691566, "grad_norm": 3.7890496253967285, "learning_rate": 1.951372599351318e-05, "loss": 1.6764, "step": 343200 }, { "epoch": 0.23882864079796864, "grad_norm": 4.050221920013428, "learning_rate": 1.9513165739975493e-05, "loss": 1.6499, "step": 343400 }, { "epoch": 0.23896773726902162, "grad_norm": 7.2723388671875, "learning_rate": 1.951260517550675e-05, "loss": 1.6283, "step": 343600 }, { "epoch": 0.2391068337400746, "grad_norm": 4.328615665435791, "learning_rate": 1.951204430013371e-05, "loss": 1.6806, "step": 343800 }, { "epoch": 0.23924593021112758, "grad_norm": 6.319999694824219, "learning_rate": 1.9511483113883144e-05, "loss": 1.6528, "step": 344000 }, { "epoch": 0.23938502668218056, "grad_norm": 3.773545265197754, "learning_rate": 1.9510921616781844e-05, "loss": 1.643, "step": 344200 }, { "epoch": 0.23952412315323354, "grad_norm": 4.095102787017822, "learning_rate": 1.9510359808856623e-05, "loss": 1.6922, "step": 344400 }, { "epoch": 0.23966321962428652, "grad_norm": 5.804976463317871, "learning_rate": 1.950979769013429e-05, "loss": 1.6055, "step": 344600 }, { "epoch": 0.2398023160953395, "grad_norm": 9.323091506958008, "learning_rate": 1.9509235260641682e-05, "loss": 1.6792, "step": 344800 }, { "epoch": 0.23994141256639248, "grad_norm": 3.1342084407806396, "learning_rate": 1.950867252040566e-05, "loss": 1.6612, "step": 345000 }, { "epoch": 0.24008050903744546, "grad_norm": 5.803501605987549, "learning_rate": 1.9508109469453075e-05, "loss": 1.6417, "step": 345200 }, { "epoch": 0.24021960550849844, "grad_norm": 2.7702744007110596, "learning_rate": 1.9507546107810813e-05, "loss": 1.6397, "step": 345400 }, { "epoch": 0.24035870197955142, "grad_norm": 6.073428153991699, "learning_rate": 1.9506982435505766e-05, "loss": 1.6321, "step": 345600 }, { "epoch": 0.2404977984506044, "grad_norm": 4.152920246124268, "learning_rate": 1.9506418452564844e-05, "loss": 1.6407, "step": 345800 }, { "epoch": 0.24063689492165738, "grad_norm": 3.3686416149139404, "learning_rate": 1.9505854159014972e-05, "loss": 1.7123, "step": 346000 }, { "epoch": 0.24077599139271036, "grad_norm": 4.510013103485107, "learning_rate": 1.950528955488309e-05, "loss": 1.6391, "step": 346200 }, { "epoch": 0.24091508786376334, "grad_norm": 4.174516201019287, "learning_rate": 1.9504724640196143e-05, "loss": 1.669, "step": 346400 }, { "epoch": 0.24105418433481635, "grad_norm": 4.815258979797363, "learning_rate": 1.9504159414981112e-05, "loss": 1.6308, "step": 346600 }, { "epoch": 0.24119328080586933, "grad_norm": 3.8510098457336426, "learning_rate": 1.950359387926497e-05, "loss": 1.6688, "step": 346800 }, { "epoch": 0.2413323772769223, "grad_norm": 6.116521835327148, "learning_rate": 1.950302803307472e-05, "loss": 1.6655, "step": 347000 }, { "epoch": 0.24147147374797528, "grad_norm": 4.095193862915039, "learning_rate": 1.9502461876437376e-05, "loss": 1.6587, "step": 347200 }, { "epoch": 0.24161057021902826, "grad_norm": 3.3929877281188965, "learning_rate": 1.9501895409379958e-05, "loss": 1.5897, "step": 347400 }, { "epoch": 0.24174966669008124, "grad_norm": 4.79518461227417, "learning_rate": 1.9501328631929515e-05, "loss": 1.6804, "step": 347600 }, { "epoch": 0.24188876316113422, "grad_norm": 4.848894119262695, "learning_rate": 1.9500761544113106e-05, "loss": 1.6742, "step": 347800 }, { "epoch": 0.2420278596321872, "grad_norm": 4.406215667724609, "learning_rate": 1.9500194145957797e-05, "loss": 1.711, "step": 348000 }, { "epoch": 0.24216695610324018, "grad_norm": 7.045769214630127, "learning_rate": 1.949962643749068e-05, "loss": 1.591, "step": 348200 }, { "epoch": 0.24230605257429316, "grad_norm": 5.135491847991943, "learning_rate": 1.9499058418738855e-05, "loss": 1.6447, "step": 348400 }, { "epoch": 0.24244514904534614, "grad_norm": 4.513916492462158, "learning_rate": 1.9498490089729438e-05, "loss": 1.6319, "step": 348600 }, { "epoch": 0.24258424551639912, "grad_norm": 3.753251791000366, "learning_rate": 1.949792145048956e-05, "loss": 1.6632, "step": 348800 }, { "epoch": 0.2427233419874521, "grad_norm": 3.935469150543213, "learning_rate": 1.949735250104637e-05, "loss": 1.678, "step": 349000 }, { "epoch": 0.24286243845850508, "grad_norm": 5.35392951965332, "learning_rate": 1.9496783241427026e-05, "loss": 1.6673, "step": 349200 }, { "epoch": 0.24300153492955806, "grad_norm": 4.7084879875183105, "learning_rate": 1.9496213671658703e-05, "loss": 1.6702, "step": 349400 }, { "epoch": 0.24314063140061104, "grad_norm": 4.929116249084473, "learning_rate": 1.94956437917686e-05, "loss": 1.6643, "step": 349600 }, { "epoch": 0.24327972787166402, "grad_norm": 4.202829837799072, "learning_rate": 1.949507360178391e-05, "loss": 1.6291, "step": 349800 }, { "epoch": 0.243418824342717, "grad_norm": 6.402523517608643, "learning_rate": 1.949450310173186e-05, "loss": 1.6366, "step": 350000 }, { "epoch": 0.24355792081377, "grad_norm": 5.324528694152832, "learning_rate": 1.9493932291639685e-05, "loss": 1.6421, "step": 350200 }, { "epoch": 0.243697017284823, "grad_norm": 0.4841703772544861, "learning_rate": 1.9493361171534633e-05, "loss": 1.6182, "step": 350400 }, { "epoch": 0.24383611375587597, "grad_norm": 4.352268218994141, "learning_rate": 1.949278974144397e-05, "loss": 1.6553, "step": 350600 }, { "epoch": 0.24397521022692895, "grad_norm": 4.935366630554199, "learning_rate": 1.9492218001394977e-05, "loss": 1.6541, "step": 350800 }, { "epoch": 0.24411430669798193, "grad_norm": 3.1909875869750977, "learning_rate": 1.9491645951414943e-05, "loss": 1.7019, "step": 351000 }, { "epoch": 0.2442534031690349, "grad_norm": 4.752991199493408, "learning_rate": 1.9491073591531178e-05, "loss": 1.6571, "step": 351200 }, { "epoch": 0.24439249964008788, "grad_norm": 6.217344284057617, "learning_rate": 1.949050092177101e-05, "loss": 1.723, "step": 351400 }, { "epoch": 0.24453159611114086, "grad_norm": 6.201177597045898, "learning_rate": 1.948992794216178e-05, "loss": 1.6802, "step": 351600 }, { "epoch": 0.24467069258219384, "grad_norm": 2.825784683227539, "learning_rate": 1.948935465273083e-05, "loss": 1.5842, "step": 351800 }, { "epoch": 0.24480978905324682, "grad_norm": 4.810288429260254, "learning_rate": 1.948878105350554e-05, "loss": 1.7099, "step": 352000 }, { "epoch": 0.2449488855242998, "grad_norm": 4.089555263519287, "learning_rate": 1.9488207144513284e-05, "loss": 1.7244, "step": 352200 }, { "epoch": 0.24508798199535278, "grad_norm": 6.472701549530029, "learning_rate": 1.948763292578147e-05, "loss": 1.6518, "step": 352400 }, { "epoch": 0.24522707846640576, "grad_norm": 3.4730377197265625, "learning_rate": 1.9487058397337498e-05, "loss": 1.708, "step": 352600 }, { "epoch": 0.24536617493745874, "grad_norm": 3.145690441131592, "learning_rate": 1.9486483559208805e-05, "loss": 1.6469, "step": 352800 }, { "epoch": 0.24550527140851172, "grad_norm": 3.899048089981079, "learning_rate": 1.948590841142283e-05, "loss": 1.6595, "step": 353000 }, { "epoch": 0.2456443678795647, "grad_norm": 5.847311973571777, "learning_rate": 1.948533295400703e-05, "loss": 1.6287, "step": 353200 }, { "epoch": 0.24578346435061768, "grad_norm": 3.397681713104248, "learning_rate": 1.9484757186988875e-05, "loss": 1.6518, "step": 353400 }, { "epoch": 0.24592256082167066, "grad_norm": 4.522960662841797, "learning_rate": 1.9484181110395854e-05, "loss": 1.6843, "step": 353600 }, { "epoch": 0.24606165729272367, "grad_norm": 4.118208408355713, "learning_rate": 1.9483604724255466e-05, "loss": 1.6792, "step": 353800 }, { "epoch": 0.24620075376377665, "grad_norm": 3.8641409873962402, "learning_rate": 1.9483028028595225e-05, "loss": 1.6484, "step": 354000 }, { "epoch": 0.24633985023482963, "grad_norm": 4.879488468170166, "learning_rate": 1.948245102344267e-05, "loss": 1.6843, "step": 354200 }, { "epoch": 0.2464789467058826, "grad_norm": 4.3141255378723145, "learning_rate": 1.948187370882534e-05, "loss": 1.6977, "step": 354400 }, { "epoch": 0.2466180431769356, "grad_norm": 3.124896764755249, "learning_rate": 1.9481296084770798e-05, "loss": 1.672, "step": 354600 }, { "epoch": 0.24675713964798857, "grad_norm": 7.436282634735107, "learning_rate": 1.9480718151306614e-05, "loss": 1.6572, "step": 354800 }, { "epoch": 0.24689623611904155, "grad_norm": 5.373355388641357, "learning_rate": 1.9480139908460387e-05, "loss": 1.7045, "step": 355000 }, { "epoch": 0.24703533259009453, "grad_norm": 4.849045276641846, "learning_rate": 1.9479561356259715e-05, "loss": 1.6751, "step": 355200 }, { "epoch": 0.2471744290611475, "grad_norm": 3.8025639057159424, "learning_rate": 1.9478982494732217e-05, "loss": 1.6516, "step": 355400 }, { "epoch": 0.24731352553220048, "grad_norm": 4.946913242340088, "learning_rate": 1.947840332390553e-05, "loss": 1.7016, "step": 355600 } ], "logging_steps": 200, "max_steps": 2875702, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.734126673569939e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }