{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 493, "global_step": 1969, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005078720162519045, "grad_norm": 0.8817383050918579, "learning_rate": 2.0000000000000003e-06, "loss": 1.9482, "step": 1 }, { "epoch": 0.0005078720162519045, "eval_loss": 1.4919016361236572, "eval_runtime": 167.0655, "eval_samples_per_second": 5.89, "eval_steps_per_second": 1.472, "step": 1 }, { "epoch": 0.001015744032503809, "grad_norm": 1.0007613897323608, "learning_rate": 4.000000000000001e-06, "loss": 1.7274, "step": 2 }, { "epoch": 0.0015236160487557136, "grad_norm": 0.8447886109352112, "learning_rate": 6e-06, "loss": 1.8784, "step": 3 }, { "epoch": 0.002031488065007618, "grad_norm": 0.9496368765830994, "learning_rate": 8.000000000000001e-06, "loss": 1.709, "step": 4 }, { "epoch": 0.0025393600812595226, "grad_norm": 0.9440109729766846, "learning_rate": 1e-05, "loss": 1.4441, "step": 5 }, { "epoch": 0.0030472320975114273, "grad_norm": 0.9887049794197083, "learning_rate": 1.2e-05, "loss": 1.6845, "step": 6 }, { "epoch": 0.0035551041137633316, "grad_norm": 1.1271413564682007, "learning_rate": 1.4e-05, "loss": 1.6197, "step": 7 }, { "epoch": 0.004062976130015236, "grad_norm": 1.0455557107925415, "learning_rate": 1.6000000000000003e-05, "loss": 1.8842, "step": 8 }, { "epoch": 0.0045708481462671405, "grad_norm": 0.9560151100158691, "learning_rate": 1.8e-05, "loss": 1.8362, "step": 9 }, { "epoch": 0.005078720162519045, "grad_norm": 0.97687166929245, "learning_rate": 2e-05, "loss": 1.6373, "step": 10 }, { "epoch": 0.00558659217877095, "grad_norm": 0.9685382843017578, "learning_rate": 1.9999987141189844e-05, "loss": 1.6555, "step": 11 }, { "epoch": 0.006094464195022855, "grad_norm": 0.8788787722587585, "learning_rate": 1.9999948564792438e-05, "loss": 1.8128, "step": 12 }, { "epoch": 0.006602336211274758, "grad_norm": 0.9693357944488525, "learning_rate": 1.9999884270906992e-05, "loss": 1.6594, "step": 13 }, { "epoch": 0.007110208227526663, "grad_norm": 0.8221359252929688, "learning_rate": 1.9999794259698856e-05, "loss": 1.7551, "step": 14 }, { "epoch": 0.007618080243778568, "grad_norm": 0.9802096486091614, "learning_rate": 1.9999678531399517e-05, "loss": 1.7172, "step": 15 }, { "epoch": 0.008125952260030472, "grad_norm": 0.8529878854751587, "learning_rate": 1.99995370863066e-05, "loss": 1.6958, "step": 16 }, { "epoch": 0.008633824276282377, "grad_norm": 1.0876967906951904, "learning_rate": 1.9999369924783872e-05, "loss": 1.7709, "step": 17 }, { "epoch": 0.009141696292534281, "grad_norm": 1.0921239852905273, "learning_rate": 1.9999177047261227e-05, "loss": 1.8711, "step": 18 }, { "epoch": 0.009649568308786187, "grad_norm": 0.9870465993881226, "learning_rate": 1.9998958454234704e-05, "loss": 1.5542, "step": 19 }, { "epoch": 0.01015744032503809, "grad_norm": 0.8051080703735352, "learning_rate": 1.999871414626647e-05, "loss": 1.8549, "step": 20 }, { "epoch": 0.010665312341289994, "grad_norm": 0.9436357021331787, "learning_rate": 1.999844412398483e-05, "loss": 1.8625, "step": 21 }, { "epoch": 0.0111731843575419, "grad_norm": 0.870090901851654, "learning_rate": 1.9998148388084213e-05, "loss": 1.8584, "step": 22 }, { "epoch": 0.011681056373793804, "grad_norm": 0.89214026927948, "learning_rate": 1.9997826939325184e-05, "loss": 1.7535, "step": 23 }, { "epoch": 0.01218892839004571, "grad_norm": 1.119433879852295, "learning_rate": 1.9997479778534432e-05, "loss": 1.6597, "step": 24 }, { "epoch": 0.012696800406297613, "grad_norm": 0.7907304167747498, "learning_rate": 1.9997106906604773e-05, "loss": 1.8413, "step": 25 }, { "epoch": 0.013204672422549517, "grad_norm": 1.2375823259353638, "learning_rate": 1.9996708324495144e-05, "loss": 1.1807, "step": 26 }, { "epoch": 0.013712544438801422, "grad_norm": 1.1361809968948364, "learning_rate": 1.99962840332306e-05, "loss": 1.6539, "step": 27 }, { "epoch": 0.014220416455053326, "grad_norm": 0.967383861541748, "learning_rate": 1.9995834033902323e-05, "loss": 1.7859, "step": 28 }, { "epoch": 0.014728288471305232, "grad_norm": 0.797429621219635, "learning_rate": 1.99953583276676e-05, "loss": 1.5485, "step": 29 }, { "epoch": 0.015236160487557136, "grad_norm": 1.13390052318573, "learning_rate": 1.999485691574984e-05, "loss": 1.5869, "step": 30 }, { "epoch": 0.01574403250380904, "grad_norm": 0.9043781161308289, "learning_rate": 1.9994329799438547e-05, "loss": 1.8953, "step": 31 }, { "epoch": 0.016251904520060943, "grad_norm": 0.9030641317367554, "learning_rate": 1.9993776980089344e-05, "loss": 1.6764, "step": 32 }, { "epoch": 0.01675977653631285, "grad_norm": 0.8708584308624268, "learning_rate": 1.9993198459123948e-05, "loss": 1.763, "step": 33 }, { "epoch": 0.017267648552564754, "grad_norm": 0.8852054476737976, "learning_rate": 1.999259423803018e-05, "loss": 1.7905, "step": 34 }, { "epoch": 0.017775520568816656, "grad_norm": 1.2012213468551636, "learning_rate": 1.9991964318361954e-05, "loss": 1.6094, "step": 35 }, { "epoch": 0.018283392585068562, "grad_norm": 0.8909621834754944, "learning_rate": 1.999130870173927e-05, "loss": 1.5457, "step": 36 }, { "epoch": 0.018791264601320468, "grad_norm": 0.9592351317405701, "learning_rate": 1.9990627389848217e-05, "loss": 1.7552, "step": 37 }, { "epoch": 0.019299136617572373, "grad_norm": 0.8920623660087585, "learning_rate": 1.9989920384440968e-05, "loss": 1.5726, "step": 38 }, { "epoch": 0.019807008633824275, "grad_norm": 0.9171638488769531, "learning_rate": 1.9989187687335777e-05, "loss": 1.5285, "step": 39 }, { "epoch": 0.02031488065007618, "grad_norm": 1.3621869087219238, "learning_rate": 1.998842930041696e-05, "loss": 1.4605, "step": 40 }, { "epoch": 0.020822752666328086, "grad_norm": 1.1085385084152222, "learning_rate": 1.9987645225634915e-05, "loss": 1.6428, "step": 41 }, { "epoch": 0.02133062468257999, "grad_norm": 1.2515841722488403, "learning_rate": 1.9986835465006088e-05, "loss": 1.3788, "step": 42 }, { "epoch": 0.021838496698831894, "grad_norm": 1.1154283285140991, "learning_rate": 1.9986000020613e-05, "loss": 1.8016, "step": 43 }, { "epoch": 0.0223463687150838, "grad_norm": 1.0101381540298462, "learning_rate": 1.9985138894604205e-05, "loss": 1.712, "step": 44 }, { "epoch": 0.0228542407313357, "grad_norm": 1.146767020225525, "learning_rate": 1.9984252089194317e-05, "loss": 1.7227, "step": 45 }, { "epoch": 0.023362112747587607, "grad_norm": 0.4284367263317108, "learning_rate": 1.9983339606663993e-05, "loss": 0.2803, "step": 46 }, { "epoch": 0.023869984763839513, "grad_norm": 1.1581532955169678, "learning_rate": 1.998240144935992e-05, "loss": 1.7686, "step": 47 }, { "epoch": 0.02437785678009142, "grad_norm": 1.028751015663147, "learning_rate": 1.9981437619694813e-05, "loss": 1.4811, "step": 48 }, { "epoch": 0.02488572879634332, "grad_norm": 1.0959101915359497, "learning_rate": 1.998044812014741e-05, "loss": 1.516, "step": 49 }, { "epoch": 0.025393600812595226, "grad_norm": 1.1301296949386597, "learning_rate": 1.997943295326247e-05, "loss": 1.5932, "step": 50 }, { "epoch": 0.02590147282884713, "grad_norm": 1.0573782920837402, "learning_rate": 1.9978392121650767e-05, "loss": 1.6835, "step": 51 }, { "epoch": 0.026409344845099034, "grad_norm": 1.027848720550537, "learning_rate": 1.9977325627989066e-05, "loss": 1.6119, "step": 52 }, { "epoch": 0.02691721686135094, "grad_norm": 1.2942439317703247, "learning_rate": 1.9976233475020136e-05, "loss": 1.685, "step": 53 }, { "epoch": 0.027425088877602845, "grad_norm": 1.1388881206512451, "learning_rate": 1.9975115665552732e-05, "loss": 1.5779, "step": 54 }, { "epoch": 0.027932960893854747, "grad_norm": 1.0829545259475708, "learning_rate": 1.99739722024616e-05, "loss": 1.3557, "step": 55 }, { "epoch": 0.028440832910106652, "grad_norm": 1.2315815687179565, "learning_rate": 1.997280308868745e-05, "loss": 1.2091, "step": 56 }, { "epoch": 0.028948704926358558, "grad_norm": 1.185844898223877, "learning_rate": 1.9971608327236968e-05, "loss": 1.5351, "step": 57 }, { "epoch": 0.029456576942610464, "grad_norm": 0.9325307011604309, "learning_rate": 1.9970387921182793e-05, "loss": 1.6289, "step": 58 }, { "epoch": 0.029964448958862366, "grad_norm": 1.5263203382492065, "learning_rate": 1.996914187366352e-05, "loss": 1.4187, "step": 59 }, { "epoch": 0.03047232097511427, "grad_norm": 1.1244592666625977, "learning_rate": 1.9967870187883685e-05, "loss": 1.5894, "step": 60 }, { "epoch": 0.030980192991366177, "grad_norm": 1.3506087064743042, "learning_rate": 1.9966572867113764e-05, "loss": 1.5138, "step": 61 }, { "epoch": 0.03148806500761808, "grad_norm": 1.2022825479507446, "learning_rate": 1.996524991469016e-05, "loss": 1.6979, "step": 62 }, { "epoch": 0.031995937023869984, "grad_norm": 1.4655373096466064, "learning_rate": 1.9963901334015184e-05, "loss": 1.5341, "step": 63 }, { "epoch": 0.03250380904012189, "grad_norm": 1.375972867012024, "learning_rate": 1.996252712855707e-05, "loss": 1.2517, "step": 64 }, { "epoch": 0.033011681056373796, "grad_norm": 1.1926246881484985, "learning_rate": 1.9961127301849947e-05, "loss": 1.7264, "step": 65 }, { "epoch": 0.0335195530726257, "grad_norm": 1.2697643041610718, "learning_rate": 1.9959701857493835e-05, "loss": 1.6141, "step": 66 }, { "epoch": 0.0340274250888776, "grad_norm": 1.122188925743103, "learning_rate": 1.995825079915464e-05, "loss": 1.5998, "step": 67 }, { "epoch": 0.03453529710512951, "grad_norm": 1.2801930904388428, "learning_rate": 1.9956774130564132e-05, "loss": 1.6505, "step": 68 }, { "epoch": 0.03504316912138141, "grad_norm": 1.1839287281036377, "learning_rate": 1.995527185551996e-05, "loss": 1.6169, "step": 69 }, { "epoch": 0.03555104113763331, "grad_norm": 1.2095578908920288, "learning_rate": 1.995374397788561e-05, "loss": 1.754, "step": 70 }, { "epoch": 0.03605891315388522, "grad_norm": 1.2569252252578735, "learning_rate": 1.9952190501590425e-05, "loss": 1.5258, "step": 71 }, { "epoch": 0.036566785170137124, "grad_norm": 1.2527538537979126, "learning_rate": 1.9950611430629577e-05, "loss": 1.7069, "step": 72 }, { "epoch": 0.03707465718638903, "grad_norm": 1.2936222553253174, "learning_rate": 1.9949006769064057e-05, "loss": 1.447, "step": 73 }, { "epoch": 0.037582529202640935, "grad_norm": 1.5045548677444458, "learning_rate": 1.9947376521020675e-05, "loss": 1.5354, "step": 74 }, { "epoch": 0.03809040121889284, "grad_norm": 1.201399564743042, "learning_rate": 1.994572069069204e-05, "loss": 1.583, "step": 75 }, { "epoch": 0.038598273235144746, "grad_norm": 1.2044777870178223, "learning_rate": 1.9944039282336553e-05, "loss": 1.4365, "step": 76 }, { "epoch": 0.03910614525139665, "grad_norm": 1.3132214546203613, "learning_rate": 1.99423323002784e-05, "loss": 1.6793, "step": 77 }, { "epoch": 0.03961401726764855, "grad_norm": 1.4409575462341309, "learning_rate": 1.994059974890753e-05, "loss": 1.6365, "step": 78 }, { "epoch": 0.04012188928390046, "grad_norm": 1.273102879524231, "learning_rate": 1.9938841632679647e-05, "loss": 1.6006, "step": 79 }, { "epoch": 0.04062976130015236, "grad_norm": 1.4772100448608398, "learning_rate": 1.9937057956116217e-05, "loss": 1.5227, "step": 80 }, { "epoch": 0.041137633316404264, "grad_norm": 1.306842565536499, "learning_rate": 1.993524872380443e-05, "loss": 1.491, "step": 81 }, { "epoch": 0.04164550533265617, "grad_norm": 1.2774075269699097, "learning_rate": 1.993341394039719e-05, "loss": 1.7854, "step": 82 }, { "epoch": 0.042153377348908075, "grad_norm": 1.4964087009429932, "learning_rate": 1.993155361061314e-05, "loss": 1.6788, "step": 83 }, { "epoch": 0.04266124936515998, "grad_norm": 1.237638235092163, "learning_rate": 1.9929667739236597e-05, "loss": 1.8024, "step": 84 }, { "epoch": 0.043169121381411886, "grad_norm": 1.7545373439788818, "learning_rate": 1.9927756331117575e-05, "loss": 1.1102, "step": 85 }, { "epoch": 0.04367699339766379, "grad_norm": 1.4063007831573486, "learning_rate": 1.992581939117176e-05, "loss": 1.2884, "step": 86 }, { "epoch": 0.04418486541391569, "grad_norm": 1.5220825672149658, "learning_rate": 1.9923856924380496e-05, "loss": 1.4794, "step": 87 }, { "epoch": 0.0446927374301676, "grad_norm": 1.3065992593765259, "learning_rate": 1.9921868935790788e-05, "loss": 1.6815, "step": 88 }, { "epoch": 0.0452006094464195, "grad_norm": 1.5807737112045288, "learning_rate": 1.9919855430515265e-05, "loss": 1.3399, "step": 89 }, { "epoch": 0.0457084814626714, "grad_norm": 1.4048736095428467, "learning_rate": 1.9917816413732184e-05, "loss": 1.7414, "step": 90 }, { "epoch": 0.04621635347892331, "grad_norm": 1.6283764839172363, "learning_rate": 1.9915751890685416e-05, "loss": 1.5164, "step": 91 }, { "epoch": 0.046724225495175215, "grad_norm": 2.085670232772827, "learning_rate": 1.9913661866684415e-05, "loss": 1.4067, "step": 92 }, { "epoch": 0.047232097511427124, "grad_norm": 1.2962095737457275, "learning_rate": 1.9911546347104232e-05, "loss": 1.3445, "step": 93 }, { "epoch": 0.047739969527679026, "grad_norm": 1.3311445713043213, "learning_rate": 1.9909405337385474e-05, "loss": 1.6662, "step": 94 }, { "epoch": 0.04824784154393093, "grad_norm": 1.7060577869415283, "learning_rate": 1.9907238843034317e-05, "loss": 1.5718, "step": 95 }, { "epoch": 0.04875571356018284, "grad_norm": 1.2818933725357056, "learning_rate": 1.990504686962246e-05, "loss": 1.6056, "step": 96 }, { "epoch": 0.04926358557643474, "grad_norm": 1.5512213706970215, "learning_rate": 1.990282942278714e-05, "loss": 1.6164, "step": 97 }, { "epoch": 0.04977145759268664, "grad_norm": 1.6890562772750854, "learning_rate": 1.9900586508231107e-05, "loss": 1.4841, "step": 98 }, { "epoch": 0.05027932960893855, "grad_norm": 1.4586176872253418, "learning_rate": 1.9898318131722597e-05, "loss": 1.5043, "step": 99 }, { "epoch": 0.05078720162519045, "grad_norm": 1.9947463274002075, "learning_rate": 1.9896024299095336e-05, "loss": 1.11, "step": 100 }, { "epoch": 0.051295073641442354, "grad_norm": 1.809464454650879, "learning_rate": 1.989370501624852e-05, "loss": 1.4424, "step": 101 }, { "epoch": 0.05180294565769426, "grad_norm": 1.4731370210647583, "learning_rate": 1.9891360289146787e-05, "loss": 1.4353, "step": 102 }, { "epoch": 0.052310817673946165, "grad_norm": 1.6137124300003052, "learning_rate": 1.9888990123820223e-05, "loss": 1.4225, "step": 103 }, { "epoch": 0.05281868969019807, "grad_norm": 1.5897164344787598, "learning_rate": 1.9886594526364323e-05, "loss": 1.5393, "step": 104 }, { "epoch": 0.053326561706449976, "grad_norm": 1.5969938039779663, "learning_rate": 1.988417350294e-05, "loss": 1.6938, "step": 105 }, { "epoch": 0.05383443372270188, "grad_norm": 1.7524147033691406, "learning_rate": 1.988172705977354e-05, "loss": 1.4503, "step": 106 }, { "epoch": 0.05434230573895378, "grad_norm": 1.5974395275115967, "learning_rate": 1.9879255203156626e-05, "loss": 1.6654, "step": 107 }, { "epoch": 0.05485017775520569, "grad_norm": 1.6734551191329956, "learning_rate": 1.9876757939446273e-05, "loss": 1.7293, "step": 108 }, { "epoch": 0.05535804977145759, "grad_norm": 1.6386557817459106, "learning_rate": 1.9874235275064855e-05, "loss": 1.6286, "step": 109 }, { "epoch": 0.055865921787709494, "grad_norm": 1.3887147903442383, "learning_rate": 1.9871687216500065e-05, "loss": 1.5914, "step": 110 }, { "epoch": 0.0563737938039614, "grad_norm": 1.4445892572402954, "learning_rate": 1.9869113770304903e-05, "loss": 1.8198, "step": 111 }, { "epoch": 0.056881665820213305, "grad_norm": 1.891767978668213, "learning_rate": 1.986651494309766e-05, "loss": 1.5657, "step": 112 }, { "epoch": 0.057389537836465214, "grad_norm": 1.4021755456924438, "learning_rate": 1.9863890741561896e-05, "loss": 1.5877, "step": 113 }, { "epoch": 0.057897409852717116, "grad_norm": 1.3860752582550049, "learning_rate": 1.9861241172446443e-05, "loss": 1.5665, "step": 114 }, { "epoch": 0.05840528186896902, "grad_norm": 1.810243844985962, "learning_rate": 1.985856624256535e-05, "loss": 1.6717, "step": 115 }, { "epoch": 0.05891315388522093, "grad_norm": 1.4407758712768555, "learning_rate": 1.985586595879791e-05, "loss": 1.6235, "step": 116 }, { "epoch": 0.05942102590147283, "grad_norm": 1.3299332857131958, "learning_rate": 1.9853140328088608e-05, "loss": 1.4443, "step": 117 }, { "epoch": 0.05992889791772473, "grad_norm": 1.984378457069397, "learning_rate": 1.9850389357447116e-05, "loss": 1.3835, "step": 118 }, { "epoch": 0.06043676993397664, "grad_norm": 1.7486463785171509, "learning_rate": 1.9847613053948272e-05, "loss": 1.6556, "step": 119 }, { "epoch": 0.06094464195022854, "grad_norm": 1.381593108177185, "learning_rate": 1.9844811424732078e-05, "loss": 1.6322, "step": 120 }, { "epoch": 0.061452513966480445, "grad_norm": 1.4813175201416016, "learning_rate": 1.9841984477003647e-05, "loss": 1.3805, "step": 121 }, { "epoch": 0.061960385982732354, "grad_norm": 1.7013801336288452, "learning_rate": 1.9839132218033225e-05, "loss": 1.4291, "step": 122 }, { "epoch": 0.062468257998984256, "grad_norm": 1.7752875089645386, "learning_rate": 1.9836254655156135e-05, "loss": 1.4994, "step": 123 }, { "epoch": 0.06297613001523616, "grad_norm": 1.5406975746154785, "learning_rate": 1.9833351795772788e-05, "loss": 1.6347, "step": 124 }, { "epoch": 0.06348400203148806, "grad_norm": 2.0284907817840576, "learning_rate": 1.9830423647348652e-05, "loss": 1.4502, "step": 125 }, { "epoch": 0.06399187404773997, "grad_norm": 1.535901665687561, "learning_rate": 1.982747021741422e-05, "loss": 1.5141, "step": 126 }, { "epoch": 0.06449974606399188, "grad_norm": 1.566285252571106, "learning_rate": 1.9824491513565016e-05, "loss": 1.2966, "step": 127 }, { "epoch": 0.06500761808024377, "grad_norm": 1.9784098863601685, "learning_rate": 1.9821487543461554e-05, "loss": 1.5775, "step": 128 }, { "epoch": 0.06551549009649568, "grad_norm": 1.4289242029190063, "learning_rate": 1.981845831482933e-05, "loss": 1.4593, "step": 129 }, { "epoch": 0.06602336211274759, "grad_norm": 1.6669222116470337, "learning_rate": 1.9815403835458803e-05, "loss": 1.691, "step": 130 }, { "epoch": 0.06653123412899949, "grad_norm": 1.4953184127807617, "learning_rate": 1.9812324113205366e-05, "loss": 1.5924, "step": 131 }, { "epoch": 0.0670391061452514, "grad_norm": 2.38492488861084, "learning_rate": 1.9809219155989335e-05, "loss": 1.5344, "step": 132 }, { "epoch": 0.0675469781615033, "grad_norm": 2.0180890560150146, "learning_rate": 1.9806088971795914e-05, "loss": 1.5879, "step": 133 }, { "epoch": 0.0680548501777552, "grad_norm": 1.7884663343429565, "learning_rate": 1.9802933568675193e-05, "loss": 1.4413, "step": 134 }, { "epoch": 0.06856272219400711, "grad_norm": 1.3341087102890015, "learning_rate": 1.9799752954742123e-05, "loss": 1.329, "step": 135 }, { "epoch": 0.06907059421025902, "grad_norm": 2.230578660964966, "learning_rate": 1.979654713817648e-05, "loss": 1.4909, "step": 136 }, { "epoch": 0.06957846622651091, "grad_norm": 1.7144185304641724, "learning_rate": 1.9793316127222864e-05, "loss": 1.2556, "step": 137 }, { "epoch": 0.07008633824276282, "grad_norm": 1.8278745412826538, "learning_rate": 1.979005993019067e-05, "loss": 1.4866, "step": 138 }, { "epoch": 0.07059421025901473, "grad_norm": 1.563427448272705, "learning_rate": 1.978677855545406e-05, "loss": 1.5526, "step": 139 }, { "epoch": 0.07110208227526663, "grad_norm": 1.5219600200653076, "learning_rate": 1.9783472011451944e-05, "loss": 1.6047, "step": 140 }, { "epoch": 0.07160995429151853, "grad_norm": 1.6114145517349243, "learning_rate": 1.978014030668797e-05, "loss": 1.6691, "step": 141 }, { "epoch": 0.07211782630777044, "grad_norm": 2.0263187885284424, "learning_rate": 1.9776783449730494e-05, "loss": 1.5557, "step": 142 }, { "epoch": 0.07262569832402235, "grad_norm": 1.488948106765747, "learning_rate": 1.9773401449212545e-05, "loss": 1.6008, "step": 143 }, { "epoch": 0.07313357034027425, "grad_norm": 1.7363940477371216, "learning_rate": 1.976999431383183e-05, "loss": 1.5921, "step": 144 }, { "epoch": 0.07364144235652616, "grad_norm": 1.658381700515747, "learning_rate": 1.9766562052350682e-05, "loss": 1.7221, "step": 145 }, { "epoch": 0.07414931437277807, "grad_norm": 1.9282320737838745, "learning_rate": 1.9763104673596075e-05, "loss": 1.4233, "step": 146 }, { "epoch": 0.07465718638902996, "grad_norm": 2.092620372772217, "learning_rate": 1.9759622186459548e-05, "loss": 1.3735, "step": 147 }, { "epoch": 0.07516505840528187, "grad_norm": 1.7551285028457642, "learning_rate": 1.975611459989724e-05, "loss": 1.4086, "step": 148 }, { "epoch": 0.07567293042153378, "grad_norm": 1.4977078437805176, "learning_rate": 1.9752581922929833e-05, "loss": 1.5749, "step": 149 }, { "epoch": 0.07618080243778567, "grad_norm": 1.57418692111969, "learning_rate": 1.974902416464252e-05, "loss": 1.4769, "step": 150 }, { "epoch": 0.07668867445403758, "grad_norm": 1.671919584274292, "learning_rate": 1.9745441334185012e-05, "loss": 1.5281, "step": 151 }, { "epoch": 0.07719654647028949, "grad_norm": 1.7778983116149902, "learning_rate": 1.9741833440771495e-05, "loss": 1.6097, "step": 152 }, { "epoch": 0.07770441848654139, "grad_norm": 1.651626706123352, "learning_rate": 1.973820049368062e-05, "loss": 1.4676, "step": 153 }, { "epoch": 0.0782122905027933, "grad_norm": 1.7673014402389526, "learning_rate": 1.9734542502255457e-05, "loss": 1.4988, "step": 154 }, { "epoch": 0.0787201625190452, "grad_norm": 2.4674925804138184, "learning_rate": 1.973085947590349e-05, "loss": 1.4203, "step": 155 }, { "epoch": 0.0792280345352971, "grad_norm": 1.5197545289993286, "learning_rate": 1.9727151424096585e-05, "loss": 1.6221, "step": 156 }, { "epoch": 0.07973590655154901, "grad_norm": 2.3471405506134033, "learning_rate": 1.972341835637097e-05, "loss": 1.3436, "step": 157 }, { "epoch": 0.08024377856780092, "grad_norm": 2.414863348007202, "learning_rate": 1.9719660282327205e-05, "loss": 1.5306, "step": 158 }, { "epoch": 0.08075165058405281, "grad_norm": 2.0253164768218994, "learning_rate": 1.9715877211630168e-05, "loss": 1.1846, "step": 159 }, { "epoch": 0.08125952260030472, "grad_norm": 2.0572662353515625, "learning_rate": 1.971206915400901e-05, "loss": 1.7391, "step": 160 }, { "epoch": 0.08176739461655663, "grad_norm": 1.8957135677337646, "learning_rate": 1.9708236119257154e-05, "loss": 1.3965, "step": 161 }, { "epoch": 0.08227526663280853, "grad_norm": 1.7855340242385864, "learning_rate": 1.9704378117232247e-05, "loss": 1.3442, "step": 162 }, { "epoch": 0.08278313864906044, "grad_norm": 2.447350263595581, "learning_rate": 1.9700495157856162e-05, "loss": 1.3476, "step": 163 }, { "epoch": 0.08329101066531235, "grad_norm": 2.002796173095703, "learning_rate": 1.9696587251114936e-05, "loss": 1.6351, "step": 164 }, { "epoch": 0.08379888268156424, "grad_norm": 1.765633225440979, "learning_rate": 1.969265440705878e-05, "loss": 1.3999, "step": 165 }, { "epoch": 0.08430675469781615, "grad_norm": 1.6512019634246826, "learning_rate": 1.9688696635802035e-05, "loss": 1.5612, "step": 166 }, { "epoch": 0.08481462671406806, "grad_norm": 2.272329330444336, "learning_rate": 1.968471394752314e-05, "loss": 1.286, "step": 167 }, { "epoch": 0.08532249873031995, "grad_norm": 2.384737730026245, "learning_rate": 1.9680706352464632e-05, "loss": 1.299, "step": 168 }, { "epoch": 0.08583037074657186, "grad_norm": 0.558256208896637, "learning_rate": 1.9676673860933083e-05, "loss": 0.2597, "step": 169 }, { "epoch": 0.08633824276282377, "grad_norm": 2.099900007247925, "learning_rate": 1.9672616483299106e-05, "loss": 1.3598, "step": 170 }, { "epoch": 0.08684611477907567, "grad_norm": 2.465440273284912, "learning_rate": 1.9668534229997305e-05, "loss": 1.201, "step": 171 }, { "epoch": 0.08735398679532758, "grad_norm": 1.5782811641693115, "learning_rate": 1.966442711152627e-05, "loss": 1.3735, "step": 172 }, { "epoch": 0.08786185881157949, "grad_norm": 1.8597896099090576, "learning_rate": 1.9660295138448533e-05, "loss": 1.4313, "step": 173 }, { "epoch": 0.08836973082783138, "grad_norm": 1.765465259552002, "learning_rate": 1.965613832139054e-05, "loss": 1.4627, "step": 174 }, { "epoch": 0.08887760284408329, "grad_norm": 1.6649190187454224, "learning_rate": 1.9651956671042642e-05, "loss": 1.633, "step": 175 }, { "epoch": 0.0893854748603352, "grad_norm": 1.8893892765045166, "learning_rate": 1.9647750198159043e-05, "loss": 1.597, "step": 176 }, { "epoch": 0.0898933468765871, "grad_norm": 1.6628423929214478, "learning_rate": 1.9643518913557792e-05, "loss": 1.6589, "step": 177 }, { "epoch": 0.090401218892839, "grad_norm": 1.9656569957733154, "learning_rate": 1.9639262828120748e-05, "loss": 1.436, "step": 178 }, { "epoch": 0.09090909090909091, "grad_norm": 1.9969520568847656, "learning_rate": 1.9634981952793544e-05, "loss": 1.4815, "step": 179 }, { "epoch": 0.0914169629253428, "grad_norm": 2.0888116359710693, "learning_rate": 1.963067629858558e-05, "loss": 1.4511, "step": 180 }, { "epoch": 0.09192483494159472, "grad_norm": 1.9178920984268188, "learning_rate": 1.9626345876569974e-05, "loss": 1.367, "step": 181 }, { "epoch": 0.09243270695784662, "grad_norm": 2.327199935913086, "learning_rate": 1.9621990697883536e-05, "loss": 1.3266, "step": 182 }, { "epoch": 0.09294057897409853, "grad_norm": 1.8885021209716797, "learning_rate": 1.9617610773726746e-05, "loss": 1.3464, "step": 183 }, { "epoch": 0.09344845099035043, "grad_norm": 2.1692769527435303, "learning_rate": 1.9613206115363736e-05, "loss": 1.5941, "step": 184 }, { "epoch": 0.09395632300660234, "grad_norm": 1.5790897607803345, "learning_rate": 1.9608776734122234e-05, "loss": 1.5025, "step": 185 }, { "epoch": 0.09446419502285425, "grad_norm": 2.0684800148010254, "learning_rate": 1.9604322641393556e-05, "loss": 1.5111, "step": 186 }, { "epoch": 0.09497206703910614, "grad_norm": 2.5110013484954834, "learning_rate": 1.9599843848632568e-05, "loss": 1.4123, "step": 187 }, { "epoch": 0.09547993905535805, "grad_norm": 1.8878841400146484, "learning_rate": 1.959534036735766e-05, "loss": 1.4055, "step": 188 }, { "epoch": 0.09598781107160996, "grad_norm": 1.6657261848449707, "learning_rate": 1.959081220915071e-05, "loss": 1.7038, "step": 189 }, { "epoch": 0.09649568308786186, "grad_norm": 2.0334601402282715, "learning_rate": 1.9586259385657066e-05, "loss": 1.5934, "step": 190 }, { "epoch": 0.09700355510411376, "grad_norm": 1.8854690790176392, "learning_rate": 1.958168190858551e-05, "loss": 1.7543, "step": 191 }, { "epoch": 0.09751142712036567, "grad_norm": 1.7778549194335938, "learning_rate": 1.957707978970822e-05, "loss": 1.4673, "step": 192 }, { "epoch": 0.09801929913661757, "grad_norm": 1.6139943599700928, "learning_rate": 1.957245304086075e-05, "loss": 1.5247, "step": 193 }, { "epoch": 0.09852717115286948, "grad_norm": 2.0140955448150635, "learning_rate": 1.9567801673942e-05, "loss": 1.6386, "step": 194 }, { "epoch": 0.09903504316912139, "grad_norm": 1.8387404680252075, "learning_rate": 1.9563125700914178e-05, "loss": 1.5482, "step": 195 }, { "epoch": 0.09954291518537328, "grad_norm": 2.418991804122925, "learning_rate": 1.9558425133802773e-05, "loss": 1.272, "step": 196 }, { "epoch": 0.10005078720162519, "grad_norm": 1.6561681032180786, "learning_rate": 1.9553699984696526e-05, "loss": 1.588, "step": 197 }, { "epoch": 0.1005586592178771, "grad_norm": 1.572954773902893, "learning_rate": 1.954895026574739e-05, "loss": 1.8015, "step": 198 }, { "epoch": 0.101066531234129, "grad_norm": 1.933001160621643, "learning_rate": 1.9544175989170524e-05, "loss": 1.4857, "step": 199 }, { "epoch": 0.1015744032503809, "grad_norm": 1.5697592496871948, "learning_rate": 1.953937716724422e-05, "loss": 1.5252, "step": 200 }, { "epoch": 0.10208227526663281, "grad_norm": 2.5573041439056396, "learning_rate": 1.9534553812309915e-05, "loss": 1.4089, "step": 201 }, { "epoch": 0.10259014728288471, "grad_norm": 2.043156623840332, "learning_rate": 1.952970593677212e-05, "loss": 1.5563, "step": 202 }, { "epoch": 0.10309801929913662, "grad_norm": 2.2919042110443115, "learning_rate": 1.9524833553098427e-05, "loss": 1.3823, "step": 203 }, { "epoch": 0.10360589131538853, "grad_norm": 1.8231582641601562, "learning_rate": 1.951993667381944e-05, "loss": 1.6335, "step": 204 }, { "epoch": 0.10411376333164042, "grad_norm": 2.7078065872192383, "learning_rate": 1.951501531152877e-05, "loss": 1.3149, "step": 205 }, { "epoch": 0.10462163534789233, "grad_norm": 1.694219946861267, "learning_rate": 1.9510069478882992e-05, "loss": 1.5816, "step": 206 }, { "epoch": 0.10512950736414424, "grad_norm": 1.9638980627059937, "learning_rate": 1.9505099188601608e-05, "loss": 1.4232, "step": 207 }, { "epoch": 0.10563737938039613, "grad_norm": 1.720871925354004, "learning_rate": 1.950010445346702e-05, "loss": 1.5198, "step": 208 }, { "epoch": 0.10614525139664804, "grad_norm": 1.6361560821533203, "learning_rate": 1.9495085286324507e-05, "loss": 1.5962, "step": 209 }, { "epoch": 0.10665312341289995, "grad_norm": 2.454713821411133, "learning_rate": 1.949004170008216e-05, "loss": 1.1473, "step": 210 }, { "epoch": 0.10716099542915185, "grad_norm": 1.7801949977874756, "learning_rate": 1.948497370771089e-05, "loss": 1.2549, "step": 211 }, { "epoch": 0.10766886744540376, "grad_norm": 2.275209665298462, "learning_rate": 1.947988132224437e-05, "loss": 1.4693, "step": 212 }, { "epoch": 0.10817673946165567, "grad_norm": 2.096883535385132, "learning_rate": 1.9474764556778995e-05, "loss": 1.6335, "step": 213 }, { "epoch": 0.10868461147790756, "grad_norm": 1.7155479192733765, "learning_rate": 1.9469623424473872e-05, "loss": 1.3781, "step": 214 }, { "epoch": 0.10919248349415947, "grad_norm": 2.3470253944396973, "learning_rate": 1.9464457938550777e-05, "loss": 1.3552, "step": 215 }, { "epoch": 0.10970035551041138, "grad_norm": 1.6862317323684692, "learning_rate": 1.94592681122941e-05, "loss": 1.5208, "step": 216 }, { "epoch": 0.11020822752666327, "grad_norm": 1.914600133895874, "learning_rate": 1.945405395905084e-05, "loss": 1.5037, "step": 217 }, { "epoch": 0.11071609954291518, "grad_norm": 1.9791793823242188, "learning_rate": 1.9448815492230565e-05, "loss": 1.4713, "step": 218 }, { "epoch": 0.11122397155916709, "grad_norm": 2.2346582412719727, "learning_rate": 1.9443552725305365e-05, "loss": 1.4726, "step": 219 }, { "epoch": 0.11173184357541899, "grad_norm": 2.368903398513794, "learning_rate": 1.9438265671809814e-05, "loss": 1.0064, "step": 220 }, { "epoch": 0.1122397155916709, "grad_norm": 2.2253165245056152, "learning_rate": 1.943295434534097e-05, "loss": 1.5413, "step": 221 }, { "epoch": 0.1127475876079228, "grad_norm": 2.02980637550354, "learning_rate": 1.942761875955829e-05, "loss": 1.3832, "step": 222 }, { "epoch": 0.11325545962417471, "grad_norm": 1.889204502105713, "learning_rate": 1.942225892818363e-05, "loss": 1.4722, "step": 223 }, { "epoch": 0.11376333164042661, "grad_norm": 2.5287232398986816, "learning_rate": 1.941687486500121e-05, "loss": 1.3209, "step": 224 }, { "epoch": 0.11427120365667852, "grad_norm": 1.8791983127593994, "learning_rate": 1.9411466583857553e-05, "loss": 1.4135, "step": 225 }, { "epoch": 0.11477907567293043, "grad_norm": 2.2831013202667236, "learning_rate": 1.940603409866147e-05, "loss": 1.3759, "step": 226 }, { "epoch": 0.11528694768918232, "grad_norm": 2.0638115406036377, "learning_rate": 1.9400577423384024e-05, "loss": 1.567, "step": 227 }, { "epoch": 0.11579481970543423, "grad_norm": 1.8206024169921875, "learning_rate": 1.9395096572058482e-05, "loss": 1.3574, "step": 228 }, { "epoch": 0.11630269172168614, "grad_norm": 2.0053048133850098, "learning_rate": 1.9389591558780293e-05, "loss": 1.59, "step": 229 }, { "epoch": 0.11681056373793804, "grad_norm": 1.729561686515808, "learning_rate": 1.938406239770704e-05, "loss": 1.4971, "step": 230 }, { "epoch": 0.11731843575418995, "grad_norm": 2.176643133163452, "learning_rate": 1.9378509103058406e-05, "loss": 1.5178, "step": 231 }, { "epoch": 0.11782630777044185, "grad_norm": 1.8067771196365356, "learning_rate": 1.9372931689116147e-05, "loss": 1.5915, "step": 232 }, { "epoch": 0.11833417978669375, "grad_norm": 2.685448408126831, "learning_rate": 1.9367330170224043e-05, "loss": 1.0551, "step": 233 }, { "epoch": 0.11884205180294566, "grad_norm": 1.656010389328003, "learning_rate": 1.9361704560787872e-05, "loss": 1.4525, "step": 234 }, { "epoch": 0.11934992381919757, "grad_norm": 1.7234470844268799, "learning_rate": 1.9356054875275353e-05, "loss": 1.393, "step": 235 }, { "epoch": 0.11985779583544946, "grad_norm": 2.0601320266723633, "learning_rate": 1.935038112821614e-05, "loss": 1.375, "step": 236 }, { "epoch": 0.12036566785170137, "grad_norm": 1.6750483512878418, "learning_rate": 1.9344683334201764e-05, "loss": 1.7454, "step": 237 }, { "epoch": 0.12087353986795328, "grad_norm": 1.990843415260315, "learning_rate": 1.9338961507885586e-05, "loss": 1.5386, "step": 238 }, { "epoch": 0.12138141188420518, "grad_norm": 2.328029155731201, "learning_rate": 1.9333215663982786e-05, "loss": 1.3595, "step": 239 }, { "epoch": 0.12188928390045708, "grad_norm": 2.0774049758911133, "learning_rate": 1.9327445817270304e-05, "loss": 1.366, "step": 240 }, { "epoch": 0.122397155916709, "grad_norm": 2.1023411750793457, "learning_rate": 1.932165198258682e-05, "loss": 1.306, "step": 241 }, { "epoch": 0.12290502793296089, "grad_norm": 1.935680866241455, "learning_rate": 1.931583417483269e-05, "loss": 1.8575, "step": 242 }, { "epoch": 0.1234128999492128, "grad_norm": 1.8602055311203003, "learning_rate": 1.9309992408969938e-05, "loss": 1.3252, "step": 243 }, { "epoch": 0.12392077196546471, "grad_norm": 2.098102569580078, "learning_rate": 1.9304126700022188e-05, "loss": 1.5078, "step": 244 }, { "epoch": 0.1244286439817166, "grad_norm": 2.305022954940796, "learning_rate": 1.929823706307466e-05, "loss": 1.5595, "step": 245 }, { "epoch": 0.12493651599796851, "grad_norm": 1.9000664949417114, "learning_rate": 1.9292323513274083e-05, "loss": 1.4804, "step": 246 }, { "epoch": 0.12544438801422042, "grad_norm": 2.0502431392669678, "learning_rate": 1.928638606582871e-05, "loss": 1.4233, "step": 247 }, { "epoch": 0.12595226003047233, "grad_norm": 1.6554498672485352, "learning_rate": 1.928042473600824e-05, "loss": 1.56, "step": 248 }, { "epoch": 0.12646013204672424, "grad_norm": 1.7347345352172852, "learning_rate": 1.9274439539143796e-05, "loss": 1.6861, "step": 249 }, { "epoch": 0.12696800406297612, "grad_norm": 1.698886513710022, "learning_rate": 1.9268430490627882e-05, "loss": 1.4357, "step": 250 }, { "epoch": 0.12747587607922803, "grad_norm": 1.5664055347442627, "learning_rate": 1.9262397605914335e-05, "loss": 1.5929, "step": 251 }, { "epoch": 0.12798374809547994, "grad_norm": 2.1273763179779053, "learning_rate": 1.9256340900518297e-05, "loss": 1.6916, "step": 252 }, { "epoch": 0.12849162011173185, "grad_norm": 1.813781499862671, "learning_rate": 1.9250260390016183e-05, "loss": 1.6156, "step": 253 }, { "epoch": 0.12899949212798376, "grad_norm": 2.629873275756836, "learning_rate": 1.9244156090045614e-05, "loss": 1.3784, "step": 254 }, { "epoch": 0.12950736414423566, "grad_norm": 1.9181764125823975, "learning_rate": 1.9238028016305395e-05, "loss": 1.5728, "step": 255 }, { "epoch": 0.13001523616048755, "grad_norm": 1.8989523649215698, "learning_rate": 1.9231876184555478e-05, "loss": 1.4443, "step": 256 }, { "epoch": 0.13052310817673946, "grad_norm": 2.246098279953003, "learning_rate": 1.9225700610616905e-05, "loss": 1.373, "step": 257 }, { "epoch": 0.13103098019299136, "grad_norm": 1.9531599283218384, "learning_rate": 1.9219501310371785e-05, "loss": 1.4015, "step": 258 }, { "epoch": 0.13153885220924327, "grad_norm": 1.9696428775787354, "learning_rate": 1.9213278299763247e-05, "loss": 1.4289, "step": 259 }, { "epoch": 0.13204672422549518, "grad_norm": 2.4111690521240234, "learning_rate": 1.920703159479538e-05, "loss": 1.3672, "step": 260 }, { "epoch": 0.1325545962417471, "grad_norm": 1.8648861646652222, "learning_rate": 1.9200761211533242e-05, "loss": 1.4389, "step": 261 }, { "epoch": 0.13306246825799897, "grad_norm": 1.9896775484085083, "learning_rate": 1.919446716610275e-05, "loss": 1.5265, "step": 262 }, { "epoch": 0.13357034027425088, "grad_norm": 1.900830626487732, "learning_rate": 1.9188149474690702e-05, "loss": 1.4751, "step": 263 }, { "epoch": 0.1340782122905028, "grad_norm": 2.4136266708374023, "learning_rate": 1.9181808153544693e-05, "loss": 1.2784, "step": 264 }, { "epoch": 0.1345860843067547, "grad_norm": 2.2382752895355225, "learning_rate": 1.9175443218973088e-05, "loss": 1.3597, "step": 265 }, { "epoch": 0.1350939563230066, "grad_norm": 2.2490243911743164, "learning_rate": 1.916905468734499e-05, "loss": 1.4279, "step": 266 }, { "epoch": 0.13560182833925852, "grad_norm": 2.242929458618164, "learning_rate": 1.9162642575090177e-05, "loss": 1.4268, "step": 267 }, { "epoch": 0.1361097003555104, "grad_norm": 2.0219061374664307, "learning_rate": 1.915620689869908e-05, "loss": 1.5397, "step": 268 }, { "epoch": 0.1366175723717623, "grad_norm": 2.5027315616607666, "learning_rate": 1.9149747674722728e-05, "loss": 1.3594, "step": 269 }, { "epoch": 0.13712544438801422, "grad_norm": 2.541565418243408, "learning_rate": 1.9143264919772706e-05, "loss": 1.6437, "step": 270 }, { "epoch": 0.13763331640426613, "grad_norm": 1.9443773031234741, "learning_rate": 1.9136758650521115e-05, "loss": 1.4999, "step": 271 }, { "epoch": 0.13814118842051804, "grad_norm": 2.0174036026000977, "learning_rate": 1.913022888370053e-05, "loss": 1.4428, "step": 272 }, { "epoch": 0.13864906043676994, "grad_norm": 2.9639198780059814, "learning_rate": 1.9123675636103965e-05, "loss": 1.0563, "step": 273 }, { "epoch": 0.13915693245302183, "grad_norm": 2.349069118499756, "learning_rate": 1.911709892458481e-05, "loss": 1.4261, "step": 274 }, { "epoch": 0.13966480446927373, "grad_norm": 2.796888828277588, "learning_rate": 1.9110498766056795e-05, "loss": 1.3594, "step": 275 }, { "epoch": 0.14017267648552564, "grad_norm": 1.9367467164993286, "learning_rate": 1.9103875177493965e-05, "loss": 1.2106, "step": 276 }, { "epoch": 0.14068054850177755, "grad_norm": 2.081439256668091, "learning_rate": 1.9097228175930614e-05, "loss": 1.7384, "step": 277 }, { "epoch": 0.14118842051802946, "grad_norm": 2.059976100921631, "learning_rate": 1.909055777846124e-05, "loss": 1.6941, "step": 278 }, { "epoch": 0.14169629253428137, "grad_norm": 2.0120351314544678, "learning_rate": 1.9083864002240528e-05, "loss": 1.6482, "step": 279 }, { "epoch": 0.14220416455053325, "grad_norm": 2.1116793155670166, "learning_rate": 1.9077146864483276e-05, "loss": 1.4704, "step": 280 }, { "epoch": 0.14271203656678516, "grad_norm": 1.8414626121520996, "learning_rate": 1.9070406382464357e-05, "loss": 1.5199, "step": 281 }, { "epoch": 0.14321990858303707, "grad_norm": 2.3146474361419678, "learning_rate": 1.906364257351869e-05, "loss": 1.3412, "step": 282 }, { "epoch": 0.14372778059928898, "grad_norm": 3.1995961666107178, "learning_rate": 1.9056855455041184e-05, "loss": 1.1945, "step": 283 }, { "epoch": 0.1442356526155409, "grad_norm": 2.307894229888916, "learning_rate": 1.905004504448669e-05, "loss": 1.4024, "step": 284 }, { "epoch": 0.1447435246317928, "grad_norm": 2.1073455810546875, "learning_rate": 1.9043211359369968e-05, "loss": 1.4765, "step": 285 }, { "epoch": 0.1452513966480447, "grad_norm": 2.1424057483673096, "learning_rate": 1.9036354417265625e-05, "loss": 1.4475, "step": 286 }, { "epoch": 0.1457592686642966, "grad_norm": 2.280087471008301, "learning_rate": 1.9029474235808084e-05, "loss": 1.4765, "step": 287 }, { "epoch": 0.1462671406805485, "grad_norm": 2.5047733783721924, "learning_rate": 1.902257083269154e-05, "loss": 1.4661, "step": 288 }, { "epoch": 0.1467750126968004, "grad_norm": 2.0760483741760254, "learning_rate": 1.9015644225669898e-05, "loss": 1.3863, "step": 289 }, { "epoch": 0.14728288471305231, "grad_norm": 2.574784755706787, "learning_rate": 1.900869443255674e-05, "loss": 1.3514, "step": 290 }, { "epoch": 0.14779075672930422, "grad_norm": 1.9647001028060913, "learning_rate": 1.9001721471225288e-05, "loss": 1.6358, "step": 291 }, { "epoch": 0.14829862874555613, "grad_norm": 2.3125619888305664, "learning_rate": 1.8994725359608337e-05, "loss": 1.2977, "step": 292 }, { "epoch": 0.148806500761808, "grad_norm": 2.3946354389190674, "learning_rate": 1.898770611569822e-05, "loss": 1.1799, "step": 293 }, { "epoch": 0.14931437277805992, "grad_norm": 2.557532548904419, "learning_rate": 1.898066375754676e-05, "loss": 1.2864, "step": 294 }, { "epoch": 0.14982224479431183, "grad_norm": 1.8328592777252197, "learning_rate": 1.8973598303265227e-05, "loss": 1.5309, "step": 295 }, { "epoch": 0.15033011681056374, "grad_norm": 2.9495561122894287, "learning_rate": 1.896650977102429e-05, "loss": 1.3253, "step": 296 }, { "epoch": 0.15083798882681565, "grad_norm": 2.62872576713562, "learning_rate": 1.8959398179053967e-05, "loss": 1.5211, "step": 297 }, { "epoch": 0.15134586084306756, "grad_norm": 2.3727867603302, "learning_rate": 1.895226354564358e-05, "loss": 1.3479, "step": 298 }, { "epoch": 0.15185373285931944, "grad_norm": 1.9571844339370728, "learning_rate": 1.894510588914171e-05, "loss": 1.4394, "step": 299 }, { "epoch": 0.15236160487557135, "grad_norm": 2.380986213684082, "learning_rate": 1.8937925227956138e-05, "loss": 1.4961, "step": 300 }, { "epoch": 0.15286947689182326, "grad_norm": 2.1453568935394287, "learning_rate": 1.8930721580553823e-05, "loss": 1.4865, "step": 301 }, { "epoch": 0.15337734890807517, "grad_norm": 2.148232936859131, "learning_rate": 1.8923494965460835e-05, "loss": 1.4999, "step": 302 }, { "epoch": 0.15388522092432708, "grad_norm": 2.053626298904419, "learning_rate": 1.8916245401262302e-05, "loss": 1.4948, "step": 303 }, { "epoch": 0.15439309294057899, "grad_norm": 2.15143084526062, "learning_rate": 1.8908972906602377e-05, "loss": 1.3438, "step": 304 }, { "epoch": 0.15490096495683087, "grad_norm": 1.9328835010528564, "learning_rate": 1.8901677500184193e-05, "loss": 1.6481, "step": 305 }, { "epoch": 0.15540883697308278, "grad_norm": 2.4750730991363525, "learning_rate": 1.8894359200769795e-05, "loss": 1.414, "step": 306 }, { "epoch": 0.15591670898933468, "grad_norm": 2.187335968017578, "learning_rate": 1.88870180271801e-05, "loss": 1.6663, "step": 307 }, { "epoch": 0.1564245810055866, "grad_norm": 1.9440557956695557, "learning_rate": 1.8879653998294874e-05, "loss": 1.574, "step": 308 }, { "epoch": 0.1569324530218385, "grad_norm": 2.0348010063171387, "learning_rate": 1.887226713305264e-05, "loss": 1.5302, "step": 309 }, { "epoch": 0.1574403250380904, "grad_norm": 2.1762094497680664, "learning_rate": 1.8864857450450656e-05, "loss": 1.3841, "step": 310 }, { "epoch": 0.1579481970543423, "grad_norm": 2.0718870162963867, "learning_rate": 1.885742496954486e-05, "loss": 1.4259, "step": 311 }, { "epoch": 0.1584560690705942, "grad_norm": 2.1066360473632812, "learning_rate": 1.8849969709449832e-05, "loss": 1.4205, "step": 312 }, { "epoch": 0.1589639410868461, "grad_norm": 2.104362726211548, "learning_rate": 1.8842491689338723e-05, "loss": 1.4979, "step": 313 }, { "epoch": 0.15947181310309802, "grad_norm": 2.0534019470214844, "learning_rate": 1.8834990928443223e-05, "loss": 1.4079, "step": 314 }, { "epoch": 0.15997968511934993, "grad_norm": 2.407684087753296, "learning_rate": 1.8827467446053495e-05, "loss": 1.2353, "step": 315 }, { "epoch": 0.16048755713560184, "grad_norm": 1.9434267282485962, "learning_rate": 1.881992126151816e-05, "loss": 1.4704, "step": 316 }, { "epoch": 0.16099542915185372, "grad_norm": 2.3935511112213135, "learning_rate": 1.88123523942442e-05, "loss": 1.5185, "step": 317 }, { "epoch": 0.16150330116810563, "grad_norm": 2.342310667037964, "learning_rate": 1.8804760863696935e-05, "loss": 1.4252, "step": 318 }, { "epoch": 0.16201117318435754, "grad_norm": 2.0927910804748535, "learning_rate": 1.8797146689399986e-05, "loss": 1.4787, "step": 319 }, { "epoch": 0.16251904520060945, "grad_norm": 2.240511417388916, "learning_rate": 1.8789509890935195e-05, "loss": 1.4495, "step": 320 }, { "epoch": 0.16302691721686136, "grad_norm": 2.1083669662475586, "learning_rate": 1.878185048794259e-05, "loss": 1.3573, "step": 321 }, { "epoch": 0.16353478923311326, "grad_norm": 2.706357002258301, "learning_rate": 1.8774168500120327e-05, "loss": 1.1445, "step": 322 }, { "epoch": 0.16404266124936517, "grad_norm": 2.1849541664123535, "learning_rate": 1.8766463947224656e-05, "loss": 1.5005, "step": 323 }, { "epoch": 0.16455053326561705, "grad_norm": 1.9791027307510376, "learning_rate": 1.875873684906985e-05, "loss": 1.522, "step": 324 }, { "epoch": 0.16505840528186896, "grad_norm": 3.4028851985931396, "learning_rate": 1.8750987225528172e-05, "loss": 1.2043, "step": 325 }, { "epoch": 0.16556627729812087, "grad_norm": 2.6301186084747314, "learning_rate": 1.874321509652981e-05, "loss": 1.3337, "step": 326 }, { "epoch": 0.16607414931437278, "grad_norm": 2.031865119934082, "learning_rate": 1.8735420482062823e-05, "loss": 1.7132, "step": 327 }, { "epoch": 0.1665820213306247, "grad_norm": 2.090599298477173, "learning_rate": 1.872760340217311e-05, "loss": 1.577, "step": 328 }, { "epoch": 0.1670898933468766, "grad_norm": 2.234833240509033, "learning_rate": 1.8719763876964336e-05, "loss": 1.2331, "step": 329 }, { "epoch": 0.16759776536312848, "grad_norm": 2.4577765464782715, "learning_rate": 1.87119019265979e-05, "loss": 1.4527, "step": 330 }, { "epoch": 0.1681056373793804, "grad_norm": 1.89016592502594, "learning_rate": 1.8704017571292863e-05, "loss": 1.4465, "step": 331 }, { "epoch": 0.1686135093956323, "grad_norm": 2.2821614742279053, "learning_rate": 1.8696110831325913e-05, "loss": 1.3836, "step": 332 }, { "epoch": 0.1691213814118842, "grad_norm": 2.25083327293396, "learning_rate": 1.8688181727031302e-05, "loss": 1.5524, "step": 333 }, { "epoch": 0.16962925342813612, "grad_norm": 2.1324195861816406, "learning_rate": 1.8680230278800803e-05, "loss": 1.5654, "step": 334 }, { "epoch": 0.17013712544438803, "grad_norm": 1.9769184589385986, "learning_rate": 1.8672256507083643e-05, "loss": 1.5077, "step": 335 }, { "epoch": 0.1706449974606399, "grad_norm": 2.0667829513549805, "learning_rate": 1.8664260432386468e-05, "loss": 1.5829, "step": 336 }, { "epoch": 0.17115286947689182, "grad_norm": 2.8132925033569336, "learning_rate": 1.8656242075273285e-05, "loss": 1.1102, "step": 337 }, { "epoch": 0.17166074149314373, "grad_norm": 2.055262565612793, "learning_rate": 1.864820145636539e-05, "loss": 1.3644, "step": 338 }, { "epoch": 0.17216861350939563, "grad_norm": 2.576754570007324, "learning_rate": 1.8640138596341344e-05, "loss": 1.1964, "step": 339 }, { "epoch": 0.17267648552564754, "grad_norm": 2.026365280151367, "learning_rate": 1.8632053515936908e-05, "loss": 1.4661, "step": 340 }, { "epoch": 0.17318435754189945, "grad_norm": 2.220334768295288, "learning_rate": 1.8623946235944984e-05, "loss": 1.5105, "step": 341 }, { "epoch": 0.17369222955815133, "grad_norm": 1.902352213859558, "learning_rate": 1.8615816777215568e-05, "loss": 1.3642, "step": 342 }, { "epoch": 0.17420010157440324, "grad_norm": 2.278432846069336, "learning_rate": 1.860766516065569e-05, "loss": 1.2551, "step": 343 }, { "epoch": 0.17470797359065515, "grad_norm": 2.4950084686279297, "learning_rate": 1.859949140722937e-05, "loss": 1.49, "step": 344 }, { "epoch": 0.17521584560690706, "grad_norm": 2.029480218887329, "learning_rate": 1.859129553795756e-05, "loss": 1.3534, "step": 345 }, { "epoch": 0.17572371762315897, "grad_norm": 2.1549057960510254, "learning_rate": 1.8583077573918075e-05, "loss": 1.3466, "step": 346 }, { "epoch": 0.17623158963941088, "grad_norm": 2.1388282775878906, "learning_rate": 1.8574837536245573e-05, "loss": 1.6887, "step": 347 }, { "epoch": 0.17673946165566276, "grad_norm": 2.1772992610931396, "learning_rate": 1.8566575446131465e-05, "loss": 1.3554, "step": 348 }, { "epoch": 0.17724733367191467, "grad_norm": 2.1744778156280518, "learning_rate": 1.855829132482388e-05, "loss": 1.4455, "step": 349 }, { "epoch": 0.17775520568816658, "grad_norm": 2.47438383102417, "learning_rate": 1.8549985193627614e-05, "loss": 1.0442, "step": 350 }, { "epoch": 0.1782630777044185, "grad_norm": 2.6652379035949707, "learning_rate": 1.8541657073904056e-05, "loss": 1.5324, "step": 351 }, { "epoch": 0.1787709497206704, "grad_norm": 1.9979355335235596, "learning_rate": 1.8533306987071143e-05, "loss": 1.3954, "step": 352 }, { "epoch": 0.1792788217369223, "grad_norm": 2.422628164291382, "learning_rate": 1.8524934954603315e-05, "loss": 1.1543, "step": 353 }, { "epoch": 0.1797866937531742, "grad_norm": 2.561239242553711, "learning_rate": 1.8516540998031448e-05, "loss": 1.4323, "step": 354 }, { "epoch": 0.1802945657694261, "grad_norm": 2.8377954959869385, "learning_rate": 1.85081251389428e-05, "loss": 1.2745, "step": 355 }, { "epoch": 0.180802437785678, "grad_norm": 2.4836831092834473, "learning_rate": 1.8499687398980958e-05, "loss": 1.6612, "step": 356 }, { "epoch": 0.18131030980192991, "grad_norm": 2.547248601913452, "learning_rate": 1.849122779984578e-05, "loss": 1.1906, "step": 357 }, { "epoch": 0.18181818181818182, "grad_norm": 2.7639479637145996, "learning_rate": 1.8482746363293343e-05, "loss": 1.4388, "step": 358 }, { "epoch": 0.18232605383443373, "grad_norm": 2.050381898880005, "learning_rate": 1.8474243111135887e-05, "loss": 1.3234, "step": 359 }, { "epoch": 0.1828339258506856, "grad_norm": 2.310777187347412, "learning_rate": 1.8465718065241748e-05, "loss": 1.4396, "step": 360 }, { "epoch": 0.18334179786693752, "grad_norm": 2.1302127838134766, "learning_rate": 1.8457171247535317e-05, "loss": 1.3624, "step": 361 }, { "epoch": 0.18384966988318943, "grad_norm": 2.225740671157837, "learning_rate": 1.8448602679996973e-05, "loss": 1.4544, "step": 362 }, { "epoch": 0.18435754189944134, "grad_norm": 2.29659104347229, "learning_rate": 1.8440012384663038e-05, "loss": 1.4831, "step": 363 }, { "epoch": 0.18486541391569325, "grad_norm": 2.4151954650878906, "learning_rate": 1.8431400383625705e-05, "loss": 1.6177, "step": 364 }, { "epoch": 0.18537328593194516, "grad_norm": 2.5019686222076416, "learning_rate": 1.8422766699032987e-05, "loss": 1.6529, "step": 365 }, { "epoch": 0.18588115794819707, "grad_norm": 2.6340701580047607, "learning_rate": 1.8414111353088673e-05, "loss": 1.4929, "step": 366 }, { "epoch": 0.18638902996444895, "grad_norm": 2.0728466510772705, "learning_rate": 1.840543436805225e-05, "loss": 1.2841, "step": 367 }, { "epoch": 0.18689690198070086, "grad_norm": 2.3754184246063232, "learning_rate": 1.8396735766238857e-05, "loss": 1.5152, "step": 368 }, { "epoch": 0.18740477399695277, "grad_norm": 2.1963157653808594, "learning_rate": 1.838801557001923e-05, "loss": 1.5506, "step": 369 }, { "epoch": 0.18791264601320468, "grad_norm": 2.961599588394165, "learning_rate": 1.8379273801819638e-05, "loss": 1.4706, "step": 370 }, { "epoch": 0.18842051802945659, "grad_norm": 2.4010231494903564, "learning_rate": 1.8370510484121827e-05, "loss": 1.5459, "step": 371 }, { "epoch": 0.1889283900457085, "grad_norm": 2.4878487586975098, "learning_rate": 1.836172563946297e-05, "loss": 1.3344, "step": 372 }, { "epoch": 0.18943626206196038, "grad_norm": 2.9341530799865723, "learning_rate": 1.835291929043559e-05, "loss": 1.1486, "step": 373 }, { "epoch": 0.18994413407821228, "grad_norm": 2.265747308731079, "learning_rate": 1.8344091459687523e-05, "loss": 1.4896, "step": 374 }, { "epoch": 0.1904520060944642, "grad_norm": 2.425539016723633, "learning_rate": 1.8335242169921855e-05, "loss": 1.34, "step": 375 }, { "epoch": 0.1909598781107161, "grad_norm": 2.2222049236297607, "learning_rate": 1.8326371443896844e-05, "loss": 1.3314, "step": 376 }, { "epoch": 0.191467750126968, "grad_norm": 2.7927303314208984, "learning_rate": 1.8317479304425892e-05, "loss": 1.5547, "step": 377 }, { "epoch": 0.19197562214321992, "grad_norm": 2.29571795463562, "learning_rate": 1.8308565774377467e-05, "loss": 1.6539, "step": 378 }, { "epoch": 0.1924834941594718, "grad_norm": 2.2622365951538086, "learning_rate": 1.8299630876675044e-05, "loss": 1.0428, "step": 379 }, { "epoch": 0.1929913661757237, "grad_norm": 2.2128021717071533, "learning_rate": 1.8290674634297058e-05, "loss": 1.7003, "step": 380 }, { "epoch": 0.19349923819197562, "grad_norm": 2.8428139686584473, "learning_rate": 1.8281697070276823e-05, "loss": 1.3185, "step": 381 }, { "epoch": 0.19400711020822753, "grad_norm": 2.158491849899292, "learning_rate": 1.8272698207702507e-05, "loss": 1.5285, "step": 382 }, { "epoch": 0.19451498222447944, "grad_norm": 2.125875949859619, "learning_rate": 1.8263678069717038e-05, "loss": 1.3319, "step": 383 }, { "epoch": 0.19502285424073135, "grad_norm": 2.0204598903656006, "learning_rate": 1.825463667951807e-05, "loss": 1.5469, "step": 384 }, { "epoch": 0.19553072625698323, "grad_norm": 2.247217893600464, "learning_rate": 1.82455740603579e-05, "loss": 1.7199, "step": 385 }, { "epoch": 0.19603859827323514, "grad_norm": 2.4135122299194336, "learning_rate": 1.8236490235543434e-05, "loss": 1.2538, "step": 386 }, { "epoch": 0.19654647028948705, "grad_norm": 2.0119616985321045, "learning_rate": 1.8227385228436105e-05, "loss": 1.6421, "step": 387 }, { "epoch": 0.19705434230573896, "grad_norm": 2.754758596420288, "learning_rate": 1.8218259062451823e-05, "loss": 1.4222, "step": 388 }, { "epoch": 0.19756221432199086, "grad_norm": 3.040740728378296, "learning_rate": 1.8209111761060917e-05, "loss": 0.8824, "step": 389 }, { "epoch": 0.19807008633824277, "grad_norm": 2.2685205936431885, "learning_rate": 1.8199943347788068e-05, "loss": 1.4771, "step": 390 }, { "epoch": 0.19857795835449465, "grad_norm": 2.4334073066711426, "learning_rate": 1.819075384621226e-05, "loss": 1.3698, "step": 391 }, { "epoch": 0.19908583037074656, "grad_norm": 2.461564302444458, "learning_rate": 1.8181543279966693e-05, "loss": 1.3328, "step": 392 }, { "epoch": 0.19959370238699847, "grad_norm": 2.8636112213134766, "learning_rate": 1.817231167273876e-05, "loss": 1.3873, "step": 393 }, { "epoch": 0.20010157440325038, "grad_norm": 2.322828769683838, "learning_rate": 1.8163059048269955e-05, "loss": 1.5203, "step": 394 }, { "epoch": 0.2006094464195023, "grad_norm": 2.6736671924591064, "learning_rate": 1.8153785430355822e-05, "loss": 1.3255, "step": 395 }, { "epoch": 0.2011173184357542, "grad_norm": 2.41998553276062, "learning_rate": 1.814449084284591e-05, "loss": 1.5725, "step": 396 }, { "epoch": 0.20162519045200608, "grad_norm": 2.028114080429077, "learning_rate": 1.813517530964368e-05, "loss": 1.5688, "step": 397 }, { "epoch": 0.202133062468258, "grad_norm": 3.2604825496673584, "learning_rate": 1.8125838854706462e-05, "loss": 1.3424, "step": 398 }, { "epoch": 0.2026409344845099, "grad_norm": 2.9950945377349854, "learning_rate": 1.8116481502045403e-05, "loss": 1.1741, "step": 399 }, { "epoch": 0.2031488065007618, "grad_norm": 3.022077798843384, "learning_rate": 1.8107103275725385e-05, "loss": 1.4031, "step": 400 }, { "epoch": 0.20365667851701372, "grad_norm": 2.1941399574279785, "learning_rate": 1.8097704199864975e-05, "loss": 1.5143, "step": 401 }, { "epoch": 0.20416455053326563, "grad_norm": 2.6506152153015137, "learning_rate": 1.808828429863636e-05, "loss": 1.253, "step": 402 }, { "epoch": 0.20467242254951754, "grad_norm": 2.522321939468384, "learning_rate": 1.8078843596265284e-05, "loss": 1.3241, "step": 403 }, { "epoch": 0.20518029456576942, "grad_norm": 1.9919332265853882, "learning_rate": 1.8069382117030986e-05, "loss": 1.4723, "step": 404 }, { "epoch": 0.20568816658202133, "grad_norm": 2.273698329925537, "learning_rate": 1.8059899885266138e-05, "loss": 1.5205, "step": 405 }, { "epoch": 0.20619603859827323, "grad_norm": 2.337557792663574, "learning_rate": 1.8050396925356784e-05, "loss": 1.5634, "step": 406 }, { "epoch": 0.20670391061452514, "grad_norm": 2.289808750152588, "learning_rate": 1.8040873261742283e-05, "loss": 1.4679, "step": 407 }, { "epoch": 0.20721178263077705, "grad_norm": 2.309823989868164, "learning_rate": 1.8031328918915217e-05, "loss": 1.4815, "step": 408 }, { "epoch": 0.20771965464702896, "grad_norm": 2.8153891563415527, "learning_rate": 1.802176392142138e-05, "loss": 1.015, "step": 409 }, { "epoch": 0.20822752666328084, "grad_norm": 2.379129409790039, "learning_rate": 1.8012178293859656e-05, "loss": 1.5316, "step": 410 }, { "epoch": 0.20873539867953275, "grad_norm": 2.3453006744384766, "learning_rate": 1.8002572060882007e-05, "loss": 1.3609, "step": 411 }, { "epoch": 0.20924327069578466, "grad_norm": 2.95465350151062, "learning_rate": 1.7992945247193377e-05, "loss": 1.5909, "step": 412 }, { "epoch": 0.20975114271203657, "grad_norm": 2.9034945964813232, "learning_rate": 1.7983297877551632e-05, "loss": 1.3234, "step": 413 }, { "epoch": 0.21025901472828848, "grad_norm": 1.9625252485275269, "learning_rate": 1.797362997676752e-05, "loss": 1.6168, "step": 414 }, { "epoch": 0.2107668867445404, "grad_norm": 2.2009334564208984, "learning_rate": 1.796394156970458e-05, "loss": 1.5079, "step": 415 }, { "epoch": 0.21127475876079227, "grad_norm": 3.1377625465393066, "learning_rate": 1.7954232681279088e-05, "loss": 1.0786, "step": 416 }, { "epoch": 0.21178263077704418, "grad_norm": 2.8287923336029053, "learning_rate": 1.794450333645999e-05, "loss": 1.4505, "step": 417 }, { "epoch": 0.2122905027932961, "grad_norm": 2.2335076332092285, "learning_rate": 1.7934753560268857e-05, "loss": 1.6506, "step": 418 }, { "epoch": 0.212798374809548, "grad_norm": 2.296365737915039, "learning_rate": 1.792498337777978e-05, "loss": 1.3601, "step": 419 }, { "epoch": 0.2133062468257999, "grad_norm": 2.6674273014068604, "learning_rate": 1.7915192814119353e-05, "loss": 1.4654, "step": 420 }, { "epoch": 0.21381411884205181, "grad_norm": 2.8880250453948975, "learning_rate": 1.790538189446657e-05, "loss": 1.3702, "step": 421 }, { "epoch": 0.2143219908583037, "grad_norm": 2.449025869369507, "learning_rate": 1.7895550644052786e-05, "loss": 1.4645, "step": 422 }, { "epoch": 0.2148298628745556, "grad_norm": 1.928270697593689, "learning_rate": 1.7885699088161633e-05, "loss": 1.4804, "step": 423 }, { "epoch": 0.21533773489080751, "grad_norm": 2.3959450721740723, "learning_rate": 1.7875827252128975e-05, "loss": 1.2716, "step": 424 }, { "epoch": 0.21584560690705942, "grad_norm": 2.897495985031128, "learning_rate": 1.786593516134282e-05, "loss": 1.6, "step": 425 }, { "epoch": 0.21635347892331133, "grad_norm": 2.6640288829803467, "learning_rate": 1.7856022841243267e-05, "loss": 1.4547, "step": 426 }, { "epoch": 0.21686135093956324, "grad_norm": 2.1698009967803955, "learning_rate": 1.7846090317322457e-05, "loss": 1.3702, "step": 427 }, { "epoch": 0.21736922295581512, "grad_norm": 2.8702659606933594, "learning_rate": 1.7836137615124466e-05, "loss": 1.2881, "step": 428 }, { "epoch": 0.21787709497206703, "grad_norm": 2.8904287815093994, "learning_rate": 1.7826164760245284e-05, "loss": 1.5613, "step": 429 }, { "epoch": 0.21838496698831894, "grad_norm": 2.9009711742401123, "learning_rate": 1.7816171778332715e-05, "loss": 1.3943, "step": 430 }, { "epoch": 0.21889283900457085, "grad_norm": 2.75502872467041, "learning_rate": 1.7806158695086333e-05, "loss": 1.4311, "step": 431 }, { "epoch": 0.21940071102082276, "grad_norm": 2.89510178565979, "learning_rate": 1.7796125536257408e-05, "loss": 1.3384, "step": 432 }, { "epoch": 0.21990858303707467, "grad_norm": 2.7613089084625244, "learning_rate": 1.778607232764883e-05, "loss": 1.734, "step": 433 }, { "epoch": 0.22041645505332655, "grad_norm": 1.9892759323120117, "learning_rate": 1.7775999095115066e-05, "loss": 1.4728, "step": 434 }, { "epoch": 0.22092432706957846, "grad_norm": 2.4320971965789795, "learning_rate": 1.7765905864562066e-05, "loss": 1.4656, "step": 435 }, { "epoch": 0.22143219908583037, "grad_norm": 2.669489622116089, "learning_rate": 1.7755792661947224e-05, "loss": 1.2149, "step": 436 }, { "epoch": 0.22194007110208228, "grad_norm": 2.6985087394714355, "learning_rate": 1.774565951327929e-05, "loss": 1.3109, "step": 437 }, { "epoch": 0.22244794311833418, "grad_norm": 2.665987014770508, "learning_rate": 1.7735506444618303e-05, "loss": 1.4917, "step": 438 }, { "epoch": 0.2229558151345861, "grad_norm": 2.214789628982544, "learning_rate": 1.7725333482075546e-05, "loss": 1.6784, "step": 439 }, { "epoch": 0.22346368715083798, "grad_norm": 2.9302868843078613, "learning_rate": 1.771514065181346e-05, "loss": 1.2476, "step": 440 }, { "epoch": 0.22397155916708988, "grad_norm": 2.3863744735717773, "learning_rate": 1.7704927980045578e-05, "loss": 1.3971, "step": 441 }, { "epoch": 0.2244794311833418, "grad_norm": 2.391859769821167, "learning_rate": 1.7694695493036454e-05, "loss": 1.3805, "step": 442 }, { "epoch": 0.2249873031995937, "grad_norm": 2.776182174682617, "learning_rate": 1.7684443217101616e-05, "loss": 1.3091, "step": 443 }, { "epoch": 0.2254951752158456, "grad_norm": 2.4615769386291504, "learning_rate": 1.7674171178607476e-05, "loss": 1.457, "step": 444 }, { "epoch": 0.22600304723209752, "grad_norm": 2.2038397789001465, "learning_rate": 1.7663879403971274e-05, "loss": 1.4441, "step": 445 }, { "epoch": 0.22651091924834943, "grad_norm": 2.2213821411132812, "learning_rate": 1.7653567919661004e-05, "loss": 1.3748, "step": 446 }, { "epoch": 0.2270187912646013, "grad_norm": 2.3839704990386963, "learning_rate": 1.7643236752195357e-05, "loss": 1.6147, "step": 447 }, { "epoch": 0.22752666328085322, "grad_norm": 2.652604341506958, "learning_rate": 1.7632885928143624e-05, "loss": 1.5667, "step": 448 }, { "epoch": 0.22803453529710513, "grad_norm": 2.5281620025634766, "learning_rate": 1.762251547412567e-05, "loss": 1.2654, "step": 449 }, { "epoch": 0.22854240731335704, "grad_norm": 2.3414971828460693, "learning_rate": 1.761212541681183e-05, "loss": 1.4149, "step": 450 }, { "epoch": 0.22905027932960895, "grad_norm": 3.3338890075683594, "learning_rate": 1.760171578292287e-05, "loss": 1.5189, "step": 451 }, { "epoch": 0.22955815134586086, "grad_norm": 2.4341635704040527, "learning_rate": 1.7591286599229874e-05, "loss": 1.5695, "step": 452 }, { "epoch": 0.23006602336211274, "grad_norm": 2.1505444049835205, "learning_rate": 1.7580837892554233e-05, "loss": 1.3496, "step": 453 }, { "epoch": 0.23057389537836465, "grad_norm": 2.65849232673645, "learning_rate": 1.7570369689767534e-05, "loss": 1.4188, "step": 454 }, { "epoch": 0.23108176739461656, "grad_norm": 2.5335099697113037, "learning_rate": 1.75598820177915e-05, "loss": 1.5386, "step": 455 }, { "epoch": 0.23158963941086846, "grad_norm": 2.5862152576446533, "learning_rate": 1.7549374903597923e-05, "loss": 1.5935, "step": 456 }, { "epoch": 0.23209751142712037, "grad_norm": 2.326003074645996, "learning_rate": 1.7538848374208606e-05, "loss": 1.3387, "step": 457 }, { "epoch": 0.23260538344337228, "grad_norm": 2.2396774291992188, "learning_rate": 1.7528302456695278e-05, "loss": 1.565, "step": 458 }, { "epoch": 0.23311325545962416, "grad_norm": 3.605147361755371, "learning_rate": 1.751773717817953e-05, "loss": 0.9666, "step": 459 }, { "epoch": 0.23362112747587607, "grad_norm": 2.740504503250122, "learning_rate": 1.750715256583274e-05, "loss": 1.4752, "step": 460 }, { "epoch": 0.23412899949212798, "grad_norm": 3.1214797496795654, "learning_rate": 1.7496548646876018e-05, "loss": 1.1117, "step": 461 }, { "epoch": 0.2346368715083799, "grad_norm": 2.327601432800293, "learning_rate": 1.748592544858011e-05, "loss": 1.2953, "step": 462 }, { "epoch": 0.2351447435246318, "grad_norm": 2.6740219593048096, "learning_rate": 1.7475282998265367e-05, "loss": 1.462, "step": 463 }, { "epoch": 0.2356526155408837, "grad_norm": 2.490809202194214, "learning_rate": 1.746462132330163e-05, "loss": 1.234, "step": 464 }, { "epoch": 0.2361604875571356, "grad_norm": 3.0028162002563477, "learning_rate": 1.745394045110819e-05, "loss": 1.0599, "step": 465 }, { "epoch": 0.2366683595733875, "grad_norm": 2.8784072399139404, "learning_rate": 1.7443240409153714e-05, "loss": 1.188, "step": 466 }, { "epoch": 0.2371762315896394, "grad_norm": 2.4966988563537598, "learning_rate": 1.743252122495616e-05, "loss": 1.5535, "step": 467 }, { "epoch": 0.23768410360589132, "grad_norm": 2.3101346492767334, "learning_rate": 1.7421782926082714e-05, "loss": 1.4142, "step": 468 }, { "epoch": 0.23819197562214323, "grad_norm": 2.4662463665008545, "learning_rate": 1.7411025540149737e-05, "loss": 1.4697, "step": 469 }, { "epoch": 0.23869984763839514, "grad_norm": 2.2677195072174072, "learning_rate": 1.7400249094822656e-05, "loss": 1.3498, "step": 470 }, { "epoch": 0.23920771965464702, "grad_norm": 2.796475410461426, "learning_rate": 1.7389453617815926e-05, "loss": 1.6972, "step": 471 }, { "epoch": 0.23971559167089893, "grad_norm": 2.44164776802063, "learning_rate": 1.737863913689295e-05, "loss": 1.5431, "step": 472 }, { "epoch": 0.24022346368715083, "grad_norm": 3.1222140789031982, "learning_rate": 1.736780567986599e-05, "loss": 1.1978, "step": 473 }, { "epoch": 0.24073133570340274, "grad_norm": 3.480477809906006, "learning_rate": 1.735695327459613e-05, "loss": 1.0457, "step": 474 }, { "epoch": 0.24123920771965465, "grad_norm": 2.646369695663452, "learning_rate": 1.734608194899317e-05, "loss": 1.2979, "step": 475 }, { "epoch": 0.24174707973590656, "grad_norm": 2.220792531967163, "learning_rate": 1.7335191731015564e-05, "loss": 1.4697, "step": 476 }, { "epoch": 0.24225495175215844, "grad_norm": 2.121004819869995, "learning_rate": 1.7324282648670374e-05, "loss": 1.5301, "step": 477 }, { "epoch": 0.24276282376841035, "grad_norm": 2.2976834774017334, "learning_rate": 1.731335473001316e-05, "loss": 1.4953, "step": 478 }, { "epoch": 0.24327069578466226, "grad_norm": 2.562124252319336, "learning_rate": 1.730240800314792e-05, "loss": 1.2582, "step": 479 }, { "epoch": 0.24377856780091417, "grad_norm": 2.4853553771972656, "learning_rate": 1.7291442496227042e-05, "loss": 1.3838, "step": 480 }, { "epoch": 0.24428643981716608, "grad_norm": 2.6778106689453125, "learning_rate": 1.7280458237451192e-05, "loss": 1.5141, "step": 481 }, { "epoch": 0.244794311833418, "grad_norm": 2.7512123584747314, "learning_rate": 1.7269455255069276e-05, "loss": 1.342, "step": 482 }, { "epoch": 0.2453021838496699, "grad_norm": 2.8781282901763916, "learning_rate": 1.7258433577378342e-05, "loss": 1.3965, "step": 483 }, { "epoch": 0.24581005586592178, "grad_norm": 2.9807615280151367, "learning_rate": 1.7247393232723523e-05, "loss": 1.5302, "step": 484 }, { "epoch": 0.2463179278821737, "grad_norm": 3.537425994873047, "learning_rate": 1.723633424949796e-05, "loss": 1.3961, "step": 485 }, { "epoch": 0.2468257998984256, "grad_norm": 3.2156057357788086, "learning_rate": 1.722525665614272e-05, "loss": 1.4332, "step": 486 }, { "epoch": 0.2473336719146775, "grad_norm": 2.931434392929077, "learning_rate": 1.7214160481146748e-05, "loss": 1.5141, "step": 487 }, { "epoch": 0.24784154393092941, "grad_norm": 2.7063584327697754, "learning_rate": 1.7203045753046755e-05, "loss": 1.4324, "step": 488 }, { "epoch": 0.24834941594718132, "grad_norm": 2.5695347785949707, "learning_rate": 1.7191912500427184e-05, "loss": 1.4655, "step": 489 }, { "epoch": 0.2488572879634332, "grad_norm": 2.729949951171875, "learning_rate": 1.7180760751920106e-05, "loss": 1.2106, "step": 490 }, { "epoch": 0.2493651599796851, "grad_norm": 2.229206085205078, "learning_rate": 1.7169590536205166e-05, "loss": 1.698, "step": 491 }, { "epoch": 0.24987303199593702, "grad_norm": 2.490645408630371, "learning_rate": 1.7158401882009507e-05, "loss": 1.5399, "step": 492 }, { "epoch": 0.25038090401218893, "grad_norm": 2.817081928253174, "learning_rate": 1.7147194818107674e-05, "loss": 1.2956, "step": 493 }, { "epoch": 0.25038090401218893, "eval_loss": 1.272118330001831, "eval_runtime": 169.0072, "eval_samples_per_second": 5.822, "eval_steps_per_second": 1.456, "step": 493 }, { "epoch": 0.25088877602844084, "grad_norm": 2.6739213466644287, "learning_rate": 1.713596937332158e-05, "loss": 1.2104, "step": 494 }, { "epoch": 0.25139664804469275, "grad_norm": 2.5604422092437744, "learning_rate": 1.7124725576520386e-05, "loss": 1.4294, "step": 495 }, { "epoch": 0.25190452006094466, "grad_norm": 3.073133707046509, "learning_rate": 1.711346345662047e-05, "loss": 1.3513, "step": 496 }, { "epoch": 0.25241239207719657, "grad_norm": 2.7818005084991455, "learning_rate": 1.710218304258532e-05, "loss": 1.3464, "step": 497 }, { "epoch": 0.2529202640934485, "grad_norm": 2.26126766204834, "learning_rate": 1.7090884363425482e-05, "loss": 1.4703, "step": 498 }, { "epoch": 0.25342813610970033, "grad_norm": 2.490053415298462, "learning_rate": 1.7079567448198463e-05, "loss": 1.7689, "step": 499 }, { "epoch": 0.25393600812595224, "grad_norm": 2.396192789077759, "learning_rate": 1.706823232600868e-05, "loss": 1.6666, "step": 500 }, { "epoch": 0.25444388014220415, "grad_norm": 2.811347484588623, "learning_rate": 1.7056879026007373e-05, "loss": 1.3238, "step": 501 }, { "epoch": 0.25495175215845606, "grad_norm": 2.5664217472076416, "learning_rate": 1.7045507577392518e-05, "loss": 1.39, "step": 502 }, { "epoch": 0.25545962417470797, "grad_norm": 2.662076473236084, "learning_rate": 1.7034118009408784e-05, "loss": 1.3517, "step": 503 }, { "epoch": 0.2559674961909599, "grad_norm": 2.6712756156921387, "learning_rate": 1.7022710351347423e-05, "loss": 1.5732, "step": 504 }, { "epoch": 0.2564753682072118, "grad_norm": 2.152195930480957, "learning_rate": 1.7011284632546225e-05, "loss": 1.3897, "step": 505 }, { "epoch": 0.2569832402234637, "grad_norm": 2.2648797035217285, "learning_rate": 1.6999840882389413e-05, "loss": 1.4887, "step": 506 }, { "epoch": 0.2574911122397156, "grad_norm": 3.0260188579559326, "learning_rate": 1.698837913030759e-05, "loss": 1.3154, "step": 507 }, { "epoch": 0.2579989842559675, "grad_norm": 2.791656494140625, "learning_rate": 1.6976899405777656e-05, "loss": 1.3579, "step": 508 }, { "epoch": 0.2585068562722194, "grad_norm": 2.6979916095733643, "learning_rate": 1.6965401738322734e-05, "loss": 1.3458, "step": 509 }, { "epoch": 0.25901472828847133, "grad_norm": 2.2963948249816895, "learning_rate": 1.6953886157512084e-05, "loss": 1.4368, "step": 510 }, { "epoch": 0.2595226003047232, "grad_norm": 2.48846697807312, "learning_rate": 1.6942352692961036e-05, "loss": 1.3519, "step": 511 }, { "epoch": 0.2600304723209751, "grad_norm": 2.709425449371338, "learning_rate": 1.6930801374330924e-05, "loss": 1.4138, "step": 512 }, { "epoch": 0.260538344337227, "grad_norm": 2.7645037174224854, "learning_rate": 1.691923223132899e-05, "loss": 1.1771, "step": 513 }, { "epoch": 0.2610462163534789, "grad_norm": 3.7189676761627197, "learning_rate": 1.690764529370831e-05, "loss": 1.0428, "step": 514 }, { "epoch": 0.2615540883697308, "grad_norm": 3.617924928665161, "learning_rate": 1.689604059126774e-05, "loss": 0.8653, "step": 515 }, { "epoch": 0.26206196038598273, "grad_norm": 2.869488000869751, "learning_rate": 1.6884418153851805e-05, "loss": 1.5021, "step": 516 }, { "epoch": 0.26256983240223464, "grad_norm": 2.827406883239746, "learning_rate": 1.687277801135065e-05, "loss": 1.7214, "step": 517 }, { "epoch": 0.26307770441848655, "grad_norm": 2.7554526329040527, "learning_rate": 1.6861120193699953e-05, "loss": 1.5105, "step": 518 }, { "epoch": 0.26358557643473846, "grad_norm": 3.9668350219726562, "learning_rate": 1.6849444730880846e-05, "loss": 1.1446, "step": 519 }, { "epoch": 0.26409344845099036, "grad_norm": 3.158369779586792, "learning_rate": 1.683775165291984e-05, "loss": 1.0159, "step": 520 }, { "epoch": 0.2646013204672423, "grad_norm": 2.6982619762420654, "learning_rate": 1.6826040989888754e-05, "loss": 1.611, "step": 521 }, { "epoch": 0.2651091924834942, "grad_norm": 4.317055702209473, "learning_rate": 1.681431277190462e-05, "loss": 1.3542, "step": 522 }, { "epoch": 0.2656170644997461, "grad_norm": 3.3620643615722656, "learning_rate": 1.680256702912963e-05, "loss": 1.3332, "step": 523 }, { "epoch": 0.26612493651599795, "grad_norm": 4.196297645568848, "learning_rate": 1.6790803791771034e-05, "loss": 0.9123, "step": 524 }, { "epoch": 0.26663280853224985, "grad_norm": 2.9033923149108887, "learning_rate": 1.677902309008108e-05, "loss": 1.4143, "step": 525 }, { "epoch": 0.26714068054850176, "grad_norm": 2.843406915664673, "learning_rate": 1.6767224954356928e-05, "loss": 1.1294, "step": 526 }, { "epoch": 0.2676485525647537, "grad_norm": 2.732457160949707, "learning_rate": 1.6755409414940584e-05, "loss": 1.7313, "step": 527 }, { "epoch": 0.2681564245810056, "grad_norm": 2.676978826522827, "learning_rate": 1.6743576502218792e-05, "loss": 1.464, "step": 528 }, { "epoch": 0.2686642965972575, "grad_norm": 2.5006861686706543, "learning_rate": 1.6731726246622996e-05, "loss": 1.5256, "step": 529 }, { "epoch": 0.2691721686135094, "grad_norm": 2.2941715717315674, "learning_rate": 1.6719858678629232e-05, "loss": 1.3606, "step": 530 }, { "epoch": 0.2696800406297613, "grad_norm": 2.733076572418213, "learning_rate": 1.6707973828758055e-05, "loss": 1.1881, "step": 531 }, { "epoch": 0.2701879126460132, "grad_norm": 3.1281354427337646, "learning_rate": 1.6696071727574476e-05, "loss": 1.2662, "step": 532 }, { "epoch": 0.2706957846622651, "grad_norm": 2.3248023986816406, "learning_rate": 1.6684152405687867e-05, "loss": 1.6241, "step": 533 }, { "epoch": 0.27120365667851704, "grad_norm": 2.4165563583374023, "learning_rate": 1.667221589375189e-05, "loss": 1.3266, "step": 534 }, { "epoch": 0.27171152869476894, "grad_norm": 2.5224907398223877, "learning_rate": 1.6660262222464404e-05, "loss": 1.3281, "step": 535 }, { "epoch": 0.2722194007110208, "grad_norm": 2.712475061416626, "learning_rate": 1.6648291422567414e-05, "loss": 1.3519, "step": 536 }, { "epoch": 0.2727272727272727, "grad_norm": 2.8065361976623535, "learning_rate": 1.6636303524846968e-05, "loss": 1.5687, "step": 537 }, { "epoch": 0.2732351447435246, "grad_norm": 2.669524669647217, "learning_rate": 1.6624298560133087e-05, "loss": 1.6927, "step": 538 }, { "epoch": 0.2737430167597765, "grad_norm": 2.4854934215545654, "learning_rate": 1.661227655929968e-05, "loss": 1.4282, "step": 539 }, { "epoch": 0.27425088877602843, "grad_norm": 2.914283037185669, "learning_rate": 1.6600237553264474e-05, "loss": 1.4625, "step": 540 }, { "epoch": 0.27475876079228034, "grad_norm": 3.190369129180908, "learning_rate": 1.6588181572988928e-05, "loss": 1.373, "step": 541 }, { "epoch": 0.27526663280853225, "grad_norm": 3.2698330879211426, "learning_rate": 1.6576108649478153e-05, "loss": 1.3348, "step": 542 }, { "epoch": 0.27577450482478416, "grad_norm": 2.661933422088623, "learning_rate": 1.6564018813780838e-05, "loss": 1.443, "step": 543 }, { "epoch": 0.27628237684103607, "grad_norm": 2.999220848083496, "learning_rate": 1.655191209698916e-05, "loss": 1.416, "step": 544 }, { "epoch": 0.276790248857288, "grad_norm": 2.747131824493408, "learning_rate": 1.6539788530238716e-05, "loss": 1.4417, "step": 545 }, { "epoch": 0.2772981208735399, "grad_norm": 3.005300283432007, "learning_rate": 1.6527648144708436e-05, "loss": 1.1945, "step": 546 }, { "epoch": 0.2778059928897918, "grad_norm": 2.605773687362671, "learning_rate": 1.65154909716205e-05, "loss": 1.4557, "step": 547 }, { "epoch": 0.27831386490604365, "grad_norm": 2.42523193359375, "learning_rate": 1.6503317042240264e-05, "loss": 1.3029, "step": 548 }, { "epoch": 0.27882173692229556, "grad_norm": 2.553007125854492, "learning_rate": 1.6491126387876176e-05, "loss": 1.5922, "step": 549 }, { "epoch": 0.27932960893854747, "grad_norm": 3.1910340785980225, "learning_rate": 1.6478919039879705e-05, "loss": 1.169, "step": 550 }, { "epoch": 0.2798374809547994, "grad_norm": 3.14784574508667, "learning_rate": 1.6466695029645237e-05, "loss": 1.4954, "step": 551 }, { "epoch": 0.2803453529710513, "grad_norm": 2.7142114639282227, "learning_rate": 1.645445438861002e-05, "loss": 1.2445, "step": 552 }, { "epoch": 0.2808532249873032, "grad_norm": 3.649052619934082, "learning_rate": 1.644219714825407e-05, "loss": 1.1579, "step": 553 }, { "epoch": 0.2813610970035551, "grad_norm": 2.5138401985168457, "learning_rate": 1.6429923340100096e-05, "loss": 1.4395, "step": 554 }, { "epoch": 0.281868969019807, "grad_norm": 2.99912428855896, "learning_rate": 1.6417632995713407e-05, "loss": 1.2584, "step": 555 }, { "epoch": 0.2823768410360589, "grad_norm": 3.645402193069458, "learning_rate": 1.6405326146701846e-05, "loss": 1.2956, "step": 556 }, { "epoch": 0.28288471305231083, "grad_norm": 2.8912413120269775, "learning_rate": 1.63930028247157e-05, "loss": 1.2831, "step": 557 }, { "epoch": 0.28339258506856274, "grad_norm": 2.8437440395355225, "learning_rate": 1.6380663061447618e-05, "loss": 1.3912, "step": 558 }, { "epoch": 0.28390045708481465, "grad_norm": 3.3983263969421387, "learning_rate": 1.6368306888632538e-05, "loss": 1.3817, "step": 559 }, { "epoch": 0.2844083291010665, "grad_norm": 2.768794298171997, "learning_rate": 1.63559343380476e-05, "loss": 1.517, "step": 560 }, { "epoch": 0.2849162011173184, "grad_norm": 3.3941545486450195, "learning_rate": 1.634354544151205e-05, "loss": 0.9587, "step": 561 }, { "epoch": 0.2854240731335703, "grad_norm": 3.603623390197754, "learning_rate": 1.6331140230887185e-05, "loss": 1.0481, "step": 562 }, { "epoch": 0.28593194514982223, "grad_norm": 2.6169986724853516, "learning_rate": 1.6318718738076263e-05, "loss": 1.2115, "step": 563 }, { "epoch": 0.28643981716607414, "grad_norm": 2.8581392765045166, "learning_rate": 1.6306280995024394e-05, "loss": 1.4153, "step": 564 }, { "epoch": 0.28694768918232605, "grad_norm": 3.1200432777404785, "learning_rate": 1.6293827033718505e-05, "loss": 1.5795, "step": 565 }, { "epoch": 0.28745556119857796, "grad_norm": 2.9050920009613037, "learning_rate": 1.628135688618721e-05, "loss": 1.4716, "step": 566 }, { "epoch": 0.28796343321482987, "grad_norm": 3.6316444873809814, "learning_rate": 1.6268870584500775e-05, "loss": 1.4701, "step": 567 }, { "epoch": 0.2884713052310818, "grad_norm": 3.67140531539917, "learning_rate": 1.6256368160770982e-05, "loss": 1.2652, "step": 568 }, { "epoch": 0.2889791772473337, "grad_norm": 2.786526679992676, "learning_rate": 1.6243849647151098e-05, "loss": 1.4331, "step": 569 }, { "epoch": 0.2894870492635856, "grad_norm": 2.9930903911590576, "learning_rate": 1.623131507583576e-05, "loss": 1.392, "step": 570 }, { "epoch": 0.2899949212798375, "grad_norm": 2.4112536907196045, "learning_rate": 1.6218764479060906e-05, "loss": 1.6523, "step": 571 }, { "epoch": 0.2905027932960894, "grad_norm": 2.9992754459381104, "learning_rate": 1.6206197889103674e-05, "loss": 1.3157, "step": 572 }, { "epoch": 0.29101066531234127, "grad_norm": 2.8552370071411133, "learning_rate": 1.619361533828235e-05, "loss": 1.1532, "step": 573 }, { "epoch": 0.2915185373285932, "grad_norm": 3.1149823665618896, "learning_rate": 1.618101685895626e-05, "loss": 1.3478, "step": 574 }, { "epoch": 0.2920264093448451, "grad_norm": 3.108031988143921, "learning_rate": 1.6168402483525696e-05, "loss": 1.1179, "step": 575 }, { "epoch": 0.292534281361097, "grad_norm": 2.5380311012268066, "learning_rate": 1.615577224443183e-05, "loss": 1.5684, "step": 576 }, { "epoch": 0.2930421533773489, "grad_norm": 2.935041904449463, "learning_rate": 1.6143126174156624e-05, "loss": 1.5189, "step": 577 }, { "epoch": 0.2935500253936008, "grad_norm": 2.550523519515991, "learning_rate": 1.613046430522277e-05, "loss": 1.4392, "step": 578 }, { "epoch": 0.2940578974098527, "grad_norm": 2.2801826000213623, "learning_rate": 1.6117786670193578e-05, "loss": 1.5579, "step": 579 }, { "epoch": 0.29456576942610463, "grad_norm": 2.9374067783355713, "learning_rate": 1.610509330167291e-05, "loss": 1.41, "step": 580 }, { "epoch": 0.29507364144235654, "grad_norm": 3.2625834941864014, "learning_rate": 1.6092384232305086e-05, "loss": 1.4237, "step": 581 }, { "epoch": 0.29558151345860845, "grad_norm": 2.602550745010376, "learning_rate": 1.6079659494774813e-05, "loss": 1.4022, "step": 582 }, { "epoch": 0.29608938547486036, "grad_norm": 2.5818839073181152, "learning_rate": 1.606691912180708e-05, "loss": 1.5549, "step": 583 }, { "epoch": 0.29659725749111227, "grad_norm": 2.6626718044281006, "learning_rate": 1.6054163146167104e-05, "loss": 1.1246, "step": 584 }, { "epoch": 0.2971051295073641, "grad_norm": 3.313922166824341, "learning_rate": 1.6041391600660215e-05, "loss": 1.5451, "step": 585 }, { "epoch": 0.297613001523616, "grad_norm": 2.426194190979004, "learning_rate": 1.6028604518131787e-05, "loss": 1.4977, "step": 586 }, { "epoch": 0.29812087353986794, "grad_norm": 2.4327762126922607, "learning_rate": 1.6015801931467153e-05, "loss": 1.5168, "step": 587 }, { "epoch": 0.29862874555611985, "grad_norm": 3.15324068069458, "learning_rate": 1.6002983873591526e-05, "loss": 1.4526, "step": 588 }, { "epoch": 0.29913661757237175, "grad_norm": 2.9152519702911377, "learning_rate": 1.5990150377469894e-05, "loss": 1.4495, "step": 589 }, { "epoch": 0.29964448958862366, "grad_norm": 2.939667224884033, "learning_rate": 1.5977301476106953e-05, "loss": 1.4074, "step": 590 }, { "epoch": 0.3001523616048756, "grad_norm": 3.2030749320983887, "learning_rate": 1.596443720254702e-05, "loss": 1.3787, "step": 591 }, { "epoch": 0.3006602336211275, "grad_norm": 3.0066051483154297, "learning_rate": 1.595155758987396e-05, "loss": 1.2458, "step": 592 }, { "epoch": 0.3011681056373794, "grad_norm": 3.0819432735443115, "learning_rate": 1.5938662671211052e-05, "loss": 1.3818, "step": 593 }, { "epoch": 0.3016759776536313, "grad_norm": 2.645960807800293, "learning_rate": 1.592575247972097e-05, "loss": 1.5243, "step": 594 }, { "epoch": 0.3021838496698832, "grad_norm": 4.147794723510742, "learning_rate": 1.591282704860565e-05, "loss": 1.3021, "step": 595 }, { "epoch": 0.3026917216861351, "grad_norm": 3.1667566299438477, "learning_rate": 1.5899886411106223e-05, "loss": 1.502, "step": 596 }, { "epoch": 0.30319959370238697, "grad_norm": 3.224921941757202, "learning_rate": 1.5886930600502935e-05, "loss": 1.2507, "step": 597 }, { "epoch": 0.3037074657186389, "grad_norm": 2.8849408626556396, "learning_rate": 1.587395965011504e-05, "loss": 1.5756, "step": 598 }, { "epoch": 0.3042153377348908, "grad_norm": 2.8263800144195557, "learning_rate": 1.5860973593300747e-05, "loss": 1.6592, "step": 599 }, { "epoch": 0.3047232097511427, "grad_norm": 2.590561866760254, "learning_rate": 1.5847972463457097e-05, "loss": 1.3933, "step": 600 }, { "epoch": 0.3052310817673946, "grad_norm": 3.864027738571167, "learning_rate": 1.58349562940199e-05, "loss": 1.4139, "step": 601 }, { "epoch": 0.3057389537836465, "grad_norm": 3.146385669708252, "learning_rate": 1.582192511846365e-05, "loss": 1.4181, "step": 602 }, { "epoch": 0.3062468257998984, "grad_norm": 3.3183696269989014, "learning_rate": 1.580887897030143e-05, "loss": 1.1924, "step": 603 }, { "epoch": 0.30675469781615033, "grad_norm": 3.0171616077423096, "learning_rate": 1.5795817883084823e-05, "loss": 1.2576, "step": 604 }, { "epoch": 0.30726256983240224, "grad_norm": 2.751277208328247, "learning_rate": 1.5782741890403842e-05, "loss": 1.5396, "step": 605 }, { "epoch": 0.30777044184865415, "grad_norm": 2.6898446083068848, "learning_rate": 1.576965102588683e-05, "loss": 1.4069, "step": 606 }, { "epoch": 0.30827831386490606, "grad_norm": 2.6060962677001953, "learning_rate": 1.5756545323200373e-05, "loss": 1.4862, "step": 607 }, { "epoch": 0.30878618588115797, "grad_norm": 2.941141128540039, "learning_rate": 1.574342481604922e-05, "loss": 1.4382, "step": 608 }, { "epoch": 0.3092940578974099, "grad_norm": 3.190333127975464, "learning_rate": 1.573028953817619e-05, "loss": 1.5053, "step": 609 }, { "epoch": 0.30980192991366173, "grad_norm": 2.9648282527923584, "learning_rate": 1.5717139523362094e-05, "loss": 1.2999, "step": 610 }, { "epoch": 0.31030980192991364, "grad_norm": 3.3651657104492188, "learning_rate": 1.5703974805425645e-05, "loss": 1.3128, "step": 611 }, { "epoch": 0.31081767394616555, "grad_norm": 4.03769063949585, "learning_rate": 1.569079541822336e-05, "loss": 1.1174, "step": 612 }, { "epoch": 0.31132554596241746, "grad_norm": 3.595374584197998, "learning_rate": 1.5677601395649486e-05, "loss": 1.1697, "step": 613 }, { "epoch": 0.31183341797866937, "grad_norm": 3.3900370597839355, "learning_rate": 1.5664392771635912e-05, "loss": 1.6163, "step": 614 }, { "epoch": 0.3123412899949213, "grad_norm": 3.554189443588257, "learning_rate": 1.5651169580152075e-05, "loss": 1.1483, "step": 615 }, { "epoch": 0.3128491620111732, "grad_norm": 3.3712856769561768, "learning_rate": 1.5637931855204877e-05, "loss": 1.4755, "step": 616 }, { "epoch": 0.3133570340274251, "grad_norm": 2.823914051055908, "learning_rate": 1.5624679630838594e-05, "loss": 1.5944, "step": 617 }, { "epoch": 0.313864906043677, "grad_norm": 3.103543519973755, "learning_rate": 1.56114129411348e-05, "loss": 1.1896, "step": 618 }, { "epoch": 0.3143727780599289, "grad_norm": 3.144697666168213, "learning_rate": 1.5598131820212256e-05, "loss": 1.3939, "step": 619 }, { "epoch": 0.3148806500761808, "grad_norm": 2.5628550052642822, "learning_rate": 1.5584836302226846e-05, "loss": 1.4688, "step": 620 }, { "epoch": 0.31538852209243273, "grad_norm": 2.8005852699279785, "learning_rate": 1.5571526421371484e-05, "loss": 1.3955, "step": 621 }, { "epoch": 0.3158963941086846, "grad_norm": 3.2499237060546875, "learning_rate": 1.555820221187601e-05, "loss": 1.4999, "step": 622 }, { "epoch": 0.3164042661249365, "grad_norm": 3.178745985031128, "learning_rate": 1.5544863708007122e-05, "loss": 1.2467, "step": 623 }, { "epoch": 0.3169121381411884, "grad_norm": 2.5890581607818604, "learning_rate": 1.553151094406828e-05, "loss": 1.3692, "step": 624 }, { "epoch": 0.3174200101574403, "grad_norm": 2.8289737701416016, "learning_rate": 1.5518143954399612e-05, "loss": 1.3892, "step": 625 }, { "epoch": 0.3179278821736922, "grad_norm": 2.4055609703063965, "learning_rate": 1.550476277337784e-05, "loss": 1.4142, "step": 626 }, { "epoch": 0.31843575418994413, "grad_norm": 2.517207384109497, "learning_rate": 1.549136743541617e-05, "loss": 1.3777, "step": 627 }, { "epoch": 0.31894362620619604, "grad_norm": 3.7871336936950684, "learning_rate": 1.5477957974964225e-05, "loss": 1.4155, "step": 628 }, { "epoch": 0.31945149822244795, "grad_norm": 3.072606325149536, "learning_rate": 1.5464534426507954e-05, "loss": 1.3369, "step": 629 }, { "epoch": 0.31995937023869986, "grad_norm": 3.0552282333374023, "learning_rate": 1.545109682456952e-05, "loss": 1.5726, "step": 630 }, { "epoch": 0.32046724225495177, "grad_norm": 4.044310569763184, "learning_rate": 1.5437645203707242e-05, "loss": 1.3222, "step": 631 }, { "epoch": 0.3209751142712037, "grad_norm": 2.970475912094116, "learning_rate": 1.5424179598515486e-05, "loss": 1.3176, "step": 632 }, { "epoch": 0.3214829862874556, "grad_norm": 2.8568050861358643, "learning_rate": 1.5410700043624587e-05, "loss": 1.724, "step": 633 }, { "epoch": 0.32199085830370744, "grad_norm": 3.2946465015411377, "learning_rate": 1.539720657370075e-05, "loss": 1.3501, "step": 634 }, { "epoch": 0.32249873031995935, "grad_norm": 3.082104206085205, "learning_rate": 1.5383699223445967e-05, "loss": 1.3687, "step": 635 }, { "epoch": 0.32300660233621126, "grad_norm": 2.9089434146881104, "learning_rate": 1.537017802759793e-05, "loss": 1.2199, "step": 636 }, { "epoch": 0.32351447435246317, "grad_norm": 2.8768131732940674, "learning_rate": 1.5356643020929942e-05, "loss": 1.5327, "step": 637 }, { "epoch": 0.3240223463687151, "grad_norm": 2.816420793533325, "learning_rate": 1.5343094238250814e-05, "loss": 1.3783, "step": 638 }, { "epoch": 0.324530218384967, "grad_norm": 2.742537260055542, "learning_rate": 1.5329531714404787e-05, "loss": 1.3853, "step": 639 }, { "epoch": 0.3250380904012189, "grad_norm": 2.976881742477417, "learning_rate": 1.531595548427145e-05, "loss": 1.5038, "step": 640 }, { "epoch": 0.3255459624174708, "grad_norm": 2.651549816131592, "learning_rate": 1.5302365582765643e-05, "loss": 1.6156, "step": 641 }, { "epoch": 0.3260538344337227, "grad_norm": 2.5804250240325928, "learning_rate": 1.5288762044837348e-05, "loss": 1.4925, "step": 642 }, { "epoch": 0.3265617064499746, "grad_norm": 2.674237012863159, "learning_rate": 1.527514490547163e-05, "loss": 1.3758, "step": 643 }, { "epoch": 0.32706957846622653, "grad_norm": 3.6240010261535645, "learning_rate": 1.526151419968853e-05, "loss": 1.4575, "step": 644 }, { "epoch": 0.32757745048247844, "grad_norm": 3.1033742427825928, "learning_rate": 1.5247869962542985e-05, "loss": 1.5951, "step": 645 }, { "epoch": 0.32808532249873035, "grad_norm": 3.0908331871032715, "learning_rate": 1.5234212229124718e-05, "loss": 1.3567, "step": 646 }, { "epoch": 0.3285931945149822, "grad_norm": 3.233978509902954, "learning_rate": 1.522054103455818e-05, "loss": 1.3405, "step": 647 }, { "epoch": 0.3291010665312341, "grad_norm": 3.537057399749756, "learning_rate": 1.520685641400242e-05, "loss": 1.4314, "step": 648 }, { "epoch": 0.329608938547486, "grad_norm": 3.0778417587280273, "learning_rate": 1.5193158402651032e-05, "loss": 1.2213, "step": 649 }, { "epoch": 0.33011681056373793, "grad_norm": 4.0384440422058105, "learning_rate": 1.5179447035732038e-05, "loss": 1.3354, "step": 650 }, { "epoch": 0.33062468257998984, "grad_norm": 2.6769659519195557, "learning_rate": 1.516572234850781e-05, "loss": 1.4348, "step": 651 }, { "epoch": 0.33113255459624175, "grad_norm": 2.5838680267333984, "learning_rate": 1.5151984376274982e-05, "loss": 1.3927, "step": 652 }, { "epoch": 0.33164042661249366, "grad_norm": 2.274228572845459, "learning_rate": 1.5138233154364344e-05, "loss": 1.5037, "step": 653 }, { "epoch": 0.33214829862874556, "grad_norm": 3.4756743907928467, "learning_rate": 1.512446871814077e-05, "loss": 1.2171, "step": 654 }, { "epoch": 0.3326561706449975, "grad_norm": 3.236459732055664, "learning_rate": 1.5110691103003118e-05, "loss": 1.177, "step": 655 }, { "epoch": 0.3331640426612494, "grad_norm": 3.806804895401001, "learning_rate": 1.5096900344384126e-05, "loss": 1.3514, "step": 656 }, { "epoch": 0.3336719146775013, "grad_norm": 3.0247037410736084, "learning_rate": 1.5083096477750347e-05, "loss": 1.3816, "step": 657 }, { "epoch": 0.3341797866937532, "grad_norm": 2.83921480178833, "learning_rate": 1.5069279538602048e-05, "loss": 1.624, "step": 658 }, { "epoch": 0.33468765871000505, "grad_norm": 2.76662540435791, "learning_rate": 1.5055449562473102e-05, "loss": 1.4202, "step": 659 }, { "epoch": 0.33519553072625696, "grad_norm": 2.6441972255706787, "learning_rate": 1.5041606584930914e-05, "loss": 1.5947, "step": 660 }, { "epoch": 0.33570340274250887, "grad_norm": 2.4214117527008057, "learning_rate": 1.5027750641576333e-05, "loss": 1.4026, "step": 661 }, { "epoch": 0.3362112747587608, "grad_norm": 3.095984935760498, "learning_rate": 1.5013881768043547e-05, "loss": 1.5424, "step": 662 }, { "epoch": 0.3367191467750127, "grad_norm": 3.10390305519104, "learning_rate": 1.5000000000000002e-05, "loss": 1.2982, "step": 663 }, { "epoch": 0.3372270187912646, "grad_norm": 3.0772242546081543, "learning_rate": 1.4986105373146296e-05, "loss": 1.3818, "step": 664 }, { "epoch": 0.3377348908075165, "grad_norm": 2.9678761959075928, "learning_rate": 1.49721979232161e-05, "loss": 1.229, "step": 665 }, { "epoch": 0.3382427628237684, "grad_norm": 3.0405545234680176, "learning_rate": 1.4958277685976077e-05, "loss": 1.5045, "step": 666 }, { "epoch": 0.3387506348400203, "grad_norm": 4.125433444976807, "learning_rate": 1.4944344697225754e-05, "loss": 1.0499, "step": 667 }, { "epoch": 0.33925850685627224, "grad_norm": 2.586026906967163, "learning_rate": 1.493039899279747e-05, "loss": 1.549, "step": 668 }, { "epoch": 0.33976637887252414, "grad_norm": 3.4039981365203857, "learning_rate": 1.4916440608556253e-05, "loss": 1.6548, "step": 669 }, { "epoch": 0.34027425088877605, "grad_norm": 3.3261430263519287, "learning_rate": 1.4902469580399746e-05, "loss": 1.2327, "step": 670 }, { "epoch": 0.3407821229050279, "grad_norm": 3.39119815826416, "learning_rate": 1.4888485944258113e-05, "loss": 1.0486, "step": 671 }, { "epoch": 0.3412899949212798, "grad_norm": 2.656126022338867, "learning_rate": 1.4874489736093933e-05, "loss": 1.5682, "step": 672 }, { "epoch": 0.3417978669375317, "grad_norm": 2.970787286758423, "learning_rate": 1.4860480991902127e-05, "loss": 1.2382, "step": 673 }, { "epoch": 0.34230573895378363, "grad_norm": 2.8525571823120117, "learning_rate": 1.4846459747709849e-05, "loss": 1.4311, "step": 674 }, { "epoch": 0.34281361097003554, "grad_norm": 3.1475439071655273, "learning_rate": 1.4832426039576401e-05, "loss": 1.2665, "step": 675 }, { "epoch": 0.34332148298628745, "grad_norm": 2.9784014225006104, "learning_rate": 1.481837990359315e-05, "loss": 1.2435, "step": 676 }, { "epoch": 0.34382935500253936, "grad_norm": 3.6366403102874756, "learning_rate": 1.4804321375883404e-05, "loss": 1.4673, "step": 677 }, { "epoch": 0.34433722701879127, "grad_norm": 3.0844566822052, "learning_rate": 1.4790250492602352e-05, "loss": 1.3489, "step": 678 }, { "epoch": 0.3448450990350432, "grad_norm": 2.902252674102783, "learning_rate": 1.4776167289936964e-05, "loss": 1.4117, "step": 679 }, { "epoch": 0.3453529710512951, "grad_norm": 3.076643943786621, "learning_rate": 1.4762071804105881e-05, "loss": 1.2952, "step": 680 }, { "epoch": 0.345860843067547, "grad_norm": 2.8510043621063232, "learning_rate": 1.4747964071359343e-05, "loss": 1.4523, "step": 681 }, { "epoch": 0.3463687150837989, "grad_norm": 2.854254722595215, "learning_rate": 1.4733844127979073e-05, "loss": 1.6375, "step": 682 }, { "epoch": 0.34687658710005076, "grad_norm": 3.7380759716033936, "learning_rate": 1.4719712010278211e-05, "loss": 1.4145, "step": 683 }, { "epoch": 0.34738445911630267, "grad_norm": 2.862888813018799, "learning_rate": 1.4705567754601204e-05, "loss": 1.4546, "step": 684 }, { "epoch": 0.3478923311325546, "grad_norm": 3.2201626300811768, "learning_rate": 1.4691411397323706e-05, "loss": 1.5113, "step": 685 }, { "epoch": 0.3484002031488065, "grad_norm": 3.1785452365875244, "learning_rate": 1.46772429748525e-05, "loss": 1.3296, "step": 686 }, { "epoch": 0.3489080751650584, "grad_norm": 2.7606334686279297, "learning_rate": 1.4663062523625398e-05, "loss": 1.433, "step": 687 }, { "epoch": 0.3494159471813103, "grad_norm": 3.220571756362915, "learning_rate": 1.4648870080111144e-05, "loss": 1.2379, "step": 688 }, { "epoch": 0.3499238191975622, "grad_norm": 2.676091194152832, "learning_rate": 1.463466568080933e-05, "loss": 1.3755, "step": 689 }, { "epoch": 0.3504316912138141, "grad_norm": 3.6125404834747314, "learning_rate": 1.4620449362250286e-05, "loss": 1.2007, "step": 690 }, { "epoch": 0.35093956323006603, "grad_norm": 2.850972890853882, "learning_rate": 1.4606221160995002e-05, "loss": 1.5413, "step": 691 }, { "epoch": 0.35144743524631794, "grad_norm": 2.8337199687957764, "learning_rate": 1.459198111363503e-05, "loss": 1.628, "step": 692 }, { "epoch": 0.35195530726256985, "grad_norm": 2.9772043228149414, "learning_rate": 1.4577729256792377e-05, "loss": 1.4941, "step": 693 }, { "epoch": 0.35246317927882176, "grad_norm": 2.7553727626800537, "learning_rate": 1.4563465627119428e-05, "loss": 1.2939, "step": 694 }, { "epoch": 0.35297105129507367, "grad_norm": 4.402785301208496, "learning_rate": 1.4549190261298848e-05, "loss": 1.1561, "step": 695 }, { "epoch": 0.3534789233113255, "grad_norm": 3.322864055633545, "learning_rate": 1.4534903196043477e-05, "loss": 1.4288, "step": 696 }, { "epoch": 0.35398679532757743, "grad_norm": 3.631467342376709, "learning_rate": 1.4520604468096252e-05, "loss": 1.2479, "step": 697 }, { "epoch": 0.35449466734382934, "grad_norm": 3.06563138961792, "learning_rate": 1.4506294114230093e-05, "loss": 1.3205, "step": 698 }, { "epoch": 0.35500253936008125, "grad_norm": 2.893291711807251, "learning_rate": 1.4491972171247825e-05, "loss": 1.4051, "step": 699 }, { "epoch": 0.35551041137633316, "grad_norm": 3.3483219146728516, "learning_rate": 1.4477638675982081e-05, "loss": 1.5237, "step": 700 }, { "epoch": 0.35601828339258507, "grad_norm": 3.449445962905884, "learning_rate": 1.44632936652952e-05, "loss": 1.2882, "step": 701 }, { "epoch": 0.356526155408837, "grad_norm": 3.7622361183166504, "learning_rate": 1.444893717607913e-05, "loss": 1.4397, "step": 702 }, { "epoch": 0.3570340274250889, "grad_norm": 3.3970916271209717, "learning_rate": 1.443456924525535e-05, "loss": 1.4158, "step": 703 }, { "epoch": 0.3575418994413408, "grad_norm": 3.098458766937256, "learning_rate": 1.4420189909774757e-05, "loss": 1.371, "step": 704 }, { "epoch": 0.3580497714575927, "grad_norm": 2.868194818496704, "learning_rate": 1.4405799206617586e-05, "loss": 1.0695, "step": 705 }, { "epoch": 0.3585576434738446, "grad_norm": 3.3876736164093018, "learning_rate": 1.439139717279329e-05, "loss": 1.2454, "step": 706 }, { "epoch": 0.3590655154900965, "grad_norm": 3.4045627117156982, "learning_rate": 1.437698384534048e-05, "loss": 1.2066, "step": 707 }, { "epoch": 0.3595733875063484, "grad_norm": 2.855515480041504, "learning_rate": 1.4362559261326805e-05, "loss": 1.6359, "step": 708 }, { "epoch": 0.3600812595226003, "grad_norm": 2.887098550796509, "learning_rate": 1.4348123457848857e-05, "loss": 1.1533, "step": 709 }, { "epoch": 0.3605891315388522, "grad_norm": 3.2437286376953125, "learning_rate": 1.433367647203209e-05, "loss": 1.4143, "step": 710 }, { "epoch": 0.3610970035551041, "grad_norm": 3.3504650592803955, "learning_rate": 1.4319218341030719e-05, "loss": 1.4759, "step": 711 }, { "epoch": 0.361604875571356, "grad_norm": 3.7517337799072266, "learning_rate": 1.4304749102027608e-05, "loss": 1.3028, "step": 712 }, { "epoch": 0.3621127475876079, "grad_norm": 2.6952829360961914, "learning_rate": 1.4290268792234203e-05, "loss": 1.3487, "step": 713 }, { "epoch": 0.36262061960385983, "grad_norm": 3.4969964027404785, "learning_rate": 1.427577744889041e-05, "loss": 1.5259, "step": 714 }, { "epoch": 0.36312849162011174, "grad_norm": 2.790510892868042, "learning_rate": 1.4261275109264516e-05, "loss": 1.3438, "step": 715 }, { "epoch": 0.36363636363636365, "grad_norm": 3.0047624111175537, "learning_rate": 1.424676181065309e-05, "loss": 1.3558, "step": 716 }, { "epoch": 0.36414423565261556, "grad_norm": 3.4911231994628906, "learning_rate": 1.4232237590380882e-05, "loss": 1.4434, "step": 717 }, { "epoch": 0.36465210766886746, "grad_norm": 3.183309555053711, "learning_rate": 1.421770248580073e-05, "loss": 1.1461, "step": 718 }, { "epoch": 0.3651599796851194, "grad_norm": 3.691046714782715, "learning_rate": 1.4203156534293463e-05, "loss": 1.2286, "step": 719 }, { "epoch": 0.3656678517013712, "grad_norm": 3.438445568084717, "learning_rate": 1.4188599773267808e-05, "loss": 1.0223, "step": 720 }, { "epoch": 0.36617572371762314, "grad_norm": 2.969409704208374, "learning_rate": 1.4174032240160291e-05, "loss": 1.4687, "step": 721 }, { "epoch": 0.36668359573387505, "grad_norm": 3.7304344177246094, "learning_rate": 1.4159453972435135e-05, "loss": 1.0545, "step": 722 }, { "epoch": 0.36719146775012695, "grad_norm": 3.4797556400299072, "learning_rate": 1.4144865007584184e-05, "loss": 1.4136, "step": 723 }, { "epoch": 0.36769933976637886, "grad_norm": 3.0512192249298096, "learning_rate": 1.4130265383126778e-05, "loss": 1.3685, "step": 724 }, { "epoch": 0.36820721178263077, "grad_norm": 3.239837646484375, "learning_rate": 1.4115655136609677e-05, "loss": 1.4191, "step": 725 }, { "epoch": 0.3687150837988827, "grad_norm": 3.2800753116607666, "learning_rate": 1.4101034305606962e-05, "loss": 1.2906, "step": 726 }, { "epoch": 0.3692229558151346, "grad_norm": 3.322382926940918, "learning_rate": 1.4086402927719929e-05, "loss": 1.3609, "step": 727 }, { "epoch": 0.3697308278313865, "grad_norm": 3.090886354446411, "learning_rate": 1.4071761040576996e-05, "loss": 1.3124, "step": 728 }, { "epoch": 0.3702386998476384, "grad_norm": 3.5921313762664795, "learning_rate": 1.4057108681833623e-05, "loss": 1.4419, "step": 729 }, { "epoch": 0.3707465718638903, "grad_norm": 3.024637460708618, "learning_rate": 1.4042445889172178e-05, "loss": 1.5245, "step": 730 }, { "epoch": 0.3712544438801422, "grad_norm": 2.6643972396850586, "learning_rate": 1.402777270030188e-05, "loss": 1.3496, "step": 731 }, { "epoch": 0.37176231589639414, "grad_norm": 3.0148916244506836, "learning_rate": 1.4013089152958681e-05, "loss": 1.5488, "step": 732 }, { "epoch": 0.372270187912646, "grad_norm": 3.736215591430664, "learning_rate": 1.3998395284905166e-05, "loss": 1.4689, "step": 733 }, { "epoch": 0.3727780599288979, "grad_norm": 3.5619914531707764, "learning_rate": 1.3983691133930473e-05, "loss": 1.2974, "step": 734 }, { "epoch": 0.3732859319451498, "grad_norm": 2.7390873432159424, "learning_rate": 1.3968976737850172e-05, "loss": 0.876, "step": 735 }, { "epoch": 0.3737938039614017, "grad_norm": 3.622314691543579, "learning_rate": 1.3954252134506193e-05, "loss": 1.1496, "step": 736 }, { "epoch": 0.3743016759776536, "grad_norm": 2.7309963703155518, "learning_rate": 1.393951736176671e-05, "loss": 1.2773, "step": 737 }, { "epoch": 0.37480954799390553, "grad_norm": 3.10971736907959, "learning_rate": 1.392477245752605e-05, "loss": 1.3917, "step": 738 }, { "epoch": 0.37531742001015744, "grad_norm": 2.9807276725769043, "learning_rate": 1.3910017459704602e-05, "loss": 1.5002, "step": 739 }, { "epoch": 0.37582529202640935, "grad_norm": 3.2353007793426514, "learning_rate": 1.3895252406248708e-05, "loss": 1.2697, "step": 740 }, { "epoch": 0.37633316404266126, "grad_norm": 3.410163402557373, "learning_rate": 1.3880477335130573e-05, "loss": 1.3525, "step": 741 }, { "epoch": 0.37684103605891317, "grad_norm": 2.844461679458618, "learning_rate": 1.3865692284348162e-05, "loss": 1.4138, "step": 742 }, { "epoch": 0.3773489080751651, "grad_norm": 2.8310110569000244, "learning_rate": 1.3850897291925104e-05, "loss": 1.3118, "step": 743 }, { "epoch": 0.377856780091417, "grad_norm": 3.3003883361816406, "learning_rate": 1.3836092395910605e-05, "loss": 1.1658, "step": 744 }, { "epoch": 0.37836465210766884, "grad_norm": 3.317352533340454, "learning_rate": 1.382127763437933e-05, "loss": 1.3725, "step": 745 }, { "epoch": 0.37887252412392075, "grad_norm": 2.9160075187683105, "learning_rate": 1.3806453045431326e-05, "loss": 1.4349, "step": 746 }, { "epoch": 0.37938039614017266, "grad_norm": 3.4108498096466064, "learning_rate": 1.3791618667191902e-05, "loss": 1.3338, "step": 747 }, { "epoch": 0.37988826815642457, "grad_norm": 4.158885478973389, "learning_rate": 1.3776774537811548e-05, "loss": 1.081, "step": 748 }, { "epoch": 0.3803961401726765, "grad_norm": 3.3611350059509277, "learning_rate": 1.3761920695465838e-05, "loss": 1.4261, "step": 749 }, { "epoch": 0.3809040121889284, "grad_norm": 3.7590487003326416, "learning_rate": 1.3747057178355316e-05, "loss": 1.2509, "step": 750 }, { "epoch": 0.3814118842051803, "grad_norm": 3.6111934185028076, "learning_rate": 1.3732184024705413e-05, "loss": 1.3899, "step": 751 }, { "epoch": 0.3819197562214322, "grad_norm": 4.133301734924316, "learning_rate": 1.3717301272766336e-05, "loss": 1.5087, "step": 752 }, { "epoch": 0.3824276282376841, "grad_norm": 3.427138090133667, "learning_rate": 1.3702408960812989e-05, "loss": 1.3835, "step": 753 }, { "epoch": 0.382935500253936, "grad_norm": 3.1767027378082275, "learning_rate": 1.3687507127144848e-05, "loss": 1.3515, "step": 754 }, { "epoch": 0.38344337227018793, "grad_norm": 3.710176706314087, "learning_rate": 1.3672595810085887e-05, "loss": 1.2997, "step": 755 }, { "epoch": 0.38395124428643984, "grad_norm": 2.8906760215759277, "learning_rate": 1.3657675047984462e-05, "loss": 1.3962, "step": 756 }, { "epoch": 0.3844591163026917, "grad_norm": 2.8986518383026123, "learning_rate": 1.3642744879213226e-05, "loss": 1.5398, "step": 757 }, { "epoch": 0.3849669883189436, "grad_norm": 3.346881628036499, "learning_rate": 1.3627805342169019e-05, "loss": 1.2606, "step": 758 }, { "epoch": 0.3854748603351955, "grad_norm": 3.2820770740509033, "learning_rate": 1.361285647527277e-05, "loss": 1.3161, "step": 759 }, { "epoch": 0.3859827323514474, "grad_norm": 2.738898992538452, "learning_rate": 1.359789831696941e-05, "loss": 1.3221, "step": 760 }, { "epoch": 0.38649060436769933, "grad_norm": 3.425286293029785, "learning_rate": 1.3582930905727772e-05, "loss": 1.3743, "step": 761 }, { "epoch": 0.38699847638395124, "grad_norm": 2.762843370437622, "learning_rate": 1.3567954280040463e-05, "loss": 1.5665, "step": 762 }, { "epoch": 0.38750634840020315, "grad_norm": 3.123188018798828, "learning_rate": 1.3552968478423809e-05, "loss": 1.5878, "step": 763 }, { "epoch": 0.38801422041645506, "grad_norm": 3.069780111312866, "learning_rate": 1.3537973539417719e-05, "loss": 1.4137, "step": 764 }, { "epoch": 0.38852209243270697, "grad_norm": 3.6509578227996826, "learning_rate": 1.3522969501585612e-05, "loss": 1.5572, "step": 765 }, { "epoch": 0.3890299644489589, "grad_norm": 3.1971583366394043, "learning_rate": 1.35079564035143e-05, "loss": 1.3368, "step": 766 }, { "epoch": 0.3895378364652108, "grad_norm": 3.548170566558838, "learning_rate": 1.34929342838139e-05, "loss": 1.2282, "step": 767 }, { "epoch": 0.3900457084814627, "grad_norm": 2.829338788986206, "learning_rate": 1.3477903181117731e-05, "loss": 1.3614, "step": 768 }, { "epoch": 0.3905535804977146, "grad_norm": 3.30485463142395, "learning_rate": 1.3462863134082214e-05, "loss": 1.4793, "step": 769 }, { "epoch": 0.39106145251396646, "grad_norm": 3.015761613845825, "learning_rate": 1.3447814181386763e-05, "loss": 1.5157, "step": 770 }, { "epoch": 0.39156932453021837, "grad_norm": 3.4275357723236084, "learning_rate": 1.343275636173371e-05, "loss": 1.2972, "step": 771 }, { "epoch": 0.3920771965464703, "grad_norm": 3.141019344329834, "learning_rate": 1.3417689713848179e-05, "loss": 1.2101, "step": 772 }, { "epoch": 0.3925850685627222, "grad_norm": 3.313812017440796, "learning_rate": 1.3402614276478008e-05, "loss": 1.4199, "step": 773 }, { "epoch": 0.3930929405789741, "grad_norm": 3.148071050643921, "learning_rate": 1.3387530088393628e-05, "loss": 1.5838, "step": 774 }, { "epoch": 0.393600812595226, "grad_norm": 3.4933178424835205, "learning_rate": 1.3372437188387988e-05, "loss": 1.4677, "step": 775 }, { "epoch": 0.3941086846114779, "grad_norm": 2.875379800796509, "learning_rate": 1.3357335615276434e-05, "loss": 1.3789, "step": 776 }, { "epoch": 0.3946165566277298, "grad_norm": 2.804950714111328, "learning_rate": 1.3342225407896614e-05, "loss": 1.3081, "step": 777 }, { "epoch": 0.39512442864398173, "grad_norm": 3.7760562896728516, "learning_rate": 1.3327106605108389e-05, "loss": 1.5066, "step": 778 }, { "epoch": 0.39563230066023364, "grad_norm": 3.2691030502319336, "learning_rate": 1.3311979245793723e-05, "loss": 1.3408, "step": 779 }, { "epoch": 0.39614017267648555, "grad_norm": 3.711343765258789, "learning_rate": 1.329684336885658e-05, "loss": 1.2556, "step": 780 }, { "epoch": 0.39664804469273746, "grad_norm": 3.3347604274749756, "learning_rate": 1.3281699013222836e-05, "loss": 1.4213, "step": 781 }, { "epoch": 0.3971559167089893, "grad_norm": 3.439558744430542, "learning_rate": 1.3266546217840172e-05, "loss": 0.9587, "step": 782 }, { "epoch": 0.3976637887252412, "grad_norm": 3.292663097381592, "learning_rate": 1.3251385021677967e-05, "loss": 1.1098, "step": 783 }, { "epoch": 0.3981716607414931, "grad_norm": 3.108430862426758, "learning_rate": 1.3236215463727218e-05, "loss": 1.308, "step": 784 }, { "epoch": 0.39867953275774504, "grad_norm": 2.932769298553467, "learning_rate": 1.322103758300041e-05, "loss": 1.4641, "step": 785 }, { "epoch": 0.39918740477399695, "grad_norm": 3.71151065826416, "learning_rate": 1.3205851418531442e-05, "loss": 1.4567, "step": 786 }, { "epoch": 0.39969527679024885, "grad_norm": 2.6987061500549316, "learning_rate": 1.3190657009375516e-05, "loss": 1.3758, "step": 787 }, { "epoch": 0.40020314880650076, "grad_norm": 2.942145586013794, "learning_rate": 1.3175454394609037e-05, "loss": 1.3166, "step": 788 }, { "epoch": 0.4007110208227527, "grad_norm": 4.66526985168457, "learning_rate": 1.3160243613329512e-05, "loss": 1.2639, "step": 789 }, { "epoch": 0.4012188928390046, "grad_norm": 2.9264190196990967, "learning_rate": 1.3145024704655453e-05, "loss": 1.5059, "step": 790 }, { "epoch": 0.4017267648552565, "grad_norm": 3.232764482498169, "learning_rate": 1.3129797707726268e-05, "loss": 1.3631, "step": 791 }, { "epoch": 0.4022346368715084, "grad_norm": 3.2342617511749268, "learning_rate": 1.3114562661702173e-05, "loss": 1.2905, "step": 792 }, { "epoch": 0.4027425088877603, "grad_norm": 5.082442283630371, "learning_rate": 1.3099319605764078e-05, "loss": 0.6581, "step": 793 }, { "epoch": 0.40325038090401216, "grad_norm": 3.325721025466919, "learning_rate": 1.3084068579113495e-05, "loss": 1.2834, "step": 794 }, { "epoch": 0.40375825292026407, "grad_norm": 3.943657636642456, "learning_rate": 1.3068809620972438e-05, "loss": 1.5556, "step": 795 }, { "epoch": 0.404266124936516, "grad_norm": 2.8499321937561035, "learning_rate": 1.3053542770583314e-05, "loss": 1.5058, "step": 796 }, { "epoch": 0.4047739969527679, "grad_norm": 4.564476013183594, "learning_rate": 1.303826806720883e-05, "loss": 1.3175, "step": 797 }, { "epoch": 0.4052818689690198, "grad_norm": 3.9246888160705566, "learning_rate": 1.3022985550131893e-05, "loss": 1.1799, "step": 798 }, { "epoch": 0.4057897409852717, "grad_norm": 3.540846347808838, "learning_rate": 1.300769525865549e-05, "loss": 1.6187, "step": 799 }, { "epoch": 0.4062976130015236, "grad_norm": 2.856062412261963, "learning_rate": 1.2992397232102623e-05, "loss": 1.3625, "step": 800 }, { "epoch": 0.4068054850177755, "grad_norm": 3.0858078002929688, "learning_rate": 1.2977091509816166e-05, "loss": 1.5088, "step": 801 }, { "epoch": 0.40731335703402743, "grad_norm": 3.1453182697296143, "learning_rate": 1.2961778131158798e-05, "loss": 1.2945, "step": 802 }, { "epoch": 0.40782122905027934, "grad_norm": 3.0279111862182617, "learning_rate": 1.2946457135512889e-05, "loss": 1.5215, "step": 803 }, { "epoch": 0.40832910106653125, "grad_norm": 3.2221486568450928, "learning_rate": 1.2931128562280388e-05, "loss": 1.1617, "step": 804 }, { "epoch": 0.40883697308278316, "grad_norm": 3.101377010345459, "learning_rate": 1.2915792450882745e-05, "loss": 1.2618, "step": 805 }, { "epoch": 0.40934484509903507, "grad_norm": 4.297948360443115, "learning_rate": 1.2900448840760779e-05, "loss": 0.8525, "step": 806 }, { "epoch": 0.4098527171152869, "grad_norm": 3.5713438987731934, "learning_rate": 1.288509777137461e-05, "loss": 1.1102, "step": 807 }, { "epoch": 0.41036058913153883, "grad_norm": 3.4977385997772217, "learning_rate": 1.2869739282203538e-05, "loss": 1.5499, "step": 808 }, { "epoch": 0.41086846114779074, "grad_norm": 3.0273525714874268, "learning_rate": 1.2854373412745936e-05, "loss": 1.3806, "step": 809 }, { "epoch": 0.41137633316404265, "grad_norm": 3.4045588970184326, "learning_rate": 1.2839000202519166e-05, "loss": 1.2375, "step": 810 }, { "epoch": 0.41188420518029456, "grad_norm": 3.0983994007110596, "learning_rate": 1.2823619691059471e-05, "loss": 1.3675, "step": 811 }, { "epoch": 0.41239207719654647, "grad_norm": 3.5127463340759277, "learning_rate": 1.2808231917921857e-05, "loss": 1.4256, "step": 812 }, { "epoch": 0.4128999492127984, "grad_norm": 3.695651054382324, "learning_rate": 1.2792836922680026e-05, "loss": 1.4067, "step": 813 }, { "epoch": 0.4134078212290503, "grad_norm": 3.353546380996704, "learning_rate": 1.2777434744926231e-05, "loss": 1.3569, "step": 814 }, { "epoch": 0.4139156932453022, "grad_norm": 4.223036289215088, "learning_rate": 1.2762025424271213e-05, "loss": 1.0206, "step": 815 }, { "epoch": 0.4144235652615541, "grad_norm": 3.227527379989624, "learning_rate": 1.274660900034408e-05, "loss": 1.3569, "step": 816 }, { "epoch": 0.414931437277806, "grad_norm": 3.7230629920959473, "learning_rate": 1.2731185512792203e-05, "loss": 1.0783, "step": 817 }, { "epoch": 0.4154393092940579, "grad_norm": 3.222513198852539, "learning_rate": 1.2715755001281125e-05, "loss": 1.4662, "step": 818 }, { "epoch": 0.4159471813103098, "grad_norm": 3.7021639347076416, "learning_rate": 1.2700317505494446e-05, "loss": 1.4293, "step": 819 }, { "epoch": 0.4164550533265617, "grad_norm": 3.0131332874298096, "learning_rate": 1.268487306513373e-05, "loss": 1.237, "step": 820 }, { "epoch": 0.4169629253428136, "grad_norm": 3.4458506107330322, "learning_rate": 1.2669421719918411e-05, "loss": 1.1909, "step": 821 }, { "epoch": 0.4174707973590655, "grad_norm": 3.5575385093688965, "learning_rate": 1.265396350958566e-05, "loss": 1.4145, "step": 822 }, { "epoch": 0.4179786693753174, "grad_norm": 3.4635229110717773, "learning_rate": 1.2638498473890323e-05, "loss": 1.3249, "step": 823 }, { "epoch": 0.4184865413915693, "grad_norm": 3.8467001914978027, "learning_rate": 1.2623026652604788e-05, "loss": 1.2442, "step": 824 }, { "epoch": 0.41899441340782123, "grad_norm": 2.934985399246216, "learning_rate": 1.2607548085518903e-05, "loss": 1.3453, "step": 825 }, { "epoch": 0.41950228542407314, "grad_norm": 3.92087721824646, "learning_rate": 1.2592062812439854e-05, "loss": 1.2389, "step": 826 }, { "epoch": 0.42001015744032505, "grad_norm": 3.172184705734253, "learning_rate": 1.257657087319208e-05, "loss": 1.4821, "step": 827 }, { "epoch": 0.42051802945657696, "grad_norm": 3.199178457260132, "learning_rate": 1.2561072307617161e-05, "loss": 1.244, "step": 828 }, { "epoch": 0.42102590147282887, "grad_norm": 3.8658840656280518, "learning_rate": 1.2545567155573725e-05, "loss": 1.2649, "step": 829 }, { "epoch": 0.4215337734890808, "grad_norm": 3.9423792362213135, "learning_rate": 1.2530055456937321e-05, "loss": 1.4782, "step": 830 }, { "epoch": 0.42204164550533263, "grad_norm": 3.3361122608184814, "learning_rate": 1.251453725160036e-05, "loss": 1.5312, "step": 831 }, { "epoch": 0.42254951752158454, "grad_norm": 4.270658016204834, "learning_rate": 1.249901257947197e-05, "loss": 1.1508, "step": 832 }, { "epoch": 0.42305738953783645, "grad_norm": 3.095872163772583, "learning_rate": 1.2483481480477908e-05, "loss": 1.3785, "step": 833 }, { "epoch": 0.42356526155408836, "grad_norm": 3.7208592891693115, "learning_rate": 1.246794399456047e-05, "loss": 1.4228, "step": 834 }, { "epoch": 0.42407313357034027, "grad_norm": 3.2617084980010986, "learning_rate": 1.2452400161678367e-05, "loss": 1.3602, "step": 835 }, { "epoch": 0.4245810055865922, "grad_norm": 4.436614990234375, "learning_rate": 1.2436850021806644e-05, "loss": 1.0053, "step": 836 }, { "epoch": 0.4250888776028441, "grad_norm": 3.6590428352355957, "learning_rate": 1.2421293614936553e-05, "loss": 1.3901, "step": 837 }, { "epoch": 0.425596749619096, "grad_norm": 4.084576606750488, "learning_rate": 1.2405730981075478e-05, "loss": 1.4956, "step": 838 }, { "epoch": 0.4261046216353479, "grad_norm": 3.201557159423828, "learning_rate": 1.239016216024681e-05, "loss": 1.3609, "step": 839 }, { "epoch": 0.4266124936515998, "grad_norm": 3.2476463317871094, "learning_rate": 1.2374587192489846e-05, "loss": 1.5228, "step": 840 }, { "epoch": 0.4271203656678517, "grad_norm": 4.044534683227539, "learning_rate": 1.23590061178597e-05, "loss": 1.0868, "step": 841 }, { "epoch": 0.42762823768410363, "grad_norm": 2.8839735984802246, "learning_rate": 1.2343418976427191e-05, "loss": 1.4421, "step": 842 }, { "epoch": 0.4281361097003555, "grad_norm": 3.148660182952881, "learning_rate": 1.2327825808278728e-05, "loss": 1.4414, "step": 843 }, { "epoch": 0.4286439817166074, "grad_norm": 3.5366671085357666, "learning_rate": 1.2312226653516237e-05, "loss": 1.2989, "step": 844 }, { "epoch": 0.4291518537328593, "grad_norm": 3.481353998184204, "learning_rate": 1.229662155225703e-05, "loss": 1.441, "step": 845 }, { "epoch": 0.4296597257491112, "grad_norm": 4.349026679992676, "learning_rate": 1.2281010544633708e-05, "loss": 1.2352, "step": 846 }, { "epoch": 0.4301675977653631, "grad_norm": 3.208542823791504, "learning_rate": 1.2265393670794079e-05, "loss": 1.2023, "step": 847 }, { "epoch": 0.43067546978161503, "grad_norm": 3.4003970623016357, "learning_rate": 1.2249770970901023e-05, "loss": 1.338, "step": 848 }, { "epoch": 0.43118334179786694, "grad_norm": 3.414480447769165, "learning_rate": 1.2234142485132398e-05, "loss": 1.1968, "step": 849 }, { "epoch": 0.43169121381411885, "grad_norm": 3.3189659118652344, "learning_rate": 1.221850825368096e-05, "loss": 1.4158, "step": 850 }, { "epoch": 0.43219908583037076, "grad_norm": 4.061149597167969, "learning_rate": 1.2202868316754226e-05, "loss": 1.1293, "step": 851 }, { "epoch": 0.43270695784662266, "grad_norm": 3.178410768508911, "learning_rate": 1.2187222714574392e-05, "loss": 1.4672, "step": 852 }, { "epoch": 0.4332148298628746, "grad_norm": 4.4542927742004395, "learning_rate": 1.2171571487378229e-05, "loss": 1.2674, "step": 853 }, { "epoch": 0.4337227018791265, "grad_norm": 3.5307891368865967, "learning_rate": 1.2155914675416964e-05, "loss": 1.4065, "step": 854 }, { "epoch": 0.4342305738953784, "grad_norm": 3.9772210121154785, "learning_rate": 1.2140252318956192e-05, "loss": 1.091, "step": 855 }, { "epoch": 0.43473844591163024, "grad_norm": 3.50232195854187, "learning_rate": 1.2124584458275772e-05, "loss": 1.4893, "step": 856 }, { "epoch": 0.43524631792788215, "grad_norm": 3.74629807472229, "learning_rate": 1.2108911133669706e-05, "loss": 1.0373, "step": 857 }, { "epoch": 0.43575418994413406, "grad_norm": 3.458662986755371, "learning_rate": 1.2093232385446057e-05, "loss": 1.3145, "step": 858 }, { "epoch": 0.43626206196038597, "grad_norm": 2.9476242065429688, "learning_rate": 1.2077548253926834e-05, "loss": 1.3316, "step": 859 }, { "epoch": 0.4367699339766379, "grad_norm": 3.3274056911468506, "learning_rate": 1.2061858779447894e-05, "loss": 1.6013, "step": 860 }, { "epoch": 0.4372778059928898, "grad_norm": 4.1184797286987305, "learning_rate": 1.2046164002358834e-05, "loss": 1.4292, "step": 861 }, { "epoch": 0.4377856780091417, "grad_norm": 3.278787851333618, "learning_rate": 1.2030463963022878e-05, "loss": 1.2706, "step": 862 }, { "epoch": 0.4382935500253936, "grad_norm": 3.9214258193969727, "learning_rate": 1.2014758701816798e-05, "loss": 1.3934, "step": 863 }, { "epoch": 0.4388014220416455, "grad_norm": 3.6918678283691406, "learning_rate": 1.1999048259130782e-05, "loss": 1.4788, "step": 864 }, { "epoch": 0.4393092940578974, "grad_norm": 3.303007125854492, "learning_rate": 1.1983332675368353e-05, "loss": 1.5519, "step": 865 }, { "epoch": 0.43981716607414933, "grad_norm": 3.0729122161865234, "learning_rate": 1.1967611990946254e-05, "loss": 1.4969, "step": 866 }, { "epoch": 0.44032503809040124, "grad_norm": 3.0283827781677246, "learning_rate": 1.1951886246294345e-05, "loss": 1.3992, "step": 867 }, { "epoch": 0.4408329101066531, "grad_norm": 3.1034414768218994, "learning_rate": 1.1936155481855496e-05, "loss": 1.3656, "step": 868 }, { "epoch": 0.441340782122905, "grad_norm": 5.4229960441589355, "learning_rate": 1.1920419738085493e-05, "loss": 0.8356, "step": 869 }, { "epoch": 0.4418486541391569, "grad_norm": 3.6526145935058594, "learning_rate": 1.1904679055452922e-05, "loss": 1.3332, "step": 870 }, { "epoch": 0.4423565261554088, "grad_norm": 3.88909912109375, "learning_rate": 1.1888933474439072e-05, "loss": 1.3471, "step": 871 }, { "epoch": 0.44286439817166073, "grad_norm": 3.9307477474212646, "learning_rate": 1.1873183035537833e-05, "loss": 1.2132, "step": 872 }, { "epoch": 0.44337227018791264, "grad_norm": 3.914536952972412, "learning_rate": 1.1857427779255581e-05, "loss": 1.3909, "step": 873 }, { "epoch": 0.44388014220416455, "grad_norm": 3.4930009841918945, "learning_rate": 1.1841667746111094e-05, "loss": 1.1192, "step": 874 }, { "epoch": 0.44438801422041646, "grad_norm": 3.1796231269836426, "learning_rate": 1.1825902976635422e-05, "loss": 1.2825, "step": 875 }, { "epoch": 0.44489588623666837, "grad_norm": 4.252844333648682, "learning_rate": 1.1810133511371799e-05, "loss": 1.2945, "step": 876 }, { "epoch": 0.4454037582529203, "grad_norm": 3.7375707626342773, "learning_rate": 1.179435939087554e-05, "loss": 1.3394, "step": 877 }, { "epoch": 0.4459116302691722, "grad_norm": 3.735342264175415, "learning_rate": 1.1778580655713928e-05, "loss": 1.129, "step": 878 }, { "epoch": 0.4464195022854241, "grad_norm": 3.234929084777832, "learning_rate": 1.1762797346466115e-05, "loss": 1.4438, "step": 879 }, { "epoch": 0.44692737430167595, "grad_norm": 0.7645437121391296, "learning_rate": 1.1747009503723017e-05, "loss": 0.2741, "step": 880 }, { "epoch": 0.44743524631792786, "grad_norm": 4.15008020401001, "learning_rate": 1.1731217168087207e-05, "loss": 1.3858, "step": 881 }, { "epoch": 0.44794311833417977, "grad_norm": 3.355151414871216, "learning_rate": 1.171542038017282e-05, "loss": 1.2419, "step": 882 }, { "epoch": 0.4484509903504317, "grad_norm": 3.976581573486328, "learning_rate": 1.1699619180605426e-05, "loss": 1.2607, "step": 883 }, { "epoch": 0.4489588623666836, "grad_norm": 4.275966167449951, "learning_rate": 1.1683813610021955e-05, "loss": 1.1482, "step": 884 }, { "epoch": 0.4494667343829355, "grad_norm": 3.5389294624328613, "learning_rate": 1.1668003709070576e-05, "loss": 1.3533, "step": 885 }, { "epoch": 0.4499746063991874, "grad_norm": 3.413022994995117, "learning_rate": 1.1652189518410586e-05, "loss": 1.2802, "step": 886 }, { "epoch": 0.4504824784154393, "grad_norm": 3.4573328495025635, "learning_rate": 1.1636371078712325e-05, "loss": 1.3234, "step": 887 }, { "epoch": 0.4509903504316912, "grad_norm": 3.2229926586151123, "learning_rate": 1.1620548430657052e-05, "loss": 1.3755, "step": 888 }, { "epoch": 0.45149822244794313, "grad_norm": 3.1548473834991455, "learning_rate": 1.1604721614936856e-05, "loss": 1.364, "step": 889 }, { "epoch": 0.45200609446419504, "grad_norm": 5.584766864776611, "learning_rate": 1.158889067225454e-05, "loss": 0.7164, "step": 890 }, { "epoch": 0.45251396648044695, "grad_norm": 4.067551612854004, "learning_rate": 1.1573055643323519e-05, "loss": 1.3354, "step": 891 }, { "epoch": 0.45302183849669886, "grad_norm": 3.604311943054199, "learning_rate": 1.1557216568867724e-05, "loss": 1.3106, "step": 892 }, { "epoch": 0.4535297105129507, "grad_norm": 3.1507656574249268, "learning_rate": 1.1541373489621477e-05, "loss": 1.5909, "step": 893 }, { "epoch": 0.4540375825292026, "grad_norm": 3.9540703296661377, "learning_rate": 1.1525526446329413e-05, "loss": 1.1574, "step": 894 }, { "epoch": 0.45454545454545453, "grad_norm": 3.316423177719116, "learning_rate": 1.1509675479746359e-05, "loss": 1.5137, "step": 895 }, { "epoch": 0.45505332656170644, "grad_norm": 3.792618751525879, "learning_rate": 1.1493820630637222e-05, "loss": 1.2956, "step": 896 }, { "epoch": 0.45556119857795835, "grad_norm": 3.3896944522857666, "learning_rate": 1.1477961939776906e-05, "loss": 1.2825, "step": 897 }, { "epoch": 0.45606907059421026, "grad_norm": 5.148727893829346, "learning_rate": 1.1462099447950192e-05, "loss": 0.8357, "step": 898 }, { "epoch": 0.45657694261046217, "grad_norm": 3.793823003768921, "learning_rate": 1.1446233195951629e-05, "loss": 1.0614, "step": 899 }, { "epoch": 0.4570848146267141, "grad_norm": 3.341181755065918, "learning_rate": 1.1430363224585445e-05, "loss": 1.3663, "step": 900 }, { "epoch": 0.457592686642966, "grad_norm": 3.8023338317871094, "learning_rate": 1.1414489574665424e-05, "loss": 1.2573, "step": 901 }, { "epoch": 0.4581005586592179, "grad_norm": 3.6153724193573, "learning_rate": 1.1398612287014823e-05, "loss": 1.3102, "step": 902 }, { "epoch": 0.4586084306754698, "grad_norm": 4.726499557495117, "learning_rate": 1.1382731402466246e-05, "loss": 1.1665, "step": 903 }, { "epoch": 0.4591163026917217, "grad_norm": 3.5814390182495117, "learning_rate": 1.1366846961861548e-05, "loss": 1.4517, "step": 904 }, { "epoch": 0.45962417470797357, "grad_norm": 4.046858787536621, "learning_rate": 1.135095900605173e-05, "loss": 1.2333, "step": 905 }, { "epoch": 0.4601320467242255, "grad_norm": 3.346405506134033, "learning_rate": 1.1335067575896834e-05, "loss": 1.3656, "step": 906 }, { "epoch": 0.4606399187404774, "grad_norm": 3.4278366565704346, "learning_rate": 1.1319172712265836e-05, "loss": 1.3663, "step": 907 }, { "epoch": 0.4611477907567293, "grad_norm": 3.110161066055298, "learning_rate": 1.1303274456036541e-05, "loss": 1.4959, "step": 908 }, { "epoch": 0.4616556627729812, "grad_norm": 4.720462799072266, "learning_rate": 1.1287372848095485e-05, "loss": 1.2964, "step": 909 }, { "epoch": 0.4621635347892331, "grad_norm": 3.7709381580352783, "learning_rate": 1.1271467929337818e-05, "loss": 1.5176, "step": 910 }, { "epoch": 0.462671406805485, "grad_norm": 3.399737596511841, "learning_rate": 1.1255559740667206e-05, "loss": 1.202, "step": 911 }, { "epoch": 0.46317927882173693, "grad_norm": 3.852874279022217, "learning_rate": 1.1239648322995724e-05, "loss": 1.4293, "step": 912 }, { "epoch": 0.46368715083798884, "grad_norm": 3.403273344039917, "learning_rate": 1.122373371724375e-05, "loss": 1.4322, "step": 913 }, { "epoch": 0.46419502285424075, "grad_norm": 4.433647155761719, "learning_rate": 1.120781596433987e-05, "loss": 1.0907, "step": 914 }, { "epoch": 0.46470289487049266, "grad_norm": 4.138065338134766, "learning_rate": 1.1191895105220747e-05, "loss": 1.5683, "step": 915 }, { "epoch": 0.46521076688674456, "grad_norm": 3.5094752311706543, "learning_rate": 1.1175971180831046e-05, "loss": 1.3521, "step": 916 }, { "epoch": 0.4657186389029964, "grad_norm": 3.576686143875122, "learning_rate": 1.1160044232123313e-05, "loss": 1.2947, "step": 917 }, { "epoch": 0.4662265109192483, "grad_norm": 3.317537546157837, "learning_rate": 1.114411430005787e-05, "loss": 1.2871, "step": 918 }, { "epoch": 0.46673438293550024, "grad_norm": 3.438308000564575, "learning_rate": 1.1128181425602712e-05, "loss": 1.3563, "step": 919 }, { "epoch": 0.46724225495175215, "grad_norm": 3.2957468032836914, "learning_rate": 1.1112245649733393e-05, "loss": 1.5019, "step": 920 }, { "epoch": 0.46775012696800405, "grad_norm": 3.8559672832489014, "learning_rate": 1.1096307013432948e-05, "loss": 1.2077, "step": 921 }, { "epoch": 0.46825799898425596, "grad_norm": 3.325054407119751, "learning_rate": 1.1080365557691743e-05, "loss": 1.2988, "step": 922 }, { "epoch": 0.46876587100050787, "grad_norm": 3.2300596237182617, "learning_rate": 1.1064421323507422e-05, "loss": 1.2613, "step": 923 }, { "epoch": 0.4692737430167598, "grad_norm": 5.3694963455200195, "learning_rate": 1.1048474351884756e-05, "loss": 0.9946, "step": 924 }, { "epoch": 0.4697816150330117, "grad_norm": 3.384295701980591, "learning_rate": 1.1032524683835557e-05, "loss": 1.5957, "step": 925 }, { "epoch": 0.4702894870492636, "grad_norm": 5.694066047668457, "learning_rate": 1.1016572360378579e-05, "loss": 0.9679, "step": 926 }, { "epoch": 0.4707973590655155, "grad_norm": 3.405916929244995, "learning_rate": 1.1000617422539405e-05, "loss": 1.4736, "step": 927 }, { "epoch": 0.4713052310817674, "grad_norm": 3.8782858848571777, "learning_rate": 1.0984659911350335e-05, "loss": 1.1301, "step": 928 }, { "epoch": 0.4718131030980193, "grad_norm": 3.81451678276062, "learning_rate": 1.0968699867850289e-05, "loss": 1.3413, "step": 929 }, { "epoch": 0.4723209751142712, "grad_norm": 3.7040867805480957, "learning_rate": 1.0952737333084704e-05, "loss": 1.226, "step": 930 }, { "epoch": 0.4728288471305231, "grad_norm": 3.4536123275756836, "learning_rate": 1.0936772348105418e-05, "loss": 1.4836, "step": 931 }, { "epoch": 0.473336719146775, "grad_norm": 4.051872253417969, "learning_rate": 1.0920804953970579e-05, "loss": 1.2695, "step": 932 }, { "epoch": 0.4738445911630269, "grad_norm": 3.3694775104522705, "learning_rate": 1.0904835191744515e-05, "loss": 1.1816, "step": 933 }, { "epoch": 0.4743524631792788, "grad_norm": 3.6979739665985107, "learning_rate": 1.0888863102497665e-05, "loss": 1.4489, "step": 934 }, { "epoch": 0.4748603351955307, "grad_norm": 4.128031253814697, "learning_rate": 1.0872888727306434e-05, "loss": 1.2585, "step": 935 }, { "epoch": 0.47536820721178263, "grad_norm": 3.5181219577789307, "learning_rate": 1.0856912107253115e-05, "loss": 1.3912, "step": 936 }, { "epoch": 0.47587607922803454, "grad_norm": 4.10791015625, "learning_rate": 1.0840933283425774e-05, "loss": 1.2379, "step": 937 }, { "epoch": 0.47638395124428645, "grad_norm": 5.002466201782227, "learning_rate": 1.0824952296918146e-05, "loss": 1.3092, "step": 938 }, { "epoch": 0.47689182326053836, "grad_norm": 3.118469476699829, "learning_rate": 1.080896918882952e-05, "loss": 1.3928, "step": 939 }, { "epoch": 0.47739969527679027, "grad_norm": 4.414337635040283, "learning_rate": 1.0792984000264653e-05, "loss": 1.0707, "step": 940 }, { "epoch": 0.4779075672930422, "grad_norm": 5.438792705535889, "learning_rate": 1.077699677233364e-05, "loss": 1.0545, "step": 941 }, { "epoch": 0.47841543930929403, "grad_norm": 4.211660861968994, "learning_rate": 1.0761007546151826e-05, "loss": 1.0931, "step": 942 }, { "epoch": 0.47892331132554594, "grad_norm": 3.749115467071533, "learning_rate": 1.0745016362839704e-05, "loss": 1.4633, "step": 943 }, { "epoch": 0.47943118334179785, "grad_norm": 4.394846439361572, "learning_rate": 1.0729023263522781e-05, "loss": 1.2719, "step": 944 }, { "epoch": 0.47993905535804976, "grad_norm": 3.9928226470947266, "learning_rate": 1.0713028289331515e-05, "loss": 1.4197, "step": 945 }, { "epoch": 0.48044692737430167, "grad_norm": 3.897555112838745, "learning_rate": 1.0697031481401166e-05, "loss": 1.1506, "step": 946 }, { "epoch": 0.4809547993905536, "grad_norm": 3.1774990558624268, "learning_rate": 1.0681032880871716e-05, "loss": 1.3888, "step": 947 }, { "epoch": 0.4814626714068055, "grad_norm": 3.9464094638824463, "learning_rate": 1.0665032528887761e-05, "loss": 1.0521, "step": 948 }, { "epoch": 0.4819705434230574, "grad_norm": 3.534813642501831, "learning_rate": 1.0649030466598398e-05, "loss": 1.3523, "step": 949 }, { "epoch": 0.4824784154393093, "grad_norm": 3.586951971054077, "learning_rate": 1.0633026735157124e-05, "loss": 1.1999, "step": 950 }, { "epoch": 0.4829862874555612, "grad_norm": 3.499028444290161, "learning_rate": 1.0617021375721727e-05, "loss": 1.5563, "step": 951 }, { "epoch": 0.4834941594718131, "grad_norm": 4.564541339874268, "learning_rate": 1.0601014429454184e-05, "loss": 1.2579, "step": 952 }, { "epoch": 0.48400203148806503, "grad_norm": 3.5675089359283447, "learning_rate": 1.0585005937520552e-05, "loss": 1.1718, "step": 953 }, { "epoch": 0.4845099035043169, "grad_norm": 3.861952781677246, "learning_rate": 1.0568995941090858e-05, "loss": 1.3146, "step": 954 }, { "epoch": 0.4850177755205688, "grad_norm": 3.5122156143188477, "learning_rate": 1.0552984481339008e-05, "loss": 1.3775, "step": 955 }, { "epoch": 0.4855256475368207, "grad_norm": 3.456569194793701, "learning_rate": 1.0536971599442667e-05, "loss": 1.4182, "step": 956 }, { "epoch": 0.4860335195530726, "grad_norm": 3.4301440715789795, "learning_rate": 1.052095733658315e-05, "loss": 1.3669, "step": 957 }, { "epoch": 0.4865413915693245, "grad_norm": 3.3819780349731445, "learning_rate": 1.0504941733945337e-05, "loss": 1.5379, "step": 958 }, { "epoch": 0.48704926358557643, "grad_norm": 3.7181880474090576, "learning_rate": 1.0488924832717544e-05, "loss": 1.3226, "step": 959 }, { "epoch": 0.48755713560182834, "grad_norm": 4.475208759307861, "learning_rate": 1.047290667409143e-05, "loss": 1.2804, "step": 960 }, { "epoch": 0.48806500761808025, "grad_norm": 4.995233535766602, "learning_rate": 1.0456887299261891e-05, "loss": 1.1545, "step": 961 }, { "epoch": 0.48857287963433216, "grad_norm": 4.604562759399414, "learning_rate": 1.0440866749426937e-05, "loss": 1.2169, "step": 962 }, { "epoch": 0.48908075165058407, "grad_norm": 3.4509825706481934, "learning_rate": 1.042484506578762e-05, "loss": 1.5351, "step": 963 }, { "epoch": 0.489588623666836, "grad_norm": 4.223725318908691, "learning_rate": 1.040882228954789e-05, "loss": 1.2196, "step": 964 }, { "epoch": 0.4900964956830879, "grad_norm": 5.32916259765625, "learning_rate": 1.0392798461914518e-05, "loss": 1.1461, "step": 965 }, { "epoch": 0.4906043676993398, "grad_norm": 3.5469343662261963, "learning_rate": 1.0376773624096979e-05, "loss": 1.2844, "step": 966 }, { "epoch": 0.49111223971559165, "grad_norm": 3.920461893081665, "learning_rate": 1.0360747817307335e-05, "loss": 1.0736, "step": 967 }, { "epoch": 0.49162011173184356, "grad_norm": 3.2652604579925537, "learning_rate": 1.0344721082760152e-05, "loss": 1.2714, "step": 968 }, { "epoch": 0.49212798374809547, "grad_norm": 3.248802661895752, "learning_rate": 1.0328693461672381e-05, "loss": 1.4837, "step": 969 }, { "epoch": 0.4926358557643474, "grad_norm": 3.43121600151062, "learning_rate": 1.0312664995263242e-05, "loss": 1.4444, "step": 970 }, { "epoch": 0.4931437277805993, "grad_norm": 3.92547869682312, "learning_rate": 1.0296635724754137e-05, "loss": 1.3583, "step": 971 }, { "epoch": 0.4936515997968512, "grad_norm": 3.1732635498046875, "learning_rate": 1.0280605691368542e-05, "loss": 1.4998, "step": 972 }, { "epoch": 0.4941594718131031, "grad_norm": 3.503230571746826, "learning_rate": 1.026457493633188e-05, "loss": 1.3833, "step": 973 }, { "epoch": 0.494667343829355, "grad_norm": 3.726642608642578, "learning_rate": 1.0248543500871449e-05, "loss": 1.1362, "step": 974 }, { "epoch": 0.4951752158456069, "grad_norm": 3.980401039123535, "learning_rate": 1.0232511426216273e-05, "loss": 1.4434, "step": 975 }, { "epoch": 0.49568308786185883, "grad_norm": 4.425255298614502, "learning_rate": 1.021647875359704e-05, "loss": 1.2277, "step": 976 }, { "epoch": 0.49619095987811074, "grad_norm": 4.443340301513672, "learning_rate": 1.020044552424597e-05, "loss": 1.2725, "step": 977 }, { "epoch": 0.49669883189436265, "grad_norm": 3.401463508605957, "learning_rate": 1.018441177939671e-05, "loss": 1.4152, "step": 978 }, { "epoch": 0.4972067039106145, "grad_norm": 4.191753387451172, "learning_rate": 1.0168377560284237e-05, "loss": 1.5727, "step": 979 }, { "epoch": 0.4977145759268664, "grad_norm": 3.79152512550354, "learning_rate": 1.0152342908144747e-05, "loss": 1.51, "step": 980 }, { "epoch": 0.4982224479431183, "grad_norm": 3.977187395095825, "learning_rate": 1.0136307864215552e-05, "loss": 1.1533, "step": 981 }, { "epoch": 0.4987303199593702, "grad_norm": 3.822143077850342, "learning_rate": 1.0120272469734969e-05, "loss": 1.3153, "step": 982 }, { "epoch": 0.49923819197562214, "grad_norm": 4.221085071563721, "learning_rate": 1.010423676594221e-05, "loss": 1.0479, "step": 983 }, { "epoch": 0.49974606399187405, "grad_norm": 3.6054933071136475, "learning_rate": 1.0088200794077296e-05, "loss": 1.3151, "step": 984 }, { "epoch": 0.500253936008126, "grad_norm": 4.690833568572998, "learning_rate": 1.007216459538093e-05, "loss": 1.0533, "step": 985 }, { "epoch": 0.5007618080243779, "grad_norm": 4.331236839294434, "learning_rate": 1.0056128211094395e-05, "loss": 1.4805, "step": 986 }, { "epoch": 0.5007618080243779, "eval_loss": 1.2702714204788208, "eval_runtime": 167.3873, "eval_samples_per_second": 5.879, "eval_steps_per_second": 1.47, "step": 986 }, { "epoch": 0.5012696800406298, "grad_norm": 3.8839874267578125, "learning_rate": 1.0040091682459461e-05, "loss": 1.4227, "step": 987 }, { "epoch": 0.5017775520568817, "grad_norm": 4.005825042724609, "learning_rate": 1.0024055050718257e-05, "loss": 1.2848, "step": 988 }, { "epoch": 0.5022854240731336, "grad_norm": 4.486266613006592, "learning_rate": 1.0008018357113187e-05, "loss": 1.321, "step": 989 }, { "epoch": 0.5027932960893855, "grad_norm": 4.866846084594727, "learning_rate": 9.991981642886814e-06, "loss": 1.3309, "step": 990 }, { "epoch": 0.5033011681056374, "grad_norm": 3.5652294158935547, "learning_rate": 9.975944949281748e-06, "loss": 1.3894, "step": 991 }, { "epoch": 0.5038090401218893, "grad_norm": 4.046600818634033, "learning_rate": 9.959908317540542e-06, "loss": 1.3046, "step": 992 }, { "epoch": 0.5043169121381412, "grad_norm": 3.955585479736328, "learning_rate": 9.943871788905606e-06, "loss": 1.1924, "step": 993 }, { "epoch": 0.5048247841543931, "grad_norm": 3.789905309677124, "learning_rate": 9.927835404619075e-06, "loss": 1.3519, "step": 994 }, { "epoch": 0.505332656170645, "grad_norm": 5.119289875030518, "learning_rate": 9.911799205922704e-06, "loss": 1.2782, "step": 995 }, { "epoch": 0.505840528186897, "grad_norm": 3.122493028640747, "learning_rate": 9.895763234057792e-06, "loss": 1.5082, "step": 996 }, { "epoch": 0.5063484002031488, "grad_norm": 3.873720407485962, "learning_rate": 9.879727530265038e-06, "loss": 1.2583, "step": 997 }, { "epoch": 0.5068562722194007, "grad_norm": 3.268399953842163, "learning_rate": 9.863692135784448e-06, "loss": 1.4216, "step": 998 }, { "epoch": 0.5073641442356526, "grad_norm": 3.9059388637542725, "learning_rate": 9.847657091855254e-06, "loss": 1.1708, "step": 999 }, { "epoch": 0.5078720162519045, "grad_norm": 3.699793577194214, "learning_rate": 9.831622439715766e-06, "loss": 1.3077, "step": 1000 }, { "epoch": 0.5083798882681564, "grad_norm": 3.7766926288604736, "learning_rate": 9.815588220603292e-06, "loss": 1.5583, "step": 1001 }, { "epoch": 0.5088877602844083, "grad_norm": 4.702587127685547, "learning_rate": 9.799554475754032e-06, "loss": 1.0589, "step": 1002 }, { "epoch": 0.5093956323006602, "grad_norm": 3.7550957202911377, "learning_rate": 9.783521246402963e-06, "loss": 1.0813, "step": 1003 }, { "epoch": 0.5099035043169121, "grad_norm": 4.080326557159424, "learning_rate": 9.767488573783728e-06, "loss": 1.3106, "step": 1004 }, { "epoch": 0.510411376333164, "grad_norm": 4.644419193267822, "learning_rate": 9.751456499128555e-06, "loss": 1.4889, "step": 1005 }, { "epoch": 0.5109192483494159, "grad_norm": 3.7005774974823, "learning_rate": 9.735425063668121e-06, "loss": 1.5305, "step": 1006 }, { "epoch": 0.5114271203656678, "grad_norm": 3.4154560565948486, "learning_rate": 9.719394308631461e-06, "loss": 1.4153, "step": 1007 }, { "epoch": 0.5119349923819198, "grad_norm": 4.038942813873291, "learning_rate": 9.703364275245864e-06, "loss": 1.4484, "step": 1008 }, { "epoch": 0.5124428643981717, "grad_norm": 4.789218425750732, "learning_rate": 9.687335004736763e-06, "loss": 1.3403, "step": 1009 }, { "epoch": 0.5129507364144236, "grad_norm": 4.188717842102051, "learning_rate": 9.671306538327625e-06, "loss": 1.1975, "step": 1010 }, { "epoch": 0.5134586084306755, "grad_norm": 3.6356616020202637, "learning_rate": 9.655278917239848e-06, "loss": 1.3069, "step": 1011 }, { "epoch": 0.5139664804469274, "grad_norm": 3.921940565109253, "learning_rate": 9.639252182692669e-06, "loss": 1.3313, "step": 1012 }, { "epoch": 0.5144743524631793, "grad_norm": 3.735460042953491, "learning_rate": 9.623226375903028e-06, "loss": 1.4116, "step": 1013 }, { "epoch": 0.5149822244794312, "grad_norm": 3.5081825256347656, "learning_rate": 9.607201538085484e-06, "loss": 1.263, "step": 1014 }, { "epoch": 0.5154900964956831, "grad_norm": 4.349667549133301, "learning_rate": 9.591177710452113e-06, "loss": 1.2996, "step": 1015 }, { "epoch": 0.515997968511935, "grad_norm": 3.45693039894104, "learning_rate": 9.575154934212385e-06, "loss": 1.3949, "step": 1016 }, { "epoch": 0.5165058405281869, "grad_norm": 4.386706829071045, "learning_rate": 9.559133250573062e-06, "loss": 1.2486, "step": 1017 }, { "epoch": 0.5170137125444388, "grad_norm": 3.824852705001831, "learning_rate": 9.543112700738112e-06, "loss": 1.6588, "step": 1018 }, { "epoch": 0.5175215845606908, "grad_norm": 4.599579811096191, "learning_rate": 9.527093325908572e-06, "loss": 1.0744, "step": 1019 }, { "epoch": 0.5180294565769427, "grad_norm": 4.259639263153076, "learning_rate": 9.511075167282459e-06, "loss": 1.1343, "step": 1020 }, { "epoch": 0.5185373285931946, "grad_norm": 4.185952186584473, "learning_rate": 9.495058266054665e-06, "loss": 1.1831, "step": 1021 }, { "epoch": 0.5190452006094464, "grad_norm": 3.6289265155792236, "learning_rate": 9.479042663416852e-06, "loss": 1.4118, "step": 1022 }, { "epoch": 0.5195530726256983, "grad_norm": 3.4601523876190186, "learning_rate": 9.463028400557338e-06, "loss": 1.3336, "step": 1023 }, { "epoch": 0.5200609446419502, "grad_norm": 4.5509257316589355, "learning_rate": 9.447015518660992e-06, "loss": 1.0343, "step": 1024 }, { "epoch": 0.5205688166582021, "grad_norm": 4.35545015335083, "learning_rate": 9.431004058909145e-06, "loss": 1.0353, "step": 1025 }, { "epoch": 0.521076688674454, "grad_norm": 3.8852431774139404, "learning_rate": 9.414994062479455e-06, "loss": 1.3905, "step": 1026 }, { "epoch": 0.5215845606907059, "grad_norm": 3.812549591064453, "learning_rate": 9.398985570545818e-06, "loss": 1.4305, "step": 1027 }, { "epoch": 0.5220924327069578, "grad_norm": 3.5209968090057373, "learning_rate": 9.382978624278276e-06, "loss": 1.3956, "step": 1028 }, { "epoch": 0.5226003047232097, "grad_norm": 4.683962821960449, "learning_rate": 9.366973264842877e-06, "loss": 1.3153, "step": 1029 }, { "epoch": 0.5231081767394616, "grad_norm": 4.334263324737549, "learning_rate": 9.350969533401601e-06, "loss": 1.1408, "step": 1030 }, { "epoch": 0.5236160487557135, "grad_norm": 3.7997000217437744, "learning_rate": 9.33496747111224e-06, "loss": 1.5354, "step": 1031 }, { "epoch": 0.5241239207719655, "grad_norm": 3.7128489017486572, "learning_rate": 9.318967119128289e-06, "loss": 1.4863, "step": 1032 }, { "epoch": 0.5246317927882174, "grad_norm": 3.7790944576263428, "learning_rate": 9.302968518598835e-06, "loss": 1.3404, "step": 1033 }, { "epoch": 0.5251396648044693, "grad_norm": 4.1233367919921875, "learning_rate": 9.286971710668486e-06, "loss": 1.1933, "step": 1034 }, { "epoch": 0.5256475368207212, "grad_norm": 4.708859443664551, "learning_rate": 9.27097673647722e-06, "loss": 1.1056, "step": 1035 }, { "epoch": 0.5261554088369731, "grad_norm": 3.791539430618286, "learning_rate": 9.2549836371603e-06, "loss": 1.4778, "step": 1036 }, { "epoch": 0.526663280853225, "grad_norm": 4.034219741821289, "learning_rate": 9.238992453848173e-06, "loss": 1.4119, "step": 1037 }, { "epoch": 0.5271711528694769, "grad_norm": 3.3890597820281982, "learning_rate": 9.223003227666364e-06, "loss": 1.3695, "step": 1038 }, { "epoch": 0.5276790248857288, "grad_norm": 3.4671132564544678, "learning_rate": 9.207015999735352e-06, "loss": 1.5091, "step": 1039 }, { "epoch": 0.5281868969019807, "grad_norm": 4.020588397979736, "learning_rate": 9.191030811170481e-06, "loss": 1.0718, "step": 1040 }, { "epoch": 0.5286947689182326, "grad_norm": 3.856794595718384, "learning_rate": 9.175047703081857e-06, "loss": 1.28, "step": 1041 }, { "epoch": 0.5292026409344845, "grad_norm": 3.7805190086364746, "learning_rate": 9.159066716574227e-06, "loss": 1.3552, "step": 1042 }, { "epoch": 0.5297105129507365, "grad_norm": 4.169523239135742, "learning_rate": 9.143087892746887e-06, "loss": 1.3731, "step": 1043 }, { "epoch": 0.5302183849669884, "grad_norm": 4.341207504272461, "learning_rate": 9.12711127269357e-06, "loss": 1.1904, "step": 1044 }, { "epoch": 0.5307262569832403, "grad_norm": 4.3100409507751465, "learning_rate": 9.11113689750234e-06, "loss": 1.311, "step": 1045 }, { "epoch": 0.5312341289994922, "grad_norm": 4.511486053466797, "learning_rate": 9.095164808255485e-06, "loss": 1.3542, "step": 1046 }, { "epoch": 0.531742001015744, "grad_norm": 4.631117343902588, "learning_rate": 9.079195046029425e-06, "loss": 0.9907, "step": 1047 }, { "epoch": 0.5322498730319959, "grad_norm": 3.4736275672912598, "learning_rate": 9.063227651894583e-06, "loss": 1.3579, "step": 1048 }, { "epoch": 0.5327577450482478, "grad_norm": 3.3849706649780273, "learning_rate": 9.0472626669153e-06, "loss": 1.3443, "step": 1049 }, { "epoch": 0.5332656170644997, "grad_norm": 3.844325542449951, "learning_rate": 9.031300132149713e-06, "loss": 1.3026, "step": 1050 }, { "epoch": 0.5337734890807516, "grad_norm": 3.62764835357666, "learning_rate": 9.015340088649668e-06, "loss": 1.0405, "step": 1051 }, { "epoch": 0.5342813610970035, "grad_norm": 3.797156810760498, "learning_rate": 8.999382577460598e-06, "loss": 1.3491, "step": 1052 }, { "epoch": 0.5347892331132554, "grad_norm": 5.074930667877197, "learning_rate": 8.983427639621421e-06, "loss": 1.1417, "step": 1053 }, { "epoch": 0.5352971051295073, "grad_norm": 4.06516170501709, "learning_rate": 8.967475316164446e-06, "loss": 1.2173, "step": 1054 }, { "epoch": 0.5358049771457593, "grad_norm": 3.873654365539551, "learning_rate": 8.951525648115251e-06, "loss": 1.4384, "step": 1055 }, { "epoch": 0.5363128491620112, "grad_norm": 3.887115716934204, "learning_rate": 8.93557867649258e-06, "loss": 1.3913, "step": 1056 }, { "epoch": 0.5368207211782631, "grad_norm": 4.213863372802734, "learning_rate": 8.919634442308258e-06, "loss": 1.2949, "step": 1057 }, { "epoch": 0.537328593194515, "grad_norm": 4.809872627258301, "learning_rate": 8.903692986567059e-06, "loss": 1.3473, "step": 1058 }, { "epoch": 0.5378364652107669, "grad_norm": 4.870054244995117, "learning_rate": 8.887754350266608e-06, "loss": 1.1729, "step": 1059 }, { "epoch": 0.5383443372270188, "grad_norm": 3.5638108253479004, "learning_rate": 8.871818574397293e-06, "loss": 1.4476, "step": 1060 }, { "epoch": 0.5388522092432707, "grad_norm": 4.44373893737793, "learning_rate": 8.855885699942134e-06, "loss": 1.4085, "step": 1061 }, { "epoch": 0.5393600812595226, "grad_norm": 4.298492908477783, "learning_rate": 8.839955767876687e-06, "loss": 0.9469, "step": 1062 }, { "epoch": 0.5398679532757745, "grad_norm": 4.122191905975342, "learning_rate": 8.824028819168955e-06, "loss": 1.1266, "step": 1063 }, { "epoch": 0.5403758252920264, "grad_norm": 3.8323357105255127, "learning_rate": 8.808104894779257e-06, "loss": 1.4753, "step": 1064 }, { "epoch": 0.5408836973082783, "grad_norm": 3.6288328170776367, "learning_rate": 8.792184035660136e-06, "loss": 1.3259, "step": 1065 }, { "epoch": 0.5413915693245303, "grad_norm": 3.873750925064087, "learning_rate": 8.77626628275625e-06, "loss": 1.4324, "step": 1066 }, { "epoch": 0.5418994413407822, "grad_norm": 5.460947036743164, "learning_rate": 8.76035167700428e-06, "loss": 1.3701, "step": 1067 }, { "epoch": 0.5424073133570341, "grad_norm": 4.058062553405762, "learning_rate": 8.744440259332798e-06, "loss": 1.1698, "step": 1068 }, { "epoch": 0.542915185373286, "grad_norm": 3.992889642715454, "learning_rate": 8.728532070662184e-06, "loss": 1.3678, "step": 1069 }, { "epoch": 0.5434230573895379, "grad_norm": 3.4362125396728516, "learning_rate": 8.712627151904518e-06, "loss": 1.363, "step": 1070 }, { "epoch": 0.5439309294057897, "grad_norm": 4.388937473297119, "learning_rate": 8.696725543963462e-06, "loss": 1.3305, "step": 1071 }, { "epoch": 0.5444388014220416, "grad_norm": 3.9520606994628906, "learning_rate": 8.680827287734167e-06, "loss": 1.2707, "step": 1072 }, { "epoch": 0.5449466734382935, "grad_norm": 5.032470226287842, "learning_rate": 8.66493242410317e-06, "loss": 1.1903, "step": 1073 }, { "epoch": 0.5454545454545454, "grad_norm": 4.099115371704102, "learning_rate": 8.649040993948273e-06, "loss": 1.4908, "step": 1074 }, { "epoch": 0.5459624174707973, "grad_norm": 4.2181549072265625, "learning_rate": 8.633153038138453e-06, "loss": 1.3798, "step": 1075 }, { "epoch": 0.5464702894870492, "grad_norm": 3.630211114883423, "learning_rate": 8.617268597533755e-06, "loss": 1.3977, "step": 1076 }, { "epoch": 0.5469781615033011, "grad_norm": 3.997370481491089, "learning_rate": 8.60138771298518e-06, "loss": 1.3037, "step": 1077 }, { "epoch": 0.547486033519553, "grad_norm": 4.776542663574219, "learning_rate": 8.585510425334579e-06, "loss": 1.4208, "step": 1078 }, { "epoch": 0.547993905535805, "grad_norm": 4.408464431762695, "learning_rate": 8.569636775414558e-06, "loss": 1.5326, "step": 1079 }, { "epoch": 0.5485017775520569, "grad_norm": 3.441190719604492, "learning_rate": 8.553766804048373e-06, "loss": 1.6373, "step": 1080 }, { "epoch": 0.5490096495683088, "grad_norm": 4.0465593338012695, "learning_rate": 8.537900552049812e-06, "loss": 1.2693, "step": 1081 }, { "epoch": 0.5495175215845607, "grad_norm": 4.380228042602539, "learning_rate": 8.522038060223094e-06, "loss": 1.4583, "step": 1082 }, { "epoch": 0.5500253936008126, "grad_norm": 3.919779062271118, "learning_rate": 8.50617936936278e-06, "loss": 1.413, "step": 1083 }, { "epoch": 0.5505332656170645, "grad_norm": 4.072299480438232, "learning_rate": 8.490324520253648e-06, "loss": 1.3373, "step": 1084 }, { "epoch": 0.5510411376333164, "grad_norm": 3.893017530441284, "learning_rate": 8.474473553670588e-06, "loss": 1.3729, "step": 1085 }, { "epoch": 0.5515490096495683, "grad_norm": 3.743358850479126, "learning_rate": 8.458626510378525e-06, "loss": 1.1618, "step": 1086 }, { "epoch": 0.5520568816658202, "grad_norm": 3.8513174057006836, "learning_rate": 8.442783431132282e-06, "loss": 1.2422, "step": 1087 }, { "epoch": 0.5525647536820721, "grad_norm": 3.660341501235962, "learning_rate": 8.426944356676481e-06, "loss": 1.4136, "step": 1088 }, { "epoch": 0.553072625698324, "grad_norm": 4.450745105743408, "learning_rate": 8.411109327745463e-06, "loss": 1.3378, "step": 1089 }, { "epoch": 0.553580497714576, "grad_norm": 3.3938448429107666, "learning_rate": 8.395278385063147e-06, "loss": 1.2288, "step": 1090 }, { "epoch": 0.5540883697308279, "grad_norm": 3.7265074253082275, "learning_rate": 8.379451569342948e-06, "loss": 1.2431, "step": 1091 }, { "epoch": 0.5545962417470798, "grad_norm": 3.6786811351776123, "learning_rate": 8.363628921287678e-06, "loss": 1.3273, "step": 1092 }, { "epoch": 0.5551041137633317, "grad_norm": 3.631423234939575, "learning_rate": 8.347810481589416e-06, "loss": 1.4747, "step": 1093 }, { "epoch": 0.5556119857795836, "grad_norm": 3.7244248390197754, "learning_rate": 8.331996290929429e-06, "loss": 1.5013, "step": 1094 }, { "epoch": 0.5561198577958355, "grad_norm": 3.9098987579345703, "learning_rate": 8.316186389978045e-06, "loss": 1.4644, "step": 1095 }, { "epoch": 0.5566277298120873, "grad_norm": 3.656759738922119, "learning_rate": 8.300380819394579e-06, "loss": 1.5651, "step": 1096 }, { "epoch": 0.5571356018283392, "grad_norm": 4.050074577331543, "learning_rate": 8.284579619827187e-06, "loss": 1.3435, "step": 1097 }, { "epoch": 0.5576434738445911, "grad_norm": 4.784929275512695, "learning_rate": 8.268782831912793e-06, "loss": 1.2313, "step": 1098 }, { "epoch": 0.558151345860843, "grad_norm": 4.976466178894043, "learning_rate": 8.252990496276986e-06, "loss": 1.264, "step": 1099 }, { "epoch": 0.5586592178770949, "grad_norm": 4.2961320877075195, "learning_rate": 8.237202653533886e-06, "loss": 1.1136, "step": 1100 }, { "epoch": 0.5591670898933468, "grad_norm": 3.6020753383636475, "learning_rate": 8.221419344286073e-06, "loss": 1.2281, "step": 1101 }, { "epoch": 0.5596749619095988, "grad_norm": 3.858781337738037, "learning_rate": 8.205640609124461e-06, "loss": 1.2915, "step": 1102 }, { "epoch": 0.5601828339258507, "grad_norm": 3.9297633171081543, "learning_rate": 8.189866488628204e-06, "loss": 1.3519, "step": 1103 }, { "epoch": 0.5606907059421026, "grad_norm": 3.6003174781799316, "learning_rate": 8.17409702336458e-06, "loss": 1.514, "step": 1104 }, { "epoch": 0.5611985779583545, "grad_norm": 4.255648612976074, "learning_rate": 8.158332253888907e-06, "loss": 1.3738, "step": 1105 }, { "epoch": 0.5617064499746064, "grad_norm": 4.09144401550293, "learning_rate": 8.142572220744422e-06, "loss": 1.1667, "step": 1106 }, { "epoch": 0.5622143219908583, "grad_norm": 3.893601417541504, "learning_rate": 8.126816964462172e-06, "loss": 1.3337, "step": 1107 }, { "epoch": 0.5627221940071102, "grad_norm": 4.821288585662842, "learning_rate": 8.11106652556093e-06, "loss": 1.2159, "step": 1108 }, { "epoch": 0.5632300660233621, "grad_norm": 3.5883355140686035, "learning_rate": 8.095320944547083e-06, "loss": 1.2955, "step": 1109 }, { "epoch": 0.563737938039614, "grad_norm": 4.82534646987915, "learning_rate": 8.079580261914512e-06, "loss": 1.507, "step": 1110 }, { "epoch": 0.5642458100558659, "grad_norm": 3.767213821411133, "learning_rate": 8.063844518144505e-06, "loss": 1.3607, "step": 1111 }, { "epoch": 0.5647536820721178, "grad_norm": 3.960946559906006, "learning_rate": 8.048113753705657e-06, "loss": 1.0787, "step": 1112 }, { "epoch": 0.5652615540883698, "grad_norm": 3.8261797428131104, "learning_rate": 8.032388009053747e-06, "loss": 1.3855, "step": 1113 }, { "epoch": 0.5657694261046217, "grad_norm": 3.765634298324585, "learning_rate": 8.016667324631649e-06, "loss": 1.1932, "step": 1114 }, { "epoch": 0.5662772981208736, "grad_norm": 4.079648017883301, "learning_rate": 8.00095174086922e-06, "loss": 1.3525, "step": 1115 }, { "epoch": 0.5667851701371255, "grad_norm": 4.009759426116943, "learning_rate": 7.985241298183209e-06, "loss": 1.3184, "step": 1116 }, { "epoch": 0.5672930421533774, "grad_norm": 4.056725978851318, "learning_rate": 7.969536036977124e-06, "loss": 1.3591, "step": 1117 }, { "epoch": 0.5678009141696293, "grad_norm": 5.053130149841309, "learning_rate": 7.95383599764117e-06, "loss": 1.3316, "step": 1118 }, { "epoch": 0.5683087861858812, "grad_norm": 3.995842933654785, "learning_rate": 7.938141220552107e-06, "loss": 1.4236, "step": 1119 }, { "epoch": 0.568816658202133, "grad_norm": 3.60331654548645, "learning_rate": 7.922451746073166e-06, "loss": 1.3767, "step": 1120 }, { "epoch": 0.5693245302183849, "grad_norm": 4.3413238525390625, "learning_rate": 7.906767614553947e-06, "loss": 1.1674, "step": 1121 }, { "epoch": 0.5698324022346368, "grad_norm": 4.532721042633057, "learning_rate": 7.8910888663303e-06, "loss": 1.4166, "step": 1122 }, { "epoch": 0.5703402742508887, "grad_norm": 4.4201765060424805, "learning_rate": 7.875415541724235e-06, "loss": 1.1205, "step": 1123 }, { "epoch": 0.5708481462671406, "grad_norm": 4.706449031829834, "learning_rate": 7.859747681043808e-06, "loss": 1.3193, "step": 1124 }, { "epoch": 0.5713560182833926, "grad_norm": 5.1450982093811035, "learning_rate": 7.84408532458304e-06, "loss": 1.3136, "step": 1125 }, { "epoch": 0.5718638902996445, "grad_norm": 3.987536668777466, "learning_rate": 7.828428512621776e-06, "loss": 1.2301, "step": 1126 }, { "epoch": 0.5723717623158964, "grad_norm": 4.997498035430908, "learning_rate": 7.812777285425608e-06, "loss": 1.124, "step": 1127 }, { "epoch": 0.5728796343321483, "grad_norm": 4.351808547973633, "learning_rate": 7.797131683245776e-06, "loss": 1.2459, "step": 1128 }, { "epoch": 0.5733875063484002, "grad_norm": 4.245429992675781, "learning_rate": 7.781491746319045e-06, "loss": 1.1971, "step": 1129 }, { "epoch": 0.5738953783646521, "grad_norm": 4.84749698638916, "learning_rate": 7.765857514867603e-06, "loss": 1.2181, "step": 1130 }, { "epoch": 0.574403250380904, "grad_norm": 3.8819117546081543, "learning_rate": 7.750229029098982e-06, "loss": 1.3041, "step": 1131 }, { "epoch": 0.5749111223971559, "grad_norm": 4.279530048370361, "learning_rate": 7.734606329205923e-06, "loss": 1.2113, "step": 1132 }, { "epoch": 0.5754189944134078, "grad_norm": 5.590778827667236, "learning_rate": 7.71898945536629e-06, "loss": 1.246, "step": 1133 }, { "epoch": 0.5759268664296597, "grad_norm": 3.8742496967315674, "learning_rate": 7.703378447742976e-06, "loss": 1.3175, "step": 1134 }, { "epoch": 0.5764347384459116, "grad_norm": 4.623325347900391, "learning_rate": 7.687773346483766e-06, "loss": 1.3656, "step": 1135 }, { "epoch": 0.5769426104621636, "grad_norm": 7.073203086853027, "learning_rate": 7.672174191721277e-06, "loss": 1.1737, "step": 1136 }, { "epoch": 0.5774504824784155, "grad_norm": 4.289577007293701, "learning_rate": 7.656581023572814e-06, "loss": 1.3797, "step": 1137 }, { "epoch": 0.5779583544946674, "grad_norm": 3.823345422744751, "learning_rate": 7.640993882140302e-06, "loss": 1.1435, "step": 1138 }, { "epoch": 0.5784662265109193, "grad_norm": 3.539424180984497, "learning_rate": 7.625412807510157e-06, "loss": 1.3426, "step": 1139 }, { "epoch": 0.5789740985271712, "grad_norm": 4.083397388458252, "learning_rate": 7.609837839753191e-06, "loss": 1.5188, "step": 1140 }, { "epoch": 0.5794819705434231, "grad_norm": 3.440096855163574, "learning_rate": 7.594269018924522e-06, "loss": 1.4336, "step": 1141 }, { "epoch": 0.579989842559675, "grad_norm": 4.4859490394592285, "learning_rate": 7.5787063850634476e-06, "loss": 1.226, "step": 1142 }, { "epoch": 0.5804977145759269, "grad_norm": 4.9387617111206055, "learning_rate": 7.56314997819336e-06, "loss": 0.9405, "step": 1143 }, { "epoch": 0.5810055865921788, "grad_norm": 3.6573925018310547, "learning_rate": 7.547599838321635e-06, "loss": 1.3936, "step": 1144 }, { "epoch": 0.5815134586084306, "grad_norm": 5.386067867279053, "learning_rate": 7.532056005439536e-06, "loss": 0.9549, "step": 1145 }, { "epoch": 0.5820213306246825, "grad_norm": 4.03516149520874, "learning_rate": 7.516518519522093e-06, "loss": 1.3277, "step": 1146 }, { "epoch": 0.5825292026409344, "grad_norm": 5.966615200042725, "learning_rate": 7.500987420528033e-06, "loss": 1.1483, "step": 1147 }, { "epoch": 0.5830370746571863, "grad_norm": 3.9208691120147705, "learning_rate": 7.485462748399641e-06, "loss": 1.1712, "step": 1148 }, { "epoch": 0.5835449466734383, "grad_norm": 3.6072921752929688, "learning_rate": 7.469944543062677e-06, "loss": 1.5522, "step": 1149 }, { "epoch": 0.5840528186896902, "grad_norm": 4.357779502868652, "learning_rate": 7.454432844426278e-06, "loss": 1.2472, "step": 1150 }, { "epoch": 0.5845606907059421, "grad_norm": 4.2969465255737305, "learning_rate": 7.43892769238284e-06, "loss": 1.6309, "step": 1151 }, { "epoch": 0.585068562722194, "grad_norm": 4.970481872558594, "learning_rate": 7.423429126807924e-06, "loss": 1.0273, "step": 1152 }, { "epoch": 0.5855764347384459, "grad_norm": 4.304748058319092, "learning_rate": 7.407937187560148e-06, "loss": 1.2925, "step": 1153 }, { "epoch": 0.5860843067546978, "grad_norm": 3.9706506729125977, "learning_rate": 7.3924519144811e-06, "loss": 1.3122, "step": 1154 }, { "epoch": 0.5865921787709497, "grad_norm": 3.9953420162200928, "learning_rate": 7.376973347395215e-06, "loss": 1.1798, "step": 1155 }, { "epoch": 0.5871000507872016, "grad_norm": 4.054521083831787, "learning_rate": 7.36150152610968e-06, "loss": 1.5718, "step": 1156 }, { "epoch": 0.5876079228034535, "grad_norm": 3.4766275882720947, "learning_rate": 7.346036490414342e-06, "loss": 1.3719, "step": 1157 }, { "epoch": 0.5881157948197054, "grad_norm": 5.536925315856934, "learning_rate": 7.330578280081595e-06, "loss": 1.3112, "step": 1158 }, { "epoch": 0.5886236668359573, "grad_norm": 3.708730459213257, "learning_rate": 7.315126934866269e-06, "loss": 1.2901, "step": 1159 }, { "epoch": 0.5891315388522093, "grad_norm": 4.949906349182129, "learning_rate": 7.299682494505557e-06, "loss": 1.1044, "step": 1160 }, { "epoch": 0.5896394108684612, "grad_norm": 4.265493392944336, "learning_rate": 7.284244998718879e-06, "loss": 1.2927, "step": 1161 }, { "epoch": 0.5901472828847131, "grad_norm": 4.239290714263916, "learning_rate": 7.268814487207796e-06, "loss": 1.083, "step": 1162 }, { "epoch": 0.590655154900965, "grad_norm": 4.395908355712891, "learning_rate": 7.253390999655923e-06, "loss": 1.2914, "step": 1163 }, { "epoch": 0.5911630269172169, "grad_norm": 4.987044334411621, "learning_rate": 7.237974575728788e-06, "loss": 1.4456, "step": 1164 }, { "epoch": 0.5916708989334688, "grad_norm": 4.285270690917969, "learning_rate": 7.222565255073775e-06, "loss": 1.3697, "step": 1165 }, { "epoch": 0.5921787709497207, "grad_norm": 4.593809127807617, "learning_rate": 7.2071630773199786e-06, "loss": 1.3608, "step": 1166 }, { "epoch": 0.5926866429659726, "grad_norm": 4.617862224578857, "learning_rate": 7.191768082078145e-06, "loss": 1.3649, "step": 1167 }, { "epoch": 0.5931945149822245, "grad_norm": 3.5487444400787354, "learning_rate": 7.176380308940536e-06, "loss": 1.3859, "step": 1168 }, { "epoch": 0.5937023869984764, "grad_norm": 3.997659921646118, "learning_rate": 7.160999797480834e-06, "loss": 1.4197, "step": 1169 }, { "epoch": 0.5942102590147282, "grad_norm": 4.572630405426025, "learning_rate": 7.145626587254069e-06, "loss": 1.1834, "step": 1170 }, { "epoch": 0.5947181310309801, "grad_norm": 4.793776988983154, "learning_rate": 7.130260717796467e-06, "loss": 1.2255, "step": 1171 }, { "epoch": 0.595226003047232, "grad_norm": 4.291995525360107, "learning_rate": 7.1149022286253884e-06, "loss": 1.181, "step": 1172 }, { "epoch": 0.595733875063484, "grad_norm": 3.9128329753875732, "learning_rate": 7.0995511592392234e-06, "loss": 1.4642, "step": 1173 }, { "epoch": 0.5962417470797359, "grad_norm": 3.90106463432312, "learning_rate": 7.084207549117262e-06, "loss": 1.4408, "step": 1174 }, { "epoch": 0.5967496190959878, "grad_norm": 4.521573066711426, "learning_rate": 7.068871437719611e-06, "loss": 1.2979, "step": 1175 }, { "epoch": 0.5972574911122397, "grad_norm": 4.508446216583252, "learning_rate": 7.0535428644871125e-06, "loss": 1.485, "step": 1176 }, { "epoch": 0.5977653631284916, "grad_norm": 3.9106104373931885, "learning_rate": 7.038221868841205e-06, "loss": 1.2453, "step": 1177 }, { "epoch": 0.5982732351447435, "grad_norm": 4.619490146636963, "learning_rate": 7.022908490183836e-06, "loss": 1.051, "step": 1178 }, { "epoch": 0.5987811071609954, "grad_norm": 5.208146095275879, "learning_rate": 7.00760276789738e-06, "loss": 1.1115, "step": 1179 }, { "epoch": 0.5992889791772473, "grad_norm": 4.537458419799805, "learning_rate": 6.992304741344511e-06, "loss": 1.5648, "step": 1180 }, { "epoch": 0.5997968511934992, "grad_norm": 4.677596569061279, "learning_rate": 6.977014449868113e-06, "loss": 1.5071, "step": 1181 }, { "epoch": 0.6003047232097511, "grad_norm": 4.3316850662231445, "learning_rate": 6.961731932791169e-06, "loss": 1.2314, "step": 1182 }, { "epoch": 0.600812595226003, "grad_norm": 4.620197772979736, "learning_rate": 6.946457229416688e-06, "loss": 1.1893, "step": 1183 }, { "epoch": 0.601320467242255, "grad_norm": 4.2192277908325195, "learning_rate": 6.931190379027565e-06, "loss": 1.407, "step": 1184 }, { "epoch": 0.6018283392585069, "grad_norm": 4.571565628051758, "learning_rate": 6.9159314208865066e-06, "loss": 1.4941, "step": 1185 }, { "epoch": 0.6023362112747588, "grad_norm": 4.276565074920654, "learning_rate": 6.900680394235925e-06, "loss": 1.4298, "step": 1186 }, { "epoch": 0.6028440832910107, "grad_norm": 4.913404941558838, "learning_rate": 6.885437338297831e-06, "loss": 1.5027, "step": 1187 }, { "epoch": 0.6033519553072626, "grad_norm": 4.725199222564697, "learning_rate": 6.870202292273732e-06, "loss": 1.1943, "step": 1188 }, { "epoch": 0.6038598273235145, "grad_norm": 4.427515983581543, "learning_rate": 6.8549752953445496e-06, "loss": 1.2846, "step": 1189 }, { "epoch": 0.6043676993397664, "grad_norm": 4.1241254806518555, "learning_rate": 6.839756386670491e-06, "loss": 1.382, "step": 1190 }, { "epoch": 0.6048755713560183, "grad_norm": 4.1516571044921875, "learning_rate": 6.824545605390964e-06, "loss": 1.2045, "step": 1191 }, { "epoch": 0.6053834433722702, "grad_norm": 4.107656002044678, "learning_rate": 6.809342990624488e-06, "loss": 1.3974, "step": 1192 }, { "epoch": 0.6058913153885221, "grad_norm": 4.163454055786133, "learning_rate": 6.794148581468562e-06, "loss": 1.1562, "step": 1193 }, { "epoch": 0.6063991874047739, "grad_norm": 3.9578378200531006, "learning_rate": 6.778962416999596e-06, "loss": 1.1857, "step": 1194 }, { "epoch": 0.6069070594210259, "grad_norm": 3.630510091781616, "learning_rate": 6.763784536272785e-06, "loss": 1.1776, "step": 1195 }, { "epoch": 0.6074149314372778, "grad_norm": 4.183380126953125, "learning_rate": 6.748614978322034e-06, "loss": 1.3678, "step": 1196 }, { "epoch": 0.6079228034535297, "grad_norm": 4.871096134185791, "learning_rate": 6.733453782159834e-06, "loss": 0.7445, "step": 1197 }, { "epoch": 0.6084306754697816, "grad_norm": 4.2791876792907715, "learning_rate": 6.718300986777167e-06, "loss": 1.2649, "step": 1198 }, { "epoch": 0.6089385474860335, "grad_norm": 4.809388637542725, "learning_rate": 6.7031566311434225e-06, "loss": 1.2088, "step": 1199 }, { "epoch": 0.6094464195022854, "grad_norm": 4.801436901092529, "learning_rate": 6.688020754206283e-06, "loss": 1.0736, "step": 1200 }, { "epoch": 0.6099542915185373, "grad_norm": 4.411201000213623, "learning_rate": 6.672893394891612e-06, "loss": 1.5282, "step": 1201 }, { "epoch": 0.6104621635347892, "grad_norm": 5.016659259796143, "learning_rate": 6.657774592103389e-06, "loss": 1.1121, "step": 1202 }, { "epoch": 0.6109700355510411, "grad_norm": 4.217753887176514, "learning_rate": 6.6426643847235715e-06, "loss": 1.4342, "step": 1203 }, { "epoch": 0.611477907567293, "grad_norm": 3.952265501022339, "learning_rate": 6.627562811612012e-06, "loss": 1.3977, "step": 1204 }, { "epoch": 0.6119857795835449, "grad_norm": 3.797877550125122, "learning_rate": 6.612469911606374e-06, "loss": 1.4424, "step": 1205 }, { "epoch": 0.6124936515997969, "grad_norm": 3.7128665447235107, "learning_rate": 6.5973857235219965e-06, "loss": 1.1818, "step": 1206 }, { "epoch": 0.6130015236160488, "grad_norm": 4.827739715576172, "learning_rate": 6.582310286151824e-06, "loss": 1.4196, "step": 1207 }, { "epoch": 0.6135093956323007, "grad_norm": 3.766799211502075, "learning_rate": 6.567243638266294e-06, "loss": 1.5127, "step": 1208 }, { "epoch": 0.6140172676485526, "grad_norm": 4.022468566894531, "learning_rate": 6.552185818613241e-06, "loss": 1.2584, "step": 1209 }, { "epoch": 0.6145251396648045, "grad_norm": 3.6558563709259033, "learning_rate": 6.537136865917792e-06, "loss": 1.4567, "step": 1210 }, { "epoch": 0.6150330116810564, "grad_norm": 4.016808986663818, "learning_rate": 6.5220968188822686e-06, "loss": 1.5444, "step": 1211 }, { "epoch": 0.6155408836973083, "grad_norm": 4.2901153564453125, "learning_rate": 6.507065716186102e-06, "loss": 1.1884, "step": 1212 }, { "epoch": 0.6160487557135602, "grad_norm": 4.470537185668945, "learning_rate": 6.492043596485703e-06, "loss": 1.1808, "step": 1213 }, { "epoch": 0.6165566277298121, "grad_norm": 4.94361686706543, "learning_rate": 6.477030498414392e-06, "loss": 1.451, "step": 1214 }, { "epoch": 0.617064499746064, "grad_norm": 4.530747890472412, "learning_rate": 6.462026460582284e-06, "loss": 1.333, "step": 1215 }, { "epoch": 0.6175723717623159, "grad_norm": 4.878780841827393, "learning_rate": 6.447031521576197e-06, "loss": 1.3113, "step": 1216 }, { "epoch": 0.6180802437785679, "grad_norm": 4.026434898376465, "learning_rate": 6.4320457199595375e-06, "loss": 1.197, "step": 1217 }, { "epoch": 0.6185881157948198, "grad_norm": 4.1217570304870605, "learning_rate": 6.417069094272231e-06, "loss": 1.3979, "step": 1218 }, { "epoch": 0.6190959878110716, "grad_norm": 4.3459930419921875, "learning_rate": 6.40210168303059e-06, "loss": 1.4266, "step": 1219 }, { "epoch": 0.6196038598273235, "grad_norm": 4.097601413726807, "learning_rate": 6.387143524727233e-06, "loss": 1.4268, "step": 1220 }, { "epoch": 0.6201117318435754, "grad_norm": 3.8987221717834473, "learning_rate": 6.372194657830986e-06, "loss": 1.2839, "step": 1221 }, { "epoch": 0.6206196038598273, "grad_norm": 4.2752299308776855, "learning_rate": 6.357255120786778e-06, "loss": 1.254, "step": 1222 }, { "epoch": 0.6211274758760792, "grad_norm": 4.806422233581543, "learning_rate": 6.3423249520155415e-06, "loss": 1.3237, "step": 1223 }, { "epoch": 0.6216353478923311, "grad_norm": 3.7665746212005615, "learning_rate": 6.327404189914115e-06, "loss": 1.5077, "step": 1224 }, { "epoch": 0.622143219908583, "grad_norm": 4.326169490814209, "learning_rate": 6.312492872855156e-06, "loss": 1.1688, "step": 1225 }, { "epoch": 0.6226510919248349, "grad_norm": 4.449665069580078, "learning_rate": 6.297591039187016e-06, "loss": 1.2381, "step": 1226 }, { "epoch": 0.6231589639410868, "grad_norm": 4.6468658447265625, "learning_rate": 6.282698727233665e-06, "loss": 1.3197, "step": 1227 }, { "epoch": 0.6236668359573387, "grad_norm": 4.137972354888916, "learning_rate": 6.267815975294591e-06, "loss": 1.1778, "step": 1228 }, { "epoch": 0.6241747079735906, "grad_norm": 4.701441287994385, "learning_rate": 6.252942821644688e-06, "loss": 1.2597, "step": 1229 }, { "epoch": 0.6246825799898426, "grad_norm": 3.7897799015045166, "learning_rate": 6.2380793045341635e-06, "loss": 1.3935, "step": 1230 }, { "epoch": 0.6251904520060945, "grad_norm": 4.415806293487549, "learning_rate": 6.223225462188453e-06, "loss": 1.1646, "step": 1231 }, { "epoch": 0.6256983240223464, "grad_norm": 5.204166889190674, "learning_rate": 6.2083813328081035e-06, "loss": 1.3506, "step": 1232 }, { "epoch": 0.6262061960385983, "grad_norm": 4.363425254821777, "learning_rate": 6.193546954568677e-06, "loss": 1.2515, "step": 1233 }, { "epoch": 0.6267140680548502, "grad_norm": 4.286894798278809, "learning_rate": 6.1787223656206705e-06, "loss": 1.2642, "step": 1234 }, { "epoch": 0.6272219400711021, "grad_norm": 3.8692104816436768, "learning_rate": 6.163907604089397e-06, "loss": 1.3592, "step": 1235 }, { "epoch": 0.627729812087354, "grad_norm": 5.8255791664123535, "learning_rate": 6.149102708074896e-06, "loss": 1.0055, "step": 1236 }, { "epoch": 0.6282376841036059, "grad_norm": 4.127172946929932, "learning_rate": 6.134307715651842e-06, "loss": 1.4512, "step": 1237 }, { "epoch": 0.6287455561198578, "grad_norm": 4.500064373016357, "learning_rate": 6.119522664869431e-06, "loss": 1.079, "step": 1238 }, { "epoch": 0.6292534281361097, "grad_norm": 5.0319976806640625, "learning_rate": 6.104747593751296e-06, "loss": 1.4411, "step": 1239 }, { "epoch": 0.6297613001523616, "grad_norm": 5.003214359283447, "learning_rate": 6.089982540295398e-06, "loss": 1.2222, "step": 1240 }, { "epoch": 0.6302691721686136, "grad_norm": 4.036032676696777, "learning_rate": 6.075227542473954e-06, "loss": 1.3097, "step": 1241 }, { "epoch": 0.6307770441848655, "grad_norm": 4.442774772644043, "learning_rate": 6.060482638233295e-06, "loss": 1.1824, "step": 1242 }, { "epoch": 0.6312849162011173, "grad_norm": 5.789218902587891, "learning_rate": 6.04574786549381e-06, "loss": 0.9217, "step": 1243 }, { "epoch": 0.6317927882173692, "grad_norm": 3.8436834812164307, "learning_rate": 6.031023262149831e-06, "loss": 1.3202, "step": 1244 }, { "epoch": 0.6323006602336211, "grad_norm": 4.961956977844238, "learning_rate": 6.016308866069532e-06, "loss": 1.5101, "step": 1245 }, { "epoch": 0.632808532249873, "grad_norm": 3.7859721183776855, "learning_rate": 6.001604715094834e-06, "loss": 1.3134, "step": 1246 }, { "epoch": 0.6333164042661249, "grad_norm": 5.033949375152588, "learning_rate": 5.986910847041322e-06, "loss": 1.2267, "step": 1247 }, { "epoch": 0.6338242762823768, "grad_norm": 3.844045400619507, "learning_rate": 5.972227299698121e-06, "loss": 1.5616, "step": 1248 }, { "epoch": 0.6343321482986287, "grad_norm": 5.14638090133667, "learning_rate": 5.957554110827823e-06, "loss": 1.4153, "step": 1249 }, { "epoch": 0.6348400203148806, "grad_norm": 4.653091907501221, "learning_rate": 5.94289131816638e-06, "loss": 1.0728, "step": 1250 }, { "epoch": 0.6353478923311325, "grad_norm": 4.308130264282227, "learning_rate": 5.928238959423005e-06, "loss": 1.2786, "step": 1251 }, { "epoch": 0.6358557643473844, "grad_norm": 4.122445106506348, "learning_rate": 5.9135970722800765e-06, "loss": 1.3041, "step": 1252 }, { "epoch": 0.6363636363636364, "grad_norm": 4.471611976623535, "learning_rate": 5.8989656943930394e-06, "loss": 1.3083, "step": 1253 }, { "epoch": 0.6368715083798883, "grad_norm": 4.624831199645996, "learning_rate": 5.884344863390326e-06, "loss": 1.4186, "step": 1254 }, { "epoch": 0.6373793803961402, "grad_norm": 4.060239315032959, "learning_rate": 5.869734616873226e-06, "loss": 1.2426, "step": 1255 }, { "epoch": 0.6378872524123921, "grad_norm": 3.788975477218628, "learning_rate": 5.855134992415819e-06, "loss": 1.3551, "step": 1256 }, { "epoch": 0.638395124428644, "grad_norm": 4.169189929962158, "learning_rate": 5.840546027564866e-06, "loss": 1.418, "step": 1257 }, { "epoch": 0.6389029964448959, "grad_norm": 4.810233116149902, "learning_rate": 5.825967759839715e-06, "loss": 1.3121, "step": 1258 }, { "epoch": 0.6394108684611478, "grad_norm": 4.231508255004883, "learning_rate": 5.811400226732194e-06, "loss": 1.4864, "step": 1259 }, { "epoch": 0.6399187404773997, "grad_norm": 5.441370487213135, "learning_rate": 5.796843465706539e-06, "loss": 1.0832, "step": 1260 }, { "epoch": 0.6404266124936516, "grad_norm": 5.238120079040527, "learning_rate": 5.782297514199274e-06, "loss": 1.1778, "step": 1261 }, { "epoch": 0.6409344845099035, "grad_norm": 5.060118675231934, "learning_rate": 5.767762409619119e-06, "loss": 0.9094, "step": 1262 }, { "epoch": 0.6414423565261554, "grad_norm": 5.235774040222168, "learning_rate": 5.753238189346912e-06, "loss": 1.1636, "step": 1263 }, { "epoch": 0.6419502285424074, "grad_norm": 4.136366844177246, "learning_rate": 5.738724890735487e-06, "loss": 1.2629, "step": 1264 }, { "epoch": 0.6424581005586593, "grad_norm": 4.454113483428955, "learning_rate": 5.724222551109593e-06, "loss": 1.3022, "step": 1265 }, { "epoch": 0.6429659725749112, "grad_norm": 5.6957879066467285, "learning_rate": 5.709731207765802e-06, "loss": 1.205, "step": 1266 }, { "epoch": 0.6434738445911631, "grad_norm": 4.1256279945373535, "learning_rate": 5.695250897972393e-06, "loss": 1.1335, "step": 1267 }, { "epoch": 0.6439817166074149, "grad_norm": 4.44126558303833, "learning_rate": 5.6807816589692836e-06, "loss": 1.2609, "step": 1268 }, { "epoch": 0.6444895886236668, "grad_norm": 4.811086654663086, "learning_rate": 5.666323527967908e-06, "loss": 1.1655, "step": 1269 }, { "epoch": 0.6449974606399187, "grad_norm": 4.9102396965026855, "learning_rate": 5.6518765421511444e-06, "loss": 1.2877, "step": 1270 }, { "epoch": 0.6455053326561706, "grad_norm": 4.933690547943115, "learning_rate": 5.637440738673199e-06, "loss": 1.3126, "step": 1271 }, { "epoch": 0.6460132046724225, "grad_norm": 4.120951175689697, "learning_rate": 5.6230161546595195e-06, "loss": 1.2685, "step": 1272 }, { "epoch": 0.6465210766886744, "grad_norm": 3.8976340293884277, "learning_rate": 5.608602827206713e-06, "loss": 1.2991, "step": 1273 }, { "epoch": 0.6470289487049263, "grad_norm": 4.237303256988525, "learning_rate": 5.59420079338242e-06, "loss": 1.1338, "step": 1274 }, { "epoch": 0.6475368207211782, "grad_norm": 4.574217319488525, "learning_rate": 5.579810090225242e-06, "loss": 1.3666, "step": 1275 }, { "epoch": 0.6480446927374302, "grad_norm": 3.9830265045166016, "learning_rate": 5.565430754744652e-06, "loss": 1.3125, "step": 1276 }, { "epoch": 0.6485525647536821, "grad_norm": 4.050839424133301, "learning_rate": 5.551062823920874e-06, "loss": 1.3415, "step": 1277 }, { "epoch": 0.649060436769934, "grad_norm": 5.43248987197876, "learning_rate": 5.5367063347048044e-06, "loss": 1.2401, "step": 1278 }, { "epoch": 0.6495683087861859, "grad_norm": 4.734487056732178, "learning_rate": 5.522361324017922e-06, "loss": 1.1555, "step": 1279 }, { "epoch": 0.6500761808024378, "grad_norm": 4.565279006958008, "learning_rate": 5.508027828752181e-06, "loss": 1.3278, "step": 1280 }, { "epoch": 0.6505840528186897, "grad_norm": 5.360850811004639, "learning_rate": 5.493705885769911e-06, "loss": 1.375, "step": 1281 }, { "epoch": 0.6510919248349416, "grad_norm": 5.566100120544434, "learning_rate": 5.47939553190375e-06, "loss": 1.2314, "step": 1282 }, { "epoch": 0.6515997968511935, "grad_norm": 4.226099967956543, "learning_rate": 5.465096803956525e-06, "loss": 1.4235, "step": 1283 }, { "epoch": 0.6521076688674454, "grad_norm": 4.158005237579346, "learning_rate": 5.450809738701155e-06, "loss": 1.2179, "step": 1284 }, { "epoch": 0.6526155408836973, "grad_norm": 3.9321670532226562, "learning_rate": 5.436534372880572e-06, "loss": 1.1007, "step": 1285 }, { "epoch": 0.6531234128999492, "grad_norm": 5.232217311859131, "learning_rate": 5.4222707432076264e-06, "loss": 1.3191, "step": 1286 }, { "epoch": 0.6536312849162011, "grad_norm": 4.562361240386963, "learning_rate": 5.4080188863649745e-06, "loss": 1.1464, "step": 1287 }, { "epoch": 0.6541391569324531, "grad_norm": 4.1017069816589355, "learning_rate": 5.393778839004997e-06, "loss": 1.4151, "step": 1288 }, { "epoch": 0.654647028948705, "grad_norm": 4.379333019256592, "learning_rate": 5.379550637749718e-06, "loss": 1.3093, "step": 1289 }, { "epoch": 0.6551549009649569, "grad_norm": 5.013577938079834, "learning_rate": 5.365334319190676e-06, "loss": 1.271, "step": 1290 }, { "epoch": 0.6556627729812088, "grad_norm": 4.2003679275512695, "learning_rate": 5.351129919888858e-06, "loss": 1.4057, "step": 1291 }, { "epoch": 0.6561706449974607, "grad_norm": 4.2302727699279785, "learning_rate": 5.336937476374608e-06, "loss": 1.1751, "step": 1292 }, { "epoch": 0.6566785170137125, "grad_norm": 4.532559394836426, "learning_rate": 5.322757025147507e-06, "loss": 1.1194, "step": 1293 }, { "epoch": 0.6571863890299644, "grad_norm": 5.613903522491455, "learning_rate": 5.308588602676299e-06, "loss": 0.8151, "step": 1294 }, { "epoch": 0.6576942610462163, "grad_norm": 3.8187813758850098, "learning_rate": 5.294432245398801e-06, "loss": 1.3571, "step": 1295 }, { "epoch": 0.6582021330624682, "grad_norm": 4.5164031982421875, "learning_rate": 5.2802879897217886e-06, "loss": 1.1092, "step": 1296 }, { "epoch": 0.6587100050787201, "grad_norm": 4.190432071685791, "learning_rate": 5.266155872020931e-06, "loss": 1.2389, "step": 1297 }, { "epoch": 0.659217877094972, "grad_norm": 4.01939582824707, "learning_rate": 5.25203592864066e-06, "loss": 1.1223, "step": 1298 }, { "epoch": 0.659725749111224, "grad_norm": 3.84106183052063, "learning_rate": 5.23792819589412e-06, "loss": 1.329, "step": 1299 }, { "epoch": 0.6602336211274759, "grad_norm": 5.062501430511475, "learning_rate": 5.223832710063039e-06, "loss": 1.3425, "step": 1300 }, { "epoch": 0.6607414931437278, "grad_norm": 4.87921142578125, "learning_rate": 5.209749507397648e-06, "loss": 1.1282, "step": 1301 }, { "epoch": 0.6612493651599797, "grad_norm": 4.17822265625, "learning_rate": 5.1956786241166005e-06, "loss": 1.2733, "step": 1302 }, { "epoch": 0.6617572371762316, "grad_norm": 4.390413284301758, "learning_rate": 5.181620096406857e-06, "loss": 1.1431, "step": 1303 }, { "epoch": 0.6622651091924835, "grad_norm": 5.3159589767456055, "learning_rate": 5.167573960423598e-06, "loss": 0.8811, "step": 1304 }, { "epoch": 0.6627729812087354, "grad_norm": 4.213495254516602, "learning_rate": 5.153540252290154e-06, "loss": 1.1306, "step": 1305 }, { "epoch": 0.6632808532249873, "grad_norm": 4.159801483154297, "learning_rate": 5.139519008097877e-06, "loss": 1.0229, "step": 1306 }, { "epoch": 0.6637887252412392, "grad_norm": 3.946521282196045, "learning_rate": 5.1255102639060686e-06, "loss": 1.4018, "step": 1307 }, { "epoch": 0.6642965972574911, "grad_norm": 6.075663089752197, "learning_rate": 5.111514055741891e-06, "loss": 1.3182, "step": 1308 }, { "epoch": 0.664804469273743, "grad_norm": 3.9616241455078125, "learning_rate": 5.097530419600254e-06, "loss": 1.544, "step": 1309 }, { "epoch": 0.665312341289995, "grad_norm": 4.886152744293213, "learning_rate": 5.083559391443751e-06, "loss": 1.2317, "step": 1310 }, { "epoch": 0.6658202133062469, "grad_norm": 4.9962897300720215, "learning_rate": 5.069601007202531e-06, "loss": 1.4778, "step": 1311 }, { "epoch": 0.6663280853224988, "grad_norm": 4.1560139656066895, "learning_rate": 5.055655302774247e-06, "loss": 1.4843, "step": 1312 }, { "epoch": 0.6668359573387507, "grad_norm": 6.246951580047607, "learning_rate": 5.041722314023927e-06, "loss": 1.2443, "step": 1313 }, { "epoch": 0.6673438293550026, "grad_norm": 4.550904750823975, "learning_rate": 5.027802076783898e-06, "loss": 1.207, "step": 1314 }, { "epoch": 0.6678517013712545, "grad_norm": 5.027195930480957, "learning_rate": 5.013894626853708e-06, "loss": 1.1246, "step": 1315 }, { "epoch": 0.6683595733875064, "grad_norm": 4.895467758178711, "learning_rate": 5.000000000000003e-06, "loss": 0.9455, "step": 1316 }, { "epoch": 0.6688674454037582, "grad_norm": 4.147242546081543, "learning_rate": 4.986118231956452e-06, "loss": 1.2378, "step": 1317 }, { "epoch": 0.6693753174200101, "grad_norm": 4.623603343963623, "learning_rate": 4.972249358423668e-06, "loss": 1.3515, "step": 1318 }, { "epoch": 0.669883189436262, "grad_norm": 4.853110313415527, "learning_rate": 4.958393415069089e-06, "loss": 1.3405, "step": 1319 }, { "epoch": 0.6703910614525139, "grad_norm": 4.2498908042907715, "learning_rate": 4.944550437526902e-06, "loss": 1.3609, "step": 1320 }, { "epoch": 0.6708989334687658, "grad_norm": 4.73717737197876, "learning_rate": 4.930720461397956e-06, "loss": 1.0625, "step": 1321 }, { "epoch": 0.6714068054850177, "grad_norm": 4.079125881195068, "learning_rate": 4.916903522249656e-06, "loss": 1.3944, "step": 1322 }, { "epoch": 0.6719146775012697, "grad_norm": 4.965070724487305, "learning_rate": 4.9030996556158775e-06, "loss": 1.5302, "step": 1323 }, { "epoch": 0.6724225495175216, "grad_norm": 4.086692810058594, "learning_rate": 4.889308896996885e-06, "loss": 1.0986, "step": 1324 }, { "epoch": 0.6729304215337735, "grad_norm": 4.803402423858643, "learning_rate": 4.875531281859231e-06, "loss": 1.2473, "step": 1325 }, { "epoch": 0.6734382935500254, "grad_norm": 4.7739057540893555, "learning_rate": 4.861766845635659e-06, "loss": 1.4185, "step": 1326 }, { "epoch": 0.6739461655662773, "grad_norm": 4.143311977386475, "learning_rate": 4.84801562372502e-06, "loss": 1.3922, "step": 1327 }, { "epoch": 0.6744540375825292, "grad_norm": 4.873647212982178, "learning_rate": 4.834277651492192e-06, "loss": 1.3831, "step": 1328 }, { "epoch": 0.6749619095987811, "grad_norm": 4.5239763259887695, "learning_rate": 4.8205529642679675e-06, "loss": 1.4986, "step": 1329 }, { "epoch": 0.675469781615033, "grad_norm": 4.792907238006592, "learning_rate": 4.80684159734897e-06, "loss": 1.1446, "step": 1330 }, { "epoch": 0.6759776536312849, "grad_norm": 5.070479869842529, "learning_rate": 4.793143585997583e-06, "loss": 1.1759, "step": 1331 }, { "epoch": 0.6764855256475368, "grad_norm": 4.775655746459961, "learning_rate": 4.779458965441826e-06, "loss": 1.2957, "step": 1332 }, { "epoch": 0.6769933976637887, "grad_norm": 5.204723834991455, "learning_rate": 4.765787770875282e-06, "loss": 1.3555, "step": 1333 }, { "epoch": 0.6775012696800407, "grad_norm": 4.274050235748291, "learning_rate": 4.75213003745702e-06, "loss": 1.2904, "step": 1334 }, { "epoch": 0.6780091416962926, "grad_norm": 6.0831170082092285, "learning_rate": 4.738485800311475e-06, "loss": 1.1394, "step": 1335 }, { "epoch": 0.6785170137125445, "grad_norm": 4.034609317779541, "learning_rate": 4.724855094528374e-06, "loss": 1.367, "step": 1336 }, { "epoch": 0.6790248857287964, "grad_norm": 4.201053619384766, "learning_rate": 4.711237955162659e-06, "loss": 1.3293, "step": 1337 }, { "epoch": 0.6795327577450483, "grad_norm": 4.243471622467041, "learning_rate": 4.69763441723436e-06, "loss": 1.2624, "step": 1338 }, { "epoch": 0.6800406297613002, "grad_norm": 4.744422912597656, "learning_rate": 4.684044515728549e-06, "loss": 1.1923, "step": 1339 }, { "epoch": 0.6805485017775521, "grad_norm": 5.497531890869141, "learning_rate": 4.670468285595213e-06, "loss": 1.0948, "step": 1340 }, { "epoch": 0.681056373793804, "grad_norm": 4.373364448547363, "learning_rate": 4.656905761749191e-06, "loss": 1.3205, "step": 1341 }, { "epoch": 0.6815642458100558, "grad_norm": 4.560938358306885, "learning_rate": 4.6433569790700625e-06, "loss": 1.5145, "step": 1342 }, { "epoch": 0.6820721178263077, "grad_norm": 6.305968761444092, "learning_rate": 4.62982197240207e-06, "loss": 0.8212, "step": 1343 }, { "epoch": 0.6825799898425596, "grad_norm": 4.598505973815918, "learning_rate": 4.616300776554035e-06, "loss": 1.3917, "step": 1344 }, { "epoch": 0.6830878618588115, "grad_norm": 6.314585208892822, "learning_rate": 4.602793426299255e-06, "loss": 1.108, "step": 1345 }, { "epoch": 0.6835957338750634, "grad_norm": 5.1614813804626465, "learning_rate": 4.589299956375415e-06, "loss": 1.3614, "step": 1346 }, { "epoch": 0.6841036058913154, "grad_norm": 4.432321071624756, "learning_rate": 4.575820401484516e-06, "loss": 1.2833, "step": 1347 }, { "epoch": 0.6846114779075673, "grad_norm": 5.343625068664551, "learning_rate": 4.562354796292761e-06, "loss": 1.0535, "step": 1348 }, { "epoch": 0.6851193499238192, "grad_norm": 5.0347161293029785, "learning_rate": 4.548903175430482e-06, "loss": 0.8767, "step": 1349 }, { "epoch": 0.6856272219400711, "grad_norm": 5.4508585929870605, "learning_rate": 4.5354655734920495e-06, "loss": 1.0727, "step": 1350 }, { "epoch": 0.686135093956323, "grad_norm": 4.261341094970703, "learning_rate": 4.522042025035773e-06, "loss": 1.1111, "step": 1351 }, { "epoch": 0.6866429659725749, "grad_norm": 4.242602825164795, "learning_rate": 4.508632564583832e-06, "loss": 1.2206, "step": 1352 }, { "epoch": 0.6871508379888268, "grad_norm": 4.068726062774658, "learning_rate": 4.495237226622161e-06, "loss": 1.2067, "step": 1353 }, { "epoch": 0.6876587100050787, "grad_norm": 4.812721252441406, "learning_rate": 4.481856045600388e-06, "loss": 1.2238, "step": 1354 }, { "epoch": 0.6881665820213306, "grad_norm": 4.183948516845703, "learning_rate": 4.468489055931723e-06, "loss": 1.1906, "step": 1355 }, { "epoch": 0.6886744540375825, "grad_norm": 4.936746597290039, "learning_rate": 4.455136291992877e-06, "loss": 0.9373, "step": 1356 }, { "epoch": 0.6891823260538344, "grad_norm": 4.554867267608643, "learning_rate": 4.4417977881239925e-06, "loss": 1.2939, "step": 1357 }, { "epoch": 0.6896901980700864, "grad_norm": 6.861904144287109, "learning_rate": 4.428473578628522e-06, "loss": 0.9068, "step": 1358 }, { "epoch": 0.6901980700863383, "grad_norm": 5.396211624145508, "learning_rate": 4.415163697773155e-06, "loss": 1.2767, "step": 1359 }, { "epoch": 0.6907059421025902, "grad_norm": 4.2031168937683105, "learning_rate": 4.401868179787748e-06, "loss": 1.3721, "step": 1360 }, { "epoch": 0.6912138141188421, "grad_norm": 4.868252277374268, "learning_rate": 4.388587058865207e-06, "loss": 1.1377, "step": 1361 }, { "epoch": 0.691721686135094, "grad_norm": 5.334263801574707, "learning_rate": 4.375320369161408e-06, "loss": 1.2758, "step": 1362 }, { "epoch": 0.6922295581513459, "grad_norm": 4.449823379516602, "learning_rate": 4.362068144795127e-06, "loss": 1.3147, "step": 1363 }, { "epoch": 0.6927374301675978, "grad_norm": 4.9963250160217285, "learning_rate": 4.34883041984793e-06, "loss": 1.4626, "step": 1364 }, { "epoch": 0.6932453021838497, "grad_norm": 4.718398094177246, "learning_rate": 4.33560722836409e-06, "loss": 1.5417, "step": 1365 }, { "epoch": 0.6937531742001015, "grad_norm": 4.16678524017334, "learning_rate": 4.322398604350518e-06, "loss": 1.3918, "step": 1366 }, { "epoch": 0.6942610462163534, "grad_norm": 5.61831521987915, "learning_rate": 4.3092045817766426e-06, "loss": 1.0232, "step": 1367 }, { "epoch": 0.6947689182326053, "grad_norm": 4.456976890563965, "learning_rate": 4.296025194574359e-06, "loss": 1.3631, "step": 1368 }, { "epoch": 0.6952767902488572, "grad_norm": 4.0251359939575195, "learning_rate": 4.282860476637906e-06, "loss": 1.3043, "step": 1369 }, { "epoch": 0.6957846622651092, "grad_norm": 5.202916145324707, "learning_rate": 4.269710461823813e-06, "loss": 1.3112, "step": 1370 }, { "epoch": 0.6962925342813611, "grad_norm": 4.528244495391846, "learning_rate": 4.2565751839507855e-06, "loss": 1.2985, "step": 1371 }, { "epoch": 0.696800406297613, "grad_norm": 6.558594703674316, "learning_rate": 4.243454676799628e-06, "loss": 0.8046, "step": 1372 }, { "epoch": 0.6973082783138649, "grad_norm": 4.963061809539795, "learning_rate": 4.230348974113172e-06, "loss": 1.2971, "step": 1373 }, { "epoch": 0.6978161503301168, "grad_norm": 4.163506031036377, "learning_rate": 4.217258109596161e-06, "loss": 1.2361, "step": 1374 }, { "epoch": 0.6983240223463687, "grad_norm": 5.891390323638916, "learning_rate": 4.204182116915179e-06, "loss": 1.0048, "step": 1375 }, { "epoch": 0.6988318943626206, "grad_norm": 4.587854385375977, "learning_rate": 4.191121029698575e-06, "loss": 1.3254, "step": 1376 }, { "epoch": 0.6993397663788725, "grad_norm": 4.640517711639404, "learning_rate": 4.1780748815363545e-06, "loss": 1.2664, "step": 1377 }, { "epoch": 0.6998476383951244, "grad_norm": 5.369966983795166, "learning_rate": 4.165043705980102e-06, "loss": 1.1808, "step": 1378 }, { "epoch": 0.7003555104113763, "grad_norm": 4.568151950836182, "learning_rate": 4.152027536542908e-06, "loss": 0.9268, "step": 1379 }, { "epoch": 0.7008633824276282, "grad_norm": 4.4422125816345215, "learning_rate": 4.139026406699254e-06, "loss": 1.3176, "step": 1380 }, { "epoch": 0.7013712544438802, "grad_norm": 5.446008682250977, "learning_rate": 4.12604034988496e-06, "loss": 1.3481, "step": 1381 }, { "epoch": 0.7018791264601321, "grad_norm": 4.825053691864014, "learning_rate": 4.113069399497067e-06, "loss": 1.3604, "step": 1382 }, { "epoch": 0.702386998476384, "grad_norm": 4.3071136474609375, "learning_rate": 4.1001135888937795e-06, "loss": 1.3605, "step": 1383 }, { "epoch": 0.7028948704926359, "grad_norm": 4.9159040451049805, "learning_rate": 4.087172951394356e-06, "loss": 1.2433, "step": 1384 }, { "epoch": 0.7034027425088878, "grad_norm": 4.7212347984313965, "learning_rate": 4.074247520279033e-06, "loss": 1.2597, "step": 1385 }, { "epoch": 0.7039106145251397, "grad_norm": 4.443634986877441, "learning_rate": 4.061337328788951e-06, "loss": 1.2344, "step": 1386 }, { "epoch": 0.7044184865413916, "grad_norm": 4.415527820587158, "learning_rate": 4.0484424101260465e-06, "loss": 1.3632, "step": 1387 }, { "epoch": 0.7049263585576435, "grad_norm": 5.799596309661865, "learning_rate": 4.0355627974529775e-06, "loss": 1.4293, "step": 1388 }, { "epoch": 0.7054342305738954, "grad_norm": 4.581190586090088, "learning_rate": 4.022698523893052e-06, "loss": 1.2923, "step": 1389 }, { "epoch": 0.7059421025901473, "grad_norm": 5.361939907073975, "learning_rate": 4.009849622530113e-06, "loss": 1.2167, "step": 1390 }, { "epoch": 0.7064499746063991, "grad_norm": 4.397521495819092, "learning_rate": 3.997016126408477e-06, "loss": 1.2955, "step": 1391 }, { "epoch": 0.706957846622651, "grad_norm": 4.393995761871338, "learning_rate": 3.984198068532848e-06, "loss": 1.3823, "step": 1392 }, { "epoch": 0.707465718638903, "grad_norm": 2.7322566509246826, "learning_rate": 3.971395481868218e-06, "loss": 0.3306, "step": 1393 }, { "epoch": 0.7079735906551549, "grad_norm": 5.019045352935791, "learning_rate": 3.9586083993397874e-06, "loss": 1.344, "step": 1394 }, { "epoch": 0.7084814626714068, "grad_norm": 4.308985710144043, "learning_rate": 3.945836853832895e-06, "loss": 1.353, "step": 1395 }, { "epoch": 0.7089893346876587, "grad_norm": 4.66213846206665, "learning_rate": 3.933080878192919e-06, "loss": 1.2769, "step": 1396 }, { "epoch": 0.7094972067039106, "grad_norm": 5.00924825668335, "learning_rate": 3.920340505225192e-06, "loss": 1.1399, "step": 1397 }, { "epoch": 0.7100050787201625, "grad_norm": 5.90763521194458, "learning_rate": 3.907615767694916e-06, "loss": 1.2984, "step": 1398 }, { "epoch": 0.7105129507364144, "grad_norm": 7.6582465171813965, "learning_rate": 3.894906698327095e-06, "loss": 0.9014, "step": 1399 }, { "epoch": 0.7110208227526663, "grad_norm": 4.450231552124023, "learning_rate": 3.8822133298064264e-06, "loss": 1.484, "step": 1400 }, { "epoch": 0.7115286947689182, "grad_norm": 5.579190254211426, "learning_rate": 3.869535694777232e-06, "loss": 1.3797, "step": 1401 }, { "epoch": 0.7120365667851701, "grad_norm": 4.927333831787109, "learning_rate": 3.856873825843379e-06, "loss": 1.3417, "step": 1402 }, { "epoch": 0.712544438801422, "grad_norm": 4.601704120635986, "learning_rate": 3.844227755568177e-06, "loss": 1.2373, "step": 1403 }, { "epoch": 0.713052310817674, "grad_norm": 4.791215419769287, "learning_rate": 3.831597516474306e-06, "loss": 1.3297, "step": 1404 }, { "epoch": 0.7135601828339259, "grad_norm": 4.16111421585083, "learning_rate": 3.818983141043742e-06, "loss": 1.4404, "step": 1405 }, { "epoch": 0.7140680548501778, "grad_norm": 4.602897644042969, "learning_rate": 3.806384661717655e-06, "loss": 1.3687, "step": 1406 }, { "epoch": 0.7145759268664297, "grad_norm": 5.141113758087158, "learning_rate": 3.7938021108963297e-06, "loss": 1.0342, "step": 1407 }, { "epoch": 0.7150837988826816, "grad_norm": 4.493198871612549, "learning_rate": 3.7812355209391003e-06, "loss": 1.2501, "step": 1408 }, { "epoch": 0.7155916708989335, "grad_norm": 4.949621677398682, "learning_rate": 3.768684924164241e-06, "loss": 1.3255, "step": 1409 }, { "epoch": 0.7160995429151854, "grad_norm": 3.930460214614868, "learning_rate": 3.7561503528489007e-06, "loss": 1.3839, "step": 1410 }, { "epoch": 0.7166074149314373, "grad_norm": 5.201033115386963, "learning_rate": 3.7436318392290182e-06, "loss": 1.144, "step": 1411 }, { "epoch": 0.7171152869476892, "grad_norm": 4.482243537902832, "learning_rate": 3.7311294154992296e-06, "loss": 1.2822, "step": 1412 }, { "epoch": 0.7176231589639411, "grad_norm": 4.71453332901001, "learning_rate": 3.718643113812792e-06, "loss": 1.3802, "step": 1413 }, { "epoch": 0.718131030980193, "grad_norm": 5.0039472579956055, "learning_rate": 3.7061729662814984e-06, "loss": 1.1548, "step": 1414 }, { "epoch": 0.718638902996445, "grad_norm": 4.845503330230713, "learning_rate": 3.693719004975609e-06, "loss": 1.2801, "step": 1415 }, { "epoch": 0.7191467750126967, "grad_norm": 5.027263164520264, "learning_rate": 3.6812812619237435e-06, "loss": 1.288, "step": 1416 }, { "epoch": 0.7196546470289487, "grad_norm": 4.287696361541748, "learning_rate": 3.668859769112815e-06, "loss": 1.2276, "step": 1417 }, { "epoch": 0.7201625190452006, "grad_norm": 4.9991936683654785, "learning_rate": 3.6564545584879543e-06, "loss": 1.2035, "step": 1418 }, { "epoch": 0.7206703910614525, "grad_norm": 4.15277624130249, "learning_rate": 3.644065661952406e-06, "loss": 1.316, "step": 1419 }, { "epoch": 0.7211782630777044, "grad_norm": 4.639977931976318, "learning_rate": 3.6316931113674615e-06, "loss": 1.2438, "step": 1420 }, { "epoch": 0.7216861350939563, "grad_norm": 5.692741870880127, "learning_rate": 3.6193369385523834e-06, "loss": 1.2181, "step": 1421 }, { "epoch": 0.7221940071102082, "grad_norm": 5.025320053100586, "learning_rate": 3.6069971752843015e-06, "loss": 1.2688, "step": 1422 }, { "epoch": 0.7227018791264601, "grad_norm": 6.0205888748168945, "learning_rate": 3.594673853298156e-06, "loss": 1.202, "step": 1423 }, { "epoch": 0.723209751142712, "grad_norm": 4.887278079986572, "learning_rate": 3.582367004286593e-06, "loss": 1.2262, "step": 1424 }, { "epoch": 0.7237176231589639, "grad_norm": 5.552386283874512, "learning_rate": 3.570076659899906e-06, "loss": 1.0381, "step": 1425 }, { "epoch": 0.7242254951752158, "grad_norm": 4.675529956817627, "learning_rate": 3.557802851745932e-06, "loss": 1.3177, "step": 1426 }, { "epoch": 0.7247333671914677, "grad_norm": 6.0183186531066895, "learning_rate": 3.545545611389981e-06, "loss": 1.0576, "step": 1427 }, { "epoch": 0.7252412392077197, "grad_norm": 3.846447229385376, "learning_rate": 3.5333049703547663e-06, "loss": 1.4093, "step": 1428 }, { "epoch": 0.7257491112239716, "grad_norm": 4.579927921295166, "learning_rate": 3.5210809601203e-06, "loss": 1.176, "step": 1429 }, { "epoch": 0.7262569832402235, "grad_norm": 5.084172248840332, "learning_rate": 3.5088736121238243e-06, "loss": 1.2232, "step": 1430 }, { "epoch": 0.7267648552564754, "grad_norm": 4.2836594581604, "learning_rate": 3.4966829577597392e-06, "loss": 1.1602, "step": 1431 }, { "epoch": 0.7272727272727273, "grad_norm": 4.318619728088379, "learning_rate": 3.4845090283795046e-06, "loss": 1.2524, "step": 1432 }, { "epoch": 0.7277805992889792, "grad_norm": 5.638737201690674, "learning_rate": 3.4723518552915656e-06, "loss": 0.764, "step": 1433 }, { "epoch": 0.7282884713052311, "grad_norm": 5.3186774253845215, "learning_rate": 3.4602114697612864e-06, "loss": 1.1524, "step": 1434 }, { "epoch": 0.728796343321483, "grad_norm": 4.445633888244629, "learning_rate": 3.4480879030108436e-06, "loss": 1.3725, "step": 1435 }, { "epoch": 0.7293042153377349, "grad_norm": 4.598139762878418, "learning_rate": 3.435981186219165e-06, "loss": 1.3252, "step": 1436 }, { "epoch": 0.7298120873539868, "grad_norm": 4.246560573577881, "learning_rate": 3.4238913505218508e-06, "loss": 1.0359, "step": 1437 }, { "epoch": 0.7303199593702387, "grad_norm": 5.461703300476074, "learning_rate": 3.4118184270110742e-06, "loss": 1.0321, "step": 1438 }, { "epoch": 0.7308278313864907, "grad_norm": 4.48328161239624, "learning_rate": 3.3997624467355274e-06, "loss": 1.3616, "step": 1439 }, { "epoch": 0.7313357034027425, "grad_norm": 5.669865131378174, "learning_rate": 3.3877234407003223e-06, "loss": 1.1658, "step": 1440 }, { "epoch": 0.7318435754189944, "grad_norm": 4.731591701507568, "learning_rate": 3.375701439866916e-06, "loss": 1.2121, "step": 1441 }, { "epoch": 0.7323514474352463, "grad_norm": 5.656239986419678, "learning_rate": 3.3636964751530352e-06, "loss": 1.0572, "step": 1442 }, { "epoch": 0.7328593194514982, "grad_norm": 6.066091537475586, "learning_rate": 3.351708577432586e-06, "loss": 1.0253, "step": 1443 }, { "epoch": 0.7333671914677501, "grad_norm": 4.80064058303833, "learning_rate": 3.339737777535599e-06, "loss": 1.2829, "step": 1444 }, { "epoch": 0.733875063484002, "grad_norm": 4.782726287841797, "learning_rate": 3.3277841062481166e-06, "loss": 1.1591, "step": 1445 }, { "epoch": 0.7343829355002539, "grad_norm": 5.189933776855469, "learning_rate": 3.315847594312134e-06, "loss": 0.8328, "step": 1446 }, { "epoch": 0.7348908075165058, "grad_norm": 5.7964911460876465, "learning_rate": 3.303928272425526e-06, "loss": 1.1676, "step": 1447 }, { "epoch": 0.7353986795327577, "grad_norm": 4.1361212730407715, "learning_rate": 3.29202617124195e-06, "loss": 1.2228, "step": 1448 }, { "epoch": 0.7359065515490096, "grad_norm": 4.595512390136719, "learning_rate": 3.280141321370772e-06, "loss": 0.9714, "step": 1449 }, { "epoch": 0.7364144235652615, "grad_norm": 4.6223931312561035, "learning_rate": 3.268273753377008e-06, "loss": 1.3393, "step": 1450 }, { "epoch": 0.7369222955815135, "grad_norm": 5.296187400817871, "learning_rate": 3.2564234977812095e-06, "loss": 1.3692, "step": 1451 }, { "epoch": 0.7374301675977654, "grad_norm": 4.795011043548584, "learning_rate": 3.2445905850594205e-06, "loss": 1.3872, "step": 1452 }, { "epoch": 0.7379380396140173, "grad_norm": 6.799955368041992, "learning_rate": 3.2327750456430706e-06, "loss": 1.3781, "step": 1453 }, { "epoch": 0.7384459116302692, "grad_norm": 4.597665309906006, "learning_rate": 3.2209769099189225e-06, "loss": 1.1434, "step": 1454 }, { "epoch": 0.7389537836465211, "grad_norm": 4.368586540222168, "learning_rate": 3.2091962082289707e-06, "loss": 1.1225, "step": 1455 }, { "epoch": 0.739461655662773, "grad_norm": 4.291135787963867, "learning_rate": 3.197432970870372e-06, "loss": 1.3276, "step": 1456 }, { "epoch": 0.7399695276790249, "grad_norm": 4.883306503295898, "learning_rate": 3.185687228095381e-06, "loss": 1.4548, "step": 1457 }, { "epoch": 0.7404773996952768, "grad_norm": 5.671407222747803, "learning_rate": 3.1739590101112495e-06, "loss": 1.1628, "step": 1458 }, { "epoch": 0.7409852717115287, "grad_norm": 4.768751621246338, "learning_rate": 3.162248347080159e-06, "loss": 1.2853, "step": 1459 }, { "epoch": 0.7414931437277806, "grad_norm": 5.760345935821533, "learning_rate": 3.1505552691191566e-06, "loss": 1.2817, "step": 1460 }, { "epoch": 0.7420010157440325, "grad_norm": 5.223692893981934, "learning_rate": 3.138879806300051e-06, "loss": 1.2861, "step": 1461 }, { "epoch": 0.7425088877602845, "grad_norm": 4.0341033935546875, "learning_rate": 3.127221988649353e-06, "loss": 1.3493, "step": 1462 }, { "epoch": 0.7430167597765364, "grad_norm": 5.6256632804870605, "learning_rate": 3.1155818461482e-06, "loss": 1.0489, "step": 1463 }, { "epoch": 0.7435246317927883, "grad_norm": 5.683304786682129, "learning_rate": 3.1039594087322667e-06, "loss": 0.7919, "step": 1464 }, { "epoch": 0.7440325038090401, "grad_norm": 4.113526821136475, "learning_rate": 3.0923547062916915e-06, "loss": 1.3307, "step": 1465 }, { "epoch": 0.744540375825292, "grad_norm": 5.324737071990967, "learning_rate": 3.0807677686710112e-06, "loss": 1.3653, "step": 1466 }, { "epoch": 0.7450482478415439, "grad_norm": 5.714325904846191, "learning_rate": 3.0691986256690774e-06, "loss": 1.0556, "step": 1467 }, { "epoch": 0.7455561198577958, "grad_norm": 4.550981044769287, "learning_rate": 3.0576473070389634e-06, "loss": 1.2504, "step": 1468 }, { "epoch": 0.7460639918740477, "grad_norm": 5.716010570526123, "learning_rate": 3.04611384248792e-06, "loss": 0.9282, "step": 1469 }, { "epoch": 0.7465718638902996, "grad_norm": 5.312298774719238, "learning_rate": 3.0345982616772707e-06, "loss": 1.349, "step": 1470 }, { "epoch": 0.7470797359065515, "grad_norm": 4.471862316131592, "learning_rate": 3.0231005942223467e-06, "loss": 1.2788, "step": 1471 }, { "epoch": 0.7475876079228034, "grad_norm": 4.880403518676758, "learning_rate": 3.011620869692412e-06, "loss": 1.2968, "step": 1472 }, { "epoch": 0.7480954799390553, "grad_norm": 5.2570648193359375, "learning_rate": 3.0001591176105917e-06, "loss": 1.1027, "step": 1473 }, { "epoch": 0.7486033519553073, "grad_norm": 5.979409217834473, "learning_rate": 2.9887153674537795e-06, "loss": 1.1802, "step": 1474 }, { "epoch": 0.7491112239715592, "grad_norm": 4.316808700561523, "learning_rate": 2.9772896486525783e-06, "loss": 1.1965, "step": 1475 }, { "epoch": 0.7496190959878111, "grad_norm": 5.031011581420898, "learning_rate": 2.9658819905912205e-06, "loss": 1.3382, "step": 1476 }, { "epoch": 0.750126968004063, "grad_norm": 4.343713760375977, "learning_rate": 2.9544924226074866e-06, "loss": 1.205, "step": 1477 }, { "epoch": 0.7506348400203149, "grad_norm": 4.553668022155762, "learning_rate": 2.9431209739926324e-06, "loss": 1.6361, "step": 1478 }, { "epoch": 0.7511427120365668, "grad_norm": 5.322806358337402, "learning_rate": 2.9317676739913224e-06, "loss": 1.1267, "step": 1479 }, { "epoch": 0.7511427120365668, "eval_loss": 1.273730754852295, "eval_runtime": 168.1921, "eval_samples_per_second": 5.85, "eval_steps_per_second": 1.463, "step": 1479 }, { "epoch": 0.7516505840528187, "grad_norm": 4.639729022979736, "learning_rate": 2.9204325518015384e-06, "loss": 1.238, "step": 1480 }, { "epoch": 0.7521584560690706, "grad_norm": 5.131244659423828, "learning_rate": 2.909115636574519e-06, "loss": 1.1402, "step": 1481 }, { "epoch": 0.7526663280853225, "grad_norm": 4.832932949066162, "learning_rate": 2.89781695741468e-06, "loss": 1.3281, "step": 1482 }, { "epoch": 0.7531742001015744, "grad_norm": 5.1387104988098145, "learning_rate": 2.886536543379532e-06, "loss": 1.0876, "step": 1483 }, { "epoch": 0.7536820721178263, "grad_norm": 4.41108512878418, "learning_rate": 2.875274423479617e-06, "loss": 1.2291, "step": 1484 }, { "epoch": 0.7541899441340782, "grad_norm": 5.090763568878174, "learning_rate": 2.8640306266784225e-06, "loss": 1.1907, "step": 1485 }, { "epoch": 0.7546978161503302, "grad_norm": 5.384039402008057, "learning_rate": 2.8528051818923264e-06, "loss": 1.1933, "step": 1486 }, { "epoch": 0.7552056881665821, "grad_norm": 5.645175933837891, "learning_rate": 2.8415981179904973e-06, "loss": 1.1926, "step": 1487 }, { "epoch": 0.755713560182834, "grad_norm": 4.961467742919922, "learning_rate": 2.830409463794833e-06, "loss": 1.055, "step": 1488 }, { "epoch": 0.7562214321990859, "grad_norm": 5.414806365966797, "learning_rate": 2.8192392480798957e-06, "loss": 1.2816, "step": 1489 }, { "epoch": 0.7567293042153377, "grad_norm": 4.7053351402282715, "learning_rate": 2.8080874995728204e-06, "loss": 1.225, "step": 1490 }, { "epoch": 0.7572371762315896, "grad_norm": 4.617583751678467, "learning_rate": 2.796954246953246e-06, "loss": 1.1683, "step": 1491 }, { "epoch": 0.7577450482478415, "grad_norm": 4.665943622589111, "learning_rate": 2.785839518853256e-06, "loss": 1.2315, "step": 1492 }, { "epoch": 0.7582529202640934, "grad_norm": 5.0149760246276855, "learning_rate": 2.77474334385728e-06, "loss": 1.2449, "step": 1493 }, { "epoch": 0.7587607922803453, "grad_norm": 4.400003433227539, "learning_rate": 2.763665750502045e-06, "loss": 1.0575, "step": 1494 }, { "epoch": 0.7592686642965972, "grad_norm": 5.59690523147583, "learning_rate": 2.752606767276479e-06, "loss": 1.1692, "step": 1495 }, { "epoch": 0.7597765363128491, "grad_norm": 4.624378204345703, "learning_rate": 2.741566422621661e-06, "loss": 1.3387, "step": 1496 }, { "epoch": 0.760284408329101, "grad_norm": 4.425624370574951, "learning_rate": 2.7305447449307245e-06, "loss": 1.6304, "step": 1497 }, { "epoch": 0.760792280345353, "grad_norm": 4.48796272277832, "learning_rate": 2.7195417625488075e-06, "loss": 1.3156, "step": 1498 }, { "epoch": 0.7613001523616049, "grad_norm": 4.57307243347168, "learning_rate": 2.7085575037729607e-06, "loss": 1.1777, "step": 1499 }, { "epoch": 0.7618080243778568, "grad_norm": 4.739150524139404, "learning_rate": 2.6975919968520813e-06, "loss": 1.1394, "step": 1500 }, { "epoch": 0.7623158963941087, "grad_norm": 5.048216819763184, "learning_rate": 2.686645269986843e-06, "loss": 1.1932, "step": 1501 }, { "epoch": 0.7628237684103606, "grad_norm": 4.460635185241699, "learning_rate": 2.6757173513296275e-06, "loss": 1.4603, "step": 1502 }, { "epoch": 0.7633316404266125, "grad_norm": 3.7929909229278564, "learning_rate": 2.6648082689844367e-06, "loss": 1.0992, "step": 1503 }, { "epoch": 0.7638395124428644, "grad_norm": 4.791143894195557, "learning_rate": 2.653918051006833e-06, "loss": 1.1181, "step": 1504 }, { "epoch": 0.7643473844591163, "grad_norm": 4.762235641479492, "learning_rate": 2.6430467254038716e-06, "loss": 1.2734, "step": 1505 }, { "epoch": 0.7648552564753682, "grad_norm": 4.421921253204346, "learning_rate": 2.6321943201340117e-06, "loss": 1.2535, "step": 1506 }, { "epoch": 0.7653631284916201, "grad_norm": 5.441786766052246, "learning_rate": 2.621360863107053e-06, "loss": 1.1429, "step": 1507 }, { "epoch": 0.765871000507872, "grad_norm": 4.744370460510254, "learning_rate": 2.6105463821840727e-06, "loss": 1.1283, "step": 1508 }, { "epoch": 0.766378872524124, "grad_norm": 1.064754843711853, "learning_rate": 2.599750905177346e-06, "loss": 0.2659, "step": 1509 }, { "epoch": 0.7668867445403759, "grad_norm": 4.2103800773620605, "learning_rate": 2.5889744598502643e-06, "loss": 1.3094, "step": 1510 }, { "epoch": 0.7673946165566278, "grad_norm": 4.509934902191162, "learning_rate": 2.578217073917285e-06, "loss": 1.2055, "step": 1511 }, { "epoch": 0.7679024885728797, "grad_norm": 4.744308948516846, "learning_rate": 2.5674787750438447e-06, "loss": 1.1876, "step": 1512 }, { "epoch": 0.7684103605891316, "grad_norm": 5.503872871398926, "learning_rate": 2.5567595908462905e-06, "loss": 1.3198, "step": 1513 }, { "epoch": 0.7689182326053834, "grad_norm": 4.523368835449219, "learning_rate": 2.5460595488918104e-06, "loss": 1.2507, "step": 1514 }, { "epoch": 0.7694261046216353, "grad_norm": 5.009969234466553, "learning_rate": 2.5353786766983736e-06, "loss": 0.9893, "step": 1515 }, { "epoch": 0.7699339766378872, "grad_norm": 5.002704620361328, "learning_rate": 2.5247170017346366e-06, "loss": 1.2085, "step": 1516 }, { "epoch": 0.7704418486541391, "grad_norm": 5.431525230407715, "learning_rate": 2.5140745514198894e-06, "loss": 1.2576, "step": 1517 }, { "epoch": 0.770949720670391, "grad_norm": 4.792313098907471, "learning_rate": 2.503451353123987e-06, "loss": 1.2376, "step": 1518 }, { "epoch": 0.7714575926866429, "grad_norm": 4.87385892868042, "learning_rate": 2.4928474341672627e-06, "loss": 1.3085, "step": 1519 }, { "epoch": 0.7719654647028948, "grad_norm": 5.436407089233398, "learning_rate": 2.4822628218204726e-06, "loss": 1.3114, "step": 1520 }, { "epoch": 0.7724733367191468, "grad_norm": 4.732920169830322, "learning_rate": 2.471697543304724e-06, "loss": 1.3637, "step": 1521 }, { "epoch": 0.7729812087353987, "grad_norm": 4.354372501373291, "learning_rate": 2.4611516257913947e-06, "loss": 1.2134, "step": 1522 }, { "epoch": 0.7734890807516506, "grad_norm": 5.263430118560791, "learning_rate": 2.4506250964020805e-06, "loss": 1.1194, "step": 1523 }, { "epoch": 0.7739969527679025, "grad_norm": 4.511394023895264, "learning_rate": 2.4401179822085043e-06, "loss": 1.2076, "step": 1524 }, { "epoch": 0.7745048247841544, "grad_norm": 4.439097881317139, "learning_rate": 2.429630310232469e-06, "loss": 1.2132, "step": 1525 }, { "epoch": 0.7750126968004063, "grad_norm": 4.4272894859313965, "learning_rate": 2.4191621074457663e-06, "loss": 1.0654, "step": 1526 }, { "epoch": 0.7755205688166582, "grad_norm": 4.477302074432373, "learning_rate": 2.4087134007701264e-06, "loss": 1.1918, "step": 1527 }, { "epoch": 0.7760284408329101, "grad_norm": 4.18808126449585, "learning_rate": 2.398284217077135e-06, "loss": 1.1018, "step": 1528 }, { "epoch": 0.776536312849162, "grad_norm": 4.593721866607666, "learning_rate": 2.387874583188171e-06, "loss": 1.2392, "step": 1529 }, { "epoch": 0.7770441848654139, "grad_norm": 6.527272701263428, "learning_rate": 2.3774845258743317e-06, "loss": 1.083, "step": 1530 }, { "epoch": 0.7775520568816658, "grad_norm": 4.023992538452148, "learning_rate": 2.367114071856379e-06, "loss": 1.3593, "step": 1531 }, { "epoch": 0.7780599288979178, "grad_norm": 4.405642986297607, "learning_rate": 2.356763247804649e-06, "loss": 1.3145, "step": 1532 }, { "epoch": 0.7785678009141697, "grad_norm": 4.339470386505127, "learning_rate": 2.346432080338995e-06, "loss": 1.0931, "step": 1533 }, { "epoch": 0.7790756729304216, "grad_norm": 5.390122890472412, "learning_rate": 2.336120596028728e-06, "loss": 1.193, "step": 1534 }, { "epoch": 0.7795835449466735, "grad_norm": 5.478177070617676, "learning_rate": 2.3258288213925283e-06, "loss": 1.4469, "step": 1535 }, { "epoch": 0.7800914169629254, "grad_norm": 4.124152183532715, "learning_rate": 2.315556782898388e-06, "loss": 1.3121, "step": 1536 }, { "epoch": 0.7805992889791773, "grad_norm": 5.337630748748779, "learning_rate": 2.305304506963548e-06, "loss": 1.3499, "step": 1537 }, { "epoch": 0.7811071609954292, "grad_norm": 4.505415439605713, "learning_rate": 2.2950720199544275e-06, "loss": 1.1125, "step": 1538 }, { "epoch": 0.781615033011681, "grad_norm": 5.707019805908203, "learning_rate": 2.28485934818654e-06, "loss": 1.1868, "step": 1539 }, { "epoch": 0.7821229050279329, "grad_norm": 5.610849380493164, "learning_rate": 2.2746665179244543e-06, "loss": 1.1175, "step": 1540 }, { "epoch": 0.7826307770441848, "grad_norm": 6.6965413093566895, "learning_rate": 2.2644935553816994e-06, "loss": 1.1509, "step": 1541 }, { "epoch": 0.7831386490604367, "grad_norm": 5.4784135818481445, "learning_rate": 2.2543404867207165e-06, "loss": 1.2848, "step": 1542 }, { "epoch": 0.7836465210766886, "grad_norm": 4.223880767822266, "learning_rate": 2.2442073380527775e-06, "loss": 1.1961, "step": 1543 }, { "epoch": 0.7841543930929405, "grad_norm": 4.37587308883667, "learning_rate": 2.2340941354379363e-06, "loss": 1.2925, "step": 1544 }, { "epoch": 0.7846622651091925, "grad_norm": 4.4966511726379395, "learning_rate": 2.22400090488494e-06, "loss": 1.0749, "step": 1545 }, { "epoch": 0.7851701371254444, "grad_norm": 5.9518046379089355, "learning_rate": 2.2139276723511715e-06, "loss": 1.2663, "step": 1546 }, { "epoch": 0.7856780091416963, "grad_norm": 5.12912130355835, "learning_rate": 2.2038744637425967e-06, "loss": 1.3729, "step": 1547 }, { "epoch": 0.7861858811579482, "grad_norm": 4.480926513671875, "learning_rate": 2.19384130491367e-06, "loss": 1.3531, "step": 1548 }, { "epoch": 0.7866937531742001, "grad_norm": 4.151101589202881, "learning_rate": 2.183828221667287e-06, "loss": 1.3121, "step": 1549 }, { "epoch": 0.787201625190452, "grad_norm": 5.753885269165039, "learning_rate": 2.173835239754719e-06, "loss": 1.2623, "step": 1550 }, { "epoch": 0.7877094972067039, "grad_norm": 4.274275779724121, "learning_rate": 2.163862384875535e-06, "loss": 1.4674, "step": 1551 }, { "epoch": 0.7882173692229558, "grad_norm": 6.411306858062744, "learning_rate": 2.153909682677544e-06, "loss": 1.1709, "step": 1552 }, { "epoch": 0.7887252412392077, "grad_norm": 5.627938270568848, "learning_rate": 2.1439771587567325e-06, "loss": 1.1385, "step": 1553 }, { "epoch": 0.7892331132554596, "grad_norm": 5.276611804962158, "learning_rate": 2.134064838657185e-06, "loss": 1.1439, "step": 1554 }, { "epoch": 0.7897409852717115, "grad_norm": 5.1386213302612305, "learning_rate": 2.124172747871027e-06, "loss": 1.2375, "step": 1555 }, { "epoch": 0.7902488572879635, "grad_norm": 4.349878311157227, "learning_rate": 2.1143009118383672e-06, "loss": 1.4328, "step": 1556 }, { "epoch": 0.7907567293042154, "grad_norm": 5.146805286407471, "learning_rate": 2.1044493559472167e-06, "loss": 1.3282, "step": 1557 }, { "epoch": 0.7912646013204673, "grad_norm": 4.454208850860596, "learning_rate": 2.0946181055334336e-06, "loss": 1.1992, "step": 1558 }, { "epoch": 0.7917724733367192, "grad_norm": 6.001837730407715, "learning_rate": 2.0848071858806496e-06, "loss": 0.9643, "step": 1559 }, { "epoch": 0.7922803453529711, "grad_norm": 4.391412734985352, "learning_rate": 2.0750166222202218e-06, "loss": 1.2218, "step": 1560 }, { "epoch": 0.792788217369223, "grad_norm": 5.964014053344727, "learning_rate": 2.065246439731148e-06, "loss": 1.24, "step": 1561 }, { "epoch": 0.7932960893854749, "grad_norm": 4.850198268890381, "learning_rate": 2.055496663540009e-06, "loss": 1.5233, "step": 1562 }, { "epoch": 0.7938039614017267, "grad_norm": 4.668525695800781, "learning_rate": 2.045767318720915e-06, "loss": 1.0346, "step": 1563 }, { "epoch": 0.7943118334179786, "grad_norm": 5.233807563781738, "learning_rate": 2.03605843029542e-06, "loss": 1.2894, "step": 1564 }, { "epoch": 0.7948197054342305, "grad_norm": 4.899875640869141, "learning_rate": 2.02637002323248e-06, "loss": 1.2911, "step": 1565 }, { "epoch": 0.7953275774504824, "grad_norm": 5.5429606437683105, "learning_rate": 2.0167021224483675e-06, "loss": 1.2636, "step": 1566 }, { "epoch": 0.7958354494667343, "grad_norm": 4.837125301361084, "learning_rate": 2.0070547528066265e-06, "loss": 1.4016, "step": 1567 }, { "epoch": 0.7963433214829863, "grad_norm": 5.405340671539307, "learning_rate": 1.997427939117993e-06, "loss": 1.0387, "step": 1568 }, { "epoch": 0.7968511934992382, "grad_norm": 4.169713020324707, "learning_rate": 1.9878217061403436e-06, "loss": 1.4287, "step": 1569 }, { "epoch": 0.7973590655154901, "grad_norm": 4.973928451538086, "learning_rate": 1.9782360785786236e-06, "loss": 1.1843, "step": 1570 }, { "epoch": 0.797866937531742, "grad_norm": 5.256599426269531, "learning_rate": 1.9686710810847832e-06, "loss": 1.3887, "step": 1571 }, { "epoch": 0.7983748095479939, "grad_norm": 6.420433044433594, "learning_rate": 1.9591267382577193e-06, "loss": 1.0757, "step": 1572 }, { "epoch": 0.7988826815642458, "grad_norm": 5.725488662719727, "learning_rate": 1.949603074643216e-06, "loss": 1.0758, "step": 1573 }, { "epoch": 0.7993905535804977, "grad_norm": 5.522940158843994, "learning_rate": 1.9401001147338647e-06, "loss": 1.0674, "step": 1574 }, { "epoch": 0.7998984255967496, "grad_norm": 6.481788158416748, "learning_rate": 1.930617882969016e-06, "loss": 1.117, "step": 1575 }, { "epoch": 0.8004062976130015, "grad_norm": 4.595775127410889, "learning_rate": 1.9211564037347188e-06, "loss": 1.2688, "step": 1576 }, { "epoch": 0.8009141696292534, "grad_norm": 5.821315765380859, "learning_rate": 1.9117157013636435e-06, "loss": 1.2299, "step": 1577 }, { "epoch": 0.8014220416455053, "grad_norm": 4.451550483703613, "learning_rate": 1.9022958001350255e-06, "loss": 1.2717, "step": 1578 }, { "epoch": 0.8019299136617573, "grad_norm": 5.8209357261657715, "learning_rate": 1.8928967242746154e-06, "loss": 1.0486, "step": 1579 }, { "epoch": 0.8024377856780092, "grad_norm": 5.02223539352417, "learning_rate": 1.8835184979545984e-06, "loss": 1.2014, "step": 1580 }, { "epoch": 0.8029456576942611, "grad_norm": 4.6913228034973145, "learning_rate": 1.8741611452935382e-06, "loss": 1.2134, "step": 1581 }, { "epoch": 0.803453529710513, "grad_norm": 4.663290023803711, "learning_rate": 1.8648246903563238e-06, "loss": 1.2812, "step": 1582 }, { "epoch": 0.8039614017267649, "grad_norm": 4.683165550231934, "learning_rate": 1.8555091571540928e-06, "loss": 1.3909, "step": 1583 }, { "epoch": 0.8044692737430168, "grad_norm": 5.630870342254639, "learning_rate": 1.8462145696441768e-06, "loss": 1.5263, "step": 1584 }, { "epoch": 0.8049771457592687, "grad_norm": 4.374763488769531, "learning_rate": 1.8369409517300485e-06, "loss": 1.3124, "step": 1585 }, { "epoch": 0.8054850177755206, "grad_norm": 6.090631008148193, "learning_rate": 1.8276883272612434e-06, "loss": 1.1145, "step": 1586 }, { "epoch": 0.8059928897917725, "grad_norm": 4.912288665771484, "learning_rate": 1.8184567200333104e-06, "loss": 1.1028, "step": 1587 }, { "epoch": 0.8065007618080243, "grad_norm": 4.933427333831787, "learning_rate": 1.8092461537877436e-06, "loss": 1.1397, "step": 1588 }, { "epoch": 0.8070086338242762, "grad_norm": 5.186468601226807, "learning_rate": 1.8000566522119323e-06, "loss": 1.3007, "step": 1589 }, { "epoch": 0.8075165058405281, "grad_norm": 4.519622325897217, "learning_rate": 1.7908882389390857e-06, "loss": 1.3, "step": 1590 }, { "epoch": 0.80802437785678, "grad_norm": 4.897008895874023, "learning_rate": 1.7817409375481798e-06, "loss": 1.3872, "step": 1591 }, { "epoch": 0.808532249873032, "grad_norm": 5.396913051605225, "learning_rate": 1.7726147715638985e-06, "loss": 1.1085, "step": 1592 }, { "epoch": 0.8090401218892839, "grad_norm": 5.167842388153076, "learning_rate": 1.763509764456568e-06, "loss": 1.1642, "step": 1593 }, { "epoch": 0.8095479939055358, "grad_norm": 4.8681559562683105, "learning_rate": 1.7544259396420993e-06, "loss": 1.3848, "step": 1594 }, { "epoch": 0.8100558659217877, "grad_norm": 4.275599956512451, "learning_rate": 1.745363320481932e-06, "loss": 1.2506, "step": 1595 }, { "epoch": 0.8105637379380396, "grad_norm": 5.7443742752075195, "learning_rate": 1.7363219302829625e-06, "loss": 1.1711, "step": 1596 }, { "epoch": 0.8110716099542915, "grad_norm": 5.762200832366943, "learning_rate": 1.7273017922974944e-06, "loss": 1.1643, "step": 1597 }, { "epoch": 0.8115794819705434, "grad_norm": 5.273171901702881, "learning_rate": 1.718302929723179e-06, "loss": 1.3189, "step": 1598 }, { "epoch": 0.8120873539867953, "grad_norm": 4.075494289398193, "learning_rate": 1.7093253657029485e-06, "loss": 1.349, "step": 1599 }, { "epoch": 0.8125952260030472, "grad_norm": 4.767585754394531, "learning_rate": 1.7003691233249597e-06, "loss": 1.4168, "step": 1600 }, { "epoch": 0.8131030980192991, "grad_norm": 5.139211654663086, "learning_rate": 1.6914342256225347e-06, "loss": 1.3283, "step": 1601 }, { "epoch": 0.813610970035551, "grad_norm": 4.2492241859436035, "learning_rate": 1.6825206955741092e-06, "loss": 1.4171, "step": 1602 }, { "epoch": 0.814118842051803, "grad_norm": 5.196702003479004, "learning_rate": 1.6736285561031595e-06, "loss": 0.9668, "step": 1603 }, { "epoch": 0.8146267140680549, "grad_norm": 6.883477210998535, "learning_rate": 1.6647578300781486e-06, "loss": 1.0839, "step": 1604 }, { "epoch": 0.8151345860843068, "grad_norm": 4.714962005615234, "learning_rate": 1.6559085403124797e-06, "loss": 1.1373, "step": 1605 }, { "epoch": 0.8156424581005587, "grad_norm": 4.462029933929443, "learning_rate": 1.6470807095644127e-06, "loss": 1.3212, "step": 1606 }, { "epoch": 0.8161503301168106, "grad_norm": 5.117851734161377, "learning_rate": 1.6382743605370344e-06, "loss": 1.2225, "step": 1607 }, { "epoch": 0.8166582021330625, "grad_norm": 4.902977466583252, "learning_rate": 1.6294895158781743e-06, "loss": 1.3309, "step": 1608 }, { "epoch": 0.8171660741493144, "grad_norm": 4.548398017883301, "learning_rate": 1.6207261981803657e-06, "loss": 1.4154, "step": 1609 }, { "epoch": 0.8176739461655663, "grad_norm": 5.7301506996154785, "learning_rate": 1.611984429980772e-06, "loss": 1.2463, "step": 1610 }, { "epoch": 0.8181818181818182, "grad_norm": 4.687128067016602, "learning_rate": 1.6032642337611458e-06, "loss": 1.1388, "step": 1611 }, { "epoch": 0.8186896901980701, "grad_norm": 5.663663864135742, "learning_rate": 1.594565631947753e-06, "loss": 1.1684, "step": 1612 }, { "epoch": 0.8191975622143219, "grad_norm": 4.085853099822998, "learning_rate": 1.5858886469113277e-06, "loss": 1.3194, "step": 1613 }, { "epoch": 0.8197054342305738, "grad_norm": 4.943175792694092, "learning_rate": 1.5772333009670137e-06, "loss": 1.2014, "step": 1614 }, { "epoch": 0.8202133062468258, "grad_norm": 5.281403541564941, "learning_rate": 1.568599616374299e-06, "loss": 0.8753, "step": 1615 }, { "epoch": 0.8207211782630777, "grad_norm": 5.390008926391602, "learning_rate": 1.5599876153369653e-06, "loss": 1.3108, "step": 1616 }, { "epoch": 0.8212290502793296, "grad_norm": 4.513533592224121, "learning_rate": 1.5513973200030274e-06, "loss": 1.2118, "step": 1617 }, { "epoch": 0.8217369222955815, "grad_norm": 7.519176483154297, "learning_rate": 1.5428287524646867e-06, "loss": 0.88, "step": 1618 }, { "epoch": 0.8222447943118334, "grad_norm": 4.701818466186523, "learning_rate": 1.534281934758256e-06, "loss": 1.3704, "step": 1619 }, { "epoch": 0.8227526663280853, "grad_norm": 6.736048221588135, "learning_rate": 1.525756888864115e-06, "loss": 1.211, "step": 1620 }, { "epoch": 0.8232605383443372, "grad_norm": 5.540770053863525, "learning_rate": 1.5172536367066582e-06, "loss": 1.2807, "step": 1621 }, { "epoch": 0.8237684103605891, "grad_norm": 4.7243781089782715, "learning_rate": 1.5087722001542216e-06, "loss": 1.1751, "step": 1622 }, { "epoch": 0.824276282376841, "grad_norm": 4.625030517578125, "learning_rate": 1.5003126010190427e-06, "loss": 1.0707, "step": 1623 }, { "epoch": 0.8247841543930929, "grad_norm": 5.2486796379089355, "learning_rate": 1.491874861057202e-06, "loss": 1.1341, "step": 1624 }, { "epoch": 0.8252920264093448, "grad_norm": 5.326663017272949, "learning_rate": 1.4834590019685545e-06, "loss": 1.2813, "step": 1625 }, { "epoch": 0.8257998984255968, "grad_norm": 4.928109169006348, "learning_rate": 1.4750650453966863e-06, "loss": 1.3705, "step": 1626 }, { "epoch": 0.8263077704418487, "grad_norm": 4.358314514160156, "learning_rate": 1.4666930129288603e-06, "loss": 1.3322, "step": 1627 }, { "epoch": 0.8268156424581006, "grad_norm": 4.756168365478516, "learning_rate": 1.458342926095948e-06, "loss": 1.247, "step": 1628 }, { "epoch": 0.8273235144743525, "grad_norm": 5.003471851348877, "learning_rate": 1.4500148063723885e-06, "loss": 1.4165, "step": 1629 }, { "epoch": 0.8278313864906044, "grad_norm": 5.815068244934082, "learning_rate": 1.4417086751761188e-06, "loss": 1.2575, "step": 1630 }, { "epoch": 0.8283392585068563, "grad_norm": 5.7446746826171875, "learning_rate": 1.4334245538685376e-06, "loss": 1.3561, "step": 1631 }, { "epoch": 0.8288471305231082, "grad_norm": 5.561966896057129, "learning_rate": 1.4251624637544315e-06, "loss": 1.2635, "step": 1632 }, { "epoch": 0.8293550025393601, "grad_norm": 3.895627737045288, "learning_rate": 1.416922426081927e-06, "loss": 1.3344, "step": 1633 }, { "epoch": 0.829862874555612, "grad_norm": 4.815524101257324, "learning_rate": 1.4087044620424461e-06, "loss": 1.2994, "step": 1634 }, { "epoch": 0.8303707465718639, "grad_norm": 4.807487964630127, "learning_rate": 1.4005085927706308e-06, "loss": 1.385, "step": 1635 }, { "epoch": 0.8308786185881158, "grad_norm": 5.106080055236816, "learning_rate": 1.3923348393443115e-06, "loss": 1.2108, "step": 1636 }, { "epoch": 0.8313864906043676, "grad_norm": 4.974977016448975, "learning_rate": 1.3841832227844331e-06, "loss": 0.9978, "step": 1637 }, { "epoch": 0.8318943626206196, "grad_norm": 4.848039627075195, "learning_rate": 1.3760537640550165e-06, "loss": 1.1942, "step": 1638 }, { "epoch": 0.8324022346368715, "grad_norm": 5.152993679046631, "learning_rate": 1.3679464840630918e-06, "loss": 1.0863, "step": 1639 }, { "epoch": 0.8329101066531234, "grad_norm": 4.386135578155518, "learning_rate": 1.3598614036586576e-06, "loss": 1.1307, "step": 1640 }, { "epoch": 0.8334179786693753, "grad_norm": 5.556447982788086, "learning_rate": 1.351798543634615e-06, "loss": 1.0966, "step": 1641 }, { "epoch": 0.8339258506856272, "grad_norm": 4.602492332458496, "learning_rate": 1.3437579247267195e-06, "loss": 1.2305, "step": 1642 }, { "epoch": 0.8344337227018791, "grad_norm": 5.0979390144348145, "learning_rate": 1.335739567613532e-06, "loss": 1.2751, "step": 1643 }, { "epoch": 0.834941594718131, "grad_norm": 4.603364944458008, "learning_rate": 1.327743492916359e-06, "loss": 1.1891, "step": 1644 }, { "epoch": 0.8354494667343829, "grad_norm": 4.802546977996826, "learning_rate": 1.3197697211992012e-06, "loss": 1.2852, "step": 1645 }, { "epoch": 0.8359573387506348, "grad_norm": 4.88373327255249, "learning_rate": 1.311818272968699e-06, "loss": 1.2016, "step": 1646 }, { "epoch": 0.8364652107668867, "grad_norm": 5.029901504516602, "learning_rate": 1.3038891686740896e-06, "loss": 1.3481, "step": 1647 }, { "epoch": 0.8369730827831386, "grad_norm": 4.834127902984619, "learning_rate": 1.29598242870714e-06, "loss": 1.3121, "step": 1648 }, { "epoch": 0.8374809547993906, "grad_norm": 6.290788650512695, "learning_rate": 1.288098073402102e-06, "loss": 1.2261, "step": 1649 }, { "epoch": 0.8379888268156425, "grad_norm": 4.68792200088501, "learning_rate": 1.2802361230356642e-06, "loss": 1.5772, "step": 1650 }, { "epoch": 0.8384966988318944, "grad_norm": 5.027810573577881, "learning_rate": 1.2723965978268938e-06, "loss": 1.1969, "step": 1651 }, { "epoch": 0.8390045708481463, "grad_norm": 5.2663798332214355, "learning_rate": 1.264579517937179e-06, "loss": 1.0489, "step": 1652 }, { "epoch": 0.8395124428643982, "grad_norm": 4.718344211578369, "learning_rate": 1.2567849034701928e-06, "loss": 1.2672, "step": 1653 }, { "epoch": 0.8400203148806501, "grad_norm": 5.486889839172363, "learning_rate": 1.2490127744718283e-06, "loss": 1.1353, "step": 1654 }, { "epoch": 0.840528186896902, "grad_norm": 6.703730583190918, "learning_rate": 1.2412631509301487e-06, "loss": 1.1024, "step": 1655 }, { "epoch": 0.8410360589131539, "grad_norm": 4.949845314025879, "learning_rate": 1.2335360527753471e-06, "loss": 1.0375, "step": 1656 }, { "epoch": 0.8415439309294058, "grad_norm": 4.4931254386901855, "learning_rate": 1.2258314998796772e-06, "loss": 1.2449, "step": 1657 }, { "epoch": 0.8420518029456577, "grad_norm": 5.583803653717041, "learning_rate": 1.218149512057416e-06, "loss": 1.1586, "step": 1658 }, { "epoch": 0.8425596749619096, "grad_norm": 5.380489349365234, "learning_rate": 1.2104901090648068e-06, "loss": 1.3036, "step": 1659 }, { "epoch": 0.8430675469781616, "grad_norm": 6.198934555053711, "learning_rate": 1.202853310600015e-06, "loss": 0.9664, "step": 1660 }, { "epoch": 0.8435754189944135, "grad_norm": 4.49476432800293, "learning_rate": 1.195239136303068e-06, "loss": 1.2589, "step": 1661 }, { "epoch": 0.8440832910106653, "grad_norm": 4.489984512329102, "learning_rate": 1.187647605755806e-06, "loss": 1.4972, "step": 1662 }, { "epoch": 0.8445911630269172, "grad_norm": 5.477954864501953, "learning_rate": 1.1800787384818445e-06, "loss": 1.0707, "step": 1663 }, { "epoch": 0.8450990350431691, "grad_norm": 4.977619647979736, "learning_rate": 1.172532553946505e-06, "loss": 1.1271, "step": 1664 }, { "epoch": 0.845606907059421, "grad_norm": 5.6147332191467285, "learning_rate": 1.1650090715567796e-06, "loss": 1.1355, "step": 1665 }, { "epoch": 0.8461147790756729, "grad_norm": 4.383717060089111, "learning_rate": 1.1575083106612771e-06, "loss": 1.127, "step": 1666 }, { "epoch": 0.8466226510919248, "grad_norm": 5.167008399963379, "learning_rate": 1.1500302905501682e-06, "loss": 1.0018, "step": 1667 }, { "epoch": 0.8471305231081767, "grad_norm": 4.774293899536133, "learning_rate": 1.1425750304551387e-06, "loss": 1.3005, "step": 1668 }, { "epoch": 0.8476383951244286, "grad_norm": 6.064826488494873, "learning_rate": 1.1351425495493462e-06, "loss": 1.3613, "step": 1669 }, { "epoch": 0.8481462671406805, "grad_norm": 4.394528388977051, "learning_rate": 1.127732866947362e-06, "loss": 1.2643, "step": 1670 }, { "epoch": 0.8486541391569324, "grad_norm": 4.836899757385254, "learning_rate": 1.1203460017051249e-06, "loss": 1.4117, "step": 1671 }, { "epoch": 0.8491620111731844, "grad_norm": 4.757767200469971, "learning_rate": 1.1129819728198987e-06, "loss": 1.4844, "step": 1672 }, { "epoch": 0.8496698831894363, "grad_norm": 4.990601062774658, "learning_rate": 1.10564079923021e-06, "loss": 1.1944, "step": 1673 }, { "epoch": 0.8501777552056882, "grad_norm": 5.517089366912842, "learning_rate": 1.0983224998158104e-06, "loss": 0.8689, "step": 1674 }, { "epoch": 0.8506856272219401, "grad_norm": 5.221260070800781, "learning_rate": 1.091027093397623e-06, "loss": 1.2486, "step": 1675 }, { "epoch": 0.851193499238192, "grad_norm": 4.8471245765686035, "learning_rate": 1.083754598737702e-06, "loss": 1.1717, "step": 1676 }, { "epoch": 0.8517013712544439, "grad_norm": 6.431856155395508, "learning_rate": 1.076505034539167e-06, "loss": 0.9817, "step": 1677 }, { "epoch": 0.8522092432706958, "grad_norm": 5.298643589019775, "learning_rate": 1.069278419446178e-06, "loss": 1.3423, "step": 1678 }, { "epoch": 0.8527171152869477, "grad_norm": 4.67987060546875, "learning_rate": 1.0620747720438629e-06, "loss": 1.2656, "step": 1679 }, { "epoch": 0.8532249873031996, "grad_norm": 5.228744029998779, "learning_rate": 1.0548941108582956e-06, "loss": 1.364, "step": 1680 }, { "epoch": 0.8537328593194515, "grad_norm": 4.668889045715332, "learning_rate": 1.0477364543564205e-06, "loss": 1.3017, "step": 1681 }, { "epoch": 0.8542407313357034, "grad_norm": 5.059421539306641, "learning_rate": 1.0406018209460344e-06, "loss": 0.942, "step": 1682 }, { "epoch": 0.8547486033519553, "grad_norm": 4.647306442260742, "learning_rate": 1.0334902289757121e-06, "loss": 1.1538, "step": 1683 }, { "epoch": 0.8552564753682073, "grad_norm": 5.314273834228516, "learning_rate": 1.0264016967347744e-06, "loss": 1.2407, "step": 1684 }, { "epoch": 0.8557643473844592, "grad_norm": 4.523673057556152, "learning_rate": 1.0193362424532428e-06, "loss": 1.1304, "step": 1685 }, { "epoch": 0.856272219400711, "grad_norm": 4.930280685424805, "learning_rate": 1.0122938843017837e-06, "loss": 1.3051, "step": 1686 }, { "epoch": 0.8567800914169629, "grad_norm": 4.712639331817627, "learning_rate": 1.0052746403916668e-06, "loss": 1.4619, "step": 1687 }, { "epoch": 0.8572879634332148, "grad_norm": 4.836647033691406, "learning_rate": 9.982785287747131e-07, "loss": 1.4634, "step": 1688 }, { "epoch": 0.8577958354494667, "grad_norm": 5.0175700187683105, "learning_rate": 9.913055674432614e-07, "loss": 1.1327, "step": 1689 }, { "epoch": 0.8583037074657186, "grad_norm": 4.9350175857543945, "learning_rate": 9.84355774330108e-07, "loss": 1.053, "step": 1690 }, { "epoch": 0.8588115794819705, "grad_norm": 7.005141258239746, "learning_rate": 9.774291673084646e-07, "loss": 1.2266, "step": 1691 }, { "epoch": 0.8593194514982224, "grad_norm": 4.391099452972412, "learning_rate": 9.705257641919185e-07, "loss": 1.1779, "step": 1692 }, { "epoch": 0.8598273235144743, "grad_norm": 4.599768161773682, "learning_rate": 9.636455827343784e-07, "loss": 1.2445, "step": 1693 }, { "epoch": 0.8603351955307262, "grad_norm": 4.420525550842285, "learning_rate": 9.56788640630033e-07, "loss": 1.3395, "step": 1694 }, { "epoch": 0.8608430675469781, "grad_norm": 4.860838413238525, "learning_rate": 9.499549555133103e-07, "loss": 1.3235, "step": 1695 }, { "epoch": 0.8613509395632301, "grad_norm": 5.740962505340576, "learning_rate": 9.431445449588184e-07, "loss": 1.1896, "step": 1696 }, { "epoch": 0.861858811579482, "grad_norm": 5.737368583679199, "learning_rate": 9.363574264813113e-07, "loss": 1.1083, "step": 1697 }, { "epoch": 0.8623666835957339, "grad_norm": 5.145242691040039, "learning_rate": 9.295936175356457e-07, "loss": 1.0518, "step": 1698 }, { "epoch": 0.8628745556119858, "grad_norm": 6.466031551361084, "learning_rate": 9.22853135516728e-07, "loss": 0.9954, "step": 1699 }, { "epoch": 0.8633824276282377, "grad_norm": 4.695497035980225, "learning_rate": 9.161359977594719e-07, "loss": 1.3734, "step": 1700 }, { "epoch": 0.8638902996444896, "grad_norm": 4.825030326843262, "learning_rate": 9.094422215387599e-07, "loss": 1.1216, "step": 1701 }, { "epoch": 0.8643981716607415, "grad_norm": 4.423190116882324, "learning_rate": 9.027718240693895e-07, "loss": 1.3598, "step": 1702 }, { "epoch": 0.8649060436769934, "grad_norm": 5.499927997589111, "learning_rate": 8.961248225060382e-07, "loss": 1.0977, "step": 1703 }, { "epoch": 0.8654139156932453, "grad_norm": 4.7551703453063965, "learning_rate": 8.895012339432074e-07, "loss": 1.2789, "step": 1704 }, { "epoch": 0.8659217877094972, "grad_norm": 4.84812068939209, "learning_rate": 8.829010754151957e-07, "loss": 1.2925, "step": 1705 }, { "epoch": 0.8664296597257491, "grad_norm": 4.788599967956543, "learning_rate": 8.763243638960362e-07, "loss": 1.3916, "step": 1706 }, { "epoch": 0.866937531742001, "grad_norm": 4.135072231292725, "learning_rate": 8.697711162994705e-07, "loss": 1.3201, "step": 1707 }, { "epoch": 0.867445403758253, "grad_norm": 4.580753326416016, "learning_rate": 8.632413494788871e-07, "loss": 1.4539, "step": 1708 }, { "epoch": 0.8679532757745049, "grad_norm": 4.9030022621154785, "learning_rate": 8.567350802272967e-07, "loss": 1.3565, "step": 1709 }, { "epoch": 0.8684611477907568, "grad_norm": 4.669422149658203, "learning_rate": 8.502523252772721e-07, "loss": 1.1809, "step": 1710 }, { "epoch": 0.8689690198070086, "grad_norm": 4.870145797729492, "learning_rate": 8.437931013009193e-07, "loss": 1.3639, "step": 1711 }, { "epoch": 0.8694768918232605, "grad_norm": 4.493689060211182, "learning_rate": 8.373574249098238e-07, "loss": 1.3479, "step": 1712 }, { "epoch": 0.8699847638395124, "grad_norm": 4.5029497146606445, "learning_rate": 8.309453126550115e-07, "loss": 1.2695, "step": 1713 }, { "epoch": 0.8704926358557643, "grad_norm": 4.815980434417725, "learning_rate": 8.245567810269128e-07, "loss": 1.2276, "step": 1714 }, { "epoch": 0.8710005078720162, "grad_norm": 4.96169376373291, "learning_rate": 8.181918464553107e-07, "loss": 1.2781, "step": 1715 }, { "epoch": 0.8715083798882681, "grad_norm": 5.211386680603027, "learning_rate": 8.118505253093001e-07, "loss": 1.0092, "step": 1716 }, { "epoch": 0.87201625190452, "grad_norm": 4.784932613372803, "learning_rate": 8.055328338972501e-07, "loss": 1.3213, "step": 1717 }, { "epoch": 0.8725241239207719, "grad_norm": 5.034736156463623, "learning_rate": 7.992387884667607e-07, "loss": 1.38, "step": 1718 }, { "epoch": 0.8730319959370239, "grad_norm": 5.30827522277832, "learning_rate": 7.929684052046194e-07, "loss": 1.2445, "step": 1719 }, { "epoch": 0.8735398679532758, "grad_norm": 5.4127888679504395, "learning_rate": 7.867217002367577e-07, "loss": 1.1274, "step": 1720 }, { "epoch": 0.8740477399695277, "grad_norm": 5.677814483642578, "learning_rate": 7.804986896282152e-07, "loss": 1.1436, "step": 1721 }, { "epoch": 0.8745556119857796, "grad_norm": 4.909538745880127, "learning_rate": 7.742993893830963e-07, "loss": 1.3524, "step": 1722 }, { "epoch": 0.8750634840020315, "grad_norm": 5.164133548736572, "learning_rate": 7.681238154445226e-07, "loss": 1.2597, "step": 1723 }, { "epoch": 0.8755713560182834, "grad_norm": 5.629814624786377, "learning_rate": 7.619719836946049e-07, "loss": 1.3796, "step": 1724 }, { "epoch": 0.8760792280345353, "grad_norm": 6.742391586303711, "learning_rate": 7.55843909954388e-07, "loss": 0.7936, "step": 1725 }, { "epoch": 0.8765871000507872, "grad_norm": 6.896872043609619, "learning_rate": 7.497396099838162e-07, "loss": 1.1554, "step": 1726 }, { "epoch": 0.8770949720670391, "grad_norm": 6.02463436126709, "learning_rate": 7.436590994817028e-07, "loss": 1.1979, "step": 1727 }, { "epoch": 0.877602844083291, "grad_norm": 4.152164936065674, "learning_rate": 7.376023940856702e-07, "loss": 1.3851, "step": 1728 }, { "epoch": 0.8781107160995429, "grad_norm": 4.744118690490723, "learning_rate": 7.315695093721221e-07, "loss": 1.2747, "step": 1729 }, { "epoch": 0.8786185881157949, "grad_norm": 4.65793514251709, "learning_rate": 7.255604608562051e-07, "loss": 1.3909, "step": 1730 }, { "epoch": 0.8791264601320468, "grad_norm": 5.222963333129883, "learning_rate": 7.195752639917619e-07, "loss": 1.3076, "step": 1731 }, { "epoch": 0.8796343321482987, "grad_norm": 5.764087200164795, "learning_rate": 7.136139341712933e-07, "loss": 1.278, "step": 1732 }, { "epoch": 0.8801422041645506, "grad_norm": 4.756399154663086, "learning_rate": 7.076764867259189e-07, "loss": 1.3099, "step": 1733 }, { "epoch": 0.8806500761808025, "grad_norm": 4.776485919952393, "learning_rate": 7.017629369253454e-07, "loss": 1.5061, "step": 1734 }, { "epoch": 0.8811579481970544, "grad_norm": 4.794582366943359, "learning_rate": 6.958732999778118e-07, "loss": 1.0798, "step": 1735 }, { "epoch": 0.8816658202133062, "grad_norm": 4.570837020874023, "learning_rate": 6.90007591030063e-07, "loss": 1.2282, "step": 1736 }, { "epoch": 0.8821736922295581, "grad_norm": 5.90083122253418, "learning_rate": 6.841658251673111e-07, "loss": 1.181, "step": 1737 }, { "epoch": 0.88268156424581, "grad_norm": 5.166455268859863, "learning_rate": 6.783480174131829e-07, "loss": 1.4458, "step": 1738 }, { "epoch": 0.8831894362620619, "grad_norm": 5.3771257400512695, "learning_rate": 6.725541827296966e-07, "loss": 1.2551, "step": 1739 }, { "epoch": 0.8836973082783138, "grad_norm": 4.8618483543396, "learning_rate": 6.667843360172177e-07, "loss": 1.1116, "step": 1740 }, { "epoch": 0.8842051802945657, "grad_norm": 4.8128838539123535, "learning_rate": 6.610384921144175e-07, "loss": 1.2609, "step": 1741 }, { "epoch": 0.8847130523108176, "grad_norm": 5.699490547180176, "learning_rate": 6.553166657982391e-07, "loss": 1.1375, "step": 1742 }, { "epoch": 0.8852209243270696, "grad_norm": 5.422540664672852, "learning_rate": 6.496188717838592e-07, "loss": 1.2577, "step": 1743 }, { "epoch": 0.8857287963433215, "grad_norm": 5.66526460647583, "learning_rate": 6.439451247246486e-07, "loss": 1.26, "step": 1744 }, { "epoch": 0.8862366683595734, "grad_norm": 5.978145122528076, "learning_rate": 6.382954392121321e-07, "loss": 1.0481, "step": 1745 }, { "epoch": 0.8867445403758253, "grad_norm": 5.229310989379883, "learning_rate": 6.326698297759571e-07, "loss": 1.2828, "step": 1746 }, { "epoch": 0.8872524123920772, "grad_norm": 4.935593605041504, "learning_rate": 6.270683108838549e-07, "loss": 1.3433, "step": 1747 }, { "epoch": 0.8877602844083291, "grad_norm": 6.317234039306641, "learning_rate": 6.214908969415956e-07, "loss": 1.4458, "step": 1748 }, { "epoch": 0.888268156424581, "grad_norm": 6.326529026031494, "learning_rate": 6.159376022929631e-07, "loss": 1.0901, "step": 1749 }, { "epoch": 0.8887760284408329, "grad_norm": 4.908727169036865, "learning_rate": 6.104084412197076e-07, "loss": 1.2956, "step": 1750 }, { "epoch": 0.8892839004570848, "grad_norm": 8.57208251953125, "learning_rate": 6.049034279415189e-07, "loss": 0.6276, "step": 1751 }, { "epoch": 0.8897917724733367, "grad_norm": 4.645028114318848, "learning_rate": 5.994225766159778e-07, "loss": 1.3687, "step": 1752 }, { "epoch": 0.8902996444895886, "grad_norm": 4.887918949127197, "learning_rate": 5.939659013385324e-07, "loss": 1.3089, "step": 1753 }, { "epoch": 0.8908075165058406, "grad_norm": 4.582350730895996, "learning_rate": 5.885334161424516e-07, "loss": 1.3622, "step": 1754 }, { "epoch": 0.8913153885220925, "grad_norm": 4.830961227416992, "learning_rate": 5.831251349987921e-07, "loss": 1.261, "step": 1755 }, { "epoch": 0.8918232605383444, "grad_norm": 4.80549955368042, "learning_rate": 5.777410718163712e-07, "loss": 1.2435, "step": 1756 }, { "epoch": 0.8923311325545963, "grad_norm": 4.790905952453613, "learning_rate": 5.72381240441715e-07, "loss": 1.3054, "step": 1757 }, { "epoch": 0.8928390045708482, "grad_norm": 4.888479709625244, "learning_rate": 5.670456546590331e-07, "loss": 1.035, "step": 1758 }, { "epoch": 0.8933468765871001, "grad_norm": 5.3526787757873535, "learning_rate": 5.617343281901854e-07, "loss": 1.3651, "step": 1759 }, { "epoch": 0.8938547486033519, "grad_norm": 4.392794609069824, "learning_rate": 5.564472746946392e-07, "loss": 1.2984, "step": 1760 }, { "epoch": 0.8943626206196038, "grad_norm": 4.773922920227051, "learning_rate": 5.511845077694366e-07, "loss": 1.2163, "step": 1761 }, { "epoch": 0.8948704926358557, "grad_norm": 5.451786994934082, "learning_rate": 5.459460409491613e-07, "loss": 1.2309, "step": 1762 }, { "epoch": 0.8953783646521076, "grad_norm": 6.1389689445495605, "learning_rate": 5.407318877059031e-07, "loss": 1.3086, "step": 1763 }, { "epoch": 0.8958862366683595, "grad_norm": 5.596880912780762, "learning_rate": 5.355420614492268e-07, "loss": 1.2208, "step": 1764 }, { "epoch": 0.8963941086846114, "grad_norm": 4.918369293212891, "learning_rate": 5.30376575526127e-07, "loss": 1.2087, "step": 1765 }, { "epoch": 0.8969019807008634, "grad_norm": 5.364236831665039, "learning_rate": 5.252354432210072e-07, "loss": 1.2595, "step": 1766 }, { "epoch": 0.8974098527171153, "grad_norm": 5.538005828857422, "learning_rate": 5.201186777556333e-07, "loss": 1.429, "step": 1767 }, { "epoch": 0.8979177247333672, "grad_norm": 4.762514114379883, "learning_rate": 5.150262922891092e-07, "loss": 1.1968, "step": 1768 }, { "epoch": 0.8984255967496191, "grad_norm": 5.0200958251953125, "learning_rate": 5.09958299917841e-07, "loss": 1.0847, "step": 1769 }, { "epoch": 0.898933468765871, "grad_norm": 4.925847053527832, "learning_rate": 5.049147136754962e-07, "loss": 1.3489, "step": 1770 }, { "epoch": 0.8994413407821229, "grad_norm": 6.314319133758545, "learning_rate": 4.998955465329791e-07, "loss": 1.1123, "step": 1771 }, { "epoch": 0.8999492127983748, "grad_norm": 5.687747955322266, "learning_rate": 4.949008113983933e-07, "loss": 1.2315, "step": 1772 }, { "epoch": 0.9004570848146267, "grad_norm": 6.498187065124512, "learning_rate": 4.899305211170102e-07, "loss": 1.1394, "step": 1773 }, { "epoch": 0.9009649568308786, "grad_norm": 5.5723772048950195, "learning_rate": 4.849846884712317e-07, "loss": 1.1502, "step": 1774 }, { "epoch": 0.9014728288471305, "grad_norm": 4.490263938903809, "learning_rate": 4.800633261805632e-07, "loss": 1.3187, "step": 1775 }, { "epoch": 0.9019807008633824, "grad_norm": 4.024411201477051, "learning_rate": 4.751664469015771e-07, "loss": 1.2368, "step": 1776 }, { "epoch": 0.9024885728796344, "grad_norm": 5.656371593475342, "learning_rate": 4.7029406322788186e-07, "loss": 1.2759, "step": 1777 }, { "epoch": 0.9029964448958863, "grad_norm": 4.788447380065918, "learning_rate": 4.654461876900895e-07, "loss": 1.2423, "step": 1778 }, { "epoch": 0.9035043169121382, "grad_norm": 4.719852447509766, "learning_rate": 4.6062283275578045e-07, "loss": 1.2496, "step": 1779 }, { "epoch": 0.9040121889283901, "grad_norm": 5.475706577301025, "learning_rate": 4.5582401082947867e-07, "loss": 1.3311, "step": 1780 }, { "epoch": 0.904520060944642, "grad_norm": 5.003331661224365, "learning_rate": 4.510497342526088e-07, "loss": 1.291, "step": 1781 }, { "epoch": 0.9050279329608939, "grad_norm": 4.8336381912231445, "learning_rate": 4.463000153034769e-07, "loss": 1.3825, "step": 1782 }, { "epoch": 0.9055358049771458, "grad_norm": 4.646341800689697, "learning_rate": 4.415748661972297e-07, "loss": 1.3562, "step": 1783 }, { "epoch": 0.9060436769933977, "grad_norm": 5.7394914627075195, "learning_rate": 4.3687429908582323e-07, "loss": 1.3305, "step": 1784 }, { "epoch": 0.9065515490096495, "grad_norm": 4.842685699462891, "learning_rate": 4.3219832605800093e-07, "loss": 1.4796, "step": 1785 }, { "epoch": 0.9070594210259014, "grad_norm": 5.051614284515381, "learning_rate": 4.275469591392511e-07, "loss": 1.2763, "step": 1786 }, { "epoch": 0.9075672930421533, "grad_norm": 4.732386589050293, "learning_rate": 4.229202102917818e-07, "loss": 1.3074, "step": 1787 }, { "epoch": 0.9080751650584052, "grad_norm": 4.5038251876831055, "learning_rate": 4.183180914144924e-07, "loss": 1.2547, "step": 1788 }, { "epoch": 0.9085830370746572, "grad_norm": 6.258203506469727, "learning_rate": 4.137406143429357e-07, "loss": 1.232, "step": 1789 }, { "epoch": 0.9090909090909091, "grad_norm": 4.272420883178711, "learning_rate": 4.0918779084929363e-07, "loss": 1.1927, "step": 1790 }, { "epoch": 0.909598781107161, "grad_norm": 4.761685371398926, "learning_rate": 4.0465963264234464e-07, "loss": 1.144, "step": 1791 }, { "epoch": 0.9101066531234129, "grad_norm": 6.755512237548828, "learning_rate": 4.001561513674346e-07, "loss": 1.0257, "step": 1792 }, { "epoch": 0.9106145251396648, "grad_norm": 4.225400924682617, "learning_rate": 3.956773586064455e-07, "loss": 1.2539, "step": 1793 }, { "epoch": 0.9111223971559167, "grad_norm": 6.154450416564941, "learning_rate": 3.912232658777659e-07, "loss": 1.2094, "step": 1794 }, { "epoch": 0.9116302691721686, "grad_norm": 5.3774333000183105, "learning_rate": 3.867938846362651e-07, "loss": 1.0695, "step": 1795 }, { "epoch": 0.9121381411884205, "grad_norm": 4.787917613983154, "learning_rate": 3.823892262732543e-07, "loss": 1.282, "step": 1796 }, { "epoch": 0.9126460132046724, "grad_norm": 5.000186920166016, "learning_rate": 3.780093021164677e-07, "loss": 1.3229, "step": 1797 }, { "epoch": 0.9131538852209243, "grad_norm": 5.02613639831543, "learning_rate": 3.736541234300284e-07, "loss": 1.3228, "step": 1798 }, { "epoch": 0.9136617572371762, "grad_norm": 5.472193717956543, "learning_rate": 3.693237014144191e-07, "loss": 0.9133, "step": 1799 }, { "epoch": 0.9141696292534282, "grad_norm": 4.462389945983887, "learning_rate": 3.650180472064546e-07, "loss": 1.2428, "step": 1800 }, { "epoch": 0.9146775012696801, "grad_norm": 5.741103649139404, "learning_rate": 3.607371718792541e-07, "loss": 1.1856, "step": 1801 }, { "epoch": 0.915185373285932, "grad_norm": 4.52320671081543, "learning_rate": 3.564810864422097e-07, "loss": 1.2286, "step": 1802 }, { "epoch": 0.9156932453021839, "grad_norm": 5.195258617401123, "learning_rate": 3.5224980184095926e-07, "loss": 1.2727, "step": 1803 }, { "epoch": 0.9162011173184358, "grad_norm": 5.747450351715088, "learning_rate": 3.4804332895735924e-07, "loss": 1.0976, "step": 1804 }, { "epoch": 0.9167089893346877, "grad_norm": 4.624321460723877, "learning_rate": 3.4386167860946065e-07, "loss": 1.0627, "step": 1805 }, { "epoch": 0.9172168613509396, "grad_norm": 4.598133087158203, "learning_rate": 3.3970486155146864e-07, "loss": 1.1602, "step": 1806 }, { "epoch": 0.9177247333671915, "grad_norm": 4.223515033721924, "learning_rate": 3.355728884737297e-07, "loss": 1.4767, "step": 1807 }, { "epoch": 0.9182326053834434, "grad_norm": 4.547367572784424, "learning_rate": 3.3146577000269573e-07, "loss": 0.9494, "step": 1808 }, { "epoch": 0.9187404773996953, "grad_norm": 4.53663969039917, "learning_rate": 3.2738351670089764e-07, "loss": 1.1722, "step": 1809 }, { "epoch": 0.9192483494159471, "grad_norm": 5.274203777313232, "learning_rate": 3.2332613906691846e-07, "loss": 1.0009, "step": 1810 }, { "epoch": 0.919756221432199, "grad_norm": 5.340970039367676, "learning_rate": 3.192936475353703e-07, "loss": 1.221, "step": 1811 }, { "epoch": 0.920264093448451, "grad_norm": 4.922206401824951, "learning_rate": 3.1528605247685974e-07, "loss": 1.3697, "step": 1812 }, { "epoch": 0.9207719654647029, "grad_norm": 4.887393951416016, "learning_rate": 3.113033641979679e-07, "loss": 1.3847, "step": 1813 }, { "epoch": 0.9212798374809548, "grad_norm": 5.808061599731445, "learning_rate": 3.073455929412217e-07, "loss": 1.1314, "step": 1814 }, { "epoch": 0.9217877094972067, "grad_norm": 5.183882713317871, "learning_rate": 3.034127488850669e-07, "loss": 1.2961, "step": 1815 }, { "epoch": 0.9222955815134586, "grad_norm": 5.175242900848389, "learning_rate": 2.995048421438407e-07, "loss": 1.4633, "step": 1816 }, { "epoch": 0.9228034535297105, "grad_norm": 5.085374355316162, "learning_rate": 2.956218827677526e-07, "loss": 1.1137, "step": 1817 }, { "epoch": 0.9233113255459624, "grad_norm": 4.4305267333984375, "learning_rate": 2.917638807428491e-07, "loss": 1.3338, "step": 1818 }, { "epoch": 0.9238191975622143, "grad_norm": 4.831226825714111, "learning_rate": 2.879308459909913e-07, "loss": 1.2896, "step": 1819 }, { "epoch": 0.9243270695784662, "grad_norm": 4.863157749176025, "learning_rate": 2.84122788369835e-07, "loss": 1.1717, "step": 1820 }, { "epoch": 0.9248349415947181, "grad_norm": 4.739955425262451, "learning_rate": 2.803397176727951e-07, "loss": 1.2834, "step": 1821 }, { "epoch": 0.92534281361097, "grad_norm": 5.42057466506958, "learning_rate": 2.765816436290325e-07, "loss": 1.1681, "step": 1822 }, { "epoch": 0.925850685627222, "grad_norm": 4.562961578369141, "learning_rate": 2.7284857590341715e-07, "loss": 1.1497, "step": 1823 }, { "epoch": 0.9263585576434739, "grad_norm": 4.59310245513916, "learning_rate": 2.691405240965128e-07, "loss": 1.2458, "step": 1824 }, { "epoch": 0.9268664296597258, "grad_norm": 5.141499042510986, "learning_rate": 2.6545749774454564e-07, "loss": 1.2726, "step": 1825 }, { "epoch": 0.9273743016759777, "grad_norm": 4.8967766761779785, "learning_rate": 2.617995063193812e-07, "loss": 1.1715, "step": 1826 }, { "epoch": 0.9278821736922296, "grad_norm": 5.117003440856934, "learning_rate": 2.581665592285054e-07, "loss": 1.1912, "step": 1827 }, { "epoch": 0.9283900457084815, "grad_norm": 4.560568809509277, "learning_rate": 2.5455866581499234e-07, "loss": 1.2921, "step": 1828 }, { "epoch": 0.9288979177247334, "grad_norm": 5.413818359375, "learning_rate": 2.5097583535748424e-07, "loss": 1.374, "step": 1829 }, { "epoch": 0.9294057897409853, "grad_norm": 5.509709358215332, "learning_rate": 2.4741807707017174e-07, "loss": 1.1772, "step": 1830 }, { "epoch": 0.9299136617572372, "grad_norm": 5.931797027587891, "learning_rate": 2.4388540010275906e-07, "loss": 1.05, "step": 1831 }, { "epoch": 0.9304215337734891, "grad_norm": 5.164069175720215, "learning_rate": 2.403778135404533e-07, "loss": 1.3111, "step": 1832 }, { "epoch": 0.930929405789741, "grad_norm": 4.883148670196533, "learning_rate": 2.3689532640392976e-07, "loss": 1.2181, "step": 1833 }, { "epoch": 0.9314372778059928, "grad_norm": 4.419644832611084, "learning_rate": 2.3343794764931759e-07, "loss": 1.327, "step": 1834 }, { "epoch": 0.9319451498222447, "grad_norm": 4.260985851287842, "learning_rate": 2.3000568616817432e-07, "loss": 1.3791, "step": 1835 }, { "epoch": 0.9324530218384967, "grad_norm": 4.370043754577637, "learning_rate": 2.2659855078745796e-07, "loss": 1.1643, "step": 1836 }, { "epoch": 0.9329608938547486, "grad_norm": 5.112608432769775, "learning_rate": 2.2321655026950938e-07, "loss": 1.4418, "step": 1837 }, { "epoch": 0.9334687658710005, "grad_norm": 5.211599349975586, "learning_rate": 2.1985969331203117e-07, "loss": 1.3613, "step": 1838 }, { "epoch": 0.9339766378872524, "grad_norm": 4.372023582458496, "learning_rate": 2.1652798854805756e-07, "loss": 1.4388, "step": 1839 }, { "epoch": 0.9344845099035043, "grad_norm": 4.861570835113525, "learning_rate": 2.132214445459424e-07, "loss": 1.3352, "step": 1840 }, { "epoch": 0.9349923819197562, "grad_norm": 4.848053455352783, "learning_rate": 2.0994006980933124e-07, "loss": 1.0425, "step": 1841 }, { "epoch": 0.9355002539360081, "grad_norm": 5.172409534454346, "learning_rate": 2.0668387277713474e-07, "loss": 1.0898, "step": 1842 }, { "epoch": 0.93600812595226, "grad_norm": 5.695395469665527, "learning_rate": 2.0345286182352208e-07, "loss": 1.1312, "step": 1843 }, { "epoch": 0.9365159979685119, "grad_norm": 4.555364608764648, "learning_rate": 2.0024704525788086e-07, "loss": 1.3331, "step": 1844 }, { "epoch": 0.9370238699847638, "grad_norm": 4.197620391845703, "learning_rate": 1.9706643132480828e-07, "loss": 1.3017, "step": 1845 }, { "epoch": 0.9375317420010157, "grad_norm": 4.748517036437988, "learning_rate": 1.9391102820409014e-07, "loss": 1.032, "step": 1846 }, { "epoch": 0.9380396140172677, "grad_norm": 4.709668159484863, "learning_rate": 1.9078084401066844e-07, "loss": 1.4075, "step": 1847 }, { "epoch": 0.9385474860335196, "grad_norm": 5.260051727294922, "learning_rate": 1.8767588679463377e-07, "loss": 1.0771, "step": 1848 }, { "epoch": 0.9390553580497715, "grad_norm": 4.284242630004883, "learning_rate": 1.8459616454119755e-07, "loss": 1.1278, "step": 1849 }, { "epoch": 0.9395632300660234, "grad_norm": 4.960768699645996, "learning_rate": 1.8154168517067083e-07, "loss": 1.238, "step": 1850 }, { "epoch": 0.9400711020822753, "grad_norm": 4.591121673583984, "learning_rate": 1.7851245653844994e-07, "loss": 1.2979, "step": 1851 }, { "epoch": 0.9405789740985272, "grad_norm": 4.424968242645264, "learning_rate": 1.7550848643498763e-07, "loss": 1.2654, "step": 1852 }, { "epoch": 0.9410868461147791, "grad_norm": 5.394791126251221, "learning_rate": 1.72529782585783e-07, "loss": 1.3486, "step": 1853 }, { "epoch": 0.941594718131031, "grad_norm": 6.6211137771606445, "learning_rate": 1.6957635265135165e-07, "loss": 0.9716, "step": 1854 }, { "epoch": 0.9421025901472829, "grad_norm": 5.488575458526611, "learning_rate": 1.6664820422721216e-07, "loss": 1.2914, "step": 1855 }, { "epoch": 0.9426104621635348, "grad_norm": 4.948568344116211, "learning_rate": 1.6374534484386639e-07, "loss": 1.2524, "step": 1856 }, { "epoch": 0.9431183341797867, "grad_norm": 5.84658145904541, "learning_rate": 1.6086778196677922e-07, "loss": 1.374, "step": 1857 }, { "epoch": 0.9436262061960387, "grad_norm": 4.914611339569092, "learning_rate": 1.5801552299635426e-07, "loss": 1.4766, "step": 1858 }, { "epoch": 0.9441340782122905, "grad_norm": 5.559576988220215, "learning_rate": 1.5518857526792496e-07, "loss": 1.2248, "step": 1859 }, { "epoch": 0.9446419502285424, "grad_norm": 5.238275051116943, "learning_rate": 1.523869460517291e-07, "loss": 1.3467, "step": 1860 }, { "epoch": 0.9451498222447943, "grad_norm": 5.570143699645996, "learning_rate": 1.496106425528887e-07, "loss": 1.4734, "step": 1861 }, { "epoch": 0.9456576942610462, "grad_norm": 4.6389594078063965, "learning_rate": 1.4685967191139462e-07, "loss": 1.4379, "step": 1862 }, { "epoch": 0.9461655662772981, "grad_norm": 5.283039569854736, "learning_rate": 1.4413404120209084e-07, "loss": 1.3353, "step": 1863 }, { "epoch": 0.94667343829355, "grad_norm": 4.294154644012451, "learning_rate": 1.4143375743465026e-07, "loss": 1.2644, "step": 1864 }, { "epoch": 0.9471813103098019, "grad_norm": 4.243343353271484, "learning_rate": 1.3875882755356008e-07, "loss": 1.4211, "step": 1865 }, { "epoch": 0.9476891823260538, "grad_norm": 6.04444694519043, "learning_rate": 1.3610925843810408e-07, "loss": 1.1202, "step": 1866 }, { "epoch": 0.9481970543423057, "grad_norm": 5.779665946960449, "learning_rate": 1.334850569023438e-07, "loss": 1.0796, "step": 1867 }, { "epoch": 0.9487049263585576, "grad_norm": 5.141113758087158, "learning_rate": 1.3088622969509856e-07, "loss": 1.4375, "step": 1868 }, { "epoch": 0.9492127983748095, "grad_norm": 5.844571113586426, "learning_rate": 1.2831278349993536e-07, "loss": 1.1614, "step": 1869 }, { "epoch": 0.9497206703910615, "grad_norm": 4.52020263671875, "learning_rate": 1.2576472493514567e-07, "loss": 1.4625, "step": 1870 }, { "epoch": 0.9502285424073134, "grad_norm": 6.979033470153809, "learning_rate": 1.2324206055372878e-07, "loss": 1.1936, "step": 1871 }, { "epoch": 0.9507364144235653, "grad_norm": 5.365548133850098, "learning_rate": 1.2074479684337836e-07, "loss": 1.4185, "step": 1872 }, { "epoch": 0.9512442864398172, "grad_norm": 4.7730021476745605, "learning_rate": 1.1827294022646152e-07, "loss": 1.1636, "step": 1873 }, { "epoch": 0.9517521584560691, "grad_norm": 5.19934606552124, "learning_rate": 1.1582649706000537e-07, "loss": 1.1086, "step": 1874 }, { "epoch": 0.952260030472321, "grad_norm": 4.595972061157227, "learning_rate": 1.1340547363567932e-07, "loss": 1.1832, "step": 1875 }, { "epoch": 0.9527679024885729, "grad_norm": 5.226284980773926, "learning_rate": 1.1100987617978065e-07, "loss": 1.1639, "step": 1876 }, { "epoch": 0.9532757745048248, "grad_norm": 5.403947830200195, "learning_rate": 1.0863971085321334e-07, "loss": 1.3828, "step": 1877 }, { "epoch": 0.9537836465210767, "grad_norm": 4.61500358581543, "learning_rate": 1.0629498375148151e-07, "loss": 1.2726, "step": 1878 }, { "epoch": 0.9542915185373286, "grad_norm": 4.211429595947266, "learning_rate": 1.0397570090466491e-07, "loss": 1.3503, "step": 1879 }, { "epoch": 0.9547993905535805, "grad_norm": 5.15771484375, "learning_rate": 1.0168186827740567e-07, "loss": 1.2994, "step": 1880 }, { "epoch": 0.9553072625698324, "grad_norm": 4.26914644241333, "learning_rate": 9.941349176889492e-08, "loss": 1.3755, "step": 1881 }, { "epoch": 0.9558151345860844, "grad_norm": 4.972912311553955, "learning_rate": 9.71705772128606e-08, "loss": 1.2771, "step": 1882 }, { "epoch": 0.9563230066023362, "grad_norm": 7.042515754699707, "learning_rate": 9.495313037754195e-08, "loss": 1.2627, "step": 1883 }, { "epoch": 0.9568308786185881, "grad_norm": 6.592982769012451, "learning_rate": 9.2761156965685e-08, "loss": 1.3186, "step": 1884 }, { "epoch": 0.95733875063484, "grad_norm": 7.000082969665527, "learning_rate": 9.059466261452598e-08, "loss": 0.9216, "step": 1885 }, { "epoch": 0.9578466226510919, "grad_norm": 4.61784029006958, "learning_rate": 8.845365289577023e-08, "loss": 1.0449, "step": 1886 }, { "epoch": 0.9583544946673438, "grad_norm": 4.5391740798950195, "learning_rate": 8.633813331558549e-08, "loss": 1.383, "step": 1887 }, { "epoch": 0.9588623666835957, "grad_norm": 6.534933090209961, "learning_rate": 8.424810931458638e-08, "loss": 0.813, "step": 1888 }, { "epoch": 0.9593702386998476, "grad_norm": 5.744078636169434, "learning_rate": 8.218358626781663e-08, "loss": 1.2413, "step": 1889 }, { "epoch": 0.9598781107160995, "grad_norm": 5.97611141204834, "learning_rate": 8.014456948473692e-08, "loss": 1.0117, "step": 1890 }, { "epoch": 0.9603859827323514, "grad_norm": 6.729101657867432, "learning_rate": 7.81310642092159e-08, "loss": 1.0957, "step": 1891 }, { "epoch": 0.9608938547486033, "grad_norm": 4.487152576446533, "learning_rate": 7.614307561950585e-08, "loss": 1.4766, "step": 1892 }, { "epoch": 0.9614017267648552, "grad_norm": 5.25182580947876, "learning_rate": 7.418060882824485e-08, "loss": 1.3268, "step": 1893 }, { "epoch": 0.9619095987811072, "grad_norm": 5.395699501037598, "learning_rate": 7.224366888242685e-08, "loss": 1.2018, "step": 1894 }, { "epoch": 0.9624174707973591, "grad_norm": 6.704184055328369, "learning_rate": 7.033226076340382e-08, "loss": 1.1289, "step": 1895 }, { "epoch": 0.962925342813611, "grad_norm": 4.314540386199951, "learning_rate": 6.844638938685921e-08, "loss": 1.3382, "step": 1896 }, { "epoch": 0.9634332148298629, "grad_norm": 5.0664381980896, "learning_rate": 6.658605960280785e-08, "loss": 1.1835, "step": 1897 }, { "epoch": 0.9639410868461148, "grad_norm": 4.805373191833496, "learning_rate": 6.475127619557376e-08, "loss": 1.0936, "step": 1898 }, { "epoch": 0.9644489588623667, "grad_norm": 4.490488529205322, "learning_rate": 6.294204388378467e-08, "loss": 1.2735, "step": 1899 }, { "epoch": 0.9649568308786186, "grad_norm": 5.072920799255371, "learning_rate": 6.115836732035418e-08, "loss": 1.2059, "step": 1900 }, { "epoch": 0.9654647028948705, "grad_norm": 4.6457390785217285, "learning_rate": 5.940025109247516e-08, "loss": 1.1965, "step": 1901 }, { "epoch": 0.9659725749111224, "grad_norm": 5.324655055999756, "learning_rate": 5.766769972160302e-08, "loss": 1.3515, "step": 1902 }, { "epoch": 0.9664804469273743, "grad_norm": 4.400649547576904, "learning_rate": 5.596071766344802e-08, "loss": 1.2527, "step": 1903 }, { "epoch": 0.9669883189436262, "grad_norm": 4.642717361450195, "learning_rate": 5.427930930796188e-08, "loss": 1.2362, "step": 1904 }, { "epoch": 0.9674961909598782, "grad_norm": 5.073624134063721, "learning_rate": 5.2623478979326736e-08, "loss": 1.2601, "step": 1905 }, { "epoch": 0.9680040629761301, "grad_norm": 4.679571151733398, "learning_rate": 5.09932309359451e-08, "loss": 0.9734, "step": 1906 }, { "epoch": 0.968511934992382, "grad_norm": 4.934381484985352, "learning_rate": 4.9388569370424354e-08, "loss": 1.1064, "step": 1907 }, { "epoch": 0.9690198070086338, "grad_norm": 5.23956298828125, "learning_rate": 4.78094984095756e-08, "loss": 1.1123, "step": 1908 }, { "epoch": 0.9695276790248857, "grad_norm": 4.792849540710449, "learning_rate": 4.625602211439151e-08, "loss": 1.2325, "step": 1909 }, { "epoch": 0.9700355510411376, "grad_norm": 3.939530372619629, "learning_rate": 4.472814448004403e-08, "loss": 1.4759, "step": 1910 }, { "epoch": 0.9705434230573895, "grad_norm": 4.985993385314941, "learning_rate": 4.322586943587004e-08, "loss": 1.0928, "step": 1911 }, { "epoch": 0.9710512950736414, "grad_norm": 4.750078201293945, "learning_rate": 4.17492008453646e-08, "loss": 1.2729, "step": 1912 }, { "epoch": 0.9715591670898933, "grad_norm": 5.804688930511475, "learning_rate": 4.0298142506166594e-08, "loss": 1.2137, "step": 1913 }, { "epoch": 0.9720670391061452, "grad_norm": 4.8215131759643555, "learning_rate": 3.887269815005423e-08, "loss": 1.1932, "step": 1914 }, { "epoch": 0.9725749111223971, "grad_norm": 5.2581868171691895, "learning_rate": 3.747287144293066e-08, "loss": 1.3886, "step": 1915 }, { "epoch": 0.973082783138649, "grad_norm": 5.5822930335998535, "learning_rate": 3.609866598481726e-08, "loss": 1.0071, "step": 1916 }, { "epoch": 0.973590655154901, "grad_norm": 4.685520648956299, "learning_rate": 3.475008530984259e-08, "loss": 1.1638, "step": 1917 }, { "epoch": 0.9740985271711529, "grad_norm": 4.754659175872803, "learning_rate": 3.342713288623567e-08, "loss": 1.1589, "step": 1918 }, { "epoch": 0.9746063991874048, "grad_norm": 4.848739147186279, "learning_rate": 3.212981211631494e-08, "loss": 1.2732, "step": 1919 }, { "epoch": 0.9751142712036567, "grad_norm": 5.595327854156494, "learning_rate": 3.085812633648155e-08, "loss": 1.1356, "step": 1920 }, { "epoch": 0.9756221432199086, "grad_norm": 4.891111373901367, "learning_rate": 2.96120788172094e-08, "loss": 1.3382, "step": 1921 }, { "epoch": 0.9761300152361605, "grad_norm": 5.571377754211426, "learning_rate": 2.8391672763034005e-08, "loss": 1.2959, "step": 1922 }, { "epoch": 0.9766378872524124, "grad_norm": 4.298384189605713, "learning_rate": 2.719691131255031e-08, "loss": 1.3944, "step": 1923 }, { "epoch": 0.9771457592686643, "grad_norm": 4.778082847595215, "learning_rate": 2.6027797538401565e-08, "loss": 1.1293, "step": 1924 }, { "epoch": 0.9776536312849162, "grad_norm": 6.3270111083984375, "learning_rate": 2.4884334447268234e-08, "loss": 1.1693, "step": 1925 }, { "epoch": 0.9781615033011681, "grad_norm": 6.495800018310547, "learning_rate": 2.3766524979865757e-08, "loss": 1.218, "step": 1926 }, { "epoch": 0.97866937531742, "grad_norm": 4.57327127456665, "learning_rate": 2.2674372010935696e-08, "loss": 1.399, "step": 1927 }, { "epoch": 0.979177247333672, "grad_norm": 4.237867832183838, "learning_rate": 2.1607878349234614e-08, "loss": 1.151, "step": 1928 }, { "epoch": 0.9796851193499239, "grad_norm": 5.3051838874816895, "learning_rate": 2.056704673752963e-08, "loss": 0.9802, "step": 1929 }, { "epoch": 0.9801929913661758, "grad_norm": 6.228868007659912, "learning_rate": 1.955187985259288e-08, "loss": 1.3583, "step": 1930 }, { "epoch": 0.9807008633824277, "grad_norm": 5.250242233276367, "learning_rate": 1.856238030519153e-08, "loss": 1.231, "step": 1931 }, { "epoch": 0.9812087353986796, "grad_norm": 5.9555158615112305, "learning_rate": 1.75985506400822e-08, "loss": 0.95, "step": 1932 }, { "epoch": 0.9817166074149314, "grad_norm": 5.453784942626953, "learning_rate": 1.6660393336007665e-08, "loss": 1.1742, "step": 1933 }, { "epoch": 0.9822244794311833, "grad_norm": 4.261223793029785, "learning_rate": 1.5747910805683496e-08, "loss": 1.2285, "step": 1934 }, { "epoch": 0.9827323514474352, "grad_norm": 5.803184986114502, "learning_rate": 1.4861105395798103e-08, "loss": 1.2187, "step": 1935 }, { "epoch": 0.9832402234636871, "grad_norm": 4.059924125671387, "learning_rate": 1.3999979387003814e-08, "loss": 1.4354, "step": 1936 }, { "epoch": 0.983748095479939, "grad_norm": 4.6581010818481445, "learning_rate": 1.3164534993912458e-08, "loss": 1.3204, "step": 1937 }, { "epoch": 0.9842559674961909, "grad_norm": 4.7350172996521, "learning_rate": 1.2354774365086475e-08, "loss": 1.2787, "step": 1938 }, { "epoch": 0.9847638395124428, "grad_norm": 5.925784587860107, "learning_rate": 1.1570699583040023e-08, "loss": 1.3501, "step": 1939 }, { "epoch": 0.9852717115286947, "grad_norm": 4.791852951049805, "learning_rate": 1.0812312664224556e-08, "loss": 1.4943, "step": 1940 }, { "epoch": 0.9857795835449467, "grad_norm": 4.5251970291137695, "learning_rate": 1.0079615559033251e-08, "loss": 1.513, "step": 1941 }, { "epoch": 0.9862874555611986, "grad_norm": 5.248088359832764, "learning_rate": 9.372610151785477e-09, "loss": 1.335, "step": 1942 }, { "epoch": 0.9867953275774505, "grad_norm": 4.421478748321533, "learning_rate": 8.691298260733449e-09, "loss": 1.3018, "step": 1943 }, { "epoch": 0.9873031995937024, "grad_norm": 6.706859588623047, "learning_rate": 8.03568163804891e-09, "loss": 1.2363, "step": 1944 }, { "epoch": 0.9878110716099543, "grad_norm": 4.88031005859375, "learning_rate": 7.4057619698197914e-09, "loss": 1.103, "step": 1945 }, { "epoch": 0.9883189436262062, "grad_norm": 4.547206401824951, "learning_rate": 6.801540876052448e-09, "loss": 1.6001, "step": 1946 }, { "epoch": 0.9888268156424581, "grad_norm": 5.272502422332764, "learning_rate": 6.223019910657213e-09, "loss": 1.3305, "step": 1947 }, { "epoch": 0.98933468765871, "grad_norm": 4.947421073913574, "learning_rate": 5.670200561453953e-09, "loss": 0.9495, "step": 1948 }, { "epoch": 0.9898425596749619, "grad_norm": 4.808927536010742, "learning_rate": 5.143084250162078e-09, "loss": 1.215, "step": 1949 }, { "epoch": 0.9903504316912138, "grad_norm": 5.339200973510742, "learning_rate": 4.641672332399427e-09, "loss": 0.9683, "step": 1950 }, { "epoch": 0.9908583037074657, "grad_norm": 4.209288597106934, "learning_rate": 4.165966097677832e-09, "loss": 1.3954, "step": 1951 }, { "epoch": 0.9913661757237177, "grad_norm": 6.088907718658447, "learning_rate": 3.7159667694008915e-09, "loss": 1.0968, "step": 1952 }, { "epoch": 0.9918740477399696, "grad_norm": 4.938228130340576, "learning_rate": 3.2916755048595374e-09, "loss": 1.1232, "step": 1953 }, { "epoch": 0.9923819197562215, "grad_norm": 4.1907525062561035, "learning_rate": 2.893093395229807e-09, "loss": 1.3027, "step": 1954 }, { "epoch": 0.9928897917724734, "grad_norm": 4.614527225494385, "learning_rate": 2.5202214655695168e-09, "loss": 1.1737, "step": 1955 }, { "epoch": 0.9933976637887253, "grad_norm": 5.144586086273193, "learning_rate": 2.1730606748182615e-09, "loss": 1.1848, "step": 1956 }, { "epoch": 0.9939055358049771, "grad_norm": 4.373411655426025, "learning_rate": 1.8516119157896416e-09, "loss": 1.313, "step": 1957 }, { "epoch": 0.994413407821229, "grad_norm": 5.278648376464844, "learning_rate": 1.5558760151734854e-09, "loss": 1.3557, "step": 1958 }, { "epoch": 0.9949212798374809, "grad_norm": 4.67554235458374, "learning_rate": 1.2858537335314058e-09, "loss": 1.2227, "step": 1959 }, { "epoch": 0.9954291518537328, "grad_norm": 5.464794635772705, "learning_rate": 1.0415457652990236e-09, "loss": 1.2804, "step": 1960 }, { "epoch": 0.9959370238699847, "grad_norm": 5.86653995513916, "learning_rate": 8.229527387748626e-10, "loss": 1.2311, "step": 1961 }, { "epoch": 0.9964448958862366, "grad_norm": 4.377933502197266, "learning_rate": 6.300752161314539e-10, "loss": 1.3215, "step": 1962 }, { "epoch": 0.9969527679024885, "grad_norm": 4.7609782218933105, "learning_rate": 4.6291369340201173e-10, "loss": 1.273, "step": 1963 }, { "epoch": 0.9974606399187405, "grad_norm": 4.997765064239502, "learning_rate": 3.214686004859857e-10, "loss": 1.3889, "step": 1964 }, { "epoch": 0.9979685119349924, "grad_norm": 4.37820291519165, "learning_rate": 2.0574030114794975e-10, "loss": 1.0622, "step": 1965 }, { "epoch": 0.9984763839512443, "grad_norm": 5.484806060791016, "learning_rate": 1.1572909301205137e-10, "loss": 1.4135, "step": 1966 }, { "epoch": 0.9989842559674962, "grad_norm": 4.66874361038208, "learning_rate": 5.143520756534237e-11, "loss": 1.3501, "step": 1967 }, { "epoch": 0.9994921279837481, "grad_norm": 4.561672687530518, "learning_rate": 1.2858810158888902e-11, "loss": 1.2979, "step": 1968 }, { "epoch": 1.0, "grad_norm": 5.26925802230835, "learning_rate": 0.0, "loss": 1.2135, "step": 1969 } ], "logging_steps": 1, "max_steps": 1969, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.689577521152e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }