{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0568493024794603, "eval_steps": 500, "global_step": 1322500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00039956495367843493, "grad_norm": 4.081777095794678, "learning_rate": 4.9996004350463214e-05, "loss": 0.6696, "step": 500 }, { "epoch": 0.0007991299073568699, "grad_norm": 4.491793632507324, "learning_rate": 4.999200870092643e-05, "loss": 0.5217, "step": 1000 }, { "epoch": 0.0011986948610353048, "grad_norm": 3.067439317703247, "learning_rate": 4.998801305138965e-05, "loss": 0.4821, "step": 1500 }, { "epoch": 0.0015982598147137397, "grad_norm": 4.151858806610107, "learning_rate": 4.998401740185287e-05, "loss": 0.4571, "step": 2000 }, { "epoch": 0.0019978247683921746, "grad_norm": 2.1439192295074463, "learning_rate": 4.998002175231608e-05, "loss": 0.4137, "step": 2500 }, { "epoch": 0.0023973897220706097, "grad_norm": 2.2253267765045166, "learning_rate": 4.997602610277929e-05, "loss": 0.4137, "step": 3000 }, { "epoch": 0.0027969546757490443, "grad_norm": 2.4990456104278564, "learning_rate": 4.9972030453242516e-05, "loss": 0.4423, "step": 3500 }, { "epoch": 0.0031965196294274794, "grad_norm": 2.4442901611328125, "learning_rate": 4.996803480370573e-05, "loss": 0.3973, "step": 4000 }, { "epoch": 0.003596084583105914, "grad_norm": 1.8715656995773315, "learning_rate": 4.9964039154168946e-05, "loss": 0.4236, "step": 4500 }, { "epoch": 0.003995649536784349, "grad_norm": 2.8356869220733643, "learning_rate": 4.996004350463216e-05, "loss": 0.3907, "step": 5000 }, { "epoch": 0.004395214490462784, "grad_norm": 2.254757881164551, "learning_rate": 4.9956047855095375e-05, "loss": 0.3749, "step": 5500 }, { "epoch": 0.004794779444141219, "grad_norm": 1.976780652999878, "learning_rate": 4.9952052205558594e-05, "loss": 0.3609, "step": 6000 }, { "epoch": 0.005194344397819654, "grad_norm": 1.6873953342437744, "learning_rate": 4.9948056556021805e-05, "loss": 0.3674, "step": 6500 }, { "epoch": 0.005593909351498089, "grad_norm": 2.2335588932037354, "learning_rate": 4.994406090648502e-05, "loss": 0.4164, "step": 7000 }, { "epoch": 0.005993474305176524, "grad_norm": 1.7810245752334595, "learning_rate": 4.9940065256948235e-05, "loss": 0.3641, "step": 7500 }, { "epoch": 0.006393039258854959, "grad_norm": 1.4673467874526978, "learning_rate": 4.993606960741145e-05, "loss": 0.3689, "step": 8000 }, { "epoch": 0.0067926042125333935, "grad_norm": 1.8147343397140503, "learning_rate": 4.993207395787467e-05, "loss": 0.3674, "step": 8500 }, { "epoch": 0.007192169166211828, "grad_norm": 1.5206788778305054, "learning_rate": 4.992807830833788e-05, "loss": 0.3502, "step": 9000 }, { "epoch": 0.007591734119890264, "grad_norm": 1.8874012231826782, "learning_rate": 4.99240826588011e-05, "loss": 0.3778, "step": 9500 }, { "epoch": 0.007991299073568698, "grad_norm": 1.3820565938949585, "learning_rate": 4.992008700926432e-05, "loss": 0.361, "step": 10000 }, { "epoch": 0.008390864027247133, "grad_norm": 1.7479816675186157, "learning_rate": 4.991609135972753e-05, "loss": 0.3509, "step": 10500 }, { "epoch": 0.008790428980925568, "grad_norm": 1.5165730714797974, "learning_rate": 4.991209571019075e-05, "loss": 0.3332, "step": 11000 }, { "epoch": 0.009189993934604002, "grad_norm": 1.8460886478424072, "learning_rate": 4.990810006065396e-05, "loss": 0.3473, "step": 11500 }, { "epoch": 0.009589558888282439, "grad_norm": 1.3894577026367188, "learning_rate": 4.990410441111718e-05, "loss": 0.333, "step": 12000 }, { "epoch": 0.009989123841960873, "grad_norm": 1.6774855852127075, "learning_rate": 4.9900108761580396e-05, "loss": 0.3538, "step": 12500 }, { "epoch": 0.010388688795639308, "grad_norm": 2.859013319015503, "learning_rate": 4.989611311204361e-05, "loss": 0.341, "step": 13000 }, { "epoch": 0.010788253749317743, "grad_norm": 1.1740492582321167, "learning_rate": 4.9892117462506826e-05, "loss": 0.3437, "step": 13500 }, { "epoch": 0.011187818702996177, "grad_norm": 2.2520759105682373, "learning_rate": 4.988812181297004e-05, "loss": 0.3506, "step": 14000 }, { "epoch": 0.011587383656674612, "grad_norm": 1.5025354623794556, "learning_rate": 4.9884126163433256e-05, "loss": 0.3397, "step": 14500 }, { "epoch": 0.011986948610353048, "grad_norm": 2.6174724102020264, "learning_rate": 4.9880130513896474e-05, "loss": 0.3464, "step": 15000 }, { "epoch": 0.012386513564031483, "grad_norm": 1.0281299352645874, "learning_rate": 4.9876134864359685e-05, "loss": 0.3256, "step": 15500 }, { "epoch": 0.012786078517709918, "grad_norm": 1.4588134288787842, "learning_rate": 4.9872139214822903e-05, "loss": 0.341, "step": 16000 }, { "epoch": 0.013185643471388352, "grad_norm": 1.1882590055465698, "learning_rate": 4.986814356528612e-05, "loss": 0.3254, "step": 16500 }, { "epoch": 0.013585208425066787, "grad_norm": 1.5440988540649414, "learning_rate": 4.986414791574933e-05, "loss": 0.3355, "step": 17000 }, { "epoch": 0.013984773378745222, "grad_norm": 1.068863868713379, "learning_rate": 4.986015226621255e-05, "loss": 0.3237, "step": 17500 }, { "epoch": 0.014384338332423656, "grad_norm": 1.562046766281128, "learning_rate": 4.985615661667576e-05, "loss": 0.32, "step": 18000 }, { "epoch": 0.014783903286102093, "grad_norm": 1.0591686964035034, "learning_rate": 4.985216096713898e-05, "loss": 0.297, "step": 18500 }, { "epoch": 0.015183468239780527, "grad_norm": 1.3485994338989258, "learning_rate": 4.98481653176022e-05, "loss": 0.3423, "step": 19000 }, { "epoch": 0.015583033193458962, "grad_norm": 1.1222705841064453, "learning_rate": 4.984416966806541e-05, "loss": 0.3387, "step": 19500 }, { "epoch": 0.015982598147137397, "grad_norm": 1.7326189279556274, "learning_rate": 4.984017401852863e-05, "loss": 0.3456, "step": 20000 }, { "epoch": 0.016382163100815833, "grad_norm": 1.6008397340774536, "learning_rate": 4.983617836899184e-05, "loss": 0.3174, "step": 20500 }, { "epoch": 0.016781728054494266, "grad_norm": 1.3519564867019653, "learning_rate": 4.983218271945506e-05, "loss": 0.3308, "step": 21000 }, { "epoch": 0.017181293008172702, "grad_norm": 1.6695245504379272, "learning_rate": 4.9828187069918277e-05, "loss": 0.3241, "step": 21500 }, { "epoch": 0.017580857961851135, "grad_norm": 1.9175039529800415, "learning_rate": 4.982419142038149e-05, "loss": 0.3076, "step": 22000 }, { "epoch": 0.01798042291552957, "grad_norm": 1.1784837245941162, "learning_rate": 4.9820195770844706e-05, "loss": 0.3421, "step": 22500 }, { "epoch": 0.018379987869208005, "grad_norm": 1.0450412034988403, "learning_rate": 4.9816200121307924e-05, "loss": 0.3397, "step": 23000 }, { "epoch": 0.01877955282288644, "grad_norm": 1.8573392629623413, "learning_rate": 4.981220447177114e-05, "loss": 0.3258, "step": 23500 }, { "epoch": 0.019179117776564877, "grad_norm": 1.0331610441207886, "learning_rate": 4.9808208822234354e-05, "loss": 0.3157, "step": 24000 }, { "epoch": 0.01957868273024331, "grad_norm": 0.9629985094070435, "learning_rate": 4.9804213172697565e-05, "loss": 0.3253, "step": 24500 }, { "epoch": 0.019978247683921747, "grad_norm": 1.3304194211959839, "learning_rate": 4.9800217523160784e-05, "loss": 0.2935, "step": 25000 }, { "epoch": 0.02037781263760018, "grad_norm": 2.783392906188965, "learning_rate": 4.9796221873624e-05, "loss": 0.3117, "step": 25500 }, { "epoch": 0.020777377591278616, "grad_norm": 2.342592239379883, "learning_rate": 4.979222622408722e-05, "loss": 0.3198, "step": 26000 }, { "epoch": 0.021176942544957052, "grad_norm": 2.9410994052886963, "learning_rate": 4.978823057455043e-05, "loss": 0.3172, "step": 26500 }, { "epoch": 0.021576507498635485, "grad_norm": 1.4278972148895264, "learning_rate": 4.978423492501364e-05, "loss": 0.3175, "step": 27000 }, { "epoch": 0.02197607245231392, "grad_norm": 0.8851290345191956, "learning_rate": 4.978023927547686e-05, "loss": 0.3112, "step": 27500 }, { "epoch": 0.022375637405992355, "grad_norm": 2.891403913497925, "learning_rate": 4.977624362594008e-05, "loss": 0.312, "step": 28000 }, { "epoch": 0.02277520235967079, "grad_norm": 1.715323805809021, "learning_rate": 4.97722479764033e-05, "loss": 0.3125, "step": 28500 }, { "epoch": 0.023174767313349224, "grad_norm": 2.172997236251831, "learning_rate": 4.976825232686651e-05, "loss": 0.3296, "step": 29000 }, { "epoch": 0.02357433226702766, "grad_norm": 0.8844888210296631, "learning_rate": 4.976425667732972e-05, "loss": 0.2967, "step": 29500 }, { "epoch": 0.023973897220706097, "grad_norm": 2.583977699279785, "learning_rate": 4.9760261027792945e-05, "loss": 0.3324, "step": 30000 }, { "epoch": 0.02437346217438453, "grad_norm": 3.477320909500122, "learning_rate": 4.975626537825616e-05, "loss": 0.3029, "step": 30500 }, { "epoch": 0.024773027128062966, "grad_norm": 1.162782073020935, "learning_rate": 4.9752269728719375e-05, "loss": 0.3043, "step": 31000 }, { "epoch": 0.0251725920817414, "grad_norm": 2.4572184085845947, "learning_rate": 4.9748274079182586e-05, "loss": 0.322, "step": 31500 }, { "epoch": 0.025572157035419835, "grad_norm": 2.8739402294158936, "learning_rate": 4.9744278429645805e-05, "loss": 0.3012, "step": 32000 }, { "epoch": 0.025971721989098268, "grad_norm": 1.3774921894073486, "learning_rate": 4.974028278010902e-05, "loss": 0.3006, "step": 32500 }, { "epoch": 0.026371286942776705, "grad_norm": 0.7296271324157715, "learning_rate": 4.9736287130572234e-05, "loss": 0.3284, "step": 33000 }, { "epoch": 0.02677085189645514, "grad_norm": 1.8096294403076172, "learning_rate": 4.973229148103545e-05, "loss": 0.3226, "step": 33500 }, { "epoch": 0.027170416850133574, "grad_norm": 1.959118127822876, "learning_rate": 4.9728295831498664e-05, "loss": 0.3387, "step": 34000 }, { "epoch": 0.02756998180381201, "grad_norm": 1.2132261991500854, "learning_rate": 4.972430018196188e-05, "loss": 0.3186, "step": 34500 }, { "epoch": 0.027969546757490443, "grad_norm": 0.7700232267379761, "learning_rate": 4.97203045324251e-05, "loss": 0.3088, "step": 35000 }, { "epoch": 0.02836911171116888, "grad_norm": 0.46602240204811096, "learning_rate": 4.971630888288831e-05, "loss": 0.3163, "step": 35500 }, { "epoch": 0.028768676664847313, "grad_norm": 0.7435968518257141, "learning_rate": 4.971231323335153e-05, "loss": 0.306, "step": 36000 }, { "epoch": 0.02916824161852575, "grad_norm": 0.33676111698150635, "learning_rate": 4.970831758381475e-05, "loss": 0.3048, "step": 36500 }, { "epoch": 0.029567806572204185, "grad_norm": 1.1742411851882935, "learning_rate": 4.970432193427796e-05, "loss": 0.2962, "step": 37000 }, { "epoch": 0.02996737152588262, "grad_norm": 1.1635487079620361, "learning_rate": 4.970032628474118e-05, "loss": 0.2903, "step": 37500 }, { "epoch": 0.030366936479561055, "grad_norm": 0.6865672469139099, "learning_rate": 4.969633063520439e-05, "loss": 0.2888, "step": 38000 }, { "epoch": 0.030766501433239488, "grad_norm": 1.2744709253311157, "learning_rate": 4.969233498566761e-05, "loss": 0.3144, "step": 38500 }, { "epoch": 0.031166066386917924, "grad_norm": 1.0660783052444458, "learning_rate": 4.9688339336130825e-05, "loss": 0.2935, "step": 39000 }, { "epoch": 0.03156563134059636, "grad_norm": 1.1280876398086548, "learning_rate": 4.968434368659404e-05, "loss": 0.3027, "step": 39500 }, { "epoch": 0.03196519629427479, "grad_norm": 0.6882218718528748, "learning_rate": 4.9680348037057255e-05, "loss": 0.2893, "step": 40000 }, { "epoch": 0.032364761247953226, "grad_norm": 1.6850461959838867, "learning_rate": 4.9676352387520466e-05, "loss": 0.3041, "step": 40500 }, { "epoch": 0.032764326201631666, "grad_norm": 2.4040000438690186, "learning_rate": 4.967235673798369e-05, "loss": 0.2896, "step": 41000 }, { "epoch": 0.0331638911553101, "grad_norm": 2.316746711730957, "learning_rate": 4.96683610884469e-05, "loss": 0.2761, "step": 41500 }, { "epoch": 0.03356345610898853, "grad_norm": 0.9685487747192383, "learning_rate": 4.9664365438910114e-05, "loss": 0.3011, "step": 42000 }, { "epoch": 0.033963021062666965, "grad_norm": 1.101892113685608, "learning_rate": 4.966036978937333e-05, "loss": 0.2995, "step": 42500 }, { "epoch": 0.034362586016345405, "grad_norm": 1.2868808507919312, "learning_rate": 4.965637413983655e-05, "loss": 0.3266, "step": 43000 }, { "epoch": 0.03476215097002384, "grad_norm": 1.8883661031723022, "learning_rate": 4.965237849029976e-05, "loss": 0.3113, "step": 43500 }, { "epoch": 0.03516171592370227, "grad_norm": 1.373059868812561, "learning_rate": 4.964838284076298e-05, "loss": 0.2803, "step": 44000 }, { "epoch": 0.03556128087738071, "grad_norm": 1.9497379064559937, "learning_rate": 4.964438719122619e-05, "loss": 0.3022, "step": 44500 }, { "epoch": 0.03596084583105914, "grad_norm": 2.0109946727752686, "learning_rate": 4.964039154168941e-05, "loss": 0.2848, "step": 45000 }, { "epoch": 0.036360410784737576, "grad_norm": 1.548153281211853, "learning_rate": 4.963639589215263e-05, "loss": 0.2881, "step": 45500 }, { "epoch": 0.03675997573841601, "grad_norm": 1.1586449146270752, "learning_rate": 4.963240024261584e-05, "loss": 0.2851, "step": 46000 }, { "epoch": 0.03715954069209445, "grad_norm": 1.688859462738037, "learning_rate": 4.962840459307906e-05, "loss": 0.3212, "step": 46500 }, { "epoch": 0.03755910564577288, "grad_norm": 2.4237589836120605, "learning_rate": 4.962440894354227e-05, "loss": 0.2786, "step": 47000 }, { "epoch": 0.037958670599451315, "grad_norm": 1.5502651929855347, "learning_rate": 4.9620413294005494e-05, "loss": 0.275, "step": 47500 }, { "epoch": 0.038358235553129755, "grad_norm": 1.7260761260986328, "learning_rate": 4.9616417644468706e-05, "loss": 0.3183, "step": 48000 }, { "epoch": 0.03875780050680819, "grad_norm": 1.006080150604248, "learning_rate": 4.961242199493192e-05, "loss": 0.2952, "step": 48500 }, { "epoch": 0.03915736546048662, "grad_norm": 1.6527464389801025, "learning_rate": 4.9608426345395135e-05, "loss": 0.3061, "step": 49000 }, { "epoch": 0.03955693041416506, "grad_norm": 1.9486868381500244, "learning_rate": 4.9604430695858353e-05, "loss": 0.2993, "step": 49500 }, { "epoch": 0.03995649536784349, "grad_norm": 0.870374321937561, "learning_rate": 4.960043504632157e-05, "loss": 0.2882, "step": 50000 }, { "epoch": 0.040356060321521926, "grad_norm": 1.175443410873413, "learning_rate": 4.959643939678478e-05, "loss": 0.3237, "step": 50500 }, { "epoch": 0.04075562527520036, "grad_norm": 1.4201246500015259, "learning_rate": 4.9592443747247994e-05, "loss": 0.2813, "step": 51000 }, { "epoch": 0.0411551902288788, "grad_norm": 2.9027390480041504, "learning_rate": 4.958844809771121e-05, "loss": 0.2978, "step": 51500 }, { "epoch": 0.04155475518255723, "grad_norm": 1.5867899656295776, "learning_rate": 4.958445244817443e-05, "loss": 0.2922, "step": 52000 }, { "epoch": 0.041954320136235665, "grad_norm": 0.8484770655632019, "learning_rate": 4.958045679863765e-05, "loss": 0.2835, "step": 52500 }, { "epoch": 0.042353885089914105, "grad_norm": 0.9785622954368591, "learning_rate": 4.957646114910086e-05, "loss": 0.3042, "step": 53000 }, { "epoch": 0.04275345004359254, "grad_norm": 0.6847981810569763, "learning_rate": 4.957246549956407e-05, "loss": 0.2867, "step": 53500 }, { "epoch": 0.04315301499727097, "grad_norm": 1.6795730590820312, "learning_rate": 4.95684698500273e-05, "loss": 0.3137, "step": 54000 }, { "epoch": 0.043552579950949404, "grad_norm": 1.331191897392273, "learning_rate": 4.956447420049051e-05, "loss": 0.2833, "step": 54500 }, { "epoch": 0.04395214490462784, "grad_norm": 2.3254575729370117, "learning_rate": 4.9560478550953726e-05, "loss": 0.2993, "step": 55000 }, { "epoch": 0.044351709858306276, "grad_norm": 1.2180935144424438, "learning_rate": 4.955648290141694e-05, "loss": 0.2718, "step": 55500 }, { "epoch": 0.04475127481198471, "grad_norm": 0.8246452808380127, "learning_rate": 4.9552487251880156e-05, "loss": 0.3209, "step": 56000 }, { "epoch": 0.04515083976566315, "grad_norm": 1.8490012884140015, "learning_rate": 4.9548491602343374e-05, "loss": 0.2959, "step": 56500 }, { "epoch": 0.04555040471934158, "grad_norm": 0.8055028319358826, "learning_rate": 4.9544495952806586e-05, "loss": 0.2909, "step": 57000 }, { "epoch": 0.045949969673020015, "grad_norm": 0.9544110894203186, "learning_rate": 4.9540500303269804e-05, "loss": 0.3048, "step": 57500 }, { "epoch": 0.04634953462669845, "grad_norm": 2.563243865966797, "learning_rate": 4.9536504653733015e-05, "loss": 0.3036, "step": 58000 }, { "epoch": 0.04674909958037689, "grad_norm": 0.7761433720588684, "learning_rate": 4.9532509004196234e-05, "loss": 0.2866, "step": 58500 }, { "epoch": 0.04714866453405532, "grad_norm": 0.9289612770080566, "learning_rate": 4.952851335465945e-05, "loss": 0.2895, "step": 59000 }, { "epoch": 0.047548229487733754, "grad_norm": 2.577202796936035, "learning_rate": 4.952451770512266e-05, "loss": 0.2895, "step": 59500 }, { "epoch": 0.04794779444141219, "grad_norm": 0.4422161281108856, "learning_rate": 4.952052205558588e-05, "loss": 0.3057, "step": 60000 }, { "epoch": 0.048347359395090626, "grad_norm": 2.4044764041900635, "learning_rate": 4.95165264060491e-05, "loss": 0.3187, "step": 60500 }, { "epoch": 0.04874692434876906, "grad_norm": 1.0988867282867432, "learning_rate": 4.951253075651231e-05, "loss": 0.2888, "step": 61000 }, { "epoch": 0.04914648930244749, "grad_norm": 1.0510305166244507, "learning_rate": 4.950853510697553e-05, "loss": 0.3001, "step": 61500 }, { "epoch": 0.04954605425612593, "grad_norm": 1.1874464750289917, "learning_rate": 4.950453945743874e-05, "loss": 0.2888, "step": 62000 }, { "epoch": 0.049945619209804365, "grad_norm": 2.0075743198394775, "learning_rate": 4.950054380790196e-05, "loss": 0.2937, "step": 62500 }, { "epoch": 0.0503451841634828, "grad_norm": 1.3025768995285034, "learning_rate": 4.949654815836518e-05, "loss": 0.2994, "step": 63000 }, { "epoch": 0.05074474911716124, "grad_norm": 1.0221538543701172, "learning_rate": 4.949255250882839e-05, "loss": 0.2928, "step": 63500 }, { "epoch": 0.05114431407083967, "grad_norm": 1.3218053579330444, "learning_rate": 4.948855685929161e-05, "loss": 0.2945, "step": 64000 }, { "epoch": 0.051543879024518104, "grad_norm": 2.152517080307007, "learning_rate": 4.948456120975482e-05, "loss": 0.2992, "step": 64500 }, { "epoch": 0.051943443978196537, "grad_norm": 1.928364634513855, "learning_rate": 4.9480565560218036e-05, "loss": 0.2925, "step": 65000 }, { "epoch": 0.052343008931874976, "grad_norm": 0.6612226366996765, "learning_rate": 4.9476569910681254e-05, "loss": 0.2855, "step": 65500 }, { "epoch": 0.05274257388555341, "grad_norm": 1.4182533025741577, "learning_rate": 4.9472574261144466e-05, "loss": 0.3048, "step": 66000 }, { "epoch": 0.05314213883923184, "grad_norm": 1.6194418668746948, "learning_rate": 4.9468578611607684e-05, "loss": 0.3041, "step": 66500 }, { "epoch": 0.05354170379291028, "grad_norm": 0.6575913429260254, "learning_rate": 4.94645829620709e-05, "loss": 0.2778, "step": 67000 }, { "epoch": 0.053941268746588715, "grad_norm": 2.433629274368286, "learning_rate": 4.946058731253412e-05, "loss": 0.2774, "step": 67500 }, { "epoch": 0.05434083370026715, "grad_norm": 0.8531148433685303, "learning_rate": 4.945659166299733e-05, "loss": 0.2724, "step": 68000 }, { "epoch": 0.05474039865394558, "grad_norm": 1.3834210634231567, "learning_rate": 4.945259601346054e-05, "loss": 0.3124, "step": 68500 }, { "epoch": 0.05513996360762402, "grad_norm": 0.9069628119468689, "learning_rate": 4.944860036392376e-05, "loss": 0.2955, "step": 69000 }, { "epoch": 0.055539528561302454, "grad_norm": 1.2405658960342407, "learning_rate": 4.944460471438698e-05, "loss": 0.282, "step": 69500 }, { "epoch": 0.05593909351498089, "grad_norm": 1.026618242263794, "learning_rate": 4.944060906485019e-05, "loss": 0.2907, "step": 70000 }, { "epoch": 0.056338658468659326, "grad_norm": 1.8501253128051758, "learning_rate": 4.943661341531341e-05, "loss": 0.3125, "step": 70500 }, { "epoch": 0.05673822342233776, "grad_norm": 1.9038240909576416, "learning_rate": 4.943261776577662e-05, "loss": 0.2955, "step": 71000 }, { "epoch": 0.05713778837601619, "grad_norm": 0.8381636738777161, "learning_rate": 4.942862211623984e-05, "loss": 0.2706, "step": 71500 }, { "epoch": 0.057537353329694625, "grad_norm": 0.8808618783950806, "learning_rate": 4.942462646670306e-05, "loss": 0.2869, "step": 72000 }, { "epoch": 0.057936918283373065, "grad_norm": 1.2757792472839355, "learning_rate": 4.942063081716627e-05, "loss": 0.2961, "step": 72500 }, { "epoch": 0.0583364832370515, "grad_norm": 1.117100477218628, "learning_rate": 4.941663516762949e-05, "loss": 0.2886, "step": 73000 }, { "epoch": 0.05873604819072993, "grad_norm": 0.7892334461212158, "learning_rate": 4.94126395180927e-05, "loss": 0.2938, "step": 73500 }, { "epoch": 0.05913561314440837, "grad_norm": 2.163651943206787, "learning_rate": 4.940864386855592e-05, "loss": 0.2722, "step": 74000 }, { "epoch": 0.059535178098086804, "grad_norm": 0.8178571462631226, "learning_rate": 4.9404648219019135e-05, "loss": 0.288, "step": 74500 }, { "epoch": 0.05993474305176524, "grad_norm": 1.1608814001083374, "learning_rate": 4.9400652569482346e-05, "loss": 0.285, "step": 75000 }, { "epoch": 0.06033430800544367, "grad_norm": 2.5505645275115967, "learning_rate": 4.9396656919945564e-05, "loss": 0.2991, "step": 75500 }, { "epoch": 0.06073387295912211, "grad_norm": 2.3172526359558105, "learning_rate": 4.939266127040878e-05, "loss": 0.3046, "step": 76000 }, { "epoch": 0.06113343791280054, "grad_norm": 0.5134784579277039, "learning_rate": 4.9388665620872e-05, "loss": 0.2826, "step": 76500 }, { "epoch": 0.061533002866478975, "grad_norm": 0.7215248346328735, "learning_rate": 4.938466997133521e-05, "loss": 0.2826, "step": 77000 }, { "epoch": 0.061932567820157415, "grad_norm": 0.6030542254447937, "learning_rate": 4.9380674321798423e-05, "loss": 0.2998, "step": 77500 }, { "epoch": 0.06233213277383585, "grad_norm": 0.6619080305099487, "learning_rate": 4.937667867226164e-05, "loss": 0.2782, "step": 78000 }, { "epoch": 0.06273169772751429, "grad_norm": 3.1303670406341553, "learning_rate": 4.937268302272486e-05, "loss": 0.2805, "step": 78500 }, { "epoch": 0.06313126268119272, "grad_norm": 1.6320053339004517, "learning_rate": 4.936868737318808e-05, "loss": 0.2732, "step": 79000 }, { "epoch": 0.06353082763487115, "grad_norm": 2.2910869121551514, "learning_rate": 4.936469172365129e-05, "loss": 0.2797, "step": 79500 }, { "epoch": 0.06393039258854959, "grad_norm": 0.8188690543174744, "learning_rate": 4.93606960741145e-05, "loss": 0.291, "step": 80000 }, { "epoch": 0.06432995754222802, "grad_norm": 2.9095993041992188, "learning_rate": 4.9356700424577726e-05, "loss": 0.2731, "step": 80500 }, { "epoch": 0.06472952249590645, "grad_norm": 0.6173179149627686, "learning_rate": 4.935270477504094e-05, "loss": 0.246, "step": 81000 }, { "epoch": 0.06512908744958489, "grad_norm": 0.9133193492889404, "learning_rate": 4.9348709125504156e-05, "loss": 0.2775, "step": 81500 }, { "epoch": 0.06552865240326333, "grad_norm": 2.138429641723633, "learning_rate": 4.934471347596737e-05, "loss": 0.2701, "step": 82000 }, { "epoch": 0.06592821735694177, "grad_norm": 0.9940409660339355, "learning_rate": 4.9340717826430585e-05, "loss": 0.2678, "step": 82500 }, { "epoch": 0.0663277823106202, "grad_norm": 3.61849308013916, "learning_rate": 4.93367221768938e-05, "loss": 0.2804, "step": 83000 }, { "epoch": 0.06672734726429863, "grad_norm": 1.651969075202942, "learning_rate": 4.9332726527357015e-05, "loss": 0.2874, "step": 83500 }, { "epoch": 0.06712691221797706, "grad_norm": 1.3269331455230713, "learning_rate": 4.932873087782023e-05, "loss": 0.2878, "step": 84000 }, { "epoch": 0.0675264771716555, "grad_norm": 0.605516254901886, "learning_rate": 4.9324735228283444e-05, "loss": 0.2813, "step": 84500 }, { "epoch": 0.06792604212533393, "grad_norm": 2.9456229209899902, "learning_rate": 4.932073957874666e-05, "loss": 0.2795, "step": 85000 }, { "epoch": 0.06832560707901238, "grad_norm": 0.4787779748439789, "learning_rate": 4.931674392920988e-05, "loss": 0.2827, "step": 85500 }, { "epoch": 0.06872517203269081, "grad_norm": 0.7174183130264282, "learning_rate": 4.931274827967309e-05, "loss": 0.2976, "step": 86000 }, { "epoch": 0.06912473698636924, "grad_norm": 0.8079970479011536, "learning_rate": 4.930875263013631e-05, "loss": 0.2896, "step": 86500 }, { "epoch": 0.06952430194004768, "grad_norm": 0.41608014702796936, "learning_rate": 4.930475698059953e-05, "loss": 0.2753, "step": 87000 }, { "epoch": 0.06992386689372611, "grad_norm": 1.2346224784851074, "learning_rate": 4.930076133106274e-05, "loss": 0.2765, "step": 87500 }, { "epoch": 0.07032343184740454, "grad_norm": 0.8600857853889465, "learning_rate": 4.929676568152596e-05, "loss": 0.2398, "step": 88000 }, { "epoch": 0.07072299680108297, "grad_norm": 1.197239637374878, "learning_rate": 4.929277003198917e-05, "loss": 0.2754, "step": 88500 }, { "epoch": 0.07112256175476142, "grad_norm": 1.4435303211212158, "learning_rate": 4.928877438245239e-05, "loss": 0.27, "step": 89000 }, { "epoch": 0.07152212670843985, "grad_norm": 0.650588870048523, "learning_rate": 4.9284778732915606e-05, "loss": 0.2779, "step": 89500 }, { "epoch": 0.07192169166211829, "grad_norm": 0.8800471425056458, "learning_rate": 4.928078308337882e-05, "loss": 0.2636, "step": 90000 }, { "epoch": 0.07232125661579672, "grad_norm": 0.733068585395813, "learning_rate": 4.9276787433842036e-05, "loss": 0.2885, "step": 90500 }, { "epoch": 0.07272082156947515, "grad_norm": 1.85322904586792, "learning_rate": 4.927279178430525e-05, "loss": 0.2651, "step": 91000 }, { "epoch": 0.07312038652315359, "grad_norm": 1.0985164642333984, "learning_rate": 4.926879613476847e-05, "loss": 0.2702, "step": 91500 }, { "epoch": 0.07351995147683202, "grad_norm": 0.6010386943817139, "learning_rate": 4.9264800485231683e-05, "loss": 0.2826, "step": 92000 }, { "epoch": 0.07391951643051047, "grad_norm": 0.8415541052818298, "learning_rate": 4.9260804835694895e-05, "loss": 0.2596, "step": 92500 }, { "epoch": 0.0743190813841889, "grad_norm": 1.9091089963912964, "learning_rate": 4.925680918615811e-05, "loss": 0.274, "step": 93000 }, { "epoch": 0.07471864633786733, "grad_norm": 0.9369784593582153, "learning_rate": 4.925281353662133e-05, "loss": 0.2836, "step": 93500 }, { "epoch": 0.07511821129154576, "grad_norm": 0.7627999782562256, "learning_rate": 4.924881788708455e-05, "loss": 0.2796, "step": 94000 }, { "epoch": 0.0755177762452242, "grad_norm": 0.5176789164543152, "learning_rate": 4.924482223754776e-05, "loss": 0.2761, "step": 94500 }, { "epoch": 0.07591734119890263, "grad_norm": 2.0211431980133057, "learning_rate": 4.924082658801097e-05, "loss": 0.294, "step": 95000 }, { "epoch": 0.07631690615258108, "grad_norm": 0.3374631106853485, "learning_rate": 4.923683093847419e-05, "loss": 0.2857, "step": 95500 }, { "epoch": 0.07671647110625951, "grad_norm": 0.46449679136276245, "learning_rate": 4.923283528893741e-05, "loss": 0.2539, "step": 96000 }, { "epoch": 0.07711603605993794, "grad_norm": 1.4599374532699585, "learning_rate": 4.922883963940063e-05, "loss": 0.2854, "step": 96500 }, { "epoch": 0.07751560101361638, "grad_norm": 1.3464281558990479, "learning_rate": 4.922484398986384e-05, "loss": 0.2695, "step": 97000 }, { "epoch": 0.07791516596729481, "grad_norm": 0.4476688802242279, "learning_rate": 4.922084834032705e-05, "loss": 0.27, "step": 97500 }, { "epoch": 0.07831473092097324, "grad_norm": 1.7551065683364868, "learning_rate": 4.9216852690790275e-05, "loss": 0.2715, "step": 98000 }, { "epoch": 0.07871429587465167, "grad_norm": 1.4756869077682495, "learning_rate": 4.9212857041253486e-05, "loss": 0.2763, "step": 98500 }, { "epoch": 0.07911386082833012, "grad_norm": 0.9867352247238159, "learning_rate": 4.92088613917167e-05, "loss": 0.2875, "step": 99000 }, { "epoch": 0.07951342578200855, "grad_norm": 2.5403356552124023, "learning_rate": 4.9204865742179916e-05, "loss": 0.2704, "step": 99500 }, { "epoch": 0.07991299073568699, "grad_norm": 1.3166043758392334, "learning_rate": 4.9200870092643134e-05, "loss": 0.2862, "step": 100000 }, { "epoch": 0.08031255568936542, "grad_norm": 0.9557125568389893, "learning_rate": 4.919687444310635e-05, "loss": 0.2827, "step": 100500 }, { "epoch": 0.08071212064304385, "grad_norm": 1.0056195259094238, "learning_rate": 4.9192878793569564e-05, "loss": 0.2685, "step": 101000 }, { "epoch": 0.08111168559672229, "grad_norm": 1.0436218976974487, "learning_rate": 4.9188883144032775e-05, "loss": 0.2737, "step": 101500 }, { "epoch": 0.08151125055040072, "grad_norm": 1.7625939846038818, "learning_rate": 4.918488749449599e-05, "loss": 0.2655, "step": 102000 }, { "epoch": 0.08191081550407917, "grad_norm": 0.4783661365509033, "learning_rate": 4.918089184495921e-05, "loss": 0.2719, "step": 102500 }, { "epoch": 0.0823103804577576, "grad_norm": 0.5397074818611145, "learning_rate": 4.917689619542243e-05, "loss": 0.2695, "step": 103000 }, { "epoch": 0.08270994541143603, "grad_norm": 0.5667609572410583, "learning_rate": 4.917290054588564e-05, "loss": 0.2731, "step": 103500 }, { "epoch": 0.08310951036511446, "grad_norm": 1.5264016389846802, "learning_rate": 4.916890489634885e-05, "loss": 0.2755, "step": 104000 }, { "epoch": 0.0835090753187929, "grad_norm": 1.0925426483154297, "learning_rate": 4.916490924681208e-05, "loss": 0.2701, "step": 104500 }, { "epoch": 0.08390864027247133, "grad_norm": 5.16488790512085, "learning_rate": 4.916091359727529e-05, "loss": 0.2611, "step": 105000 }, { "epoch": 0.08430820522614976, "grad_norm": 0.8992636799812317, "learning_rate": 4.915691794773851e-05, "loss": 0.2797, "step": 105500 }, { "epoch": 0.08470777017982821, "grad_norm": 0.7648441195487976, "learning_rate": 4.915292229820172e-05, "loss": 0.2667, "step": 106000 }, { "epoch": 0.08510733513350664, "grad_norm": 1.3748236894607544, "learning_rate": 4.914892664866494e-05, "loss": 0.2836, "step": 106500 }, { "epoch": 0.08550690008718508, "grad_norm": 1.8429991006851196, "learning_rate": 4.9144930999128155e-05, "loss": 0.268, "step": 107000 }, { "epoch": 0.08590646504086351, "grad_norm": 0.393741637468338, "learning_rate": 4.9140935349591366e-05, "loss": 0.2622, "step": 107500 }, { "epoch": 0.08630602999454194, "grad_norm": 0.8524430394172668, "learning_rate": 4.9136939700054585e-05, "loss": 0.2899, "step": 108000 }, { "epoch": 0.08670559494822037, "grad_norm": 0.6162190437316895, "learning_rate": 4.9132944050517796e-05, "loss": 0.2785, "step": 108500 }, { "epoch": 0.08710515990189881, "grad_norm": 0.6410815119743347, "learning_rate": 4.9128948400981014e-05, "loss": 0.2701, "step": 109000 }, { "epoch": 0.08750472485557725, "grad_norm": 0.64133220911026, "learning_rate": 4.912495275144423e-05, "loss": 0.2549, "step": 109500 }, { "epoch": 0.08790428980925569, "grad_norm": 0.4513498842716217, "learning_rate": 4.9120957101907444e-05, "loss": 0.2566, "step": 110000 }, { "epoch": 0.08830385476293412, "grad_norm": 0.6216118931770325, "learning_rate": 4.911696145237066e-05, "loss": 0.2724, "step": 110500 }, { "epoch": 0.08870341971661255, "grad_norm": 0.7544018030166626, "learning_rate": 4.911296580283388e-05, "loss": 0.2714, "step": 111000 }, { "epoch": 0.08910298467029099, "grad_norm": 1.9668000936508179, "learning_rate": 4.910897015329709e-05, "loss": 0.2765, "step": 111500 }, { "epoch": 0.08950254962396942, "grad_norm": 1.5050837993621826, "learning_rate": 4.910497450376031e-05, "loss": 0.2627, "step": 112000 }, { "epoch": 0.08990211457764785, "grad_norm": 3.802332639694214, "learning_rate": 4.910097885422352e-05, "loss": 0.3007, "step": 112500 }, { "epoch": 0.0903016795313263, "grad_norm": 0.8596916198730469, "learning_rate": 4.909698320468674e-05, "loss": 0.2742, "step": 113000 }, { "epoch": 0.09070124448500473, "grad_norm": 0.4422881305217743, "learning_rate": 4.909298755514996e-05, "loss": 0.2623, "step": 113500 }, { "epoch": 0.09110080943868316, "grad_norm": 1.9964492321014404, "learning_rate": 4.908899190561317e-05, "loss": 0.2569, "step": 114000 }, { "epoch": 0.0915003743923616, "grad_norm": 0.6257199048995972, "learning_rate": 4.908499625607639e-05, "loss": 0.281, "step": 114500 }, { "epoch": 0.09189993934604003, "grad_norm": 0.9926784634590149, "learning_rate": 4.90810006065396e-05, "loss": 0.2805, "step": 115000 }, { "epoch": 0.09229950429971846, "grad_norm": 5.182226181030273, "learning_rate": 4.907700495700282e-05, "loss": 0.2828, "step": 115500 }, { "epoch": 0.0926990692533969, "grad_norm": 0.8583033680915833, "learning_rate": 4.9073009307466035e-05, "loss": 0.265, "step": 116000 }, { "epoch": 0.09309863420707534, "grad_norm": 0.49972429871559143, "learning_rate": 4.9069013657929246e-05, "loss": 0.2822, "step": 116500 }, { "epoch": 0.09349819916075378, "grad_norm": 1.0498415231704712, "learning_rate": 4.9065018008392465e-05, "loss": 0.2594, "step": 117000 }, { "epoch": 0.09389776411443221, "grad_norm": 0.5320106148719788, "learning_rate": 4.9061022358855676e-05, "loss": 0.2807, "step": 117500 }, { "epoch": 0.09429732906811064, "grad_norm": 2.0761990547180176, "learning_rate": 4.90570267093189e-05, "loss": 0.2809, "step": 118000 }, { "epoch": 0.09469689402178907, "grad_norm": 0.738562285900116, "learning_rate": 4.905303105978211e-05, "loss": 0.2718, "step": 118500 }, { "epoch": 0.09509645897546751, "grad_norm": 0.8123143911361694, "learning_rate": 4.9049035410245324e-05, "loss": 0.2692, "step": 119000 }, { "epoch": 0.09549602392914594, "grad_norm": 1.9179563522338867, "learning_rate": 4.904503976070854e-05, "loss": 0.2736, "step": 119500 }, { "epoch": 0.09589558888282439, "grad_norm": 2.8562474250793457, "learning_rate": 4.904104411117176e-05, "loss": 0.2773, "step": 120000 }, { "epoch": 0.09629515383650282, "grad_norm": 1.2031207084655762, "learning_rate": 4.903704846163498e-05, "loss": 0.273, "step": 120500 }, { "epoch": 0.09669471879018125, "grad_norm": 1.009368658065796, "learning_rate": 4.903305281209819e-05, "loss": 0.2652, "step": 121000 }, { "epoch": 0.09709428374385969, "grad_norm": 0.9847525358200073, "learning_rate": 4.90290571625614e-05, "loss": 0.269, "step": 121500 }, { "epoch": 0.09749384869753812, "grad_norm": 1.3933730125427246, "learning_rate": 4.902506151302462e-05, "loss": 0.2643, "step": 122000 }, { "epoch": 0.09789341365121655, "grad_norm": 1.7241570949554443, "learning_rate": 4.902106586348784e-05, "loss": 0.2554, "step": 122500 }, { "epoch": 0.09829297860489498, "grad_norm": 0.8359851241111755, "learning_rate": 4.9017070213951056e-05, "loss": 0.2494, "step": 123000 }, { "epoch": 0.09869254355857343, "grad_norm": 2.1847476959228516, "learning_rate": 4.901307456441427e-05, "loss": 0.2843, "step": 123500 }, { "epoch": 0.09909210851225186, "grad_norm": 1.9815640449523926, "learning_rate": 4.900907891487748e-05, "loss": 0.2655, "step": 124000 }, { "epoch": 0.0994916734659303, "grad_norm": 0.5393816232681274, "learning_rate": 4.9005083265340704e-05, "loss": 0.2725, "step": 124500 }, { "epoch": 0.09989123841960873, "grad_norm": 1.4789758920669556, "learning_rate": 4.9001087615803915e-05, "loss": 0.2598, "step": 125000 }, { "epoch": 0.10029080337328716, "grad_norm": 2.4414987564086914, "learning_rate": 4.899709196626713e-05, "loss": 0.2684, "step": 125500 }, { "epoch": 0.1006903683269656, "grad_norm": 3.4420876502990723, "learning_rate": 4.8993096316730345e-05, "loss": 0.2708, "step": 126000 }, { "epoch": 0.10108993328064403, "grad_norm": 1.920907735824585, "learning_rate": 4.898910066719356e-05, "loss": 0.2536, "step": 126500 }, { "epoch": 0.10148949823432248, "grad_norm": 0.7689992785453796, "learning_rate": 4.898510501765678e-05, "loss": 0.2655, "step": 127000 }, { "epoch": 0.10188906318800091, "grad_norm": 0.9230772256851196, "learning_rate": 4.898110936811999e-05, "loss": 0.2653, "step": 127500 }, { "epoch": 0.10228862814167934, "grad_norm": 1.169212818145752, "learning_rate": 4.8977113718583204e-05, "loss": 0.2629, "step": 128000 }, { "epoch": 0.10268819309535777, "grad_norm": 0.4505156874656677, "learning_rate": 4.897311806904642e-05, "loss": 0.2738, "step": 128500 }, { "epoch": 0.10308775804903621, "grad_norm": 0.5294475555419922, "learning_rate": 4.896912241950964e-05, "loss": 0.2868, "step": 129000 }, { "epoch": 0.10348732300271464, "grad_norm": 0.9603499174118042, "learning_rate": 4.896512676997286e-05, "loss": 0.2781, "step": 129500 }, { "epoch": 0.10388688795639307, "grad_norm": 0.7294228672981262, "learning_rate": 4.896113112043607e-05, "loss": 0.2624, "step": 130000 }, { "epoch": 0.10428645291007152, "grad_norm": 0.9814987182617188, "learning_rate": 4.895713547089928e-05, "loss": 0.2725, "step": 130500 }, { "epoch": 0.10468601786374995, "grad_norm": 1.1141530275344849, "learning_rate": 4.8953139821362506e-05, "loss": 0.2727, "step": 131000 }, { "epoch": 0.10508558281742839, "grad_norm": 1.552577257156372, "learning_rate": 4.894914417182572e-05, "loss": 0.2788, "step": 131500 }, { "epoch": 0.10548514777110682, "grad_norm": 0.418449729681015, "learning_rate": 4.8945148522288936e-05, "loss": 0.271, "step": 132000 }, { "epoch": 0.10588471272478525, "grad_norm": 0.6355040073394775, "learning_rate": 4.894115287275215e-05, "loss": 0.2858, "step": 132500 }, { "epoch": 0.10628427767846368, "grad_norm": 1.603973150253296, "learning_rate": 4.8937157223215366e-05, "loss": 0.2639, "step": 133000 }, { "epoch": 0.10668384263214212, "grad_norm": 2.837867259979248, "learning_rate": 4.8933161573678584e-05, "loss": 0.2757, "step": 133500 }, { "epoch": 0.10708340758582056, "grad_norm": 2.0700223445892334, "learning_rate": 4.8929165924141795e-05, "loss": 0.2751, "step": 134000 }, { "epoch": 0.107482972539499, "grad_norm": 0.8931808471679688, "learning_rate": 4.8925170274605014e-05, "loss": 0.2673, "step": 134500 }, { "epoch": 0.10788253749317743, "grad_norm": 0.7924180030822754, "learning_rate": 4.8921174625068225e-05, "loss": 0.2704, "step": 135000 }, { "epoch": 0.10828210244685586, "grad_norm": 0.8448367118835449, "learning_rate": 4.891717897553144e-05, "loss": 0.2744, "step": 135500 }, { "epoch": 0.1086816674005343, "grad_norm": 1.242915153503418, "learning_rate": 4.891318332599466e-05, "loss": 0.2733, "step": 136000 }, { "epoch": 0.10908123235421273, "grad_norm": 0.8212218284606934, "learning_rate": 4.890918767645787e-05, "loss": 0.2681, "step": 136500 }, { "epoch": 0.10948079730789116, "grad_norm": 1.1364796161651611, "learning_rate": 4.890519202692109e-05, "loss": 0.2666, "step": 137000 }, { "epoch": 0.10988036226156961, "grad_norm": 0.5063474774360657, "learning_rate": 4.890119637738431e-05, "loss": 0.2582, "step": 137500 }, { "epoch": 0.11027992721524804, "grad_norm": 0.5450100302696228, "learning_rate": 4.889720072784752e-05, "loss": 0.2864, "step": 138000 }, { "epoch": 0.11067949216892647, "grad_norm": 1.4688063859939575, "learning_rate": 4.889320507831074e-05, "loss": 0.2749, "step": 138500 }, { "epoch": 0.11107905712260491, "grad_norm": 0.4707566499710083, "learning_rate": 4.888920942877395e-05, "loss": 0.2814, "step": 139000 }, { "epoch": 0.11147862207628334, "grad_norm": 0.5394414067268372, "learning_rate": 4.888521377923717e-05, "loss": 0.2514, "step": 139500 }, { "epoch": 0.11187818702996177, "grad_norm": 0.6359038352966309, "learning_rate": 4.888121812970039e-05, "loss": 0.2589, "step": 140000 }, { "epoch": 0.1122777519836402, "grad_norm": 1.3580596446990967, "learning_rate": 4.88772224801636e-05, "loss": 0.2752, "step": 140500 }, { "epoch": 0.11267731693731865, "grad_norm": 1.2738196849822998, "learning_rate": 4.8873226830626816e-05, "loss": 0.2735, "step": 141000 }, { "epoch": 0.11307688189099709, "grad_norm": 0.38184696435928345, "learning_rate": 4.886923118109003e-05, "loss": 0.2679, "step": 141500 }, { "epoch": 0.11347644684467552, "grad_norm": 0.6441113352775574, "learning_rate": 4.886523553155325e-05, "loss": 0.272, "step": 142000 }, { "epoch": 0.11387601179835395, "grad_norm": 0.4596845507621765, "learning_rate": 4.8861239882016464e-05, "loss": 0.2709, "step": 142500 }, { "epoch": 0.11427557675203238, "grad_norm": 2.450136423110962, "learning_rate": 4.8857244232479676e-05, "loss": 0.2609, "step": 143000 }, { "epoch": 0.11467514170571082, "grad_norm": 0.5924099683761597, "learning_rate": 4.8853248582942894e-05, "loss": 0.2897, "step": 143500 }, { "epoch": 0.11507470665938925, "grad_norm": 1.9458520412445068, "learning_rate": 4.884925293340611e-05, "loss": 0.2847, "step": 144000 }, { "epoch": 0.1154742716130677, "grad_norm": 0.39753633737564087, "learning_rate": 4.884525728386933e-05, "loss": 0.2823, "step": 144500 }, { "epoch": 0.11587383656674613, "grad_norm": 1.7055304050445557, "learning_rate": 4.884126163433254e-05, "loss": 0.2653, "step": 145000 }, { "epoch": 0.11627340152042456, "grad_norm": 0.846165120601654, "learning_rate": 4.883726598479575e-05, "loss": 0.2744, "step": 145500 }, { "epoch": 0.116672966474103, "grad_norm": 0.9437318444252014, "learning_rate": 4.883327033525897e-05, "loss": 0.2702, "step": 146000 }, { "epoch": 0.11707253142778143, "grad_norm": 0.9440346360206604, "learning_rate": 4.882927468572219e-05, "loss": 0.2642, "step": 146500 }, { "epoch": 0.11747209638145986, "grad_norm": 1.0122957229614258, "learning_rate": 4.882527903618541e-05, "loss": 0.287, "step": 147000 }, { "epoch": 0.1178716613351383, "grad_norm": 0.4052189290523529, "learning_rate": 4.882128338664862e-05, "loss": 0.2796, "step": 147500 }, { "epoch": 0.11827122628881674, "grad_norm": 2.649588108062744, "learning_rate": 4.881728773711183e-05, "loss": 0.2699, "step": 148000 }, { "epoch": 0.11867079124249517, "grad_norm": 0.8571920394897461, "learning_rate": 4.8813292087575055e-05, "loss": 0.2718, "step": 148500 }, { "epoch": 0.11907035619617361, "grad_norm": 2.511066198348999, "learning_rate": 4.880929643803827e-05, "loss": 0.2637, "step": 149000 }, { "epoch": 0.11946992114985204, "grad_norm": 0.5011186599731445, "learning_rate": 4.8805300788501485e-05, "loss": 0.2399, "step": 149500 }, { "epoch": 0.11986948610353047, "grad_norm": 1.2272385358810425, "learning_rate": 4.8801305138964696e-05, "loss": 0.2787, "step": 150000 }, { "epoch": 0.1202690510572089, "grad_norm": 0.6588722467422485, "learning_rate": 4.8797309489427915e-05, "loss": 0.252, "step": 150500 }, { "epoch": 0.12066861601088734, "grad_norm": 1.643241286277771, "learning_rate": 4.879331383989113e-05, "loss": 0.2457, "step": 151000 }, { "epoch": 0.12106818096456579, "grad_norm": 0.5973477363586426, "learning_rate": 4.8789318190354344e-05, "loss": 0.2679, "step": 151500 }, { "epoch": 0.12146774591824422, "grad_norm": 0.3648105263710022, "learning_rate": 4.878532254081756e-05, "loss": 0.269, "step": 152000 }, { "epoch": 0.12186731087192265, "grad_norm": 0.2330954521894455, "learning_rate": 4.8781326891280774e-05, "loss": 0.2613, "step": 152500 }, { "epoch": 0.12226687582560108, "grad_norm": 1.7007527351379395, "learning_rate": 4.877733124174399e-05, "loss": 0.2556, "step": 153000 }, { "epoch": 0.12266644077927952, "grad_norm": 1.3266957998275757, "learning_rate": 4.877333559220721e-05, "loss": 0.261, "step": 153500 }, { "epoch": 0.12306600573295795, "grad_norm": 1.2147257328033447, "learning_rate": 4.876933994267042e-05, "loss": 0.2581, "step": 154000 }, { "epoch": 0.1234655706866364, "grad_norm": 0.6007754802703857, "learning_rate": 4.876534429313363e-05, "loss": 0.246, "step": 154500 }, { "epoch": 0.12386513564031483, "grad_norm": 0.5737212896347046, "learning_rate": 4.876134864359686e-05, "loss": 0.2757, "step": 155000 }, { "epoch": 0.12426470059399326, "grad_norm": 0.8167622089385986, "learning_rate": 4.875735299406007e-05, "loss": 0.2823, "step": 155500 }, { "epoch": 0.1246642655476717, "grad_norm": 0.5231077671051025, "learning_rate": 4.875335734452329e-05, "loss": 0.2659, "step": 156000 }, { "epoch": 0.12506383050135014, "grad_norm": 0.2919742465019226, "learning_rate": 4.87493616949865e-05, "loss": 0.2378, "step": 156500 }, { "epoch": 0.12546339545502858, "grad_norm": 1.1036304235458374, "learning_rate": 4.874536604544972e-05, "loss": 0.2773, "step": 157000 }, { "epoch": 0.125862960408707, "grad_norm": 0.4759330749511719, "learning_rate": 4.8741370395912936e-05, "loss": 0.2748, "step": 157500 }, { "epoch": 0.12626252536238544, "grad_norm": 1.0458624362945557, "learning_rate": 4.873737474637615e-05, "loss": 0.2813, "step": 158000 }, { "epoch": 0.12666209031606387, "grad_norm": 0.3154975175857544, "learning_rate": 4.8733379096839365e-05, "loss": 0.2555, "step": 158500 }, { "epoch": 0.1270616552697423, "grad_norm": 0.5457379221916199, "learning_rate": 4.8729383447302577e-05, "loss": 0.2515, "step": 159000 }, { "epoch": 0.12746122022342074, "grad_norm": 0.6213950514793396, "learning_rate": 4.8725387797765795e-05, "loss": 0.257, "step": 159500 }, { "epoch": 0.12786078517709917, "grad_norm": 1.6012296676635742, "learning_rate": 4.872139214822901e-05, "loss": 0.2544, "step": 160000 }, { "epoch": 0.1282603501307776, "grad_norm": 0.6603101491928101, "learning_rate": 4.8717396498692224e-05, "loss": 0.2789, "step": 160500 }, { "epoch": 0.12865991508445604, "grad_norm": 0.5973492860794067, "learning_rate": 4.871340084915544e-05, "loss": 0.2643, "step": 161000 }, { "epoch": 0.12905948003813447, "grad_norm": 0.4881117343902588, "learning_rate": 4.8709405199618654e-05, "loss": 0.2859, "step": 161500 }, { "epoch": 0.1294590449918129, "grad_norm": 0.8536845445632935, "learning_rate": 4.870540955008187e-05, "loss": 0.282, "step": 162000 }, { "epoch": 0.12985860994549134, "grad_norm": 3.5950021743774414, "learning_rate": 4.870141390054509e-05, "loss": 0.247, "step": 162500 }, { "epoch": 0.13025817489916977, "grad_norm": 0.8162873983383179, "learning_rate": 4.86974182510083e-05, "loss": 0.2537, "step": 163000 }, { "epoch": 0.13065773985284823, "grad_norm": 2.889305591583252, "learning_rate": 4.869342260147152e-05, "loss": 0.269, "step": 163500 }, { "epoch": 0.13105730480652666, "grad_norm": 1.8237204551696777, "learning_rate": 4.868942695193474e-05, "loss": 0.2553, "step": 164000 }, { "epoch": 0.1314568697602051, "grad_norm": 1.2110859155654907, "learning_rate": 4.868543130239795e-05, "loss": 0.2588, "step": 164500 }, { "epoch": 0.13185643471388353, "grad_norm": 0.8366063833236694, "learning_rate": 4.868143565286117e-05, "loss": 0.268, "step": 165000 }, { "epoch": 0.13225599966756196, "grad_norm": 0.5632724165916443, "learning_rate": 4.867744000332438e-05, "loss": 0.2716, "step": 165500 }, { "epoch": 0.1326555646212404, "grad_norm": 1.4671285152435303, "learning_rate": 4.86734443537876e-05, "loss": 0.2547, "step": 166000 }, { "epoch": 0.13305512957491883, "grad_norm": 1.3317333459854126, "learning_rate": 4.8669448704250816e-05, "loss": 0.2719, "step": 166500 }, { "epoch": 0.13345469452859726, "grad_norm": 0.8617038130760193, "learning_rate": 4.866545305471403e-05, "loss": 0.2607, "step": 167000 }, { "epoch": 0.1338542594822757, "grad_norm": 1.198243498802185, "learning_rate": 4.8661457405177245e-05, "loss": 0.2676, "step": 167500 }, { "epoch": 0.13425382443595413, "grad_norm": 0.9213862419128418, "learning_rate": 4.865746175564046e-05, "loss": 0.2587, "step": 168000 }, { "epoch": 0.13465338938963256, "grad_norm": 0.5993715524673462, "learning_rate": 4.865346610610368e-05, "loss": 0.2659, "step": 168500 }, { "epoch": 0.135052954343311, "grad_norm": 0.6893083453178406, "learning_rate": 4.864947045656689e-05, "loss": 0.2582, "step": 169000 }, { "epoch": 0.13545251929698943, "grad_norm": 2.8119704723358154, "learning_rate": 4.8645474807030105e-05, "loss": 0.2706, "step": 169500 }, { "epoch": 0.13585208425066786, "grad_norm": 2.2236814498901367, "learning_rate": 4.864147915749332e-05, "loss": 0.249, "step": 170000 }, { "epoch": 0.13625164920434632, "grad_norm": 0.8960835337638855, "learning_rate": 4.863748350795654e-05, "loss": 0.2689, "step": 170500 }, { "epoch": 0.13665121415802475, "grad_norm": 0.676629364490509, "learning_rate": 4.863348785841976e-05, "loss": 0.2519, "step": 171000 }, { "epoch": 0.13705077911170319, "grad_norm": 3.36061954498291, "learning_rate": 4.862949220888297e-05, "loss": 0.2424, "step": 171500 }, { "epoch": 0.13745034406538162, "grad_norm": 3.4959826469421387, "learning_rate": 4.862549655934618e-05, "loss": 0.2646, "step": 172000 }, { "epoch": 0.13784990901906005, "grad_norm": 1.179906964302063, "learning_rate": 4.86215009098094e-05, "loss": 0.2396, "step": 172500 }, { "epoch": 0.13824947397273848, "grad_norm": 1.3416152000427246, "learning_rate": 4.861750526027262e-05, "loss": 0.2691, "step": 173000 }, { "epoch": 0.13864903892641692, "grad_norm": 1.724177360534668, "learning_rate": 4.8613509610735837e-05, "loss": 0.2544, "step": 173500 }, { "epoch": 0.13904860388009535, "grad_norm": 1.2037800550460815, "learning_rate": 4.860951396119905e-05, "loss": 0.2688, "step": 174000 }, { "epoch": 0.13944816883377378, "grad_norm": 0.8717305660247803, "learning_rate": 4.860551831166226e-05, "loss": 0.2718, "step": 174500 }, { "epoch": 0.13984773378745222, "grad_norm": 0.9631955027580261, "learning_rate": 4.8601522662125484e-05, "loss": 0.2612, "step": 175000 }, { "epoch": 0.14024729874113065, "grad_norm": 1.2285515069961548, "learning_rate": 4.8597527012588696e-05, "loss": 0.2672, "step": 175500 }, { "epoch": 0.14064686369480908, "grad_norm": 1.1481609344482422, "learning_rate": 4.8593531363051914e-05, "loss": 0.2512, "step": 176000 }, { "epoch": 0.14104642864848752, "grad_norm": 0.5289448499679565, "learning_rate": 4.8589535713515125e-05, "loss": 0.2677, "step": 176500 }, { "epoch": 0.14144599360216595, "grad_norm": 1.3932371139526367, "learning_rate": 4.8585540063978344e-05, "loss": 0.2492, "step": 177000 }, { "epoch": 0.1418455585558444, "grad_norm": 0.5333113074302673, "learning_rate": 4.858154441444156e-05, "loss": 0.2711, "step": 177500 }, { "epoch": 0.14224512350952284, "grad_norm": 0.84699946641922, "learning_rate": 4.857754876490477e-05, "loss": 0.2487, "step": 178000 }, { "epoch": 0.14264468846320127, "grad_norm": 0.5486475825309753, "learning_rate": 4.857355311536799e-05, "loss": 0.2573, "step": 178500 }, { "epoch": 0.1430442534168797, "grad_norm": 0.6944171190261841, "learning_rate": 4.85695574658312e-05, "loss": 0.2538, "step": 179000 }, { "epoch": 0.14344381837055814, "grad_norm": 0.6517600417137146, "learning_rate": 4.856556181629442e-05, "loss": 0.2732, "step": 179500 }, { "epoch": 0.14384338332423657, "grad_norm": 2.9520788192749023, "learning_rate": 4.856156616675764e-05, "loss": 0.245, "step": 180000 }, { "epoch": 0.144242948277915, "grad_norm": 0.4002814292907715, "learning_rate": 4.855757051722085e-05, "loss": 0.2668, "step": 180500 }, { "epoch": 0.14464251323159344, "grad_norm": 0.34401270747184753, "learning_rate": 4.855357486768406e-05, "loss": 0.2493, "step": 181000 }, { "epoch": 0.14504207818527187, "grad_norm": 1.2350959777832031, "learning_rate": 4.854957921814729e-05, "loss": 0.2757, "step": 181500 }, { "epoch": 0.1454416431389503, "grad_norm": 0.306682825088501, "learning_rate": 4.85455835686105e-05, "loss": 0.2612, "step": 182000 }, { "epoch": 0.14584120809262874, "grad_norm": 1.5906249284744263, "learning_rate": 4.854158791907372e-05, "loss": 0.2489, "step": 182500 }, { "epoch": 0.14624077304630717, "grad_norm": 0.5003988742828369, "learning_rate": 4.853759226953693e-05, "loss": 0.2822, "step": 183000 }, { "epoch": 0.1466403379999856, "grad_norm": 0.45837774872779846, "learning_rate": 4.8533596620000146e-05, "loss": 0.2735, "step": 183500 }, { "epoch": 0.14703990295366404, "grad_norm": 1.2391563653945923, "learning_rate": 4.8529600970463365e-05, "loss": 0.2641, "step": 184000 }, { "epoch": 0.1474394679073425, "grad_norm": 2.0245842933654785, "learning_rate": 4.8525605320926576e-05, "loss": 0.2707, "step": 184500 }, { "epoch": 0.14783903286102093, "grad_norm": 0.8621892333030701, "learning_rate": 4.8521609671389794e-05, "loss": 0.2397, "step": 185000 }, { "epoch": 0.14823859781469936, "grad_norm": 1.8652069568634033, "learning_rate": 4.8517614021853006e-05, "loss": 0.2538, "step": 185500 }, { "epoch": 0.1486381627683778, "grad_norm": 0.8554100394248962, "learning_rate": 4.8513618372316224e-05, "loss": 0.2596, "step": 186000 }, { "epoch": 0.14903772772205623, "grad_norm": 0.16713525354862213, "learning_rate": 4.850962272277944e-05, "loss": 0.2543, "step": 186500 }, { "epoch": 0.14943729267573466, "grad_norm": 1.7714667320251465, "learning_rate": 4.8505627073242653e-05, "loss": 0.2509, "step": 187000 }, { "epoch": 0.1498368576294131, "grad_norm": 0.4326224625110626, "learning_rate": 4.850163142370587e-05, "loss": 0.2632, "step": 187500 }, { "epoch": 0.15023642258309153, "grad_norm": 0.8681989312171936, "learning_rate": 4.849763577416909e-05, "loss": 0.2566, "step": 188000 }, { "epoch": 0.15063598753676996, "grad_norm": 0.4513641893863678, "learning_rate": 4.84936401246323e-05, "loss": 0.2594, "step": 188500 }, { "epoch": 0.1510355524904484, "grad_norm": 0.3568493723869324, "learning_rate": 4.848964447509552e-05, "loss": 0.2665, "step": 189000 }, { "epoch": 0.15143511744412683, "grad_norm": 0.9530199766159058, "learning_rate": 4.848564882555873e-05, "loss": 0.244, "step": 189500 }, { "epoch": 0.15183468239780526, "grad_norm": 0.3312021791934967, "learning_rate": 4.848165317602195e-05, "loss": 0.2747, "step": 190000 }, { "epoch": 0.1522342473514837, "grad_norm": 0.5856309533119202, "learning_rate": 4.847765752648517e-05, "loss": 0.2601, "step": 190500 }, { "epoch": 0.15263381230516215, "grad_norm": 0.838448166847229, "learning_rate": 4.847366187694838e-05, "loss": 0.2621, "step": 191000 }, { "epoch": 0.15303337725884059, "grad_norm": 1.1698331832885742, "learning_rate": 4.84696662274116e-05, "loss": 0.2638, "step": 191500 }, { "epoch": 0.15343294221251902, "grad_norm": 0.306469202041626, "learning_rate": 4.846567057787481e-05, "loss": 0.2634, "step": 192000 }, { "epoch": 0.15383250716619745, "grad_norm": 1.0206493139266968, "learning_rate": 4.846167492833803e-05, "loss": 0.2684, "step": 192500 }, { "epoch": 0.15423207211987588, "grad_norm": 1.9097557067871094, "learning_rate": 4.8457679278801245e-05, "loss": 0.2588, "step": 193000 }, { "epoch": 0.15463163707355432, "grad_norm": 1.5619105100631714, "learning_rate": 4.8453683629264456e-05, "loss": 0.2498, "step": 193500 }, { "epoch": 0.15503120202723275, "grad_norm": 2.021411657333374, "learning_rate": 4.8449687979727674e-05, "loss": 0.2488, "step": 194000 }, { "epoch": 0.15543076698091118, "grad_norm": 1.1985628604888916, "learning_rate": 4.844569233019089e-05, "loss": 0.2589, "step": 194500 }, { "epoch": 0.15583033193458962, "grad_norm": 0.7792631387710571, "learning_rate": 4.844169668065411e-05, "loss": 0.2624, "step": 195000 }, { "epoch": 0.15622989688826805, "grad_norm": 0.5965586304664612, "learning_rate": 4.843770103111732e-05, "loss": 0.2535, "step": 195500 }, { "epoch": 0.15662946184194648, "grad_norm": 0.6102330684661865, "learning_rate": 4.8433705381580534e-05, "loss": 0.2421, "step": 196000 }, { "epoch": 0.15702902679562492, "grad_norm": 0.3438211679458618, "learning_rate": 4.842970973204375e-05, "loss": 0.2511, "step": 196500 }, { "epoch": 0.15742859174930335, "grad_norm": 1.6085654497146606, "learning_rate": 4.842571408250697e-05, "loss": 0.2704, "step": 197000 }, { "epoch": 0.15782815670298178, "grad_norm": 1.3386743068695068, "learning_rate": 4.842171843297019e-05, "loss": 0.2576, "step": 197500 }, { "epoch": 0.15822772165666024, "grad_norm": 0.30512097477912903, "learning_rate": 4.84177227834334e-05, "loss": 0.2644, "step": 198000 }, { "epoch": 0.15862728661033867, "grad_norm": 0.7896501421928406, "learning_rate": 4.841372713389661e-05, "loss": 0.2461, "step": 198500 }, { "epoch": 0.1590268515640171, "grad_norm": 0.5531265735626221, "learning_rate": 4.8409731484359836e-05, "loss": 0.2449, "step": 199000 }, { "epoch": 0.15942641651769554, "grad_norm": 1.3540623188018799, "learning_rate": 4.840573583482305e-05, "loss": 0.268, "step": 199500 }, { "epoch": 0.15982598147137397, "grad_norm": 0.6070414185523987, "learning_rate": 4.8401740185286266e-05, "loss": 0.2692, "step": 200000 }, { "epoch": 0.1602255464250524, "grad_norm": 1.8883955478668213, "learning_rate": 4.839774453574948e-05, "loss": 0.2602, "step": 200500 }, { "epoch": 0.16062511137873084, "grad_norm": 1.316870093345642, "learning_rate": 4.8393748886212695e-05, "loss": 0.2502, "step": 201000 }, { "epoch": 0.16102467633240927, "grad_norm": 2.0436816215515137, "learning_rate": 4.8389753236675913e-05, "loss": 0.2564, "step": 201500 }, { "epoch": 0.1614242412860877, "grad_norm": 0.7767001986503601, "learning_rate": 4.8385757587139125e-05, "loss": 0.2609, "step": 202000 }, { "epoch": 0.16182380623976614, "grad_norm": 0.7199544310569763, "learning_rate": 4.838176193760234e-05, "loss": 0.2577, "step": 202500 }, { "epoch": 0.16222337119344457, "grad_norm": 0.32926321029663086, "learning_rate": 4.8377766288065554e-05, "loss": 0.2516, "step": 203000 }, { "epoch": 0.162622936147123, "grad_norm": 0.5816808342933655, "learning_rate": 4.837377063852877e-05, "loss": 0.2593, "step": 203500 }, { "epoch": 0.16302250110080144, "grad_norm": 0.887645959854126, "learning_rate": 4.836977498899199e-05, "loss": 0.2543, "step": 204000 }, { "epoch": 0.16342206605447987, "grad_norm": 0.8861106634140015, "learning_rate": 4.83657793394552e-05, "loss": 0.2433, "step": 204500 }, { "epoch": 0.16382163100815833, "grad_norm": 0.40117567777633667, "learning_rate": 4.836178368991842e-05, "loss": 0.2691, "step": 205000 }, { "epoch": 0.16422119596183676, "grad_norm": 0.8757227063179016, "learning_rate": 4.835778804038163e-05, "loss": 0.264, "step": 205500 }, { "epoch": 0.1646207609155152, "grad_norm": 0.6845428943634033, "learning_rate": 4.835379239084485e-05, "loss": 0.2622, "step": 206000 }, { "epoch": 0.16502032586919363, "grad_norm": 0.9201008677482605, "learning_rate": 4.834979674130807e-05, "loss": 0.2422, "step": 206500 }, { "epoch": 0.16541989082287206, "grad_norm": 0.40277498960494995, "learning_rate": 4.834580109177128e-05, "loss": 0.2537, "step": 207000 }, { "epoch": 0.1658194557765505, "grad_norm": 1.747119426727295, "learning_rate": 4.83418054422345e-05, "loss": 0.249, "step": 207500 }, { "epoch": 0.16621902073022893, "grad_norm": 0.6764073967933655, "learning_rate": 4.8337809792697716e-05, "loss": 0.2739, "step": 208000 }, { "epoch": 0.16661858568390736, "grad_norm": 0.42607229948043823, "learning_rate": 4.833381414316093e-05, "loss": 0.2547, "step": 208500 }, { "epoch": 0.1670181506375858, "grad_norm": 0.5080292224884033, "learning_rate": 4.8329818493624146e-05, "loss": 0.2753, "step": 209000 }, { "epoch": 0.16741771559126423, "grad_norm": 0.6160194873809814, "learning_rate": 4.832582284408736e-05, "loss": 0.2626, "step": 209500 }, { "epoch": 0.16781728054494266, "grad_norm": 0.4278533160686493, "learning_rate": 4.8321827194550575e-05, "loss": 0.2584, "step": 210000 }, { "epoch": 0.1682168454986211, "grad_norm": 0.6575314402580261, "learning_rate": 4.8317831545013794e-05, "loss": 0.2449, "step": 210500 }, { "epoch": 0.16861641045229953, "grad_norm": 0.5455963015556335, "learning_rate": 4.8313835895477005e-05, "loss": 0.265, "step": 211000 }, { "epoch": 0.16901597540597796, "grad_norm": 0.5795176029205322, "learning_rate": 4.830984024594022e-05, "loss": 0.2611, "step": 211500 }, { "epoch": 0.16941554035965642, "grad_norm": 0.6192690134048462, "learning_rate": 4.8305844596403435e-05, "loss": 0.2645, "step": 212000 }, { "epoch": 0.16981510531333485, "grad_norm": 0.576086163520813, "learning_rate": 4.830184894686665e-05, "loss": 0.2675, "step": 212500 }, { "epoch": 0.17021467026701328, "grad_norm": 0.645433247089386, "learning_rate": 4.829785329732987e-05, "loss": 0.2731, "step": 213000 }, { "epoch": 0.17061423522069172, "grad_norm": 0.4997786581516266, "learning_rate": 4.829385764779308e-05, "loss": 0.2616, "step": 213500 }, { "epoch": 0.17101380017437015, "grad_norm": 0.926995575428009, "learning_rate": 4.82898619982563e-05, "loss": 0.2518, "step": 214000 }, { "epoch": 0.17141336512804858, "grad_norm": 0.5664276480674744, "learning_rate": 4.828586634871952e-05, "loss": 0.2322, "step": 214500 }, { "epoch": 0.17181293008172702, "grad_norm": 0.2790573239326477, "learning_rate": 4.828187069918273e-05, "loss": 0.2519, "step": 215000 }, { "epoch": 0.17221249503540545, "grad_norm": 0.5118285417556763, "learning_rate": 4.827787504964595e-05, "loss": 0.2532, "step": 215500 }, { "epoch": 0.17261205998908388, "grad_norm": 0.7516388297080994, "learning_rate": 4.827387940010916e-05, "loss": 0.2446, "step": 216000 }, { "epoch": 0.17301162494276232, "grad_norm": 1.0253053903579712, "learning_rate": 4.826988375057238e-05, "loss": 0.2637, "step": 216500 }, { "epoch": 0.17341118989644075, "grad_norm": 0.5023830533027649, "learning_rate": 4.8265888101035596e-05, "loss": 0.2537, "step": 217000 }, { "epoch": 0.17381075485011918, "grad_norm": 0.3593728840351105, "learning_rate": 4.826189245149881e-05, "loss": 0.2438, "step": 217500 }, { "epoch": 0.17421031980379761, "grad_norm": 0.41701996326446533, "learning_rate": 4.8257896801962026e-05, "loss": 0.2666, "step": 218000 }, { "epoch": 0.17460988475747605, "grad_norm": 0.7942751049995422, "learning_rate": 4.825390115242524e-05, "loss": 0.2463, "step": 218500 }, { "epoch": 0.1750094497111545, "grad_norm": 2.0328922271728516, "learning_rate": 4.824990550288846e-05, "loss": 0.2549, "step": 219000 }, { "epoch": 0.17540901466483294, "grad_norm": 2.2021477222442627, "learning_rate": 4.8245909853351674e-05, "loss": 0.2637, "step": 219500 }, { "epoch": 0.17580857961851137, "grad_norm": 0.7293294072151184, "learning_rate": 4.8241914203814885e-05, "loss": 0.2399, "step": 220000 }, { "epoch": 0.1762081445721898, "grad_norm": 0.5244370102882385, "learning_rate": 4.82379185542781e-05, "loss": 0.2517, "step": 220500 }, { "epoch": 0.17660770952586824, "grad_norm": 0.5825855135917664, "learning_rate": 4.823392290474132e-05, "loss": 0.2461, "step": 221000 }, { "epoch": 0.17700727447954667, "grad_norm": 1.7435094118118286, "learning_rate": 4.822992725520454e-05, "loss": 0.2469, "step": 221500 }, { "epoch": 0.1774068394332251, "grad_norm": 1.6178573369979858, "learning_rate": 4.822593160566775e-05, "loss": 0.2483, "step": 222000 }, { "epoch": 0.17780640438690354, "grad_norm": 0.5234832167625427, "learning_rate": 4.822193595613096e-05, "loss": 0.2484, "step": 222500 }, { "epoch": 0.17820596934058197, "grad_norm": 0.7131869792938232, "learning_rate": 4.821794030659418e-05, "loss": 0.2381, "step": 223000 }, { "epoch": 0.1786055342942604, "grad_norm": 0.36050480604171753, "learning_rate": 4.82139446570574e-05, "loss": 0.2598, "step": 223500 }, { "epoch": 0.17900509924793884, "grad_norm": 0.44272276759147644, "learning_rate": 4.820994900752062e-05, "loss": 0.2523, "step": 224000 }, { "epoch": 0.17940466420161727, "grad_norm": 3.3559300899505615, "learning_rate": 4.820595335798383e-05, "loss": 0.2561, "step": 224500 }, { "epoch": 0.1798042291552957, "grad_norm": 0.5538239479064941, "learning_rate": 4.820195770844704e-05, "loss": 0.2571, "step": 225000 }, { "epoch": 0.18020379410897414, "grad_norm": 0.6692741513252258, "learning_rate": 4.8197962058910265e-05, "loss": 0.2653, "step": 225500 }, { "epoch": 0.1806033590626526, "grad_norm": 0.4119844436645508, "learning_rate": 4.8193966409373476e-05, "loss": 0.2538, "step": 226000 }, { "epoch": 0.18100292401633103, "grad_norm": 0.8069096207618713, "learning_rate": 4.8189970759836695e-05, "loss": 0.2558, "step": 226500 }, { "epoch": 0.18140248897000946, "grad_norm": 0.520562469959259, "learning_rate": 4.8185975110299906e-05, "loss": 0.2448, "step": 227000 }, { "epoch": 0.1818020539236879, "grad_norm": 1.4226702451705933, "learning_rate": 4.8181979460763124e-05, "loss": 0.2542, "step": 227500 }, { "epoch": 0.18220161887736633, "grad_norm": 0.519947350025177, "learning_rate": 4.817798381122634e-05, "loss": 0.2622, "step": 228000 }, { "epoch": 0.18260118383104476, "grad_norm": 0.370055228471756, "learning_rate": 4.8173988161689554e-05, "loss": 0.2455, "step": 228500 }, { "epoch": 0.1830007487847232, "grad_norm": 0.42935118079185486, "learning_rate": 4.816999251215277e-05, "loss": 0.2679, "step": 229000 }, { "epoch": 0.18340031373840163, "grad_norm": 0.7421344518661499, "learning_rate": 4.8165996862615984e-05, "loss": 0.2517, "step": 229500 }, { "epoch": 0.18379987869208006, "grad_norm": 0.6645961999893188, "learning_rate": 4.81620012130792e-05, "loss": 0.2566, "step": 230000 }, { "epoch": 0.1841994436457585, "grad_norm": 1.21634840965271, "learning_rate": 4.815800556354242e-05, "loss": 0.2645, "step": 230500 }, { "epoch": 0.18459900859943693, "grad_norm": 2.502849578857422, "learning_rate": 4.815400991400563e-05, "loss": 0.2648, "step": 231000 }, { "epoch": 0.18499857355311536, "grad_norm": 0.826808512210846, "learning_rate": 4.815001426446885e-05, "loss": 0.2569, "step": 231500 }, { "epoch": 0.1853981385067938, "grad_norm": 0.7136436104774475, "learning_rate": 4.814601861493207e-05, "loss": 0.2434, "step": 232000 }, { "epoch": 0.18579770346047222, "grad_norm": 0.49958473443984985, "learning_rate": 4.814202296539528e-05, "loss": 0.2402, "step": 232500 }, { "epoch": 0.18619726841415069, "grad_norm": 0.6985440254211426, "learning_rate": 4.81380273158585e-05, "loss": 0.2637, "step": 233000 }, { "epoch": 0.18659683336782912, "grad_norm": 0.4400031566619873, "learning_rate": 4.813403166632171e-05, "loss": 0.2548, "step": 233500 }, { "epoch": 0.18699639832150755, "grad_norm": 0.8190596103668213, "learning_rate": 4.813003601678493e-05, "loss": 0.2384, "step": 234000 }, { "epoch": 0.18739596327518598, "grad_norm": 0.8776618838310242, "learning_rate": 4.8126040367248145e-05, "loss": 0.2317, "step": 234500 }, { "epoch": 0.18779552822886442, "grad_norm": 0.5676888227462769, "learning_rate": 4.8122044717711357e-05, "loss": 0.2493, "step": 235000 }, { "epoch": 0.18819509318254285, "grad_norm": 0.9650105237960815, "learning_rate": 4.8118049068174575e-05, "loss": 0.2725, "step": 235500 }, { "epoch": 0.18859465813622128, "grad_norm": 0.5925421714782715, "learning_rate": 4.8114053418637786e-05, "loss": 0.2574, "step": 236000 }, { "epoch": 0.18899422308989972, "grad_norm": 0.6817251443862915, "learning_rate": 4.8110057769101004e-05, "loss": 0.2599, "step": 236500 }, { "epoch": 0.18939378804357815, "grad_norm": 1.1638318300247192, "learning_rate": 4.810606211956422e-05, "loss": 0.2441, "step": 237000 }, { "epoch": 0.18979335299725658, "grad_norm": 0.46350589394569397, "learning_rate": 4.8102066470027434e-05, "loss": 0.2551, "step": 237500 }, { "epoch": 0.19019291795093501, "grad_norm": 1.5453405380249023, "learning_rate": 4.809807082049065e-05, "loss": 0.2601, "step": 238000 }, { "epoch": 0.19059248290461345, "grad_norm": 0.8405442833900452, "learning_rate": 4.809407517095387e-05, "loss": 0.2397, "step": 238500 }, { "epoch": 0.19099204785829188, "grad_norm": 0.5699251294136047, "learning_rate": 4.809007952141708e-05, "loss": 0.2675, "step": 239000 }, { "epoch": 0.1913916128119703, "grad_norm": 0.21468545496463776, "learning_rate": 4.80860838718803e-05, "loss": 0.2488, "step": 239500 }, { "epoch": 0.19179117776564877, "grad_norm": 0.41771772503852844, "learning_rate": 4.808208822234351e-05, "loss": 0.2572, "step": 240000 }, { "epoch": 0.1921907427193272, "grad_norm": 0.3151680529117584, "learning_rate": 4.807809257280673e-05, "loss": 0.2519, "step": 240500 }, { "epoch": 0.19259030767300564, "grad_norm": 0.4753545820713043, "learning_rate": 4.807409692326995e-05, "loss": 0.2367, "step": 241000 }, { "epoch": 0.19298987262668407, "grad_norm": 0.52289217710495, "learning_rate": 4.807010127373316e-05, "loss": 0.2432, "step": 241500 }, { "epoch": 0.1933894375803625, "grad_norm": 0.42120739817619324, "learning_rate": 4.806610562419638e-05, "loss": 0.2375, "step": 242000 }, { "epoch": 0.19378900253404094, "grad_norm": 0.2503403425216675, "learning_rate": 4.806210997465959e-05, "loss": 0.2366, "step": 242500 }, { "epoch": 0.19418856748771937, "grad_norm": 0.43677234649658203, "learning_rate": 4.8058114325122814e-05, "loss": 0.2546, "step": 243000 }, { "epoch": 0.1945881324413978, "grad_norm": 0.5441431999206543, "learning_rate": 4.8054118675586025e-05, "loss": 0.2366, "step": 243500 }, { "epoch": 0.19498769739507624, "grad_norm": 0.34650084376335144, "learning_rate": 4.805012302604924e-05, "loss": 0.2524, "step": 244000 }, { "epoch": 0.19538726234875467, "grad_norm": 0.3216243088245392, "learning_rate": 4.8046127376512455e-05, "loss": 0.2424, "step": 244500 }, { "epoch": 0.1957868273024331, "grad_norm": 0.24469640851020813, "learning_rate": 4.804213172697567e-05, "loss": 0.2402, "step": 245000 }, { "epoch": 0.19618639225611154, "grad_norm": 0.9021053910255432, "learning_rate": 4.803813607743889e-05, "loss": 0.2478, "step": 245500 }, { "epoch": 0.19658595720978997, "grad_norm": 0.6636314988136292, "learning_rate": 4.80341404279021e-05, "loss": 0.2466, "step": 246000 }, { "epoch": 0.1969855221634684, "grad_norm": 0.5872536301612854, "learning_rate": 4.8030144778365314e-05, "loss": 0.2508, "step": 246500 }, { "epoch": 0.19738508711714686, "grad_norm": 0.7438320517539978, "learning_rate": 4.802614912882853e-05, "loss": 0.2482, "step": 247000 }, { "epoch": 0.1977846520708253, "grad_norm": 0.8127149939537048, "learning_rate": 4.802215347929175e-05, "loss": 0.2286, "step": 247500 }, { "epoch": 0.19818421702450373, "grad_norm": 1.0113919973373413, "learning_rate": 4.801815782975497e-05, "loss": 0.2545, "step": 248000 }, { "epoch": 0.19858378197818216, "grad_norm": 0.702934980392456, "learning_rate": 4.801416218021818e-05, "loss": 0.2393, "step": 248500 }, { "epoch": 0.1989833469318606, "grad_norm": 0.7065428495407104, "learning_rate": 4.801016653068139e-05, "loss": 0.2641, "step": 249000 }, { "epoch": 0.19938291188553903, "grad_norm": 0.7433169484138489, "learning_rate": 4.800617088114461e-05, "loss": 0.2519, "step": 249500 }, { "epoch": 0.19978247683921746, "grad_norm": 0.25298386812210083, "learning_rate": 4.800217523160783e-05, "loss": 0.2348, "step": 250000 }, { "epoch": 0.2001820417928959, "grad_norm": 0.4638431668281555, "learning_rate": 4.7998179582071046e-05, "loss": 0.2531, "step": 250500 }, { "epoch": 0.20058160674657433, "grad_norm": 1.7489734888076782, "learning_rate": 4.799418393253426e-05, "loss": 0.2459, "step": 251000 }, { "epoch": 0.20098117170025276, "grad_norm": 0.3193584382534027, "learning_rate": 4.7990188282997476e-05, "loss": 0.2442, "step": 251500 }, { "epoch": 0.2013807366539312, "grad_norm": 0.8558552265167236, "learning_rate": 4.7986192633460694e-05, "loss": 0.2429, "step": 252000 }, { "epoch": 0.20178030160760962, "grad_norm": 0.41875821352005005, "learning_rate": 4.7982196983923905e-05, "loss": 0.2397, "step": 252500 }, { "epoch": 0.20217986656128806, "grad_norm": 0.4392428696155548, "learning_rate": 4.7978201334387124e-05, "loss": 0.2589, "step": 253000 }, { "epoch": 0.20257943151496652, "grad_norm": 0.8783096671104431, "learning_rate": 4.7974205684850335e-05, "loss": 0.2586, "step": 253500 }, { "epoch": 0.20297899646864495, "grad_norm": 0.669469952583313, "learning_rate": 4.797021003531355e-05, "loss": 0.2597, "step": 254000 }, { "epoch": 0.20337856142232338, "grad_norm": 1.6815932989120483, "learning_rate": 4.796621438577677e-05, "loss": 0.2592, "step": 254500 }, { "epoch": 0.20377812637600182, "grad_norm": 1.0190411806106567, "learning_rate": 4.796221873623998e-05, "loss": 0.2506, "step": 255000 }, { "epoch": 0.20417769132968025, "grad_norm": 0.9998374581336975, "learning_rate": 4.79582230867032e-05, "loss": 0.2625, "step": 255500 }, { "epoch": 0.20457725628335868, "grad_norm": 1.0184133052825928, "learning_rate": 4.795422743716641e-05, "loss": 0.2508, "step": 256000 }, { "epoch": 0.20497682123703712, "grad_norm": 0.6427154541015625, "learning_rate": 4.795023178762963e-05, "loss": 0.238, "step": 256500 }, { "epoch": 0.20537638619071555, "grad_norm": 0.4931342303752899, "learning_rate": 4.794623613809285e-05, "loss": 0.2552, "step": 257000 }, { "epoch": 0.20577595114439398, "grad_norm": 0.5323029160499573, "learning_rate": 4.794224048855606e-05, "loss": 0.2579, "step": 257500 }, { "epoch": 0.20617551609807241, "grad_norm": 0.5676879286766052, "learning_rate": 4.793824483901928e-05, "loss": 0.2575, "step": 258000 }, { "epoch": 0.20657508105175085, "grad_norm": 1.4854947328567505, "learning_rate": 4.79342491894825e-05, "loss": 0.2579, "step": 258500 }, { "epoch": 0.20697464600542928, "grad_norm": 0.46165043115615845, "learning_rate": 4.793025353994571e-05, "loss": 0.2267, "step": 259000 }, { "epoch": 0.2073742109591077, "grad_norm": 0.6268613338470459, "learning_rate": 4.7926257890408926e-05, "loss": 0.2526, "step": 259500 }, { "epoch": 0.20777377591278615, "grad_norm": 0.9342248439788818, "learning_rate": 4.792226224087214e-05, "loss": 0.2551, "step": 260000 }, { "epoch": 0.2081733408664646, "grad_norm": 1.5980958938598633, "learning_rate": 4.7918266591335356e-05, "loss": 0.2353, "step": 260500 }, { "epoch": 0.20857290582014304, "grad_norm": 0.36618489027023315, "learning_rate": 4.7914270941798574e-05, "loss": 0.2498, "step": 261000 }, { "epoch": 0.20897247077382147, "grad_norm": 1.4446417093276978, "learning_rate": 4.7910275292261786e-05, "loss": 0.2831, "step": 261500 }, { "epoch": 0.2093720357274999, "grad_norm": 0.9858096837997437, "learning_rate": 4.7906279642725004e-05, "loss": 0.2638, "step": 262000 }, { "epoch": 0.20977160068117834, "grad_norm": 0.3587642014026642, "learning_rate": 4.7902283993188215e-05, "loss": 0.2376, "step": 262500 }, { "epoch": 0.21017116563485677, "grad_norm": 0.5613101720809937, "learning_rate": 4.789828834365144e-05, "loss": 0.2407, "step": 263000 }, { "epoch": 0.2105707305885352, "grad_norm": 0.5659090876579285, "learning_rate": 4.789429269411465e-05, "loss": 0.2633, "step": 263500 }, { "epoch": 0.21097029554221364, "grad_norm": 0.6286455392837524, "learning_rate": 4.789029704457786e-05, "loss": 0.2564, "step": 264000 }, { "epoch": 0.21136986049589207, "grad_norm": 3.0288000106811523, "learning_rate": 4.788630139504108e-05, "loss": 0.2266, "step": 264500 }, { "epoch": 0.2117694254495705, "grad_norm": 0.6014071106910706, "learning_rate": 4.78823057455043e-05, "loss": 0.2397, "step": 265000 }, { "epoch": 0.21216899040324894, "grad_norm": 1.0509755611419678, "learning_rate": 4.787831009596751e-05, "loss": 0.2475, "step": 265500 }, { "epoch": 0.21256855535692737, "grad_norm": 0.5163958072662354, "learning_rate": 4.787431444643073e-05, "loss": 0.2615, "step": 266000 }, { "epoch": 0.2129681203106058, "grad_norm": 1.0487529039382935, "learning_rate": 4.787031879689394e-05, "loss": 0.2569, "step": 266500 }, { "epoch": 0.21336768526428423, "grad_norm": 0.921088695526123, "learning_rate": 4.786632314735716e-05, "loss": 0.2445, "step": 267000 }, { "epoch": 0.2137672502179627, "grad_norm": 0.6839931011199951, "learning_rate": 4.786232749782038e-05, "loss": 0.2403, "step": 267500 }, { "epoch": 0.21416681517164113, "grad_norm": 0.5722342133522034, "learning_rate": 4.785833184828359e-05, "loss": 0.2601, "step": 268000 }, { "epoch": 0.21456638012531956, "grad_norm": 0.2592403292655945, "learning_rate": 4.7854336198746807e-05, "loss": 0.2413, "step": 268500 }, { "epoch": 0.214965945078998, "grad_norm": 0.79491126537323, "learning_rate": 4.785034054921002e-05, "loss": 0.2404, "step": 269000 }, { "epoch": 0.21536551003267643, "grad_norm": 0.7661212682723999, "learning_rate": 4.784634489967324e-05, "loss": 0.2723, "step": 269500 }, { "epoch": 0.21576507498635486, "grad_norm": 0.42561012506484985, "learning_rate": 4.7842349250136454e-05, "loss": 0.2297, "step": 270000 }, { "epoch": 0.2161646399400333, "grad_norm": 0.46819761395454407, "learning_rate": 4.7838353600599666e-05, "loss": 0.2468, "step": 270500 }, { "epoch": 0.21656420489371173, "grad_norm": 0.5279552340507507, "learning_rate": 4.7834357951062884e-05, "loss": 0.2507, "step": 271000 }, { "epoch": 0.21696376984739016, "grad_norm": 0.5666534304618835, "learning_rate": 4.78303623015261e-05, "loss": 0.2691, "step": 271500 }, { "epoch": 0.2173633348010686, "grad_norm": 0.6076895594596863, "learning_rate": 4.782636665198932e-05, "loss": 0.2632, "step": 272000 }, { "epoch": 0.21776289975474702, "grad_norm": 0.3407851457595825, "learning_rate": 4.782237100245253e-05, "loss": 0.2497, "step": 272500 }, { "epoch": 0.21816246470842546, "grad_norm": 0.36303192377090454, "learning_rate": 4.781837535291574e-05, "loss": 0.2642, "step": 273000 }, { "epoch": 0.2185620296621039, "grad_norm": 0.4088434875011444, "learning_rate": 4.781437970337896e-05, "loss": 0.2496, "step": 273500 }, { "epoch": 0.21896159461578232, "grad_norm": 1.1399136781692505, "learning_rate": 4.781038405384218e-05, "loss": 0.2417, "step": 274000 }, { "epoch": 0.21936115956946078, "grad_norm": 0.842380166053772, "learning_rate": 4.78063884043054e-05, "loss": 0.2317, "step": 274500 }, { "epoch": 0.21976072452313922, "grad_norm": 1.7542141675949097, "learning_rate": 4.780239275476861e-05, "loss": 0.2339, "step": 275000 }, { "epoch": 0.22016028947681765, "grad_norm": 0.5945953726768494, "learning_rate": 4.779839710523182e-05, "loss": 0.261, "step": 275500 }, { "epoch": 0.22055985443049608, "grad_norm": 0.5067148208618164, "learning_rate": 4.7794401455695046e-05, "loss": 0.2488, "step": 276000 }, { "epoch": 0.22095941938417452, "grad_norm": 1.366065263748169, "learning_rate": 4.779040580615826e-05, "loss": 0.2509, "step": 276500 }, { "epoch": 0.22135898433785295, "grad_norm": 1.0294042825698853, "learning_rate": 4.7786410156621475e-05, "loss": 0.22, "step": 277000 }, { "epoch": 0.22175854929153138, "grad_norm": 0.5922881960868835, "learning_rate": 4.778241450708469e-05, "loss": 0.2707, "step": 277500 }, { "epoch": 0.22215811424520981, "grad_norm": 2.0569639205932617, "learning_rate": 4.7778418857547905e-05, "loss": 0.2439, "step": 278000 }, { "epoch": 0.22255767919888825, "grad_norm": 0.7749038338661194, "learning_rate": 4.777442320801112e-05, "loss": 0.2399, "step": 278500 }, { "epoch": 0.22295724415256668, "grad_norm": 0.715955913066864, "learning_rate": 4.7770427558474335e-05, "loss": 0.2648, "step": 279000 }, { "epoch": 0.2233568091062451, "grad_norm": 0.5989009737968445, "learning_rate": 4.776643190893755e-05, "loss": 0.248, "step": 279500 }, { "epoch": 0.22375637405992355, "grad_norm": 0.7050957083702087, "learning_rate": 4.7762436259400764e-05, "loss": 0.2327, "step": 280000 }, { "epoch": 0.22415593901360198, "grad_norm": 0.322586327791214, "learning_rate": 4.775844060986398e-05, "loss": 0.2726, "step": 280500 }, { "epoch": 0.2245555039672804, "grad_norm": 0.6509853005409241, "learning_rate": 4.77544449603272e-05, "loss": 0.2543, "step": 281000 }, { "epoch": 0.22495506892095887, "grad_norm": 0.32858315110206604, "learning_rate": 4.775044931079041e-05, "loss": 0.2459, "step": 281500 }, { "epoch": 0.2253546338746373, "grad_norm": 0.40384232997894287, "learning_rate": 4.774645366125363e-05, "loss": 0.2517, "step": 282000 }, { "epoch": 0.22575419882831574, "grad_norm": 2.088918685913086, "learning_rate": 4.774245801171685e-05, "loss": 0.2463, "step": 282500 }, { "epoch": 0.22615376378199417, "grad_norm": 1.7866237163543701, "learning_rate": 4.773846236218006e-05, "loss": 0.2574, "step": 283000 }, { "epoch": 0.2265533287356726, "grad_norm": 1.6304171085357666, "learning_rate": 4.773446671264328e-05, "loss": 0.2426, "step": 283500 }, { "epoch": 0.22695289368935104, "grad_norm": 0.32767727971076965, "learning_rate": 4.773047106310649e-05, "loss": 0.26, "step": 284000 }, { "epoch": 0.22735245864302947, "grad_norm": 0.8986214399337769, "learning_rate": 4.772647541356971e-05, "loss": 0.2342, "step": 284500 }, { "epoch": 0.2277520235967079, "grad_norm": 1.1823816299438477, "learning_rate": 4.7722479764032926e-05, "loss": 0.2429, "step": 285000 }, { "epoch": 0.22815158855038634, "grad_norm": 0.4668426811695099, "learning_rate": 4.771848411449614e-05, "loss": 0.2474, "step": 285500 }, { "epoch": 0.22855115350406477, "grad_norm": 0.5970417857170105, "learning_rate": 4.7714488464959355e-05, "loss": 0.2339, "step": 286000 }, { "epoch": 0.2289507184577432, "grad_norm": 0.4400780498981476, "learning_rate": 4.771049281542257e-05, "loss": 0.2569, "step": 286500 }, { "epoch": 0.22935028341142163, "grad_norm": 0.5052193999290466, "learning_rate": 4.770649716588579e-05, "loss": 0.2658, "step": 287000 }, { "epoch": 0.22974984836510007, "grad_norm": 1.9844930171966553, "learning_rate": 4.7702501516349e-05, "loss": 0.2563, "step": 287500 }, { "epoch": 0.2301494133187785, "grad_norm": 0.5684137344360352, "learning_rate": 4.7698505866812215e-05, "loss": 0.2433, "step": 288000 }, { "epoch": 0.23054897827245696, "grad_norm": 0.3478074371814728, "learning_rate": 4.769451021727543e-05, "loss": 0.2554, "step": 288500 }, { "epoch": 0.2309485432261354, "grad_norm": 1.2719383239746094, "learning_rate": 4.769051456773865e-05, "loss": 0.2516, "step": 289000 }, { "epoch": 0.23134810817981383, "grad_norm": 0.416692852973938, "learning_rate": 4.768651891820187e-05, "loss": 0.248, "step": 289500 }, { "epoch": 0.23174767313349226, "grad_norm": 0.8336701989173889, "learning_rate": 4.768252326866508e-05, "loss": 0.2482, "step": 290000 }, { "epoch": 0.2321472380871707, "grad_norm": 0.5683593153953552, "learning_rate": 4.767852761912829e-05, "loss": 0.2462, "step": 290500 }, { "epoch": 0.23254680304084913, "grad_norm": 2.902923345565796, "learning_rate": 4.767453196959151e-05, "loss": 0.2407, "step": 291000 }, { "epoch": 0.23294636799452756, "grad_norm": 0.6277381777763367, "learning_rate": 4.767053632005473e-05, "loss": 0.2583, "step": 291500 }, { "epoch": 0.233345932948206, "grad_norm": 0.5777833461761475, "learning_rate": 4.766654067051794e-05, "loss": 0.2465, "step": 292000 }, { "epoch": 0.23374549790188442, "grad_norm": 1.024719476699829, "learning_rate": 4.766254502098116e-05, "loss": 0.2464, "step": 292500 }, { "epoch": 0.23414506285556286, "grad_norm": 0.6775665879249573, "learning_rate": 4.765854937144437e-05, "loss": 0.2319, "step": 293000 }, { "epoch": 0.2345446278092413, "grad_norm": 0.9936792850494385, "learning_rate": 4.765455372190759e-05, "loss": 0.2718, "step": 293500 }, { "epoch": 0.23494419276291972, "grad_norm": 0.693435788154602, "learning_rate": 4.7650558072370806e-05, "loss": 0.257, "step": 294000 }, { "epoch": 0.23534375771659816, "grad_norm": 0.42049896717071533, "learning_rate": 4.764656242283402e-05, "loss": 0.2303, "step": 294500 }, { "epoch": 0.2357433226702766, "grad_norm": 0.7199346423149109, "learning_rate": 4.7642566773297236e-05, "loss": 0.2428, "step": 295000 }, { "epoch": 0.23614288762395505, "grad_norm": 1.0985738039016724, "learning_rate": 4.7638571123760454e-05, "loss": 0.2608, "step": 295500 }, { "epoch": 0.23654245257763348, "grad_norm": 0.9050987958908081, "learning_rate": 4.763457547422367e-05, "loss": 0.2551, "step": 296000 }, { "epoch": 0.23694201753131192, "grad_norm": 0.8546951413154602, "learning_rate": 4.763057982468688e-05, "loss": 0.2459, "step": 296500 }, { "epoch": 0.23734158248499035, "grad_norm": 0.4461103677749634, "learning_rate": 4.7626584175150095e-05, "loss": 0.2274, "step": 297000 }, { "epoch": 0.23774114743866878, "grad_norm": 0.5028519034385681, "learning_rate": 4.762258852561331e-05, "loss": 0.244, "step": 297500 }, { "epoch": 0.23814071239234721, "grad_norm": 1.1516709327697754, "learning_rate": 4.761859287607653e-05, "loss": 0.2531, "step": 298000 }, { "epoch": 0.23854027734602565, "grad_norm": 1.1941641569137573, "learning_rate": 4.761459722653975e-05, "loss": 0.2392, "step": 298500 }, { "epoch": 0.23893984229970408, "grad_norm": 1.9855360984802246, "learning_rate": 4.761060157700296e-05, "loss": 0.2551, "step": 299000 }, { "epoch": 0.2393394072533825, "grad_norm": 1.3943493366241455, "learning_rate": 4.760660592746617e-05, "loss": 0.2209, "step": 299500 }, { "epoch": 0.23973897220706095, "grad_norm": 0.43473464250564575, "learning_rate": 4.760261027792939e-05, "loss": 0.2376, "step": 300000 }, { "epoch": 0.24013853716073938, "grad_norm": 1.0346168279647827, "learning_rate": 4.759861462839261e-05, "loss": 0.2593, "step": 300500 }, { "epoch": 0.2405381021144178, "grad_norm": 0.4280047118663788, "learning_rate": 4.759461897885583e-05, "loss": 0.2477, "step": 301000 }, { "epoch": 0.24093766706809625, "grad_norm": 0.5446356534957886, "learning_rate": 4.759062332931904e-05, "loss": 0.2533, "step": 301500 }, { "epoch": 0.24133723202177468, "grad_norm": 0.45690420269966125, "learning_rate": 4.758662767978225e-05, "loss": 0.2328, "step": 302000 }, { "epoch": 0.24173679697545314, "grad_norm": 0.8791415691375732, "learning_rate": 4.7582632030245475e-05, "loss": 0.23, "step": 302500 }, { "epoch": 0.24213636192913157, "grad_norm": 0.33641064167022705, "learning_rate": 4.7578636380708686e-05, "loss": 0.2565, "step": 303000 }, { "epoch": 0.24253592688281, "grad_norm": 0.6535727381706238, "learning_rate": 4.7574640731171904e-05, "loss": 0.2494, "step": 303500 }, { "epoch": 0.24293549183648844, "grad_norm": 2.148819923400879, "learning_rate": 4.7570645081635116e-05, "loss": 0.2495, "step": 304000 }, { "epoch": 0.24333505679016687, "grad_norm": 0.545325756072998, "learning_rate": 4.7566649432098334e-05, "loss": 0.2568, "step": 304500 }, { "epoch": 0.2437346217438453, "grad_norm": 0.558224618434906, "learning_rate": 4.756265378256155e-05, "loss": 0.258, "step": 305000 }, { "epoch": 0.24413418669752374, "grad_norm": 0.5564638376235962, "learning_rate": 4.7558658133024764e-05, "loss": 0.2451, "step": 305500 }, { "epoch": 0.24453375165120217, "grad_norm": 0.40051624178886414, "learning_rate": 4.755466248348798e-05, "loss": 0.2403, "step": 306000 }, { "epoch": 0.2449333166048806, "grad_norm": 0.739371120929718, "learning_rate": 4.755066683395119e-05, "loss": 0.253, "step": 306500 }, { "epoch": 0.24533288155855904, "grad_norm": 0.756428599357605, "learning_rate": 4.754667118441441e-05, "loss": 0.254, "step": 307000 }, { "epoch": 0.24573244651223747, "grad_norm": 0.5307125449180603, "learning_rate": 4.754267553487763e-05, "loss": 0.2611, "step": 307500 }, { "epoch": 0.2461320114659159, "grad_norm": 0.5267552733421326, "learning_rate": 4.753867988534084e-05, "loss": 0.2522, "step": 308000 }, { "epoch": 0.24653157641959433, "grad_norm": 0.5149267911911011, "learning_rate": 4.753468423580406e-05, "loss": 0.2468, "step": 308500 }, { "epoch": 0.2469311413732728, "grad_norm": 0.5041744112968445, "learning_rate": 4.753068858626728e-05, "loss": 0.2609, "step": 309000 }, { "epoch": 0.24733070632695123, "grad_norm": 0.378195196390152, "learning_rate": 4.752669293673049e-05, "loss": 0.2545, "step": 309500 }, { "epoch": 0.24773027128062966, "grad_norm": 0.3118264079093933, "learning_rate": 4.752269728719371e-05, "loss": 0.2443, "step": 310000 }, { "epoch": 0.2481298362343081, "grad_norm": 1.1351290941238403, "learning_rate": 4.751870163765692e-05, "loss": 0.2595, "step": 310500 }, { "epoch": 0.24852940118798653, "grad_norm": 0.4929637014865875, "learning_rate": 4.751470598812014e-05, "loss": 0.245, "step": 311000 }, { "epoch": 0.24892896614166496, "grad_norm": 0.6864035129547119, "learning_rate": 4.7510710338583355e-05, "loss": 0.2454, "step": 311500 }, { "epoch": 0.2493285310953434, "grad_norm": 1.7872095108032227, "learning_rate": 4.7506714689046566e-05, "loss": 0.2564, "step": 312000 }, { "epoch": 0.24972809604902182, "grad_norm": 0.44014057517051697, "learning_rate": 4.7502719039509784e-05, "loss": 0.2424, "step": 312500 }, { "epoch": 0.2501276610027003, "grad_norm": 0.46350666880607605, "learning_rate": 4.7498723389972996e-05, "loss": 0.2744, "step": 313000 }, { "epoch": 0.2505272259563787, "grad_norm": 0.8295425772666931, "learning_rate": 4.749472774043622e-05, "loss": 0.2388, "step": 313500 }, { "epoch": 0.25092679091005715, "grad_norm": 0.9226922988891602, "learning_rate": 4.749073209089943e-05, "loss": 0.2374, "step": 314000 }, { "epoch": 0.2513263558637356, "grad_norm": 0.515232264995575, "learning_rate": 4.7486736441362644e-05, "loss": 0.2451, "step": 314500 }, { "epoch": 0.251725920817414, "grad_norm": 0.4849660396575928, "learning_rate": 4.748274079182586e-05, "loss": 0.2597, "step": 315000 }, { "epoch": 0.25212548577109245, "grad_norm": 0.19826175272464752, "learning_rate": 4.747874514228908e-05, "loss": 0.2397, "step": 315500 }, { "epoch": 0.2525250507247709, "grad_norm": 1.1057624816894531, "learning_rate": 4.74747494927523e-05, "loss": 0.2448, "step": 316000 }, { "epoch": 0.2529246156784493, "grad_norm": 0.4430883526802063, "learning_rate": 4.747075384321551e-05, "loss": 0.2349, "step": 316500 }, { "epoch": 0.25332418063212775, "grad_norm": 0.3076478838920593, "learning_rate": 4.746675819367872e-05, "loss": 0.2428, "step": 317000 }, { "epoch": 0.2537237455858062, "grad_norm": 1.176853060722351, "learning_rate": 4.746276254414194e-05, "loss": 0.2502, "step": 317500 }, { "epoch": 0.2541233105394846, "grad_norm": 0.5293309092521667, "learning_rate": 4.745876689460516e-05, "loss": 0.2441, "step": 318000 }, { "epoch": 0.25452287549316305, "grad_norm": 0.5036641955375671, "learning_rate": 4.7454771245068376e-05, "loss": 0.2356, "step": 318500 }, { "epoch": 0.2549224404468415, "grad_norm": 0.4494696259498596, "learning_rate": 4.745077559553159e-05, "loss": 0.2347, "step": 319000 }, { "epoch": 0.2553220054005199, "grad_norm": 0.5436893701553345, "learning_rate": 4.74467799459948e-05, "loss": 0.25, "step": 319500 }, { "epoch": 0.25572157035419835, "grad_norm": 0.9529702663421631, "learning_rate": 4.7442784296458024e-05, "loss": 0.2482, "step": 320000 }, { "epoch": 0.2561211353078768, "grad_norm": 1.0758261680603027, "learning_rate": 4.7438788646921235e-05, "loss": 0.2498, "step": 320500 }, { "epoch": 0.2565207002615552, "grad_norm": 1.656219482421875, "learning_rate": 4.7434792997384446e-05, "loss": 0.2633, "step": 321000 }, { "epoch": 0.25692026521523365, "grad_norm": 0.9573178887367249, "learning_rate": 4.7430797347847665e-05, "loss": 0.2428, "step": 321500 }, { "epoch": 0.2573198301689121, "grad_norm": 0.46842190623283386, "learning_rate": 4.742680169831088e-05, "loss": 0.2363, "step": 322000 }, { "epoch": 0.2577193951225905, "grad_norm": 0.6962073445320129, "learning_rate": 4.74228060487741e-05, "loss": 0.2526, "step": 322500 }, { "epoch": 0.25811896007626894, "grad_norm": 0.4811634421348572, "learning_rate": 4.741881039923731e-05, "loss": 0.2202, "step": 323000 }, { "epoch": 0.2585185250299474, "grad_norm": 0.8419716954231262, "learning_rate": 4.7414814749700524e-05, "loss": 0.2462, "step": 323500 }, { "epoch": 0.2589180899836258, "grad_norm": 0.8906819224357605, "learning_rate": 4.741081910016374e-05, "loss": 0.2292, "step": 324000 }, { "epoch": 0.25931765493730424, "grad_norm": 0.3087131679058075, "learning_rate": 4.740682345062696e-05, "loss": 0.2454, "step": 324500 }, { "epoch": 0.2597172198909827, "grad_norm": 0.7023658752441406, "learning_rate": 4.740282780109018e-05, "loss": 0.2403, "step": 325000 }, { "epoch": 0.2601167848446611, "grad_norm": 3.251875638961792, "learning_rate": 4.739883215155339e-05, "loss": 0.2273, "step": 325500 }, { "epoch": 0.26051634979833954, "grad_norm": 0.4835781753063202, "learning_rate": 4.73948365020166e-05, "loss": 0.2438, "step": 326000 }, { "epoch": 0.26091591475201803, "grad_norm": 0.5597618818283081, "learning_rate": 4.7390840852479826e-05, "loss": 0.2397, "step": 326500 }, { "epoch": 0.26131547970569646, "grad_norm": 2.517216920852661, "learning_rate": 4.738684520294304e-05, "loss": 0.2465, "step": 327000 }, { "epoch": 0.2617150446593749, "grad_norm": 0.7894541025161743, "learning_rate": 4.7382849553406256e-05, "loss": 0.2464, "step": 327500 }, { "epoch": 0.26211460961305333, "grad_norm": 0.42294660210609436, "learning_rate": 4.737885390386947e-05, "loss": 0.2369, "step": 328000 }, { "epoch": 0.26251417456673176, "grad_norm": 0.961821436882019, "learning_rate": 4.7374858254332685e-05, "loss": 0.238, "step": 328500 }, { "epoch": 0.2629137395204102, "grad_norm": 0.5681935548782349, "learning_rate": 4.7370862604795904e-05, "loss": 0.2163, "step": 329000 }, { "epoch": 0.2633133044740886, "grad_norm": 0.5340489149093628, "learning_rate": 4.7366866955259115e-05, "loss": 0.2497, "step": 329500 }, { "epoch": 0.26371286942776706, "grad_norm": 0.9049243927001953, "learning_rate": 4.736287130572233e-05, "loss": 0.2447, "step": 330000 }, { "epoch": 0.2641124343814455, "grad_norm": 0.4207979440689087, "learning_rate": 4.7358875656185545e-05, "loss": 0.2585, "step": 330500 }, { "epoch": 0.2645119993351239, "grad_norm": 0.3933977782726288, "learning_rate": 4.735488000664876e-05, "loss": 0.2389, "step": 331000 }, { "epoch": 0.26491156428880236, "grad_norm": 1.6595038175582886, "learning_rate": 4.735088435711198e-05, "loss": 0.246, "step": 331500 }, { "epoch": 0.2653111292424808, "grad_norm": 0.2703982889652252, "learning_rate": 4.734688870757519e-05, "loss": 0.2545, "step": 332000 }, { "epoch": 0.2657106941961592, "grad_norm": 0.45263999700546265, "learning_rate": 4.734289305803841e-05, "loss": 0.2723, "step": 332500 }, { "epoch": 0.26611025914983766, "grad_norm": 0.7564799189567566, "learning_rate": 4.733889740850163e-05, "loss": 0.2671, "step": 333000 }, { "epoch": 0.2665098241035161, "grad_norm": 0.6471330523490906, "learning_rate": 4.733490175896484e-05, "loss": 0.2345, "step": 333500 }, { "epoch": 0.2669093890571945, "grad_norm": 0.3553636074066162, "learning_rate": 4.733090610942806e-05, "loss": 0.2388, "step": 334000 }, { "epoch": 0.26730895401087296, "grad_norm": 0.46375519037246704, "learning_rate": 4.732691045989127e-05, "loss": 0.2607, "step": 334500 }, { "epoch": 0.2677085189645514, "grad_norm": 0.43651944398880005, "learning_rate": 4.732291481035449e-05, "loss": 0.2276, "step": 335000 }, { "epoch": 0.2681080839182298, "grad_norm": 0.18579329550266266, "learning_rate": 4.7318919160817706e-05, "loss": 0.2657, "step": 335500 }, { "epoch": 0.26850764887190826, "grad_norm": 0.6353344321250916, "learning_rate": 4.731492351128092e-05, "loss": 0.2413, "step": 336000 }, { "epoch": 0.2689072138255867, "grad_norm": 0.4044117033481598, "learning_rate": 4.7310927861744136e-05, "loss": 0.2246, "step": 336500 }, { "epoch": 0.2693067787792651, "grad_norm": 1.227715015411377, "learning_rate": 4.730693221220735e-05, "loss": 0.2452, "step": 337000 }, { "epoch": 0.26970634373294355, "grad_norm": 0.5585553646087646, "learning_rate": 4.7302936562670566e-05, "loss": 0.2536, "step": 337500 }, { "epoch": 0.270105908686622, "grad_norm": 1.2555420398712158, "learning_rate": 4.7298940913133784e-05, "loss": 0.2316, "step": 338000 }, { "epoch": 0.2705054736403004, "grad_norm": 0.5331929326057434, "learning_rate": 4.7294945263596995e-05, "loss": 0.2432, "step": 338500 }, { "epoch": 0.27090503859397885, "grad_norm": 0.962904691696167, "learning_rate": 4.7290949614060213e-05, "loss": 0.2541, "step": 339000 }, { "epoch": 0.2713046035476573, "grad_norm": 0.7864426374435425, "learning_rate": 4.728695396452343e-05, "loss": 0.254, "step": 339500 }, { "epoch": 0.2717041685013357, "grad_norm": 1.6974186897277832, "learning_rate": 4.728295831498665e-05, "loss": 0.2441, "step": 340000 }, { "epoch": 0.2721037334550142, "grad_norm": 1.483149766921997, "learning_rate": 4.727896266544986e-05, "loss": 0.2389, "step": 340500 }, { "epoch": 0.27250329840869264, "grad_norm": 0.4335728585720062, "learning_rate": 4.727496701591307e-05, "loss": 0.2402, "step": 341000 }, { "epoch": 0.2729028633623711, "grad_norm": 0.944489598274231, "learning_rate": 4.727097136637629e-05, "loss": 0.2284, "step": 341500 }, { "epoch": 0.2733024283160495, "grad_norm": 0.4954468905925751, "learning_rate": 4.726697571683951e-05, "loss": 0.2518, "step": 342000 }, { "epoch": 0.27370199326972794, "grad_norm": 1.0819753408432007, "learning_rate": 4.726298006730273e-05, "loss": 0.2244, "step": 342500 }, { "epoch": 0.27410155822340637, "grad_norm": 0.3472881615161896, "learning_rate": 4.725898441776594e-05, "loss": 0.2619, "step": 343000 }, { "epoch": 0.2745011231770848, "grad_norm": 0.9260784983634949, "learning_rate": 4.725498876822915e-05, "loss": 0.2393, "step": 343500 }, { "epoch": 0.27490068813076324, "grad_norm": 0.8323214650154114, "learning_rate": 4.725099311869237e-05, "loss": 0.2359, "step": 344000 }, { "epoch": 0.27530025308444167, "grad_norm": 0.49934402108192444, "learning_rate": 4.7246997469155587e-05, "loss": 0.2356, "step": 344500 }, { "epoch": 0.2756998180381201, "grad_norm": 0.704128086566925, "learning_rate": 4.7243001819618805e-05, "loss": 0.2447, "step": 345000 }, { "epoch": 0.27609938299179854, "grad_norm": 0.5133384466171265, "learning_rate": 4.7239006170082016e-05, "loss": 0.2456, "step": 345500 }, { "epoch": 0.27649894794547697, "grad_norm": 0.7595999836921692, "learning_rate": 4.723501052054523e-05, "loss": 0.2508, "step": 346000 }, { "epoch": 0.2768985128991554, "grad_norm": 0.4369831681251526, "learning_rate": 4.723101487100845e-05, "loss": 0.2283, "step": 346500 }, { "epoch": 0.27729807785283384, "grad_norm": 0.4272473454475403, "learning_rate": 4.7227019221471664e-05, "loss": 0.2476, "step": 347000 }, { "epoch": 0.27769764280651227, "grad_norm": 2.5545008182525635, "learning_rate": 4.7223023571934875e-05, "loss": 0.2464, "step": 347500 }, { "epoch": 0.2780972077601907, "grad_norm": 0.7786296606063843, "learning_rate": 4.7219027922398094e-05, "loss": 0.2198, "step": 348000 }, { "epoch": 0.27849677271386913, "grad_norm": 0.44699811935424805, "learning_rate": 4.721503227286131e-05, "loss": 0.2322, "step": 348500 }, { "epoch": 0.27889633766754757, "grad_norm": 0.5935172438621521, "learning_rate": 4.721103662332453e-05, "loss": 0.247, "step": 349000 }, { "epoch": 0.279295902621226, "grad_norm": 0.3587712049484253, "learning_rate": 4.720704097378774e-05, "loss": 0.2352, "step": 349500 }, { "epoch": 0.27969546757490443, "grad_norm": 1.1256067752838135, "learning_rate": 4.720304532425095e-05, "loss": 0.2634, "step": 350000 }, { "epoch": 0.28009503252858287, "grad_norm": 0.49200043082237244, "learning_rate": 4.719904967471417e-05, "loss": 0.2421, "step": 350500 }, { "epoch": 0.2804945974822613, "grad_norm": 0.39014652371406555, "learning_rate": 4.719505402517739e-05, "loss": 0.2379, "step": 351000 }, { "epoch": 0.28089416243593973, "grad_norm": 0.5332036018371582, "learning_rate": 4.719105837564061e-05, "loss": 0.2368, "step": 351500 }, { "epoch": 0.28129372738961816, "grad_norm": 0.5696955919265747, "learning_rate": 4.718706272610382e-05, "loss": 0.2561, "step": 352000 }, { "epoch": 0.2816932923432966, "grad_norm": 1.048926830291748, "learning_rate": 4.718306707656703e-05, "loss": 0.2293, "step": 352500 }, { "epoch": 0.28209285729697503, "grad_norm": 0.6337674260139465, "learning_rate": 4.7179071427030255e-05, "loss": 0.2572, "step": 353000 }, { "epoch": 0.28249242225065346, "grad_norm": 0.6514806151390076, "learning_rate": 4.717507577749347e-05, "loss": 0.2337, "step": 353500 }, { "epoch": 0.2828919872043319, "grad_norm": 0.4797157645225525, "learning_rate": 4.7171080127956685e-05, "loss": 0.2242, "step": 354000 }, { "epoch": 0.2832915521580104, "grad_norm": 0.4569477140903473, "learning_rate": 4.7167084478419896e-05, "loss": 0.2426, "step": 354500 }, { "epoch": 0.2836911171116888, "grad_norm": 0.43970486521720886, "learning_rate": 4.7163088828883115e-05, "loss": 0.2549, "step": 355000 }, { "epoch": 0.28409068206536725, "grad_norm": 0.9948223829269409, "learning_rate": 4.715909317934633e-05, "loss": 0.2297, "step": 355500 }, { "epoch": 0.2844902470190457, "grad_norm": 2.657339334487915, "learning_rate": 4.7155097529809544e-05, "loss": 0.2294, "step": 356000 }, { "epoch": 0.2848898119727241, "grad_norm": 0.5157157778739929, "learning_rate": 4.715110188027276e-05, "loss": 0.2584, "step": 356500 }, { "epoch": 0.28528937692640255, "grad_norm": 0.4705747067928314, "learning_rate": 4.7147106230735974e-05, "loss": 0.254, "step": 357000 }, { "epoch": 0.285688941880081, "grad_norm": 1.1031297445297241, "learning_rate": 4.714311058119919e-05, "loss": 0.2449, "step": 357500 }, { "epoch": 0.2860885068337594, "grad_norm": 0.7282834053039551, "learning_rate": 4.713911493166241e-05, "loss": 0.2391, "step": 358000 }, { "epoch": 0.28648807178743785, "grad_norm": 0.6010321378707886, "learning_rate": 4.713511928212562e-05, "loss": 0.232, "step": 358500 }, { "epoch": 0.2868876367411163, "grad_norm": 1.3027188777923584, "learning_rate": 4.713112363258884e-05, "loss": 0.2439, "step": 359000 }, { "epoch": 0.2872872016947947, "grad_norm": 1.124481201171875, "learning_rate": 4.712712798305206e-05, "loss": 0.2634, "step": 359500 }, { "epoch": 0.28768676664847315, "grad_norm": 0.5106198191642761, "learning_rate": 4.712313233351527e-05, "loss": 0.2415, "step": 360000 }, { "epoch": 0.2880863316021516, "grad_norm": 0.3621806502342224, "learning_rate": 4.711913668397849e-05, "loss": 0.2317, "step": 360500 }, { "epoch": 0.28848589655583, "grad_norm": 2.1377620697021484, "learning_rate": 4.71151410344417e-05, "loss": 0.2316, "step": 361000 }, { "epoch": 0.28888546150950845, "grad_norm": 0.5825107097625732, "learning_rate": 4.711114538490492e-05, "loss": 0.2367, "step": 361500 }, { "epoch": 0.2892850264631869, "grad_norm": 0.7580356597900391, "learning_rate": 4.7107149735368135e-05, "loss": 0.2402, "step": 362000 }, { "epoch": 0.2896845914168653, "grad_norm": 1.4910916090011597, "learning_rate": 4.710315408583135e-05, "loss": 0.2405, "step": 362500 }, { "epoch": 0.29008415637054374, "grad_norm": 0.36631855368614197, "learning_rate": 4.7099158436294565e-05, "loss": 0.2191, "step": 363000 }, { "epoch": 0.2904837213242222, "grad_norm": 3.7142093181610107, "learning_rate": 4.7095162786757776e-05, "loss": 0.2329, "step": 363500 }, { "epoch": 0.2908832862779006, "grad_norm": 0.6818217635154724, "learning_rate": 4.7091167137221e-05, "loss": 0.2463, "step": 364000 }, { "epoch": 0.29128285123157904, "grad_norm": 2.020338535308838, "learning_rate": 4.708717148768421e-05, "loss": 0.2428, "step": 364500 }, { "epoch": 0.2916824161852575, "grad_norm": 0.1955258548259735, "learning_rate": 4.7083175838147424e-05, "loss": 0.2402, "step": 365000 }, { "epoch": 0.2920819811389359, "grad_norm": 1.105519413948059, "learning_rate": 4.707918018861064e-05, "loss": 0.2325, "step": 365500 }, { "epoch": 0.29248154609261434, "grad_norm": 3.932417631149292, "learning_rate": 4.707518453907386e-05, "loss": 0.2359, "step": 366000 }, { "epoch": 0.2928811110462928, "grad_norm": 2.2736473083496094, "learning_rate": 4.707118888953708e-05, "loss": 0.2393, "step": 366500 }, { "epoch": 0.2932806759999712, "grad_norm": 0.5137719511985779, "learning_rate": 4.706719324000029e-05, "loss": 0.2543, "step": 367000 }, { "epoch": 0.29368024095364964, "grad_norm": 1.2123284339904785, "learning_rate": 4.70631975904635e-05, "loss": 0.238, "step": 367500 }, { "epoch": 0.2940798059073281, "grad_norm": 0.9010424613952637, "learning_rate": 4.705920194092672e-05, "loss": 0.2619, "step": 368000 }, { "epoch": 0.29447937086100656, "grad_norm": 1.4214415550231934, "learning_rate": 4.705520629138994e-05, "loss": 0.2392, "step": 368500 }, { "epoch": 0.294878935814685, "grad_norm": 1.1055524349212646, "learning_rate": 4.7051210641853156e-05, "loss": 0.2218, "step": 369000 }, { "epoch": 0.2952785007683634, "grad_norm": 0.6146992444992065, "learning_rate": 4.704721499231637e-05, "loss": 0.2486, "step": 369500 }, { "epoch": 0.29567806572204186, "grad_norm": 0.6233959197998047, "learning_rate": 4.704321934277958e-05, "loss": 0.2503, "step": 370000 }, { "epoch": 0.2960776306757203, "grad_norm": 0.5921819806098938, "learning_rate": 4.7039223693242804e-05, "loss": 0.2571, "step": 370500 }, { "epoch": 0.2964771956293987, "grad_norm": 0.7795774340629578, "learning_rate": 4.7035228043706016e-05, "loss": 0.2293, "step": 371000 }, { "epoch": 0.29687676058307716, "grad_norm": 1.003207802772522, "learning_rate": 4.7031232394169234e-05, "loss": 0.2223, "step": 371500 }, { "epoch": 0.2972763255367556, "grad_norm": 1.0617390871047974, "learning_rate": 4.7027236744632445e-05, "loss": 0.241, "step": 372000 }, { "epoch": 0.297675890490434, "grad_norm": 1.0345491170883179, "learning_rate": 4.7023241095095663e-05, "loss": 0.2293, "step": 372500 }, { "epoch": 0.29807545544411246, "grad_norm": 2.078335762023926, "learning_rate": 4.701924544555888e-05, "loss": 0.2163, "step": 373000 }, { "epoch": 0.2984750203977909, "grad_norm": 0.33909115195274353, "learning_rate": 4.701524979602209e-05, "loss": 0.2331, "step": 373500 }, { "epoch": 0.2988745853514693, "grad_norm": 0.5281133055686951, "learning_rate": 4.701125414648531e-05, "loss": 0.231, "step": 374000 }, { "epoch": 0.29927415030514776, "grad_norm": 0.4113112688064575, "learning_rate": 4.700725849694852e-05, "loss": 0.2395, "step": 374500 }, { "epoch": 0.2996737152588262, "grad_norm": 0.7578318119049072, "learning_rate": 4.700326284741174e-05, "loss": 0.2665, "step": 375000 }, { "epoch": 0.3000732802125046, "grad_norm": 0.8992453217506409, "learning_rate": 4.699926719787496e-05, "loss": 0.2586, "step": 375500 }, { "epoch": 0.30047284516618306, "grad_norm": 0.9213141202926636, "learning_rate": 4.699527154833817e-05, "loss": 0.247, "step": 376000 }, { "epoch": 0.3008724101198615, "grad_norm": 0.8966816663742065, "learning_rate": 4.699127589880138e-05, "loss": 0.2486, "step": 376500 }, { "epoch": 0.3012719750735399, "grad_norm": 0.6569235920906067, "learning_rate": 4.698728024926461e-05, "loss": 0.2495, "step": 377000 }, { "epoch": 0.30167154002721835, "grad_norm": 1.4492273330688477, "learning_rate": 4.698328459972782e-05, "loss": 0.2388, "step": 377500 }, { "epoch": 0.3020711049808968, "grad_norm": 0.5447165369987488, "learning_rate": 4.6979288950191036e-05, "loss": 0.261, "step": 378000 }, { "epoch": 0.3024706699345752, "grad_norm": 0.4405788779258728, "learning_rate": 4.697529330065425e-05, "loss": 0.2457, "step": 378500 }, { "epoch": 0.30287023488825365, "grad_norm": 0.22495315968990326, "learning_rate": 4.6971297651117466e-05, "loss": 0.2519, "step": 379000 }, { "epoch": 0.3032697998419321, "grad_norm": 0.5395470857620239, "learning_rate": 4.6967302001580684e-05, "loss": 0.2529, "step": 379500 }, { "epoch": 0.3036693647956105, "grad_norm": 1.78800368309021, "learning_rate": 4.6963306352043896e-05, "loss": 0.2588, "step": 380000 }, { "epoch": 0.30406892974928895, "grad_norm": 0.8115984201431274, "learning_rate": 4.6959310702507114e-05, "loss": 0.2432, "step": 380500 }, { "epoch": 0.3044684947029674, "grad_norm": 0.400433212518692, "learning_rate": 4.6955315052970325e-05, "loss": 0.2336, "step": 381000 }, { "epoch": 0.3048680596566458, "grad_norm": 0.5473524332046509, "learning_rate": 4.6951319403433544e-05, "loss": 0.2562, "step": 381500 }, { "epoch": 0.3052676246103243, "grad_norm": 0.3608904182910919, "learning_rate": 4.694732375389676e-05, "loss": 0.2338, "step": 382000 }, { "epoch": 0.30566718956400274, "grad_norm": 0.8079068064689636, "learning_rate": 4.694332810435997e-05, "loss": 0.2526, "step": 382500 }, { "epoch": 0.30606675451768117, "grad_norm": 0.6253378987312317, "learning_rate": 4.693933245482319e-05, "loss": 0.2332, "step": 383000 }, { "epoch": 0.3064663194713596, "grad_norm": 1.5998022556304932, "learning_rate": 4.693533680528641e-05, "loss": 0.2435, "step": 383500 }, { "epoch": 0.30686588442503804, "grad_norm": 1.1558122634887695, "learning_rate": 4.693134115574962e-05, "loss": 0.2299, "step": 384000 }, { "epoch": 0.30726544937871647, "grad_norm": 1.21847403049469, "learning_rate": 4.692734550621284e-05, "loss": 0.2309, "step": 384500 }, { "epoch": 0.3076650143323949, "grad_norm": 0.6057390570640564, "learning_rate": 4.692334985667605e-05, "loss": 0.2339, "step": 385000 }, { "epoch": 0.30806457928607334, "grad_norm": 0.8020809292793274, "learning_rate": 4.691935420713927e-05, "loss": 0.2493, "step": 385500 }, { "epoch": 0.30846414423975177, "grad_norm": 1.6492342948913574, "learning_rate": 4.691535855760249e-05, "loss": 0.2569, "step": 386000 }, { "epoch": 0.3088637091934302, "grad_norm": 0.5882508754730225, "learning_rate": 4.69113629080657e-05, "loss": 0.2379, "step": 386500 }, { "epoch": 0.30926327414710864, "grad_norm": 0.7844045758247375, "learning_rate": 4.690736725852892e-05, "loss": 0.229, "step": 387000 }, { "epoch": 0.30966283910078707, "grad_norm": 0.5903177261352539, "learning_rate": 4.690337160899213e-05, "loss": 0.2272, "step": 387500 }, { "epoch": 0.3100624040544655, "grad_norm": 0.42931726574897766, "learning_rate": 4.6899375959455346e-05, "loss": 0.2645, "step": 388000 }, { "epoch": 0.31046196900814393, "grad_norm": 1.506012201309204, "learning_rate": 4.6895380309918564e-05, "loss": 0.2699, "step": 388500 }, { "epoch": 0.31086153396182237, "grad_norm": 0.25253015756607056, "learning_rate": 4.6891384660381776e-05, "loss": 0.2356, "step": 389000 }, { "epoch": 0.3112610989155008, "grad_norm": 0.7282262444496155, "learning_rate": 4.6887389010844994e-05, "loss": 0.2333, "step": 389500 }, { "epoch": 0.31166066386917923, "grad_norm": 0.8557161688804626, "learning_rate": 4.6883393361308206e-05, "loss": 0.2445, "step": 390000 }, { "epoch": 0.31206022882285767, "grad_norm": 0.6370416879653931, "learning_rate": 4.687939771177143e-05, "loss": 0.2417, "step": 390500 }, { "epoch": 0.3124597937765361, "grad_norm": 0.30028048157691956, "learning_rate": 4.687540206223464e-05, "loss": 0.2289, "step": 391000 }, { "epoch": 0.31285935873021453, "grad_norm": 0.40975576639175415, "learning_rate": 4.687140641269785e-05, "loss": 0.2506, "step": 391500 }, { "epoch": 0.31325892368389296, "grad_norm": 0.5871539115905762, "learning_rate": 4.686741076316107e-05, "loss": 0.227, "step": 392000 }, { "epoch": 0.3136584886375714, "grad_norm": 1.3906728029251099, "learning_rate": 4.686341511362429e-05, "loss": 0.2581, "step": 392500 }, { "epoch": 0.31405805359124983, "grad_norm": 0.4902611970901489, "learning_rate": 4.685941946408751e-05, "loss": 0.246, "step": 393000 }, { "epoch": 0.31445761854492826, "grad_norm": 0.7250665426254272, "learning_rate": 4.685542381455072e-05, "loss": 0.225, "step": 393500 }, { "epoch": 0.3148571834986067, "grad_norm": 0.5084488987922668, "learning_rate": 4.685142816501393e-05, "loss": 0.2534, "step": 394000 }, { "epoch": 0.31525674845228513, "grad_norm": 1.5144284963607788, "learning_rate": 4.684743251547715e-05, "loss": 0.2356, "step": 394500 }, { "epoch": 0.31565631340596356, "grad_norm": 1.350603699684143, "learning_rate": 4.684343686594037e-05, "loss": 0.2537, "step": 395000 }, { "epoch": 0.316055878359642, "grad_norm": 0.321712464094162, "learning_rate": 4.6839441216403585e-05, "loss": 0.2539, "step": 395500 }, { "epoch": 0.3164554433133205, "grad_norm": 0.4363856017589569, "learning_rate": 4.68354455668668e-05, "loss": 0.2302, "step": 396000 }, { "epoch": 0.3168550082669989, "grad_norm": 1.4025335311889648, "learning_rate": 4.683144991733001e-05, "loss": 0.2547, "step": 396500 }, { "epoch": 0.31725457322067735, "grad_norm": 0.5930554270744324, "learning_rate": 4.682745426779323e-05, "loss": 0.253, "step": 397000 }, { "epoch": 0.3176541381743558, "grad_norm": 0.4926980435848236, "learning_rate": 4.6823458618256445e-05, "loss": 0.2474, "step": 397500 }, { "epoch": 0.3180537031280342, "grad_norm": 1.324951410293579, "learning_rate": 4.681946296871966e-05, "loss": 0.2503, "step": 398000 }, { "epoch": 0.31845326808171265, "grad_norm": 0.5602216720581055, "learning_rate": 4.6815467319182874e-05, "loss": 0.2604, "step": 398500 }, { "epoch": 0.3188528330353911, "grad_norm": 0.9177944660186768, "learning_rate": 4.681147166964609e-05, "loss": 0.2381, "step": 399000 }, { "epoch": 0.3192523979890695, "grad_norm": 0.7114989161491394, "learning_rate": 4.680747602010931e-05, "loss": 0.2558, "step": 399500 }, { "epoch": 0.31965196294274795, "grad_norm": 0.5589873790740967, "learning_rate": 4.680348037057252e-05, "loss": 0.2407, "step": 400000 }, { "epoch": 0.3200515278964264, "grad_norm": 1.7849122285842896, "learning_rate": 4.679948472103574e-05, "loss": 0.2652, "step": 400500 }, { "epoch": 0.3204510928501048, "grad_norm": 0.569049060344696, "learning_rate": 4.679548907149895e-05, "loss": 0.2518, "step": 401000 }, { "epoch": 0.32085065780378325, "grad_norm": 1.380000352859497, "learning_rate": 4.679149342196217e-05, "loss": 0.2571, "step": 401500 }, { "epoch": 0.3212502227574617, "grad_norm": 0.5535905361175537, "learning_rate": 4.678749777242539e-05, "loss": 0.2473, "step": 402000 }, { "epoch": 0.3216497877111401, "grad_norm": 0.182023286819458, "learning_rate": 4.67835021228886e-05, "loss": 0.2396, "step": 402500 }, { "epoch": 0.32204935266481854, "grad_norm": 0.5003525614738464, "learning_rate": 4.677950647335181e-05, "loss": 0.2452, "step": 403000 }, { "epoch": 0.322448917618497, "grad_norm": 2.4984819889068604, "learning_rate": 4.6775510823815036e-05, "loss": 0.2431, "step": 403500 }, { "epoch": 0.3228484825721754, "grad_norm": 3.8632712364196777, "learning_rate": 4.677151517427825e-05, "loss": 0.2474, "step": 404000 }, { "epoch": 0.32324804752585384, "grad_norm": 0.5361611247062683, "learning_rate": 4.6767519524741466e-05, "loss": 0.2372, "step": 404500 }, { "epoch": 0.3236476124795323, "grad_norm": 0.42485693097114563, "learning_rate": 4.676352387520468e-05, "loss": 0.2495, "step": 405000 }, { "epoch": 0.3240471774332107, "grad_norm": 0.462603360414505, "learning_rate": 4.6759528225667895e-05, "loss": 0.2617, "step": 405500 }, { "epoch": 0.32444674238688914, "grad_norm": 0.38206884264945984, "learning_rate": 4.675553257613111e-05, "loss": 0.2467, "step": 406000 }, { "epoch": 0.3248463073405676, "grad_norm": 0.23172567784786224, "learning_rate": 4.6751536926594325e-05, "loss": 0.2324, "step": 406500 }, { "epoch": 0.325245872294246, "grad_norm": 0.525528073310852, "learning_rate": 4.674754127705754e-05, "loss": 0.2425, "step": 407000 }, { "epoch": 0.32564543724792444, "grad_norm": 0.3746863603591919, "learning_rate": 4.6743545627520754e-05, "loss": 0.2324, "step": 407500 }, { "epoch": 0.3260450022016029, "grad_norm": 0.47505825757980347, "learning_rate": 4.673954997798397e-05, "loss": 0.2439, "step": 408000 }, { "epoch": 0.3264445671552813, "grad_norm": 0.5624730587005615, "learning_rate": 4.673555432844719e-05, "loss": 0.2515, "step": 408500 }, { "epoch": 0.32684413210895974, "grad_norm": 0.9638099670410156, "learning_rate": 4.67315586789104e-05, "loss": 0.2252, "step": 409000 }, { "epoch": 0.3272436970626382, "grad_norm": 0.7209935188293457, "learning_rate": 4.672756302937362e-05, "loss": 0.2245, "step": 409500 }, { "epoch": 0.32764326201631666, "grad_norm": 0.6693527102470398, "learning_rate": 4.672356737983684e-05, "loss": 0.2569, "step": 410000 }, { "epoch": 0.3280428269699951, "grad_norm": 0.4414277672767639, "learning_rate": 4.671957173030005e-05, "loss": 0.2274, "step": 410500 }, { "epoch": 0.3284423919236735, "grad_norm": 2.2455790042877197, "learning_rate": 4.671557608076327e-05, "loss": 0.2458, "step": 411000 }, { "epoch": 0.32884195687735196, "grad_norm": 0.6276607513427734, "learning_rate": 4.671158043122648e-05, "loss": 0.251, "step": 411500 }, { "epoch": 0.3292415218310304, "grad_norm": 0.5009578466415405, "learning_rate": 4.67075847816897e-05, "loss": 0.2477, "step": 412000 }, { "epoch": 0.3296410867847088, "grad_norm": 0.545983076095581, "learning_rate": 4.6703589132152916e-05, "loss": 0.2378, "step": 412500 }, { "epoch": 0.33004065173838726, "grad_norm": 0.4713369309902191, "learning_rate": 4.669959348261613e-05, "loss": 0.2619, "step": 413000 }, { "epoch": 0.3304402166920657, "grad_norm": 0.5319584012031555, "learning_rate": 4.6695597833079346e-05, "loss": 0.2379, "step": 413500 }, { "epoch": 0.3308397816457441, "grad_norm": 0.4638768434524536, "learning_rate": 4.669160218354256e-05, "loss": 0.2306, "step": 414000 }, { "epoch": 0.33123934659942256, "grad_norm": 0.20029762387275696, "learning_rate": 4.668760653400578e-05, "loss": 0.2542, "step": 414500 }, { "epoch": 0.331638911553101, "grad_norm": 0.3679060637950897, "learning_rate": 4.6683610884468993e-05, "loss": 0.2334, "step": 415000 }, { "epoch": 0.3320384765067794, "grad_norm": 1.1810832023620605, "learning_rate": 4.6679615234932205e-05, "loss": 0.2012, "step": 415500 }, { "epoch": 0.33243804146045786, "grad_norm": 0.509239912033081, "learning_rate": 4.667561958539542e-05, "loss": 0.2488, "step": 416000 }, { "epoch": 0.3328376064141363, "grad_norm": 1.8765151500701904, "learning_rate": 4.667162393585864e-05, "loss": 0.2478, "step": 416500 }, { "epoch": 0.3332371713678147, "grad_norm": 0.5297932624816895, "learning_rate": 4.666762828632186e-05, "loss": 0.2406, "step": 417000 }, { "epoch": 0.33363673632149315, "grad_norm": 0.4993404150009155, "learning_rate": 4.666363263678507e-05, "loss": 0.224, "step": 417500 }, { "epoch": 0.3340363012751716, "grad_norm": 0.6090480089187622, "learning_rate": 4.665963698724828e-05, "loss": 0.2282, "step": 418000 }, { "epoch": 0.33443586622885, "grad_norm": 0.5013939142227173, "learning_rate": 4.66556413377115e-05, "loss": 0.2443, "step": 418500 }, { "epoch": 0.33483543118252845, "grad_norm": 0.3929177224636078, "learning_rate": 4.665164568817472e-05, "loss": 0.225, "step": 419000 }, { "epoch": 0.3352349961362069, "grad_norm": 0.1924070417881012, "learning_rate": 4.664765003863794e-05, "loss": 0.2165, "step": 419500 }, { "epoch": 0.3356345610898853, "grad_norm": 0.5382612347602844, "learning_rate": 4.664365438910115e-05, "loss": 0.2225, "step": 420000 }, { "epoch": 0.33603412604356375, "grad_norm": 0.8821693062782288, "learning_rate": 4.663965873956436e-05, "loss": 0.2416, "step": 420500 }, { "epoch": 0.3364336909972422, "grad_norm": 0.6296918988227844, "learning_rate": 4.6635663090027585e-05, "loss": 0.2327, "step": 421000 }, { "epoch": 0.3368332559509206, "grad_norm": 0.36132949590682983, "learning_rate": 4.6631667440490796e-05, "loss": 0.2571, "step": 421500 }, { "epoch": 0.33723282090459905, "grad_norm": 0.47887128591537476, "learning_rate": 4.6627671790954014e-05, "loss": 0.2332, "step": 422000 }, { "epoch": 0.3376323858582775, "grad_norm": 0.5773317217826843, "learning_rate": 4.6623676141417226e-05, "loss": 0.232, "step": 422500 }, { "epoch": 0.3380319508119559, "grad_norm": 0.43099361658096313, "learning_rate": 4.6619680491880444e-05, "loss": 0.2298, "step": 423000 }, { "epoch": 0.33843151576563435, "grad_norm": 0.9233924746513367, "learning_rate": 4.661568484234366e-05, "loss": 0.2386, "step": 423500 }, { "epoch": 0.33883108071931284, "grad_norm": 0.2863590717315674, "learning_rate": 4.6611689192806874e-05, "loss": 0.2198, "step": 424000 }, { "epoch": 0.33923064567299127, "grad_norm": 0.6491907238960266, "learning_rate": 4.660769354327009e-05, "loss": 0.2456, "step": 424500 }, { "epoch": 0.3396302106266697, "grad_norm": 0.6470714807510376, "learning_rate": 4.66036978937333e-05, "loss": 0.2439, "step": 425000 }, { "epoch": 0.34002977558034814, "grad_norm": 0.4654944837093353, "learning_rate": 4.659970224419652e-05, "loss": 0.225, "step": 425500 }, { "epoch": 0.34042934053402657, "grad_norm": 0.8514377474784851, "learning_rate": 4.659570659465974e-05, "loss": 0.2442, "step": 426000 }, { "epoch": 0.340828905487705, "grad_norm": 0.8393601775169373, "learning_rate": 4.659171094512295e-05, "loss": 0.238, "step": 426500 }, { "epoch": 0.34122847044138344, "grad_norm": 0.21255138516426086, "learning_rate": 4.658771529558617e-05, "loss": 0.2519, "step": 427000 }, { "epoch": 0.34162803539506187, "grad_norm": 0.3488863408565521, "learning_rate": 4.658371964604939e-05, "loss": 0.2475, "step": 427500 }, { "epoch": 0.3420276003487403, "grad_norm": 0.588313102722168, "learning_rate": 4.65797239965126e-05, "loss": 0.2427, "step": 428000 }, { "epoch": 0.34242716530241873, "grad_norm": 2.364053964614868, "learning_rate": 4.657572834697582e-05, "loss": 0.2183, "step": 428500 }, { "epoch": 0.34282673025609717, "grad_norm": 1.1189961433410645, "learning_rate": 4.657173269743903e-05, "loss": 0.2386, "step": 429000 }, { "epoch": 0.3432262952097756, "grad_norm": 2.429332971572876, "learning_rate": 4.656773704790225e-05, "loss": 0.2421, "step": 429500 }, { "epoch": 0.34362586016345403, "grad_norm": 0.6900713443756104, "learning_rate": 4.6563741398365465e-05, "loss": 0.2292, "step": 430000 }, { "epoch": 0.34402542511713247, "grad_norm": 0.7103545069694519, "learning_rate": 4.6559745748828676e-05, "loss": 0.2224, "step": 430500 }, { "epoch": 0.3444249900708109, "grad_norm": 0.33714160323143005, "learning_rate": 4.6555750099291895e-05, "loss": 0.2183, "step": 431000 }, { "epoch": 0.34482455502448933, "grad_norm": 0.2303358018398285, "learning_rate": 4.6551754449755106e-05, "loss": 0.2523, "step": 431500 }, { "epoch": 0.34522411997816776, "grad_norm": 1.6134319305419922, "learning_rate": 4.6547758800218324e-05, "loss": 0.2272, "step": 432000 }, { "epoch": 0.3456236849318462, "grad_norm": 1.1368356943130493, "learning_rate": 4.654376315068154e-05, "loss": 0.2353, "step": 432500 }, { "epoch": 0.34602324988552463, "grad_norm": 0.5264033079147339, "learning_rate": 4.6539767501144754e-05, "loss": 0.2262, "step": 433000 }, { "epoch": 0.34642281483920306, "grad_norm": 0.9753745794296265, "learning_rate": 4.653577185160797e-05, "loss": 0.219, "step": 433500 }, { "epoch": 0.3468223797928815, "grad_norm": 0.547334611415863, "learning_rate": 4.6531776202071183e-05, "loss": 0.2504, "step": 434000 }, { "epoch": 0.34722194474655993, "grad_norm": 0.17420987784862518, "learning_rate": 4.65277805525344e-05, "loss": 0.2313, "step": 434500 }, { "epoch": 0.34762150970023836, "grad_norm": 0.387300580739975, "learning_rate": 4.652378490299762e-05, "loss": 0.2291, "step": 435000 }, { "epoch": 0.3480210746539168, "grad_norm": 0.7707698345184326, "learning_rate": 4.651978925346083e-05, "loss": 0.2409, "step": 435500 }, { "epoch": 0.34842063960759523, "grad_norm": 0.8136277198791504, "learning_rate": 4.651579360392405e-05, "loss": 0.2341, "step": 436000 }, { "epoch": 0.34882020456127366, "grad_norm": 0.520192563533783, "learning_rate": 4.651179795438727e-05, "loss": 0.2358, "step": 436500 }, { "epoch": 0.3492197695149521, "grad_norm": 0.9424835443496704, "learning_rate": 4.650780230485048e-05, "loss": 0.2551, "step": 437000 }, { "epoch": 0.3496193344686306, "grad_norm": 0.61256343126297, "learning_rate": 4.65038066553137e-05, "loss": 0.2266, "step": 437500 }, { "epoch": 0.350018899422309, "grad_norm": 0.7525993585586548, "learning_rate": 4.649981100577691e-05, "loss": 0.2313, "step": 438000 }, { "epoch": 0.35041846437598745, "grad_norm": 0.8302854299545288, "learning_rate": 4.649581535624013e-05, "loss": 0.2287, "step": 438500 }, { "epoch": 0.3508180293296659, "grad_norm": 0.9021904468536377, "learning_rate": 4.6491819706703345e-05, "loss": 0.2195, "step": 439000 }, { "epoch": 0.3512175942833443, "grad_norm": 0.4967847764492035, "learning_rate": 4.6487824057166557e-05, "loss": 0.2346, "step": 439500 }, { "epoch": 0.35161715923702275, "grad_norm": 0.6737426519393921, "learning_rate": 4.6483828407629775e-05, "loss": 0.2409, "step": 440000 }, { "epoch": 0.3520167241907012, "grad_norm": 0.22830188274383545, "learning_rate": 4.6479832758092986e-05, "loss": 0.2405, "step": 440500 }, { "epoch": 0.3524162891443796, "grad_norm": 0.4019054174423218, "learning_rate": 4.647583710855621e-05, "loss": 0.2607, "step": 441000 }, { "epoch": 0.35281585409805805, "grad_norm": 0.762373149394989, "learning_rate": 4.647184145901942e-05, "loss": 0.2384, "step": 441500 }, { "epoch": 0.3532154190517365, "grad_norm": 0.6883245706558228, "learning_rate": 4.6467845809482634e-05, "loss": 0.2384, "step": 442000 }, { "epoch": 0.3536149840054149, "grad_norm": 1.127780795097351, "learning_rate": 4.646385015994585e-05, "loss": 0.2419, "step": 442500 }, { "epoch": 0.35401454895909334, "grad_norm": 0.7368519902229309, "learning_rate": 4.645985451040907e-05, "loss": 0.239, "step": 443000 }, { "epoch": 0.3544141139127718, "grad_norm": 1.843346357345581, "learning_rate": 4.645585886087229e-05, "loss": 0.2389, "step": 443500 }, { "epoch": 0.3548136788664502, "grad_norm": 0.7055549025535583, "learning_rate": 4.64518632113355e-05, "loss": 0.233, "step": 444000 }, { "epoch": 0.35521324382012864, "grad_norm": 1.9055941104888916, "learning_rate": 4.644786756179871e-05, "loss": 0.234, "step": 444500 }, { "epoch": 0.3556128087738071, "grad_norm": 0.5253866910934448, "learning_rate": 4.644387191226193e-05, "loss": 0.2257, "step": 445000 }, { "epoch": 0.3560123737274855, "grad_norm": 0.8299115896224976, "learning_rate": 4.643987626272515e-05, "loss": 0.24, "step": 445500 }, { "epoch": 0.35641193868116394, "grad_norm": 0.5566179752349854, "learning_rate": 4.6435880613188366e-05, "loss": 0.2344, "step": 446000 }, { "epoch": 0.3568115036348424, "grad_norm": 0.2716953754425049, "learning_rate": 4.643188496365158e-05, "loss": 0.2428, "step": 446500 }, { "epoch": 0.3572110685885208, "grad_norm": 0.5719165205955505, "learning_rate": 4.642788931411479e-05, "loss": 0.2428, "step": 447000 }, { "epoch": 0.35761063354219924, "grad_norm": 0.5542140603065491, "learning_rate": 4.6423893664578014e-05, "loss": 0.2612, "step": 447500 }, { "epoch": 0.3580101984958777, "grad_norm": 0.5974667072296143, "learning_rate": 4.6419898015041225e-05, "loss": 0.238, "step": 448000 }, { "epoch": 0.3584097634495561, "grad_norm": 1.1069387197494507, "learning_rate": 4.6415902365504443e-05, "loss": 0.2515, "step": 448500 }, { "epoch": 0.35880932840323454, "grad_norm": 0.5018848180770874, "learning_rate": 4.6411906715967655e-05, "loss": 0.2365, "step": 449000 }, { "epoch": 0.359208893356913, "grad_norm": 0.9789451956748962, "learning_rate": 4.640791106643087e-05, "loss": 0.2134, "step": 449500 }, { "epoch": 0.3596084583105914, "grad_norm": 0.7332815527915955, "learning_rate": 4.640391541689409e-05, "loss": 0.2453, "step": 450000 }, { "epoch": 0.36000802326426984, "grad_norm": 0.7665380239486694, "learning_rate": 4.63999197673573e-05, "loss": 0.2367, "step": 450500 }, { "epoch": 0.36040758821794827, "grad_norm": 0.9489160776138306, "learning_rate": 4.639592411782052e-05, "loss": 0.2433, "step": 451000 }, { "epoch": 0.36080715317162676, "grad_norm": 0.36715760827064514, "learning_rate": 4.639192846828373e-05, "loss": 0.2495, "step": 451500 }, { "epoch": 0.3612067181253052, "grad_norm": 0.6208722591400146, "learning_rate": 4.638793281874695e-05, "loss": 0.235, "step": 452000 }, { "epoch": 0.3616062830789836, "grad_norm": 0.6682359576225281, "learning_rate": 4.638393716921017e-05, "loss": 0.2503, "step": 452500 }, { "epoch": 0.36200584803266206, "grad_norm": 0.5204598307609558, "learning_rate": 4.637994151967338e-05, "loss": 0.2566, "step": 453000 }, { "epoch": 0.3624054129863405, "grad_norm": 0.26243385672569275, "learning_rate": 4.63759458701366e-05, "loss": 0.2353, "step": 453500 }, { "epoch": 0.3628049779400189, "grad_norm": 0.6248922944068909, "learning_rate": 4.6371950220599817e-05, "loss": 0.2227, "step": 454000 }, { "epoch": 0.36320454289369736, "grad_norm": 0.45966702699661255, "learning_rate": 4.636795457106303e-05, "loss": 0.2418, "step": 454500 }, { "epoch": 0.3636041078473758, "grad_norm": 1.7038697004318237, "learning_rate": 4.6363958921526246e-05, "loss": 0.219, "step": 455000 }, { "epoch": 0.3640036728010542, "grad_norm": 1.1341651678085327, "learning_rate": 4.635996327198946e-05, "loss": 0.2298, "step": 455500 }, { "epoch": 0.36440323775473266, "grad_norm": 0.8294051289558411, "learning_rate": 4.6355967622452676e-05, "loss": 0.2497, "step": 456000 }, { "epoch": 0.3648028027084111, "grad_norm": 1.5242713689804077, "learning_rate": 4.6351971972915894e-05, "loss": 0.2363, "step": 456500 }, { "epoch": 0.3652023676620895, "grad_norm": 1.0086596012115479, "learning_rate": 4.6347976323379105e-05, "loss": 0.2287, "step": 457000 }, { "epoch": 0.36560193261576795, "grad_norm": 0.8560370802879333, "learning_rate": 4.6343980673842324e-05, "loss": 0.2289, "step": 457500 }, { "epoch": 0.3660014975694464, "grad_norm": 0.7617954611778259, "learning_rate": 4.6339985024305535e-05, "loss": 0.2197, "step": 458000 }, { "epoch": 0.3664010625231248, "grad_norm": 0.4032207727432251, "learning_rate": 4.633598937476875e-05, "loss": 0.2203, "step": 458500 }, { "epoch": 0.36680062747680325, "grad_norm": 0.5553929209709167, "learning_rate": 4.633199372523197e-05, "loss": 0.2469, "step": 459000 }, { "epoch": 0.3672001924304817, "grad_norm": 1.0711969137191772, "learning_rate": 4.632799807569518e-05, "loss": 0.2445, "step": 459500 }, { "epoch": 0.3675997573841601, "grad_norm": 0.437844842672348, "learning_rate": 4.63240024261584e-05, "loss": 0.2363, "step": 460000 }, { "epoch": 0.36799932233783855, "grad_norm": 0.22941192984580994, "learning_rate": 4.632000677662162e-05, "loss": 0.2251, "step": 460500 }, { "epoch": 0.368398887291517, "grad_norm": 0.3255271017551422, "learning_rate": 4.631601112708483e-05, "loss": 0.2261, "step": 461000 }, { "epoch": 0.3687984522451954, "grad_norm": 0.6247419714927673, "learning_rate": 4.631201547754805e-05, "loss": 0.2405, "step": 461500 }, { "epoch": 0.36919801719887385, "grad_norm": 1.0791103839874268, "learning_rate": 4.630801982801126e-05, "loss": 0.2463, "step": 462000 }, { "epoch": 0.3695975821525523, "grad_norm": 2.063413143157959, "learning_rate": 4.630402417847448e-05, "loss": 0.2523, "step": 462500 }, { "epoch": 0.3699971471062307, "grad_norm": 0.24300184845924377, "learning_rate": 4.63000285289377e-05, "loss": 0.2378, "step": 463000 }, { "epoch": 0.37039671205990915, "grad_norm": 0.5117874145507812, "learning_rate": 4.629603287940091e-05, "loss": 0.2466, "step": 463500 }, { "epoch": 0.3707962770135876, "grad_norm": 0.7557910084724426, "learning_rate": 4.6292037229864126e-05, "loss": 0.2271, "step": 464000 }, { "epoch": 0.371195841967266, "grad_norm": 1.0723472833633423, "learning_rate": 4.628804158032734e-05, "loss": 0.2504, "step": 464500 }, { "epoch": 0.37159540692094445, "grad_norm": 0.4483608901500702, "learning_rate": 4.628404593079056e-05, "loss": 0.2444, "step": 465000 }, { "epoch": 0.37199497187462294, "grad_norm": 0.5686841011047363, "learning_rate": 4.6280050281253774e-05, "loss": 0.2327, "step": 465500 }, { "epoch": 0.37239453682830137, "grad_norm": 0.39996233582496643, "learning_rate": 4.6276054631716986e-05, "loss": 0.2568, "step": 466000 }, { "epoch": 0.3727941017819798, "grad_norm": 0.834459662437439, "learning_rate": 4.6272058982180204e-05, "loss": 0.2616, "step": 466500 }, { "epoch": 0.37319366673565824, "grad_norm": 0.7064298987388611, "learning_rate": 4.626806333264342e-05, "loss": 0.2364, "step": 467000 }, { "epoch": 0.37359323168933667, "grad_norm": 1.713271141052246, "learning_rate": 4.626406768310664e-05, "loss": 0.2478, "step": 467500 }, { "epoch": 0.3739927966430151, "grad_norm": 2.086369037628174, "learning_rate": 4.626007203356985e-05, "loss": 0.2274, "step": 468000 }, { "epoch": 0.37439236159669353, "grad_norm": 0.7331411242485046, "learning_rate": 4.625607638403306e-05, "loss": 0.2445, "step": 468500 }, { "epoch": 0.37479192655037197, "grad_norm": 0.7236125469207764, "learning_rate": 4.625208073449628e-05, "loss": 0.2336, "step": 469000 }, { "epoch": 0.3751914915040504, "grad_norm": 1.515647053718567, "learning_rate": 4.62480850849595e-05, "loss": 0.2413, "step": 469500 }, { "epoch": 0.37559105645772883, "grad_norm": 0.3869030773639679, "learning_rate": 4.624408943542272e-05, "loss": 0.2197, "step": 470000 }, { "epoch": 0.37599062141140727, "grad_norm": 0.7694616317749023, "learning_rate": 4.624009378588593e-05, "loss": 0.2382, "step": 470500 }, { "epoch": 0.3763901863650857, "grad_norm": 0.5959903001785278, "learning_rate": 4.623609813634914e-05, "loss": 0.2398, "step": 471000 }, { "epoch": 0.37678975131876413, "grad_norm": 0.5988247990608215, "learning_rate": 4.623210248681236e-05, "loss": 0.2019, "step": 471500 }, { "epoch": 0.37718931627244257, "grad_norm": 0.6157426238059998, "learning_rate": 4.622810683727558e-05, "loss": 0.2278, "step": 472000 }, { "epoch": 0.377588881226121, "grad_norm": 1.5004254579544067, "learning_rate": 4.6224111187738795e-05, "loss": 0.2493, "step": 472500 }, { "epoch": 0.37798844617979943, "grad_norm": 0.4908967912197113, "learning_rate": 4.6220115538202006e-05, "loss": 0.2575, "step": 473000 }, { "epoch": 0.37838801113347786, "grad_norm": 0.4625032842159271, "learning_rate": 4.6216119888665225e-05, "loss": 0.2292, "step": 473500 }, { "epoch": 0.3787875760871563, "grad_norm": 2.7756295204162598, "learning_rate": 4.621212423912844e-05, "loss": 0.227, "step": 474000 }, { "epoch": 0.37918714104083473, "grad_norm": 0.40568920969963074, "learning_rate": 4.6208128589591654e-05, "loss": 0.2282, "step": 474500 }, { "epoch": 0.37958670599451316, "grad_norm": 0.5263286232948303, "learning_rate": 4.620413294005487e-05, "loss": 0.2472, "step": 475000 }, { "epoch": 0.3799862709481916, "grad_norm": 1.8834538459777832, "learning_rate": 4.6200137290518084e-05, "loss": 0.2304, "step": 475500 }, { "epoch": 0.38038583590187003, "grad_norm": 0.3029090166091919, "learning_rate": 4.61961416409813e-05, "loss": 0.2353, "step": 476000 }, { "epoch": 0.38078540085554846, "grad_norm": 1.0838899612426758, "learning_rate": 4.619214599144452e-05, "loss": 0.2267, "step": 476500 }, { "epoch": 0.3811849658092269, "grad_norm": 0.4482724666595459, "learning_rate": 4.618815034190773e-05, "loss": 0.2433, "step": 477000 }, { "epoch": 0.3815845307629053, "grad_norm": 0.5084109306335449, "learning_rate": 4.618415469237095e-05, "loss": 0.2332, "step": 477500 }, { "epoch": 0.38198409571658376, "grad_norm": 1.6278411149978638, "learning_rate": 4.618015904283416e-05, "loss": 0.2169, "step": 478000 }, { "epoch": 0.3823836606702622, "grad_norm": 1.0752583742141724, "learning_rate": 4.617616339329738e-05, "loss": 0.2505, "step": 478500 }, { "epoch": 0.3827832256239406, "grad_norm": 0.3886798322200775, "learning_rate": 4.61721677437606e-05, "loss": 0.2216, "step": 479000 }, { "epoch": 0.3831827905776191, "grad_norm": 0.406037837266922, "learning_rate": 4.616817209422381e-05, "loss": 0.2325, "step": 479500 }, { "epoch": 0.38358235553129755, "grad_norm": 0.7889995574951172, "learning_rate": 4.616417644468703e-05, "loss": 0.233, "step": 480000 }, { "epoch": 0.383981920484976, "grad_norm": 0.7170509696006775, "learning_rate": 4.6160180795150246e-05, "loss": 0.2179, "step": 480500 }, { "epoch": 0.3843814854386544, "grad_norm": 0.5898926854133606, "learning_rate": 4.615618514561346e-05, "loss": 0.253, "step": 481000 }, { "epoch": 0.38478105039233285, "grad_norm": 0.19055742025375366, "learning_rate": 4.6152189496076675e-05, "loss": 0.2254, "step": 481500 }, { "epoch": 0.3851806153460113, "grad_norm": 0.5514305830001831, "learning_rate": 4.6148193846539887e-05, "loss": 0.2398, "step": 482000 }, { "epoch": 0.3855801802996897, "grad_norm": 0.9585630893707275, "learning_rate": 4.6144198197003105e-05, "loss": 0.2401, "step": 482500 }, { "epoch": 0.38597974525336814, "grad_norm": 0.5806618332862854, "learning_rate": 4.614020254746632e-05, "loss": 0.2412, "step": 483000 }, { "epoch": 0.3863793102070466, "grad_norm": 0.9147925972938538, "learning_rate": 4.6136206897929534e-05, "loss": 0.2252, "step": 483500 }, { "epoch": 0.386778875160725, "grad_norm": 1.1093685626983643, "learning_rate": 4.613221124839275e-05, "loss": 0.1946, "step": 484000 }, { "epoch": 0.38717844011440344, "grad_norm": 0.8708345293998718, "learning_rate": 4.6128215598855964e-05, "loss": 0.2264, "step": 484500 }, { "epoch": 0.3875780050680819, "grad_norm": 1.664433240890503, "learning_rate": 4.612421994931919e-05, "loss": 0.2303, "step": 485000 }, { "epoch": 0.3879775700217603, "grad_norm": 0.38671064376831055, "learning_rate": 4.61202242997824e-05, "loss": 0.2273, "step": 485500 }, { "epoch": 0.38837713497543874, "grad_norm": 0.8904987573623657, "learning_rate": 4.611622865024561e-05, "loss": 0.2283, "step": 486000 }, { "epoch": 0.3887766999291172, "grad_norm": 0.6856479048728943, "learning_rate": 4.611223300070883e-05, "loss": 0.2631, "step": 486500 }, { "epoch": 0.3891762648827956, "grad_norm": 0.4397277235984802, "learning_rate": 4.610823735117205e-05, "loss": 0.2249, "step": 487000 }, { "epoch": 0.38957582983647404, "grad_norm": 1.263809323310852, "learning_rate": 4.610424170163526e-05, "loss": 0.2375, "step": 487500 }, { "epoch": 0.3899753947901525, "grad_norm": 0.6051146984100342, "learning_rate": 4.610024605209848e-05, "loss": 0.2343, "step": 488000 }, { "epoch": 0.3903749597438309, "grad_norm": 0.5714989900588989, "learning_rate": 4.609625040256169e-05, "loss": 0.2308, "step": 488500 }, { "epoch": 0.39077452469750934, "grad_norm": 0.6635706424713135, "learning_rate": 4.609225475302491e-05, "loss": 0.2347, "step": 489000 }, { "epoch": 0.3911740896511878, "grad_norm": 1.5059332847595215, "learning_rate": 4.6088259103488126e-05, "loss": 0.2464, "step": 489500 }, { "epoch": 0.3915736546048662, "grad_norm": 1.5010408163070679, "learning_rate": 4.608426345395134e-05, "loss": 0.2517, "step": 490000 }, { "epoch": 0.39197321955854464, "grad_norm": 0.5432317852973938, "learning_rate": 4.6080267804414555e-05, "loss": 0.2312, "step": 490500 }, { "epoch": 0.39237278451222307, "grad_norm": 0.5084550976753235, "learning_rate": 4.607627215487777e-05, "loss": 0.2448, "step": 491000 }, { "epoch": 0.3927723494659015, "grad_norm": 0.1605142503976822, "learning_rate": 4.607227650534099e-05, "loss": 0.2408, "step": 491500 }, { "epoch": 0.39317191441957994, "grad_norm": 0.2998347580432892, "learning_rate": 4.60682808558042e-05, "loss": 0.2353, "step": 492000 }, { "epoch": 0.39357147937325837, "grad_norm": 1.1330851316452026, "learning_rate": 4.6064285206267415e-05, "loss": 0.2367, "step": 492500 }, { "epoch": 0.3939710443269368, "grad_norm": 0.749457597732544, "learning_rate": 4.606028955673063e-05, "loss": 0.2306, "step": 493000 }, { "epoch": 0.3943706092806153, "grad_norm": 0.5827109217643738, "learning_rate": 4.605629390719385e-05, "loss": 0.2167, "step": 493500 }, { "epoch": 0.3947701742342937, "grad_norm": 0.551965594291687, "learning_rate": 4.605229825765707e-05, "loss": 0.2277, "step": 494000 }, { "epoch": 0.39516973918797216, "grad_norm": 0.5924856066703796, "learning_rate": 4.604830260812028e-05, "loss": 0.2443, "step": 494500 }, { "epoch": 0.3955693041416506, "grad_norm": 1.12799072265625, "learning_rate": 4.604430695858349e-05, "loss": 0.2354, "step": 495000 }, { "epoch": 0.395968869095329, "grad_norm": 0.36942610144615173, "learning_rate": 4.604031130904671e-05, "loss": 0.2425, "step": 495500 }, { "epoch": 0.39636843404900746, "grad_norm": 0.6150264143943787, "learning_rate": 4.603631565950993e-05, "loss": 0.211, "step": 496000 }, { "epoch": 0.3967679990026859, "grad_norm": 0.8114819526672363, "learning_rate": 4.6032320009973147e-05, "loss": 0.2215, "step": 496500 }, { "epoch": 0.3971675639563643, "grad_norm": 1.7528314590454102, "learning_rate": 4.602832436043636e-05, "loss": 0.2307, "step": 497000 }, { "epoch": 0.39756712891004276, "grad_norm": 0.7864994406700134, "learning_rate": 4.602432871089957e-05, "loss": 0.2487, "step": 497500 }, { "epoch": 0.3979666938637212, "grad_norm": 0.6288546919822693, "learning_rate": 4.6020333061362794e-05, "loss": 0.2431, "step": 498000 }, { "epoch": 0.3983662588173996, "grad_norm": 1.5764071941375732, "learning_rate": 4.6016337411826006e-05, "loss": 0.2159, "step": 498500 }, { "epoch": 0.39876582377107805, "grad_norm": 0.33475151658058167, "learning_rate": 4.6012341762289224e-05, "loss": 0.2439, "step": 499000 }, { "epoch": 0.3991653887247565, "grad_norm": 0.3440074026584625, "learning_rate": 4.6008346112752435e-05, "loss": 0.2535, "step": 499500 }, { "epoch": 0.3995649536784349, "grad_norm": 0.40122151374816895, "learning_rate": 4.6004350463215654e-05, "loss": 0.2432, "step": 500000 }, { "epoch": 0.39996451863211335, "grad_norm": 0.5369971990585327, "learning_rate": 4.600035481367887e-05, "loss": 0.2158, "step": 500500 }, { "epoch": 0.4003640835857918, "grad_norm": 1.0561907291412354, "learning_rate": 4.599635916414208e-05, "loss": 0.2293, "step": 501000 }, { "epoch": 0.4007636485394702, "grad_norm": 0.3957941234111786, "learning_rate": 4.59923635146053e-05, "loss": 0.2442, "step": 501500 }, { "epoch": 0.40116321349314865, "grad_norm": 1.9321448802947998, "learning_rate": 4.598836786506851e-05, "loss": 0.246, "step": 502000 }, { "epoch": 0.4015627784468271, "grad_norm": 0.5797551870346069, "learning_rate": 4.598437221553173e-05, "loss": 0.2502, "step": 502500 }, { "epoch": 0.4019623434005055, "grad_norm": 0.5636096596717834, "learning_rate": 4.598037656599495e-05, "loss": 0.2434, "step": 503000 }, { "epoch": 0.40236190835418395, "grad_norm": 0.5556966662406921, "learning_rate": 4.597638091645816e-05, "loss": 0.2402, "step": 503500 }, { "epoch": 0.4027614733078624, "grad_norm": 0.8923726081848145, "learning_rate": 4.597238526692138e-05, "loss": 0.2625, "step": 504000 }, { "epoch": 0.4031610382615408, "grad_norm": 1.8841279745101929, "learning_rate": 4.59683896173846e-05, "loss": 0.2495, "step": 504500 }, { "epoch": 0.40356060321521925, "grad_norm": 4.650374889373779, "learning_rate": 4.596439396784781e-05, "loss": 0.2225, "step": 505000 }, { "epoch": 0.4039601681688977, "grad_norm": 3.6097893714904785, "learning_rate": 4.596039831831103e-05, "loss": 0.2494, "step": 505500 }, { "epoch": 0.4043597331225761, "grad_norm": 0.5219786167144775, "learning_rate": 4.595640266877424e-05, "loss": 0.2327, "step": 506000 }, { "epoch": 0.40475929807625455, "grad_norm": 0.44395872950553894, "learning_rate": 4.5952407019237456e-05, "loss": 0.2363, "step": 506500 }, { "epoch": 0.40515886302993304, "grad_norm": 1.3974944353103638, "learning_rate": 4.5948411369700675e-05, "loss": 0.2226, "step": 507000 }, { "epoch": 0.40555842798361147, "grad_norm": 0.577722430229187, "learning_rate": 4.5944415720163886e-05, "loss": 0.2362, "step": 507500 }, { "epoch": 0.4059579929372899, "grad_norm": 2.060001850128174, "learning_rate": 4.5940420070627104e-05, "loss": 0.2407, "step": 508000 }, { "epoch": 0.40635755789096834, "grad_norm": 0.31474626064300537, "learning_rate": 4.5936424421090316e-05, "loss": 0.2184, "step": 508500 }, { "epoch": 0.40675712284464677, "grad_norm": 0.6936082243919373, "learning_rate": 4.593242877155354e-05, "loss": 0.2302, "step": 509000 }, { "epoch": 0.4071566877983252, "grad_norm": 0.5341528058052063, "learning_rate": 4.592843312201675e-05, "loss": 0.2195, "step": 509500 }, { "epoch": 0.40755625275200363, "grad_norm": 0.4948134124279022, "learning_rate": 4.5924437472479963e-05, "loss": 0.2138, "step": 510000 }, { "epoch": 0.40795581770568207, "grad_norm": 1.1566543579101562, "learning_rate": 4.592044182294318e-05, "loss": 0.2403, "step": 510500 }, { "epoch": 0.4083553826593605, "grad_norm": 0.7856607437133789, "learning_rate": 4.59164461734064e-05, "loss": 0.2412, "step": 511000 }, { "epoch": 0.40875494761303893, "grad_norm": 0.5754504799842834, "learning_rate": 4.591245052386962e-05, "loss": 0.2281, "step": 511500 }, { "epoch": 0.40915451256671737, "grad_norm": 1.9410213232040405, "learning_rate": 4.590845487433283e-05, "loss": 0.2375, "step": 512000 }, { "epoch": 0.4095540775203958, "grad_norm": 0.25898194313049316, "learning_rate": 4.590445922479604e-05, "loss": 0.2106, "step": 512500 }, { "epoch": 0.40995364247407423, "grad_norm": 0.8684314489364624, "learning_rate": 4.590046357525926e-05, "loss": 0.2334, "step": 513000 }, { "epoch": 0.41035320742775266, "grad_norm": 1.6322358846664429, "learning_rate": 4.589646792572248e-05, "loss": 0.2574, "step": 513500 }, { "epoch": 0.4107527723814311, "grad_norm": 0.7256922125816345, "learning_rate": 4.589247227618569e-05, "loss": 0.235, "step": 514000 }, { "epoch": 0.41115233733510953, "grad_norm": 1.1736308336257935, "learning_rate": 4.588847662664891e-05, "loss": 0.2325, "step": 514500 }, { "epoch": 0.41155190228878796, "grad_norm": 0.9569302201271057, "learning_rate": 4.588448097711212e-05, "loss": 0.2203, "step": 515000 }, { "epoch": 0.4119514672424664, "grad_norm": 0.6947535276412964, "learning_rate": 4.5880485327575337e-05, "loss": 0.2277, "step": 515500 }, { "epoch": 0.41235103219614483, "grad_norm": 0.8312774896621704, "learning_rate": 4.5876489678038555e-05, "loss": 0.2175, "step": 516000 }, { "epoch": 0.41275059714982326, "grad_norm": 0.39495155215263367, "learning_rate": 4.5872494028501766e-05, "loss": 0.2243, "step": 516500 }, { "epoch": 0.4131501621035017, "grad_norm": 0.7402554154396057, "learning_rate": 4.5868498378964984e-05, "loss": 0.2209, "step": 517000 }, { "epoch": 0.4135497270571801, "grad_norm": 0.4574830234050751, "learning_rate": 4.58645027294282e-05, "loss": 0.2344, "step": 517500 }, { "epoch": 0.41394929201085856, "grad_norm": 1.150445580482483, "learning_rate": 4.586050707989142e-05, "loss": 0.2287, "step": 518000 }, { "epoch": 0.414348856964537, "grad_norm": 0.8523080348968506, "learning_rate": 4.585651143035463e-05, "loss": 0.2281, "step": 518500 }, { "epoch": 0.4147484219182154, "grad_norm": 0.3674297332763672, "learning_rate": 4.5852515780817844e-05, "loss": 0.217, "step": 519000 }, { "epoch": 0.41514798687189386, "grad_norm": 0.5517123341560364, "learning_rate": 4.584852013128106e-05, "loss": 0.2476, "step": 519500 }, { "epoch": 0.4155475518255723, "grad_norm": 1.5202524662017822, "learning_rate": 4.584452448174428e-05, "loss": 0.2457, "step": 520000 }, { "epoch": 0.4159471167792507, "grad_norm": 1.6675400733947754, "learning_rate": 4.58405288322075e-05, "loss": 0.2323, "step": 520500 }, { "epoch": 0.4163466817329292, "grad_norm": 0.6552516222000122, "learning_rate": 4.583653318267071e-05, "loss": 0.2405, "step": 521000 }, { "epoch": 0.41674624668660765, "grad_norm": 0.46967923641204834, "learning_rate": 4.583253753313392e-05, "loss": 0.2509, "step": 521500 }, { "epoch": 0.4171458116402861, "grad_norm": 0.6922463178634644, "learning_rate": 4.582854188359714e-05, "loss": 0.246, "step": 522000 }, { "epoch": 0.4175453765939645, "grad_norm": 1.1616865396499634, "learning_rate": 4.582454623406036e-05, "loss": 0.2479, "step": 522500 }, { "epoch": 0.41794494154764295, "grad_norm": 0.5773580074310303, "learning_rate": 4.5820550584523576e-05, "loss": 0.23, "step": 523000 }, { "epoch": 0.4183445065013214, "grad_norm": 1.139544129371643, "learning_rate": 4.581655493498679e-05, "loss": 0.2343, "step": 523500 }, { "epoch": 0.4187440714549998, "grad_norm": 1.3153293132781982, "learning_rate": 4.581255928545e-05, "loss": 0.2436, "step": 524000 }, { "epoch": 0.41914363640867824, "grad_norm": 0.5622233152389526, "learning_rate": 4.5808563635913223e-05, "loss": 0.2382, "step": 524500 }, { "epoch": 0.4195432013623567, "grad_norm": 0.6327025890350342, "learning_rate": 4.5804567986376435e-05, "loss": 0.2475, "step": 525000 }, { "epoch": 0.4199427663160351, "grad_norm": 1.06494140625, "learning_rate": 4.580057233683965e-05, "loss": 0.2274, "step": 525500 }, { "epoch": 0.42034233126971354, "grad_norm": 0.8124126195907593, "learning_rate": 4.5796576687302864e-05, "loss": 0.2408, "step": 526000 }, { "epoch": 0.420741896223392, "grad_norm": 0.1933540254831314, "learning_rate": 4.579258103776608e-05, "loss": 0.2201, "step": 526500 }, { "epoch": 0.4211414611770704, "grad_norm": 0.6401020288467407, "learning_rate": 4.57885853882293e-05, "loss": 0.2507, "step": 527000 }, { "epoch": 0.42154102613074884, "grad_norm": 2.129037380218506, "learning_rate": 4.578458973869251e-05, "loss": 0.2298, "step": 527500 }, { "epoch": 0.4219405910844273, "grad_norm": 0.4831426441669464, "learning_rate": 4.578059408915573e-05, "loss": 0.2375, "step": 528000 }, { "epoch": 0.4223401560381057, "grad_norm": 3.080521583557129, "learning_rate": 4.577659843961894e-05, "loss": 0.2458, "step": 528500 }, { "epoch": 0.42273972099178414, "grad_norm": 0.21551933884620667, "learning_rate": 4.577260279008216e-05, "loss": 0.243, "step": 529000 }, { "epoch": 0.4231392859454626, "grad_norm": 0.3236337900161743, "learning_rate": 4.576860714054538e-05, "loss": 0.247, "step": 529500 }, { "epoch": 0.423538850899141, "grad_norm": 0.5686526894569397, "learning_rate": 4.576461149100859e-05, "loss": 0.2439, "step": 530000 }, { "epoch": 0.42393841585281944, "grad_norm": 0.7693064212799072, "learning_rate": 4.576061584147181e-05, "loss": 0.2297, "step": 530500 }, { "epoch": 0.42433798080649787, "grad_norm": 0.6892525553703308, "learning_rate": 4.5756620191935026e-05, "loss": 0.2271, "step": 531000 }, { "epoch": 0.4247375457601763, "grad_norm": 0.6730759143829346, "learning_rate": 4.575262454239824e-05, "loss": 0.2411, "step": 531500 }, { "epoch": 0.42513711071385474, "grad_norm": 0.23501816391944885, "learning_rate": 4.5748628892861456e-05, "loss": 0.2143, "step": 532000 }, { "epoch": 0.42553667566753317, "grad_norm": 0.3668568432331085, "learning_rate": 4.574463324332467e-05, "loss": 0.2429, "step": 532500 }, { "epoch": 0.4259362406212116, "grad_norm": 2.3846936225891113, "learning_rate": 4.5740637593787885e-05, "loss": 0.2291, "step": 533000 }, { "epoch": 0.42633580557489004, "grad_norm": 0.7873939871788025, "learning_rate": 4.5736641944251104e-05, "loss": 0.2376, "step": 533500 }, { "epoch": 0.42673537052856847, "grad_norm": 0.9348785281181335, "learning_rate": 4.5732646294714315e-05, "loss": 0.2285, "step": 534000 }, { "epoch": 0.4271349354822469, "grad_norm": 0.6925580501556396, "learning_rate": 4.572865064517753e-05, "loss": 0.2516, "step": 534500 }, { "epoch": 0.4275345004359254, "grad_norm": 0.5748716592788696, "learning_rate": 4.5724654995640745e-05, "loss": 0.236, "step": 535000 }, { "epoch": 0.4279340653896038, "grad_norm": 0.43371474742889404, "learning_rate": 4.572065934610397e-05, "loss": 0.2279, "step": 535500 }, { "epoch": 0.42833363034328226, "grad_norm": 0.4929753541946411, "learning_rate": 4.571666369656718e-05, "loss": 0.2182, "step": 536000 }, { "epoch": 0.4287331952969607, "grad_norm": 0.5409592986106873, "learning_rate": 4.571266804703039e-05, "loss": 0.2395, "step": 536500 }, { "epoch": 0.4291327602506391, "grad_norm": 0.7002780437469482, "learning_rate": 4.570867239749361e-05, "loss": 0.2262, "step": 537000 }, { "epoch": 0.42953232520431756, "grad_norm": 0.6032687425613403, "learning_rate": 4.570467674795683e-05, "loss": 0.2309, "step": 537500 }, { "epoch": 0.429931890157996, "grad_norm": 0.20743180811405182, "learning_rate": 4.570068109842005e-05, "loss": 0.2377, "step": 538000 }, { "epoch": 0.4303314551116744, "grad_norm": 1.3616626262664795, "learning_rate": 4.569668544888326e-05, "loss": 0.2395, "step": 538500 }, { "epoch": 0.43073102006535285, "grad_norm": 0.6648136377334595, "learning_rate": 4.569268979934647e-05, "loss": 0.2407, "step": 539000 }, { "epoch": 0.4311305850190313, "grad_norm": 0.342860609292984, "learning_rate": 4.568869414980969e-05, "loss": 0.2306, "step": 539500 }, { "epoch": 0.4315301499727097, "grad_norm": 0.5305058360099792, "learning_rate": 4.5684698500272906e-05, "loss": 0.2247, "step": 540000 }, { "epoch": 0.43192971492638815, "grad_norm": 0.681922972202301, "learning_rate": 4.5680702850736125e-05, "loss": 0.2359, "step": 540500 }, { "epoch": 0.4323292798800666, "grad_norm": 0.3696162700653076, "learning_rate": 4.5676707201199336e-05, "loss": 0.2321, "step": 541000 }, { "epoch": 0.432728844833745, "grad_norm": 0.624381959438324, "learning_rate": 4.567271155166255e-05, "loss": 0.2361, "step": 541500 }, { "epoch": 0.43312840978742345, "grad_norm": 0.5698217153549194, "learning_rate": 4.566871590212577e-05, "loss": 0.2446, "step": 542000 }, { "epoch": 0.4335279747411019, "grad_norm": 0.6892802119255066, "learning_rate": 4.5664720252588984e-05, "loss": 0.2366, "step": 542500 }, { "epoch": 0.4339275396947803, "grad_norm": 0.9637826085090637, "learning_rate": 4.5660724603052195e-05, "loss": 0.2363, "step": 543000 }, { "epoch": 0.43432710464845875, "grad_norm": 0.7391327619552612, "learning_rate": 4.565672895351541e-05, "loss": 0.2221, "step": 543500 }, { "epoch": 0.4347266696021372, "grad_norm": 0.6377978920936584, "learning_rate": 4.565273330397863e-05, "loss": 0.2286, "step": 544000 }, { "epoch": 0.4351262345558156, "grad_norm": 0.5728492736816406, "learning_rate": 4.564873765444185e-05, "loss": 0.2137, "step": 544500 }, { "epoch": 0.43552579950949405, "grad_norm": 0.1452438086271286, "learning_rate": 4.564474200490506e-05, "loss": 0.2344, "step": 545000 }, { "epoch": 0.4359253644631725, "grad_norm": 0.441122442483902, "learning_rate": 4.564074635536827e-05, "loss": 0.248, "step": 545500 }, { "epoch": 0.4363249294168509, "grad_norm": 1.7536396980285645, "learning_rate": 4.563675070583149e-05, "loss": 0.2249, "step": 546000 }, { "epoch": 0.43672449437052935, "grad_norm": 1.238662600517273, "learning_rate": 4.563275505629471e-05, "loss": 0.2336, "step": 546500 }, { "epoch": 0.4371240593242078, "grad_norm": 2.903782844543457, "learning_rate": 4.562875940675793e-05, "loss": 0.2315, "step": 547000 }, { "epoch": 0.4375236242778862, "grad_norm": 1.0890220403671265, "learning_rate": 4.562476375722114e-05, "loss": 0.2312, "step": 547500 }, { "epoch": 0.43792318923156465, "grad_norm": 0.6443300247192383, "learning_rate": 4.562076810768435e-05, "loss": 0.2616, "step": 548000 }, { "epoch": 0.4383227541852431, "grad_norm": 1.065466284751892, "learning_rate": 4.5616772458147575e-05, "loss": 0.2203, "step": 548500 }, { "epoch": 0.43872231913892157, "grad_norm": 1.277522325515747, "learning_rate": 4.5612776808610786e-05, "loss": 0.2334, "step": 549000 }, { "epoch": 0.4391218840926, "grad_norm": 0.5841593146324158, "learning_rate": 4.5608781159074005e-05, "loss": 0.2418, "step": 549500 }, { "epoch": 0.43952144904627843, "grad_norm": 0.6352227926254272, "learning_rate": 4.5604785509537216e-05, "loss": 0.222, "step": 550000 }, { "epoch": 0.43992101399995687, "grad_norm": 0.522990882396698, "learning_rate": 4.5600789860000434e-05, "loss": 0.2494, "step": 550500 }, { "epoch": 0.4403205789536353, "grad_norm": 1.047712802886963, "learning_rate": 4.559679421046365e-05, "loss": 0.236, "step": 551000 }, { "epoch": 0.44072014390731373, "grad_norm": 0.986603856086731, "learning_rate": 4.5592798560926864e-05, "loss": 0.224, "step": 551500 }, { "epoch": 0.44111970886099217, "grad_norm": 0.6100319623947144, "learning_rate": 4.558880291139008e-05, "loss": 0.216, "step": 552000 }, { "epoch": 0.4415192738146706, "grad_norm": 1.0333170890808105, "learning_rate": 4.5584807261853294e-05, "loss": 0.2255, "step": 552500 }, { "epoch": 0.44191883876834903, "grad_norm": 0.8120818138122559, "learning_rate": 4.558081161231651e-05, "loss": 0.2426, "step": 553000 }, { "epoch": 0.44231840372202746, "grad_norm": 0.514979362487793, "learning_rate": 4.557681596277973e-05, "loss": 0.221, "step": 553500 }, { "epoch": 0.4427179686757059, "grad_norm": 2.9118540287017822, "learning_rate": 4.557282031324294e-05, "loss": 0.2339, "step": 554000 }, { "epoch": 0.44311753362938433, "grad_norm": 0.705633819103241, "learning_rate": 4.556882466370616e-05, "loss": 0.2276, "step": 554500 }, { "epoch": 0.44351709858306276, "grad_norm": 0.2328898161649704, "learning_rate": 4.556482901416938e-05, "loss": 0.2185, "step": 555000 }, { "epoch": 0.4439166635367412, "grad_norm": 0.8786484003067017, "learning_rate": 4.556083336463259e-05, "loss": 0.2404, "step": 555500 }, { "epoch": 0.44431622849041963, "grad_norm": 0.1976647675037384, "learning_rate": 4.555683771509581e-05, "loss": 0.2156, "step": 556000 }, { "epoch": 0.44471579344409806, "grad_norm": 0.29592365026474, "learning_rate": 4.555284206555902e-05, "loss": 0.2272, "step": 556500 }, { "epoch": 0.4451153583977765, "grad_norm": 0.5882948040962219, "learning_rate": 4.554884641602224e-05, "loss": 0.2288, "step": 557000 }, { "epoch": 0.44551492335145493, "grad_norm": 0.536082923412323, "learning_rate": 4.5544850766485455e-05, "loss": 0.2327, "step": 557500 }, { "epoch": 0.44591448830513336, "grad_norm": 0.8439269661903381, "learning_rate": 4.5540855116948667e-05, "loss": 0.2296, "step": 558000 }, { "epoch": 0.4463140532588118, "grad_norm": 0.42889177799224854, "learning_rate": 4.5536859467411885e-05, "loss": 0.2368, "step": 558500 }, { "epoch": 0.4467136182124902, "grad_norm": 0.565834105014801, "learning_rate": 4.5532863817875096e-05, "loss": 0.2421, "step": 559000 }, { "epoch": 0.44711318316616866, "grad_norm": 2.138887643814087, "learning_rate": 4.5528868168338314e-05, "loss": 0.2229, "step": 559500 }, { "epoch": 0.4475127481198471, "grad_norm": 1.8534457683563232, "learning_rate": 4.552487251880153e-05, "loss": 0.2104, "step": 560000 }, { "epoch": 0.4479123130735255, "grad_norm": 0.41285672783851624, "learning_rate": 4.5520876869264744e-05, "loss": 0.2194, "step": 560500 }, { "epoch": 0.44831187802720396, "grad_norm": 0.7710129022598267, "learning_rate": 4.551688121972796e-05, "loss": 0.217, "step": 561000 }, { "epoch": 0.4487114429808824, "grad_norm": 1.0814639329910278, "learning_rate": 4.551288557019118e-05, "loss": 0.2387, "step": 561500 }, { "epoch": 0.4491110079345608, "grad_norm": 0.5867155194282532, "learning_rate": 4.55088899206544e-05, "loss": 0.2216, "step": 562000 }, { "epoch": 0.4495105728882393, "grad_norm": 0.41568446159362793, "learning_rate": 4.550489427111761e-05, "loss": 0.2256, "step": 562500 }, { "epoch": 0.44991013784191775, "grad_norm": 0.963660478591919, "learning_rate": 4.550089862158082e-05, "loss": 0.2431, "step": 563000 }, { "epoch": 0.4503097027955962, "grad_norm": 0.7384582161903381, "learning_rate": 4.549690297204404e-05, "loss": 0.2355, "step": 563500 }, { "epoch": 0.4507092677492746, "grad_norm": 1.1604576110839844, "learning_rate": 4.549290732250726e-05, "loss": 0.2366, "step": 564000 }, { "epoch": 0.45110883270295304, "grad_norm": 0.4975533187389374, "learning_rate": 4.5488911672970476e-05, "loss": 0.2459, "step": 564500 }, { "epoch": 0.4515083976566315, "grad_norm": 0.2863835096359253, "learning_rate": 4.548491602343369e-05, "loss": 0.2358, "step": 565000 }, { "epoch": 0.4519079626103099, "grad_norm": 0.548775851726532, "learning_rate": 4.54809203738969e-05, "loss": 0.2458, "step": 565500 }, { "epoch": 0.45230752756398834, "grad_norm": 0.7608944773674011, "learning_rate": 4.547692472436012e-05, "loss": 0.2452, "step": 566000 }, { "epoch": 0.4527070925176668, "grad_norm": 0.6441822648048401, "learning_rate": 4.5472929074823335e-05, "loss": 0.2165, "step": 566500 }, { "epoch": 0.4531066574713452, "grad_norm": 0.7390038967132568, "learning_rate": 4.5468933425286554e-05, "loss": 0.2381, "step": 567000 }, { "epoch": 0.45350622242502364, "grad_norm": 0.5029245018959045, "learning_rate": 4.5464937775749765e-05, "loss": 0.225, "step": 567500 }, { "epoch": 0.4539057873787021, "grad_norm": 0.48633983731269836, "learning_rate": 4.5460942126212976e-05, "loss": 0.2177, "step": 568000 }, { "epoch": 0.4543053523323805, "grad_norm": 0.6163313984870911, "learning_rate": 4.54569464766762e-05, "loss": 0.2254, "step": 568500 }, { "epoch": 0.45470491728605894, "grad_norm": 0.3476109802722931, "learning_rate": 4.545295082713941e-05, "loss": 0.2358, "step": 569000 }, { "epoch": 0.4551044822397374, "grad_norm": 1.3655463457107544, "learning_rate": 4.5448955177602624e-05, "loss": 0.2482, "step": 569500 }, { "epoch": 0.4555040471934158, "grad_norm": 0.387949138879776, "learning_rate": 4.544495952806584e-05, "loss": 0.2328, "step": 570000 }, { "epoch": 0.45590361214709424, "grad_norm": 2.471078395843506, "learning_rate": 4.544096387852906e-05, "loss": 0.2206, "step": 570500 }, { "epoch": 0.45630317710077267, "grad_norm": 0.7426581382751465, "learning_rate": 4.543696822899228e-05, "loss": 0.2322, "step": 571000 }, { "epoch": 0.4567027420544511, "grad_norm": 0.6450051665306091, "learning_rate": 4.543297257945549e-05, "loss": 0.2349, "step": 571500 }, { "epoch": 0.45710230700812954, "grad_norm": 0.6948003768920898, "learning_rate": 4.54289769299187e-05, "loss": 0.2454, "step": 572000 }, { "epoch": 0.45750187196180797, "grad_norm": 0.552707314491272, "learning_rate": 4.542498128038192e-05, "loss": 0.2203, "step": 572500 }, { "epoch": 0.4579014369154864, "grad_norm": 0.8018070459365845, "learning_rate": 4.542098563084514e-05, "loss": 0.219, "step": 573000 }, { "epoch": 0.45830100186916484, "grad_norm": 0.7001769542694092, "learning_rate": 4.5416989981308356e-05, "loss": 0.2277, "step": 573500 }, { "epoch": 0.45870056682284327, "grad_norm": 1.0817681550979614, "learning_rate": 4.541299433177157e-05, "loss": 0.2284, "step": 574000 }, { "epoch": 0.4591001317765217, "grad_norm": 0.45155444741249084, "learning_rate": 4.540899868223478e-05, "loss": 0.2476, "step": 574500 }, { "epoch": 0.45949969673020014, "grad_norm": 0.5151932239532471, "learning_rate": 4.5405003032698004e-05, "loss": 0.2323, "step": 575000 }, { "epoch": 0.45989926168387857, "grad_norm": 0.5068463087081909, "learning_rate": 4.5401007383161215e-05, "loss": 0.2231, "step": 575500 }, { "epoch": 0.460298826637557, "grad_norm": 0.6291922926902771, "learning_rate": 4.5397011733624434e-05, "loss": 0.2428, "step": 576000 }, { "epoch": 0.4606983915912355, "grad_norm": 0.5829899907112122, "learning_rate": 4.5393016084087645e-05, "loss": 0.2519, "step": 576500 }, { "epoch": 0.4610979565449139, "grad_norm": 0.7378328442573547, "learning_rate": 4.538902043455086e-05, "loss": 0.2245, "step": 577000 }, { "epoch": 0.46149752149859236, "grad_norm": 0.3284049332141876, "learning_rate": 4.538502478501408e-05, "loss": 0.2361, "step": 577500 }, { "epoch": 0.4618970864522708, "grad_norm": 1.4062414169311523, "learning_rate": 4.538102913547729e-05, "loss": 0.2547, "step": 578000 }, { "epoch": 0.4622966514059492, "grad_norm": 0.677952229976654, "learning_rate": 4.537703348594051e-05, "loss": 0.2195, "step": 578500 }, { "epoch": 0.46269621635962765, "grad_norm": 0.8808555006980896, "learning_rate": 4.537303783640372e-05, "loss": 0.2346, "step": 579000 }, { "epoch": 0.4630957813133061, "grad_norm": 0.21391360461711884, "learning_rate": 4.536904218686694e-05, "loss": 0.2281, "step": 579500 }, { "epoch": 0.4634953462669845, "grad_norm": 0.5138593912124634, "learning_rate": 4.536504653733016e-05, "loss": 0.2202, "step": 580000 }, { "epoch": 0.46389491122066295, "grad_norm": 0.366020143032074, "learning_rate": 4.536105088779337e-05, "loss": 0.2381, "step": 580500 }, { "epoch": 0.4642944761743414, "grad_norm": 0.552202582359314, "learning_rate": 4.535705523825659e-05, "loss": 0.2296, "step": 581000 }, { "epoch": 0.4646940411280198, "grad_norm": 0.38907238841056824, "learning_rate": 4.535305958871981e-05, "loss": 0.2274, "step": 581500 }, { "epoch": 0.46509360608169825, "grad_norm": 0.7479487657546997, "learning_rate": 4.534906393918302e-05, "loss": 0.2268, "step": 582000 }, { "epoch": 0.4654931710353767, "grad_norm": 1.0776790380477905, "learning_rate": 4.5345068289646236e-05, "loss": 0.2205, "step": 582500 }, { "epoch": 0.4658927359890551, "grad_norm": 0.19274137914180756, "learning_rate": 4.534107264010945e-05, "loss": 0.2393, "step": 583000 }, { "epoch": 0.46629230094273355, "grad_norm": 0.8563949465751648, "learning_rate": 4.5337076990572666e-05, "loss": 0.2442, "step": 583500 }, { "epoch": 0.466691865896412, "grad_norm": 0.3755948841571808, "learning_rate": 4.5333081341035884e-05, "loss": 0.2282, "step": 584000 }, { "epoch": 0.4670914308500904, "grad_norm": 0.23134393990039825, "learning_rate": 4.5329085691499096e-05, "loss": 0.2553, "step": 584500 }, { "epoch": 0.46749099580376885, "grad_norm": 0.4554998576641083, "learning_rate": 4.5325090041962314e-05, "loss": 0.2213, "step": 585000 }, { "epoch": 0.4678905607574473, "grad_norm": 0.6306852698326111, "learning_rate": 4.5321094392425525e-05, "loss": 0.2234, "step": 585500 }, { "epoch": 0.4682901257111257, "grad_norm": 0.3716350495815277, "learning_rate": 4.531709874288875e-05, "loss": 0.2178, "step": 586000 }, { "epoch": 0.46868969066480415, "grad_norm": 0.5681732296943665, "learning_rate": 4.531310309335196e-05, "loss": 0.2216, "step": 586500 }, { "epoch": 0.4690892556184826, "grad_norm": 0.7895967960357666, "learning_rate": 4.530910744381517e-05, "loss": 0.2347, "step": 587000 }, { "epoch": 0.469488820572161, "grad_norm": 0.8755703568458557, "learning_rate": 4.530511179427839e-05, "loss": 0.2297, "step": 587500 }, { "epoch": 0.46988838552583945, "grad_norm": 1.1850684881210327, "learning_rate": 4.530111614474161e-05, "loss": 0.2308, "step": 588000 }, { "epoch": 0.4702879504795179, "grad_norm": 0.5291880965232849, "learning_rate": 4.529712049520483e-05, "loss": 0.2274, "step": 588500 }, { "epoch": 0.4706875154331963, "grad_norm": 0.2284049242734909, "learning_rate": 4.529312484566804e-05, "loss": 0.2261, "step": 589000 }, { "epoch": 0.47108708038687475, "grad_norm": 1.0020167827606201, "learning_rate": 4.528912919613125e-05, "loss": 0.2421, "step": 589500 }, { "epoch": 0.4714866453405532, "grad_norm": 1.2764357328414917, "learning_rate": 4.528513354659447e-05, "loss": 0.2289, "step": 590000 }, { "epoch": 0.47188621029423167, "grad_norm": 0.5374190807342529, "learning_rate": 4.528113789705769e-05, "loss": 0.2268, "step": 590500 }, { "epoch": 0.4722857752479101, "grad_norm": 0.6661812663078308, "learning_rate": 4.5277142247520905e-05, "loss": 0.2264, "step": 591000 }, { "epoch": 0.47268534020158853, "grad_norm": 0.8065260052680969, "learning_rate": 4.5273146597984117e-05, "loss": 0.222, "step": 591500 }, { "epoch": 0.47308490515526697, "grad_norm": 0.5396072864532471, "learning_rate": 4.526915094844733e-05, "loss": 0.2187, "step": 592000 }, { "epoch": 0.4734844701089454, "grad_norm": 0.46927207708358765, "learning_rate": 4.526515529891055e-05, "loss": 0.2196, "step": 592500 }, { "epoch": 0.47388403506262383, "grad_norm": 2.0054142475128174, "learning_rate": 4.5261159649373764e-05, "loss": 0.2238, "step": 593000 }, { "epoch": 0.47428360001630226, "grad_norm": 0.24352455139160156, "learning_rate": 4.525716399983698e-05, "loss": 0.2387, "step": 593500 }, { "epoch": 0.4746831649699807, "grad_norm": 1.8027924299240112, "learning_rate": 4.5253168350300194e-05, "loss": 0.2368, "step": 594000 }, { "epoch": 0.47508272992365913, "grad_norm": 1.3861920833587646, "learning_rate": 4.524917270076341e-05, "loss": 0.2299, "step": 594500 }, { "epoch": 0.47548229487733756, "grad_norm": 0.17129769921302795, "learning_rate": 4.524517705122663e-05, "loss": 0.2568, "step": 595000 }, { "epoch": 0.475881859831016, "grad_norm": 0.2521411180496216, "learning_rate": 4.524118140168984e-05, "loss": 0.2429, "step": 595500 }, { "epoch": 0.47628142478469443, "grad_norm": 2.431309461593628, "learning_rate": 4.523718575215306e-05, "loss": 0.2237, "step": 596000 }, { "epoch": 0.47668098973837286, "grad_norm": 0.8973667025566101, "learning_rate": 4.523319010261627e-05, "loss": 0.2218, "step": 596500 }, { "epoch": 0.4770805546920513, "grad_norm": 0.3433820605278015, "learning_rate": 4.522919445307949e-05, "loss": 0.2218, "step": 597000 }, { "epoch": 0.47748011964572973, "grad_norm": 0.5755271911621094, "learning_rate": 4.522519880354271e-05, "loss": 0.2336, "step": 597500 }, { "epoch": 0.47787968459940816, "grad_norm": 0.6235589385032654, "learning_rate": 4.522120315400592e-05, "loss": 0.23, "step": 598000 }, { "epoch": 0.4782792495530866, "grad_norm": 0.7247177958488464, "learning_rate": 4.521720750446913e-05, "loss": 0.2497, "step": 598500 }, { "epoch": 0.478678814506765, "grad_norm": 0.4979024827480316, "learning_rate": 4.5213211854932356e-05, "loss": 0.2451, "step": 599000 }, { "epoch": 0.47907837946044346, "grad_norm": 0.5767797827720642, "learning_rate": 4.520921620539557e-05, "loss": 0.229, "step": 599500 }, { "epoch": 0.4794779444141219, "grad_norm": 0.44774091243743896, "learning_rate": 4.5205220555858785e-05, "loss": 0.2422, "step": 600000 }, { "epoch": 0.4798775093678003, "grad_norm": 0.7127969861030579, "learning_rate": 4.5201224906322e-05, "loss": 0.2367, "step": 600500 }, { "epoch": 0.48027707432147876, "grad_norm": 0.21247299015522003, "learning_rate": 4.5197229256785215e-05, "loss": 0.2369, "step": 601000 }, { "epoch": 0.4806766392751572, "grad_norm": 0.8410676717758179, "learning_rate": 4.519323360724843e-05, "loss": 0.2389, "step": 601500 }, { "epoch": 0.4810762042288356, "grad_norm": 0.5847194194793701, "learning_rate": 4.5189237957711645e-05, "loss": 0.2383, "step": 602000 }, { "epoch": 0.48147576918251406, "grad_norm": 0.4220845401287079, "learning_rate": 4.518524230817486e-05, "loss": 0.213, "step": 602500 }, { "epoch": 0.4818753341361925, "grad_norm": 0.6332404613494873, "learning_rate": 4.5181246658638074e-05, "loss": 0.2233, "step": 603000 }, { "epoch": 0.4822748990898709, "grad_norm": 0.4952828288078308, "learning_rate": 4.517725100910129e-05, "loss": 0.2191, "step": 603500 }, { "epoch": 0.48267446404354936, "grad_norm": 0.9337939023971558, "learning_rate": 4.517325535956451e-05, "loss": 0.2396, "step": 604000 }, { "epoch": 0.48307402899722784, "grad_norm": 1.360479474067688, "learning_rate": 4.516925971002772e-05, "loss": 0.2383, "step": 604500 }, { "epoch": 0.4834735939509063, "grad_norm": 2.719214916229248, "learning_rate": 4.516526406049094e-05, "loss": 0.2467, "step": 605000 }, { "epoch": 0.4838731589045847, "grad_norm": 0.40565505623817444, "learning_rate": 4.516126841095416e-05, "loss": 0.2333, "step": 605500 }, { "epoch": 0.48427272385826314, "grad_norm": 1.4250481128692627, "learning_rate": 4.515727276141737e-05, "loss": 0.2331, "step": 606000 }, { "epoch": 0.4846722888119416, "grad_norm": 0.5482524037361145, "learning_rate": 4.515327711188059e-05, "loss": 0.2198, "step": 606500 }, { "epoch": 0.48507185376562, "grad_norm": 0.8695431351661682, "learning_rate": 4.51492814623438e-05, "loss": 0.2269, "step": 607000 }, { "epoch": 0.48547141871929844, "grad_norm": 0.3687494397163391, "learning_rate": 4.514528581280702e-05, "loss": 0.2185, "step": 607500 }, { "epoch": 0.4858709836729769, "grad_norm": 0.4522852897644043, "learning_rate": 4.5141290163270236e-05, "loss": 0.2194, "step": 608000 }, { "epoch": 0.4862705486266553, "grad_norm": 0.3042037785053253, "learning_rate": 4.513729451373345e-05, "loss": 0.2179, "step": 608500 }, { "epoch": 0.48667011358033374, "grad_norm": 0.6969441771507263, "learning_rate": 4.5133298864196665e-05, "loss": 0.229, "step": 609000 }, { "epoch": 0.4870696785340122, "grad_norm": 1.8359920978546143, "learning_rate": 4.512930321465988e-05, "loss": 0.2198, "step": 609500 }, { "epoch": 0.4874692434876906, "grad_norm": 0.6206526756286621, "learning_rate": 4.5125307565123095e-05, "loss": 0.2363, "step": 610000 }, { "epoch": 0.48786880844136904, "grad_norm": 0.6119083762168884, "learning_rate": 4.512131191558631e-05, "loss": 0.2161, "step": 610500 }, { "epoch": 0.4882683733950475, "grad_norm": 0.3411417007446289, "learning_rate": 4.5117316266049525e-05, "loss": 0.2341, "step": 611000 }, { "epoch": 0.4886679383487259, "grad_norm": 0.8488616347312927, "learning_rate": 4.511332061651274e-05, "loss": 0.2307, "step": 611500 }, { "epoch": 0.48906750330240434, "grad_norm": 0.8783763647079468, "learning_rate": 4.5109324966975954e-05, "loss": 0.2316, "step": 612000 }, { "epoch": 0.48946706825608277, "grad_norm": 1.0390297174453735, "learning_rate": 4.510532931743918e-05, "loss": 0.226, "step": 612500 }, { "epoch": 0.4898666332097612, "grad_norm": 0.47489842772483826, "learning_rate": 4.510133366790239e-05, "loss": 0.2366, "step": 613000 }, { "epoch": 0.49026619816343964, "grad_norm": 0.6540419459342957, "learning_rate": 4.50973380183656e-05, "loss": 0.2481, "step": 613500 }, { "epoch": 0.49066576311711807, "grad_norm": 0.7964276671409607, "learning_rate": 4.509334236882882e-05, "loss": 0.2273, "step": 614000 }, { "epoch": 0.4910653280707965, "grad_norm": 0.4954680800437927, "learning_rate": 4.508934671929204e-05, "loss": 0.2305, "step": 614500 }, { "epoch": 0.49146489302447494, "grad_norm": 0.6087421178817749, "learning_rate": 4.508535106975526e-05, "loss": 0.2219, "step": 615000 }, { "epoch": 0.49186445797815337, "grad_norm": 0.21649335324764252, "learning_rate": 4.508135542021847e-05, "loss": 0.2394, "step": 615500 }, { "epoch": 0.4922640229318318, "grad_norm": 0.5333234071731567, "learning_rate": 4.507735977068168e-05, "loss": 0.2242, "step": 616000 }, { "epoch": 0.49266358788551023, "grad_norm": 0.6442393660545349, "learning_rate": 4.50733641211449e-05, "loss": 0.2365, "step": 616500 }, { "epoch": 0.49306315283918867, "grad_norm": 1.9071587324142456, "learning_rate": 4.5069368471608116e-05, "loss": 0.206, "step": 617000 }, { "epoch": 0.4934627177928671, "grad_norm": 0.6894512176513672, "learning_rate": 4.5065372822071334e-05, "loss": 0.2297, "step": 617500 }, { "epoch": 0.4938622827465456, "grad_norm": 1.5849096775054932, "learning_rate": 4.5061377172534546e-05, "loss": 0.2345, "step": 618000 }, { "epoch": 0.494261847700224, "grad_norm": 0.2647152841091156, "learning_rate": 4.505738152299776e-05, "loss": 0.2113, "step": 618500 }, { "epoch": 0.49466141265390245, "grad_norm": 0.7640985250473022, "learning_rate": 4.505338587346098e-05, "loss": 0.2389, "step": 619000 }, { "epoch": 0.4950609776075809, "grad_norm": 0.5049830675125122, "learning_rate": 4.5049390223924193e-05, "loss": 0.2278, "step": 619500 }, { "epoch": 0.4954605425612593, "grad_norm": 0.5864213109016418, "learning_rate": 4.504539457438741e-05, "loss": 0.217, "step": 620000 }, { "epoch": 0.49586010751493775, "grad_norm": 0.4714498817920685, "learning_rate": 4.504139892485062e-05, "loss": 0.2315, "step": 620500 }, { "epoch": 0.4962596724686162, "grad_norm": 0.5265613794326782, "learning_rate": 4.503740327531384e-05, "loss": 0.2392, "step": 621000 }, { "epoch": 0.4966592374222946, "grad_norm": 0.7601264715194702, "learning_rate": 4.503340762577706e-05, "loss": 0.2189, "step": 621500 }, { "epoch": 0.49705880237597305, "grad_norm": 0.5209540128707886, "learning_rate": 4.502941197624027e-05, "loss": 0.2347, "step": 622000 }, { "epoch": 0.4974583673296515, "grad_norm": 1.397670030593872, "learning_rate": 4.502541632670349e-05, "loss": 0.2315, "step": 622500 }, { "epoch": 0.4978579322833299, "grad_norm": 0.3418515622615814, "learning_rate": 4.50214206771667e-05, "loss": 0.2378, "step": 623000 }, { "epoch": 0.49825749723700835, "grad_norm": 1.401111125946045, "learning_rate": 4.501742502762992e-05, "loss": 0.2169, "step": 623500 }, { "epoch": 0.4986570621906868, "grad_norm": 0.3298566937446594, "learning_rate": 4.501342937809314e-05, "loss": 0.2423, "step": 624000 }, { "epoch": 0.4990566271443652, "grad_norm": 0.496146023273468, "learning_rate": 4.500943372855635e-05, "loss": 0.2334, "step": 624500 }, { "epoch": 0.49945619209804365, "grad_norm": 1.955603003501892, "learning_rate": 4.500543807901956e-05, "loss": 0.2427, "step": 625000 }, { "epoch": 0.4998557570517221, "grad_norm": 0.3098066747188568, "learning_rate": 4.5001442429482785e-05, "loss": 0.245, "step": 625500 }, { "epoch": 0.5002553220054006, "grad_norm": 1.2386924028396606, "learning_rate": 4.4997446779945996e-05, "loss": 0.2168, "step": 626000 }, { "epoch": 0.500654886959079, "grad_norm": 1.014979362487793, "learning_rate": 4.4993451130409214e-05, "loss": 0.223, "step": 626500 }, { "epoch": 0.5010544519127574, "grad_norm": 0.6000972986221313, "learning_rate": 4.4989455480872426e-05, "loss": 0.236, "step": 627000 }, { "epoch": 0.5014540168664359, "grad_norm": 0.618749737739563, "learning_rate": 4.4985459831335644e-05, "loss": 0.2257, "step": 627500 }, { "epoch": 0.5018535818201143, "grad_norm": 0.4117027521133423, "learning_rate": 4.498146418179886e-05, "loss": 0.2307, "step": 628000 }, { "epoch": 0.5022531467737927, "grad_norm": 0.25027844309806824, "learning_rate": 4.4977468532262074e-05, "loss": 0.2346, "step": 628500 }, { "epoch": 0.5026527117274712, "grad_norm": 1.286726951599121, "learning_rate": 4.497347288272529e-05, "loss": 0.2528, "step": 629000 }, { "epoch": 0.5030522766811496, "grad_norm": 0.6455493569374084, "learning_rate": 4.49694772331885e-05, "loss": 0.2143, "step": 629500 }, { "epoch": 0.503451841634828, "grad_norm": 1.6823726892471313, "learning_rate": 4.496548158365172e-05, "loss": 0.2435, "step": 630000 }, { "epoch": 0.5038514065885065, "grad_norm": 0.9200531840324402, "learning_rate": 4.496148593411494e-05, "loss": 0.2156, "step": 630500 }, { "epoch": 0.5042509715421849, "grad_norm": 0.9004033803939819, "learning_rate": 4.495749028457815e-05, "loss": 0.2333, "step": 631000 }, { "epoch": 0.5046505364958633, "grad_norm": 0.5265014171600342, "learning_rate": 4.495349463504137e-05, "loss": 0.2389, "step": 631500 }, { "epoch": 0.5050501014495418, "grad_norm": 0.6388614773750305, "learning_rate": 4.494949898550459e-05, "loss": 0.2217, "step": 632000 }, { "epoch": 0.5054496664032202, "grad_norm": 0.5511764287948608, "learning_rate": 4.49455033359678e-05, "loss": 0.2368, "step": 632500 }, { "epoch": 0.5058492313568986, "grad_norm": 1.6534128189086914, "learning_rate": 4.494150768643102e-05, "loss": 0.2323, "step": 633000 }, { "epoch": 0.5062487963105771, "grad_norm": 0.5463139414787292, "learning_rate": 4.493751203689423e-05, "loss": 0.2099, "step": 633500 }, { "epoch": 0.5066483612642555, "grad_norm": 0.722185492515564, "learning_rate": 4.493351638735745e-05, "loss": 0.237, "step": 634000 }, { "epoch": 0.5070479262179339, "grad_norm": 0.6738904118537903, "learning_rate": 4.4929520737820665e-05, "loss": 0.2117, "step": 634500 }, { "epoch": 0.5074474911716124, "grad_norm": 0.6271284222602844, "learning_rate": 4.4925525088283876e-05, "loss": 0.2289, "step": 635000 }, { "epoch": 0.5078470561252908, "grad_norm": 0.6887884140014648, "learning_rate": 4.4921529438747094e-05, "loss": 0.2462, "step": 635500 }, { "epoch": 0.5082466210789692, "grad_norm": 0.7918697595596313, "learning_rate": 4.4917533789210306e-05, "loss": 0.2359, "step": 636000 }, { "epoch": 0.5086461860326477, "grad_norm": 0.565849244594574, "learning_rate": 4.491353813967353e-05, "loss": 0.2061, "step": 636500 }, { "epoch": 0.5090457509863261, "grad_norm": 0.5805916786193848, "learning_rate": 4.490954249013674e-05, "loss": 0.225, "step": 637000 }, { "epoch": 0.5094453159400045, "grad_norm": 0.500357449054718, "learning_rate": 4.4905546840599954e-05, "loss": 0.2427, "step": 637500 }, { "epoch": 0.509844880893683, "grad_norm": 0.4585607349872589, "learning_rate": 4.490155119106317e-05, "loss": 0.228, "step": 638000 }, { "epoch": 0.5102444458473614, "grad_norm": 0.6605742573738098, "learning_rate": 4.489755554152639e-05, "loss": 0.2182, "step": 638500 }, { "epoch": 0.5106440108010398, "grad_norm": 0.4423084557056427, "learning_rate": 4.489355989198961e-05, "loss": 0.2361, "step": 639000 }, { "epoch": 0.5110435757547183, "grad_norm": 0.5109937787055969, "learning_rate": 4.488956424245282e-05, "loss": 0.2395, "step": 639500 }, { "epoch": 0.5114431407083967, "grad_norm": 0.5425543189048767, "learning_rate": 4.488556859291603e-05, "loss": 0.2329, "step": 640000 }, { "epoch": 0.5118427056620751, "grad_norm": 1.2779380083084106, "learning_rate": 4.488157294337925e-05, "loss": 0.2287, "step": 640500 }, { "epoch": 0.5122422706157536, "grad_norm": 0.5376672744750977, "learning_rate": 4.487757729384247e-05, "loss": 0.2361, "step": 641000 }, { "epoch": 0.512641835569432, "grad_norm": 1.9182038307189941, "learning_rate": 4.4873581644305686e-05, "loss": 0.2243, "step": 641500 }, { "epoch": 0.5130414005231104, "grad_norm": 2.0834755897521973, "learning_rate": 4.48695859947689e-05, "loss": 0.2212, "step": 642000 }, { "epoch": 0.5134409654767889, "grad_norm": 0.693503201007843, "learning_rate": 4.486559034523211e-05, "loss": 0.2278, "step": 642500 }, { "epoch": 0.5138405304304673, "grad_norm": 0.6835401654243469, "learning_rate": 4.4861594695695334e-05, "loss": 0.2388, "step": 643000 }, { "epoch": 0.5142400953841457, "grad_norm": 0.6664875745773315, "learning_rate": 4.4857599046158545e-05, "loss": 0.2296, "step": 643500 }, { "epoch": 0.5146396603378242, "grad_norm": 1.137164831161499, "learning_rate": 4.485360339662176e-05, "loss": 0.2291, "step": 644000 }, { "epoch": 0.5150392252915026, "grad_norm": 0.41587525606155396, "learning_rate": 4.4849607747084975e-05, "loss": 0.2164, "step": 644500 }, { "epoch": 0.515438790245181, "grad_norm": 0.5819118618965149, "learning_rate": 4.484561209754819e-05, "loss": 0.2219, "step": 645000 }, { "epoch": 0.5158383551988595, "grad_norm": 0.5089476704597473, "learning_rate": 4.484161644801141e-05, "loss": 0.2198, "step": 645500 }, { "epoch": 0.5162379201525379, "grad_norm": 0.49881502985954285, "learning_rate": 4.483762079847462e-05, "loss": 0.2303, "step": 646000 }, { "epoch": 0.5166374851062163, "grad_norm": 0.6722689270973206, "learning_rate": 4.483362514893784e-05, "loss": 0.2378, "step": 646500 }, { "epoch": 0.5170370500598948, "grad_norm": 2.703826427459717, "learning_rate": 4.482962949940105e-05, "loss": 0.2373, "step": 647000 }, { "epoch": 0.5174366150135732, "grad_norm": 0.7266898155212402, "learning_rate": 4.482563384986427e-05, "loss": 0.226, "step": 647500 }, { "epoch": 0.5178361799672516, "grad_norm": 1.4500081539154053, "learning_rate": 4.482163820032749e-05, "loss": 0.237, "step": 648000 }, { "epoch": 0.51823574492093, "grad_norm": 0.29719462990760803, "learning_rate": 4.48176425507907e-05, "loss": 0.24, "step": 648500 }, { "epoch": 0.5186353098746085, "grad_norm": 0.7109675407409668, "learning_rate": 4.481364690125392e-05, "loss": 0.2227, "step": 649000 }, { "epoch": 0.5190348748282869, "grad_norm": 1.0697821378707886, "learning_rate": 4.4809651251717136e-05, "loss": 0.2467, "step": 649500 }, { "epoch": 0.5194344397819654, "grad_norm": 0.7030501365661621, "learning_rate": 4.480565560218035e-05, "loss": 0.2244, "step": 650000 }, { "epoch": 0.5198340047356438, "grad_norm": 0.44079655408859253, "learning_rate": 4.4801659952643566e-05, "loss": 0.212, "step": 650500 }, { "epoch": 0.5202335696893222, "grad_norm": 0.5208896398544312, "learning_rate": 4.479766430310678e-05, "loss": 0.2269, "step": 651000 }, { "epoch": 0.5206331346430007, "grad_norm": 1.6550167798995972, "learning_rate": 4.4793668653569996e-05, "loss": 0.2208, "step": 651500 }, { "epoch": 0.5210326995966791, "grad_norm": 0.7969914674758911, "learning_rate": 4.4789673004033214e-05, "loss": 0.2369, "step": 652000 }, { "epoch": 0.5214322645503575, "grad_norm": 0.9560368657112122, "learning_rate": 4.4785677354496425e-05, "loss": 0.2271, "step": 652500 }, { "epoch": 0.5218318295040361, "grad_norm": 0.7637680768966675, "learning_rate": 4.478168170495964e-05, "loss": 0.2142, "step": 653000 }, { "epoch": 0.5222313944577145, "grad_norm": 2.48057222366333, "learning_rate": 4.4777686055422855e-05, "loss": 0.2307, "step": 653500 }, { "epoch": 0.5226309594113929, "grad_norm": 0.7517579793930054, "learning_rate": 4.477369040588607e-05, "loss": 0.2404, "step": 654000 }, { "epoch": 0.5230305243650714, "grad_norm": 0.40113598108291626, "learning_rate": 4.476969475634929e-05, "loss": 0.2193, "step": 654500 }, { "epoch": 0.5234300893187498, "grad_norm": 0.6084226369857788, "learning_rate": 4.47656991068125e-05, "loss": 0.2287, "step": 655000 }, { "epoch": 0.5238296542724282, "grad_norm": 0.9362441301345825, "learning_rate": 4.476170345727572e-05, "loss": 0.2341, "step": 655500 }, { "epoch": 0.5242292192261067, "grad_norm": 0.7664728164672852, "learning_rate": 4.475770780773893e-05, "loss": 0.2498, "step": 656000 }, { "epoch": 0.5246287841797851, "grad_norm": 0.6563196778297424, "learning_rate": 4.475371215820215e-05, "loss": 0.2334, "step": 656500 }, { "epoch": 0.5250283491334635, "grad_norm": 0.4630770683288574, "learning_rate": 4.474971650866537e-05, "loss": 0.222, "step": 657000 }, { "epoch": 0.525427914087142, "grad_norm": 0.36585158109664917, "learning_rate": 4.474572085912858e-05, "loss": 0.2416, "step": 657500 }, { "epoch": 0.5258274790408204, "grad_norm": 0.23606760799884796, "learning_rate": 4.47417252095918e-05, "loss": 0.221, "step": 658000 }, { "epoch": 0.5262270439944988, "grad_norm": 0.5696447491645813, "learning_rate": 4.4737729560055016e-05, "loss": 0.2376, "step": 658500 }, { "epoch": 0.5266266089481773, "grad_norm": 0.412386953830719, "learning_rate": 4.473373391051823e-05, "loss": 0.215, "step": 659000 }, { "epoch": 0.5270261739018557, "grad_norm": 0.9317113757133484, "learning_rate": 4.4729738260981446e-05, "loss": 0.2434, "step": 659500 }, { "epoch": 0.5274257388555341, "grad_norm": 0.7555381059646606, "learning_rate": 4.472574261144466e-05, "loss": 0.2576, "step": 660000 }, { "epoch": 0.5278253038092126, "grad_norm": 0.8149796724319458, "learning_rate": 4.4721746961907876e-05, "loss": 0.2296, "step": 660500 }, { "epoch": 0.528224868762891, "grad_norm": 1.2581815719604492, "learning_rate": 4.4717751312371094e-05, "loss": 0.2371, "step": 661000 }, { "epoch": 0.5286244337165694, "grad_norm": 0.6866134405136108, "learning_rate": 4.4713755662834305e-05, "loss": 0.2505, "step": 661500 }, { "epoch": 0.5290239986702479, "grad_norm": 0.2007489800453186, "learning_rate": 4.4709760013297523e-05, "loss": 0.2221, "step": 662000 }, { "epoch": 0.5294235636239263, "grad_norm": 0.673066258430481, "learning_rate": 4.4705764363760735e-05, "loss": 0.2233, "step": 662500 }, { "epoch": 0.5298231285776047, "grad_norm": 0.9314345717430115, "learning_rate": 4.470176871422396e-05, "loss": 0.2235, "step": 663000 }, { "epoch": 0.5302226935312832, "grad_norm": 1.5068438053131104, "learning_rate": 4.469777306468717e-05, "loss": 0.2288, "step": 663500 }, { "epoch": 0.5306222584849616, "grad_norm": 0.2509062886238098, "learning_rate": 4.469377741515038e-05, "loss": 0.23, "step": 664000 }, { "epoch": 0.53102182343864, "grad_norm": 0.49496597051620483, "learning_rate": 4.46897817656136e-05, "loss": 0.2266, "step": 664500 }, { "epoch": 0.5314213883923185, "grad_norm": 0.33485737442970276, "learning_rate": 4.468578611607682e-05, "loss": 0.2232, "step": 665000 }, { "epoch": 0.5318209533459969, "grad_norm": 0.8830226063728333, "learning_rate": 4.468179046654004e-05, "loss": 0.2114, "step": 665500 }, { "epoch": 0.5322205182996753, "grad_norm": 0.45924341678619385, "learning_rate": 4.467779481700325e-05, "loss": 0.227, "step": 666000 }, { "epoch": 0.5326200832533537, "grad_norm": 0.41279464960098267, "learning_rate": 4.467379916746646e-05, "loss": 0.2388, "step": 666500 }, { "epoch": 0.5330196482070322, "grad_norm": 0.17437797784805298, "learning_rate": 4.466980351792968e-05, "loss": 0.2487, "step": 667000 }, { "epoch": 0.5334192131607106, "grad_norm": 0.8569865822792053, "learning_rate": 4.4665807868392897e-05, "loss": 0.2362, "step": 667500 }, { "epoch": 0.533818778114389, "grad_norm": 0.5389264822006226, "learning_rate": 4.4661812218856115e-05, "loss": 0.2642, "step": 668000 }, { "epoch": 0.5342183430680675, "grad_norm": 0.5005224347114563, "learning_rate": 4.4657816569319326e-05, "loss": 0.2177, "step": 668500 }, { "epoch": 0.5346179080217459, "grad_norm": 0.641656219959259, "learning_rate": 4.465382091978254e-05, "loss": 0.215, "step": 669000 }, { "epoch": 0.5350174729754243, "grad_norm": 0.4085920453071594, "learning_rate": 4.464982527024576e-05, "loss": 0.2322, "step": 669500 }, { "epoch": 0.5354170379291028, "grad_norm": 0.5788689255714417, "learning_rate": 4.4645829620708974e-05, "loss": 0.2326, "step": 670000 }, { "epoch": 0.5358166028827812, "grad_norm": 2.2682278156280518, "learning_rate": 4.464183397117219e-05, "loss": 0.2225, "step": 670500 }, { "epoch": 0.5362161678364596, "grad_norm": 0.5759963989257812, "learning_rate": 4.4637838321635404e-05, "loss": 0.2136, "step": 671000 }, { "epoch": 0.5366157327901381, "grad_norm": 0.7343336939811707, "learning_rate": 4.463384267209862e-05, "loss": 0.2324, "step": 671500 }, { "epoch": 0.5370152977438165, "grad_norm": 0.6623399257659912, "learning_rate": 4.462984702256184e-05, "loss": 0.2395, "step": 672000 }, { "epoch": 0.5374148626974949, "grad_norm": 0.3833010494709015, "learning_rate": 4.462585137302505e-05, "loss": 0.2183, "step": 672500 }, { "epoch": 0.5378144276511734, "grad_norm": 0.7391517758369446, "learning_rate": 4.462185572348827e-05, "loss": 0.2209, "step": 673000 }, { "epoch": 0.5382139926048518, "grad_norm": 0.28795596957206726, "learning_rate": 4.461786007395148e-05, "loss": 0.2176, "step": 673500 }, { "epoch": 0.5386135575585302, "grad_norm": 0.6622920036315918, "learning_rate": 4.46138644244147e-05, "loss": 0.2523, "step": 674000 }, { "epoch": 0.5390131225122087, "grad_norm": 0.4364839792251587, "learning_rate": 4.460986877487792e-05, "loss": 0.2184, "step": 674500 }, { "epoch": 0.5394126874658871, "grad_norm": 0.8518491387367249, "learning_rate": 4.460587312534113e-05, "loss": 0.2175, "step": 675000 }, { "epoch": 0.5398122524195655, "grad_norm": 0.49417388439178467, "learning_rate": 4.460187747580435e-05, "loss": 0.2527, "step": 675500 }, { "epoch": 0.540211817373244, "grad_norm": 0.6242709159851074, "learning_rate": 4.4597881826267565e-05, "loss": 0.2085, "step": 676000 }, { "epoch": 0.5406113823269224, "grad_norm": 0.5006850957870483, "learning_rate": 4.459388617673078e-05, "loss": 0.2288, "step": 676500 }, { "epoch": 0.5410109472806008, "grad_norm": 0.6567051410675049, "learning_rate": 4.4589890527193995e-05, "loss": 0.2298, "step": 677000 }, { "epoch": 0.5414105122342793, "grad_norm": 0.6471547484397888, "learning_rate": 4.4585894877657206e-05, "loss": 0.236, "step": 677500 }, { "epoch": 0.5418100771879577, "grad_norm": 0.8473989963531494, "learning_rate": 4.4581899228120425e-05, "loss": 0.2251, "step": 678000 }, { "epoch": 0.5422096421416361, "grad_norm": 0.882305383682251, "learning_rate": 4.457790357858364e-05, "loss": 0.2352, "step": 678500 }, { "epoch": 0.5426092070953146, "grad_norm": 1.0633485317230225, "learning_rate": 4.4573907929046854e-05, "loss": 0.2297, "step": 679000 }, { "epoch": 0.543008772048993, "grad_norm": 0.6615346074104309, "learning_rate": 4.456991227951007e-05, "loss": 0.2307, "step": 679500 }, { "epoch": 0.5434083370026714, "grad_norm": 0.4700793921947479, "learning_rate": 4.4565916629973284e-05, "loss": 0.2397, "step": 680000 }, { "epoch": 0.54380790195635, "grad_norm": 0.7819907069206238, "learning_rate": 4.45619209804365e-05, "loss": 0.2333, "step": 680500 }, { "epoch": 0.5442074669100284, "grad_norm": 1.7696670293807983, "learning_rate": 4.455792533089972e-05, "loss": 0.2453, "step": 681000 }, { "epoch": 0.5446070318637068, "grad_norm": 0.6636971831321716, "learning_rate": 4.455392968136293e-05, "loss": 0.2301, "step": 681500 }, { "epoch": 0.5450065968173853, "grad_norm": 1.0261421203613281, "learning_rate": 4.454993403182615e-05, "loss": 0.2242, "step": 682000 }, { "epoch": 0.5454061617710637, "grad_norm": 1.4068679809570312, "learning_rate": 4.454593838228937e-05, "loss": 0.2316, "step": 682500 }, { "epoch": 0.5458057267247421, "grad_norm": 0.6631271243095398, "learning_rate": 4.454194273275258e-05, "loss": 0.2291, "step": 683000 }, { "epoch": 0.5462052916784206, "grad_norm": 0.6115766763687134, "learning_rate": 4.45379470832158e-05, "loss": 0.2158, "step": 683500 }, { "epoch": 0.546604856632099, "grad_norm": 0.7036367654800415, "learning_rate": 4.453395143367901e-05, "loss": 0.2056, "step": 684000 }, { "epoch": 0.5470044215857774, "grad_norm": 0.7525461912155151, "learning_rate": 4.452995578414223e-05, "loss": 0.2388, "step": 684500 }, { "epoch": 0.5474039865394559, "grad_norm": 0.47955697774887085, "learning_rate": 4.4525960134605445e-05, "loss": 0.2417, "step": 685000 }, { "epoch": 0.5478035514931343, "grad_norm": 0.4651810824871063, "learning_rate": 4.452196448506866e-05, "loss": 0.2434, "step": 685500 }, { "epoch": 0.5482031164468127, "grad_norm": 0.16132938861846924, "learning_rate": 4.4517968835531875e-05, "loss": 0.2399, "step": 686000 }, { "epoch": 0.5486026814004912, "grad_norm": 0.43083593249320984, "learning_rate": 4.4513973185995086e-05, "loss": 0.2291, "step": 686500 }, { "epoch": 0.5490022463541696, "grad_norm": 0.8647883534431458, "learning_rate": 4.450997753645831e-05, "loss": 0.2161, "step": 687000 }, { "epoch": 0.549401811307848, "grad_norm": 0.4693757891654968, "learning_rate": 4.450598188692152e-05, "loss": 0.2331, "step": 687500 }, { "epoch": 0.5498013762615265, "grad_norm": 0.5710835456848145, "learning_rate": 4.4501986237384734e-05, "loss": 0.2534, "step": 688000 }, { "epoch": 0.5502009412152049, "grad_norm": 0.9328039884567261, "learning_rate": 4.449799058784795e-05, "loss": 0.2268, "step": 688500 }, { "epoch": 0.5506005061688833, "grad_norm": 0.3605673611164093, "learning_rate": 4.449399493831117e-05, "loss": 0.2338, "step": 689000 }, { "epoch": 0.5510000711225618, "grad_norm": 0.5662606358528137, "learning_rate": 4.448999928877439e-05, "loss": 0.2262, "step": 689500 }, { "epoch": 0.5513996360762402, "grad_norm": 0.30997318029403687, "learning_rate": 4.44860036392376e-05, "loss": 0.2248, "step": 690000 }, { "epoch": 0.5517992010299186, "grad_norm": 0.47042447328567505, "learning_rate": 4.448200798970081e-05, "loss": 0.2374, "step": 690500 }, { "epoch": 0.5521987659835971, "grad_norm": 0.7350055575370789, "learning_rate": 4.447801234016403e-05, "loss": 0.2344, "step": 691000 }, { "epoch": 0.5525983309372755, "grad_norm": 0.6842978596687317, "learning_rate": 4.447401669062725e-05, "loss": 0.2381, "step": 691500 }, { "epoch": 0.5529978958909539, "grad_norm": 0.5660678148269653, "learning_rate": 4.4470021041090466e-05, "loss": 0.2241, "step": 692000 }, { "epoch": 0.5533974608446324, "grad_norm": 0.635234534740448, "learning_rate": 4.446602539155368e-05, "loss": 0.2386, "step": 692500 }, { "epoch": 0.5537970257983108, "grad_norm": 0.44102439284324646, "learning_rate": 4.446202974201689e-05, "loss": 0.2306, "step": 693000 }, { "epoch": 0.5541965907519892, "grad_norm": 0.5635030269622803, "learning_rate": 4.4458034092480114e-05, "loss": 0.2256, "step": 693500 }, { "epoch": 0.5545961557056677, "grad_norm": 0.5648439526557922, "learning_rate": 4.4454038442943326e-05, "loss": 0.2131, "step": 694000 }, { "epoch": 0.5549957206593461, "grad_norm": 0.40703532099723816, "learning_rate": 4.4450042793406544e-05, "loss": 0.2463, "step": 694500 }, { "epoch": 0.5553952856130245, "grad_norm": 0.5127084851264954, "learning_rate": 4.4446047143869755e-05, "loss": 0.2086, "step": 695000 }, { "epoch": 0.555794850566703, "grad_norm": 0.49104031920433044, "learning_rate": 4.4442051494332973e-05, "loss": 0.2206, "step": 695500 }, { "epoch": 0.5561944155203814, "grad_norm": 0.7501258850097656, "learning_rate": 4.443805584479619e-05, "loss": 0.2391, "step": 696000 }, { "epoch": 0.5565939804740598, "grad_norm": 0.5664258599281311, "learning_rate": 4.44340601952594e-05, "loss": 0.231, "step": 696500 }, { "epoch": 0.5569935454277383, "grad_norm": 0.4036872982978821, "learning_rate": 4.443006454572262e-05, "loss": 0.2444, "step": 697000 }, { "epoch": 0.5573931103814167, "grad_norm": 0.945702850818634, "learning_rate": 4.442606889618583e-05, "loss": 0.2212, "step": 697500 }, { "epoch": 0.5577926753350951, "grad_norm": 0.76984041929245, "learning_rate": 4.442207324664905e-05, "loss": 0.2172, "step": 698000 }, { "epoch": 0.5581922402887736, "grad_norm": 0.5255947113037109, "learning_rate": 4.441807759711227e-05, "loss": 0.2342, "step": 698500 }, { "epoch": 0.558591805242452, "grad_norm": 0.39010703563690186, "learning_rate": 4.441408194757548e-05, "loss": 0.2298, "step": 699000 }, { "epoch": 0.5589913701961304, "grad_norm": 0.49285557866096497, "learning_rate": 4.44100862980387e-05, "loss": 0.2395, "step": 699500 }, { "epoch": 0.5593909351498089, "grad_norm": 1.0349464416503906, "learning_rate": 4.440609064850191e-05, "loss": 0.2328, "step": 700000 }, { "epoch": 0.5597905001034873, "grad_norm": 0.119346983730793, "learning_rate": 4.440209499896513e-05, "loss": 0.2158, "step": 700500 }, { "epoch": 0.5601900650571657, "grad_norm": 0.9177461266517639, "learning_rate": 4.4398099349428347e-05, "loss": 0.2296, "step": 701000 }, { "epoch": 0.5605896300108442, "grad_norm": 1.2100828886032104, "learning_rate": 4.439410369989156e-05, "loss": 0.2163, "step": 701500 }, { "epoch": 0.5609891949645226, "grad_norm": 0.7061049938201904, "learning_rate": 4.4390108050354776e-05, "loss": 0.2149, "step": 702000 }, { "epoch": 0.561388759918201, "grad_norm": 0.534760057926178, "learning_rate": 4.4386112400817994e-05, "loss": 0.2344, "step": 702500 }, { "epoch": 0.5617883248718795, "grad_norm": 0.5343360304832458, "learning_rate": 4.4382116751281206e-05, "loss": 0.2267, "step": 703000 }, { "epoch": 0.5621878898255579, "grad_norm": 0.6322435736656189, "learning_rate": 4.4378121101744424e-05, "loss": 0.2089, "step": 703500 }, { "epoch": 0.5625874547792363, "grad_norm": 0.5242030620574951, "learning_rate": 4.4374125452207635e-05, "loss": 0.2168, "step": 704000 }, { "epoch": 0.5629870197329148, "grad_norm": 0.4251486659049988, "learning_rate": 4.4370129802670854e-05, "loss": 0.2313, "step": 704500 }, { "epoch": 0.5633865846865932, "grad_norm": 0.5990568995475769, "learning_rate": 4.436613415313407e-05, "loss": 0.2259, "step": 705000 }, { "epoch": 0.5637861496402716, "grad_norm": 0.7743027806282043, "learning_rate": 4.436213850359728e-05, "loss": 0.223, "step": 705500 }, { "epoch": 0.5641857145939501, "grad_norm": 0.5335805416107178, "learning_rate": 4.43581428540605e-05, "loss": 0.211, "step": 706000 }, { "epoch": 0.5645852795476285, "grad_norm": 0.17665530741214752, "learning_rate": 4.435414720452371e-05, "loss": 0.2378, "step": 706500 }, { "epoch": 0.5649848445013069, "grad_norm": 0.602147102355957, "learning_rate": 4.435015155498694e-05, "loss": 0.2345, "step": 707000 }, { "epoch": 0.5653844094549854, "grad_norm": 0.3999220132827759, "learning_rate": 4.434615590545015e-05, "loss": 0.2273, "step": 707500 }, { "epoch": 0.5657839744086638, "grad_norm": 0.19079603254795074, "learning_rate": 4.434216025591336e-05, "loss": 0.2203, "step": 708000 }, { "epoch": 0.5661835393623423, "grad_norm": 0.38971665501594543, "learning_rate": 4.433816460637658e-05, "loss": 0.2261, "step": 708500 }, { "epoch": 0.5665831043160208, "grad_norm": 0.7050237059593201, "learning_rate": 4.43341689568398e-05, "loss": 0.2129, "step": 709000 }, { "epoch": 0.5669826692696992, "grad_norm": 1.0120710134506226, "learning_rate": 4.433017330730301e-05, "loss": 0.2251, "step": 709500 }, { "epoch": 0.5673822342233776, "grad_norm": 0.319767028093338, "learning_rate": 4.432617765776623e-05, "loss": 0.2258, "step": 710000 }, { "epoch": 0.5677817991770561, "grad_norm": 2.2331020832061768, "learning_rate": 4.432218200822944e-05, "loss": 0.2155, "step": 710500 }, { "epoch": 0.5681813641307345, "grad_norm": 0.9865570664405823, "learning_rate": 4.4318186358692656e-05, "loss": 0.232, "step": 711000 }, { "epoch": 0.5685809290844129, "grad_norm": 0.48831188678741455, "learning_rate": 4.4314190709155874e-05, "loss": 0.2257, "step": 711500 }, { "epoch": 0.5689804940380914, "grad_norm": 0.16692820191383362, "learning_rate": 4.4310195059619086e-05, "loss": 0.2259, "step": 712000 }, { "epoch": 0.5693800589917698, "grad_norm": 0.6641237735748291, "learning_rate": 4.4306199410082304e-05, "loss": 0.2197, "step": 712500 }, { "epoch": 0.5697796239454482, "grad_norm": 0.5823206901550293, "learning_rate": 4.4302203760545516e-05, "loss": 0.2389, "step": 713000 }, { "epoch": 0.5701791888991267, "grad_norm": 1.0191984176635742, "learning_rate": 4.429820811100874e-05, "loss": 0.2447, "step": 713500 }, { "epoch": 0.5705787538528051, "grad_norm": 0.6417285203933716, "learning_rate": 4.429421246147195e-05, "loss": 0.2222, "step": 714000 }, { "epoch": 0.5709783188064835, "grad_norm": 0.4262913465499878, "learning_rate": 4.429021681193516e-05, "loss": 0.2074, "step": 714500 }, { "epoch": 0.571377883760162, "grad_norm": 1.1043701171875, "learning_rate": 4.428622116239838e-05, "loss": 0.2268, "step": 715000 }, { "epoch": 0.5717774487138404, "grad_norm": 0.4814198613166809, "learning_rate": 4.42822255128616e-05, "loss": 0.2274, "step": 715500 }, { "epoch": 0.5721770136675188, "grad_norm": 0.6846593618392944, "learning_rate": 4.427822986332482e-05, "loss": 0.2207, "step": 716000 }, { "epoch": 0.5725765786211973, "grad_norm": 1.2415130138397217, "learning_rate": 4.427423421378803e-05, "loss": 0.2323, "step": 716500 }, { "epoch": 0.5729761435748757, "grad_norm": 0.9340895414352417, "learning_rate": 4.427023856425124e-05, "loss": 0.218, "step": 717000 }, { "epoch": 0.5733757085285541, "grad_norm": 0.671658456325531, "learning_rate": 4.426624291471446e-05, "loss": 0.2506, "step": 717500 }, { "epoch": 0.5737752734822326, "grad_norm": 2.0295939445495605, "learning_rate": 4.426224726517768e-05, "loss": 0.2227, "step": 718000 }, { "epoch": 0.574174838435911, "grad_norm": 0.5505226254463196, "learning_rate": 4.4258251615640895e-05, "loss": 0.2315, "step": 718500 }, { "epoch": 0.5745744033895894, "grad_norm": 0.7509803771972656, "learning_rate": 4.425425596610411e-05, "loss": 0.2328, "step": 719000 }, { "epoch": 0.5749739683432679, "grad_norm": 0.5147823095321655, "learning_rate": 4.425026031656732e-05, "loss": 0.2147, "step": 719500 }, { "epoch": 0.5753735332969463, "grad_norm": 0.941768229007721, "learning_rate": 4.424626466703054e-05, "loss": 0.2334, "step": 720000 }, { "epoch": 0.5757730982506247, "grad_norm": 0.6991438865661621, "learning_rate": 4.4242269017493755e-05, "loss": 0.2322, "step": 720500 }, { "epoch": 0.5761726632043032, "grad_norm": 1.7873008251190186, "learning_rate": 4.423827336795697e-05, "loss": 0.2218, "step": 721000 }, { "epoch": 0.5765722281579816, "grad_norm": 0.4962877631187439, "learning_rate": 4.4234277718420184e-05, "loss": 0.2183, "step": 721500 }, { "epoch": 0.57697179311166, "grad_norm": 1.7198585271835327, "learning_rate": 4.42302820688834e-05, "loss": 0.2461, "step": 722000 }, { "epoch": 0.5773713580653385, "grad_norm": 0.7577345967292786, "learning_rate": 4.422628641934662e-05, "loss": 0.2362, "step": 722500 }, { "epoch": 0.5777709230190169, "grad_norm": 0.34612515568733215, "learning_rate": 4.422229076980983e-05, "loss": 0.2235, "step": 723000 }, { "epoch": 0.5781704879726953, "grad_norm": 0.4140200614929199, "learning_rate": 4.421829512027305e-05, "loss": 0.2197, "step": 723500 }, { "epoch": 0.5785700529263738, "grad_norm": 0.6163581013679504, "learning_rate": 4.421429947073626e-05, "loss": 0.2134, "step": 724000 }, { "epoch": 0.5789696178800522, "grad_norm": 4.5335822105407715, "learning_rate": 4.421030382119948e-05, "loss": 0.2278, "step": 724500 }, { "epoch": 0.5793691828337306, "grad_norm": 0.6601273417472839, "learning_rate": 4.42063081716627e-05, "loss": 0.211, "step": 725000 }, { "epoch": 0.5797687477874091, "grad_norm": 0.5114721655845642, "learning_rate": 4.420231252212591e-05, "loss": 0.2254, "step": 725500 }, { "epoch": 0.5801683127410875, "grad_norm": 1.5241601467132568, "learning_rate": 4.419831687258913e-05, "loss": 0.2212, "step": 726000 }, { "epoch": 0.5805678776947659, "grad_norm": 0.5896867513656616, "learning_rate": 4.4194321223052346e-05, "loss": 0.2228, "step": 726500 }, { "epoch": 0.5809674426484444, "grad_norm": 0.7203184962272644, "learning_rate": 4.419032557351556e-05, "loss": 0.2344, "step": 727000 }, { "epoch": 0.5813670076021228, "grad_norm": 0.16943730413913727, "learning_rate": 4.4186329923978776e-05, "loss": 0.2277, "step": 727500 }, { "epoch": 0.5817665725558012, "grad_norm": 0.7672152519226074, "learning_rate": 4.418233427444199e-05, "loss": 0.2346, "step": 728000 }, { "epoch": 0.5821661375094797, "grad_norm": 1.5389976501464844, "learning_rate": 4.4178338624905205e-05, "loss": 0.2267, "step": 728500 }, { "epoch": 0.5825657024631581, "grad_norm": 0.4093345105648041, "learning_rate": 4.417434297536842e-05, "loss": 0.2257, "step": 729000 }, { "epoch": 0.5829652674168365, "grad_norm": 0.6321568489074707, "learning_rate": 4.4170347325831635e-05, "loss": 0.2345, "step": 729500 }, { "epoch": 0.583364832370515, "grad_norm": 0.37599632143974304, "learning_rate": 4.416635167629485e-05, "loss": 0.2267, "step": 730000 }, { "epoch": 0.5837643973241934, "grad_norm": 0.9669687747955322, "learning_rate": 4.4162356026758064e-05, "loss": 0.239, "step": 730500 }, { "epoch": 0.5841639622778718, "grad_norm": 0.575912594795227, "learning_rate": 4.415836037722129e-05, "loss": 0.2371, "step": 731000 }, { "epoch": 0.5845635272315503, "grad_norm": 0.9344297647476196, "learning_rate": 4.41543647276845e-05, "loss": 0.2354, "step": 731500 }, { "epoch": 0.5849630921852287, "grad_norm": 0.6351780295372009, "learning_rate": 4.415036907814771e-05, "loss": 0.2207, "step": 732000 }, { "epoch": 0.5853626571389071, "grad_norm": 0.7975338697433472, "learning_rate": 4.414637342861093e-05, "loss": 0.227, "step": 732500 }, { "epoch": 0.5857622220925855, "grad_norm": 0.4877091944217682, "learning_rate": 4.414237777907415e-05, "loss": 0.2035, "step": 733000 }, { "epoch": 0.586161787046264, "grad_norm": 0.5289185643196106, "learning_rate": 4.413838212953737e-05, "loss": 0.2161, "step": 733500 }, { "epoch": 0.5865613519999424, "grad_norm": 0.34557682275772095, "learning_rate": 4.413438648000058e-05, "loss": 0.2297, "step": 734000 }, { "epoch": 0.5869609169536208, "grad_norm": 0.51026451587677, "learning_rate": 4.413039083046379e-05, "loss": 0.2243, "step": 734500 }, { "epoch": 0.5873604819072993, "grad_norm": 0.7222484350204468, "learning_rate": 4.412639518092701e-05, "loss": 0.2099, "step": 735000 }, { "epoch": 0.5877600468609777, "grad_norm": 0.6441338658332825, "learning_rate": 4.4122399531390226e-05, "loss": 0.2266, "step": 735500 }, { "epoch": 0.5881596118146561, "grad_norm": 1.8135178089141846, "learning_rate": 4.4118403881853444e-05, "loss": 0.2197, "step": 736000 }, { "epoch": 0.5885591767683347, "grad_norm": 0.4727433919906616, "learning_rate": 4.4114408232316656e-05, "loss": 0.2289, "step": 736500 }, { "epoch": 0.5889587417220131, "grad_norm": 0.5936695337295532, "learning_rate": 4.411041258277987e-05, "loss": 0.2377, "step": 737000 }, { "epoch": 0.5893583066756916, "grad_norm": 0.5027349591255188, "learning_rate": 4.410641693324309e-05, "loss": 0.2316, "step": 737500 }, { "epoch": 0.58975787162937, "grad_norm": 0.9736546277999878, "learning_rate": 4.4102421283706304e-05, "loss": 0.2332, "step": 738000 }, { "epoch": 0.5901574365830484, "grad_norm": 1.3293499946594238, "learning_rate": 4.4098425634169515e-05, "loss": 0.2484, "step": 738500 }, { "epoch": 0.5905570015367269, "grad_norm": 0.5546562671661377, "learning_rate": 4.409442998463273e-05, "loss": 0.2109, "step": 739000 }, { "epoch": 0.5909565664904053, "grad_norm": 0.4511144459247589, "learning_rate": 4.409043433509595e-05, "loss": 0.2106, "step": 739500 }, { "epoch": 0.5913561314440837, "grad_norm": 1.3317924737930298, "learning_rate": 4.408643868555917e-05, "loss": 0.2314, "step": 740000 }, { "epoch": 0.5917556963977622, "grad_norm": 0.6647515892982483, "learning_rate": 4.408244303602238e-05, "loss": 0.2187, "step": 740500 }, { "epoch": 0.5921552613514406, "grad_norm": 0.31392815709114075, "learning_rate": 4.407844738648559e-05, "loss": 0.2318, "step": 741000 }, { "epoch": 0.592554826305119, "grad_norm": 0.4414021968841553, "learning_rate": 4.407445173694881e-05, "loss": 0.232, "step": 741500 }, { "epoch": 0.5929543912587975, "grad_norm": 1.1339325904846191, "learning_rate": 4.407045608741203e-05, "loss": 0.2253, "step": 742000 }, { "epoch": 0.5933539562124759, "grad_norm": 1.076136589050293, "learning_rate": 4.406646043787525e-05, "loss": 0.228, "step": 742500 }, { "epoch": 0.5937535211661543, "grad_norm": 0.5559945106506348, "learning_rate": 4.406246478833846e-05, "loss": 0.235, "step": 743000 }, { "epoch": 0.5941530861198328, "grad_norm": 0.863405704498291, "learning_rate": 4.405846913880167e-05, "loss": 0.218, "step": 743500 }, { "epoch": 0.5945526510735112, "grad_norm": 0.5520251393318176, "learning_rate": 4.405447348926489e-05, "loss": 0.2233, "step": 744000 }, { "epoch": 0.5949522160271896, "grad_norm": 0.42375823855400085, "learning_rate": 4.4050477839728106e-05, "loss": 0.2336, "step": 744500 }, { "epoch": 0.595351780980868, "grad_norm": 0.7282446622848511, "learning_rate": 4.4046482190191324e-05, "loss": 0.2174, "step": 745000 }, { "epoch": 0.5957513459345465, "grad_norm": 0.8236212134361267, "learning_rate": 4.4042486540654536e-05, "loss": 0.2203, "step": 745500 }, { "epoch": 0.5961509108882249, "grad_norm": 0.7376317977905273, "learning_rate": 4.4038490891117754e-05, "loss": 0.239, "step": 746000 }, { "epoch": 0.5965504758419033, "grad_norm": 0.8282159566879272, "learning_rate": 4.403449524158097e-05, "loss": 0.1987, "step": 746500 }, { "epoch": 0.5969500407955818, "grad_norm": 0.8802658915519714, "learning_rate": 4.4030499592044184e-05, "loss": 0.2492, "step": 747000 }, { "epoch": 0.5973496057492602, "grad_norm": 0.624218225479126, "learning_rate": 4.40265039425074e-05, "loss": 0.2228, "step": 747500 }, { "epoch": 0.5977491707029386, "grad_norm": 0.9406605958938599, "learning_rate": 4.402250829297061e-05, "loss": 0.2381, "step": 748000 }, { "epoch": 0.5981487356566171, "grad_norm": 0.7924309968948364, "learning_rate": 4.401851264343383e-05, "loss": 0.2147, "step": 748500 }, { "epoch": 0.5985483006102955, "grad_norm": 0.6263455152511597, "learning_rate": 4.401451699389705e-05, "loss": 0.2405, "step": 749000 }, { "epoch": 0.598947865563974, "grad_norm": 0.18648332357406616, "learning_rate": 4.401052134436026e-05, "loss": 0.2192, "step": 749500 }, { "epoch": 0.5993474305176524, "grad_norm": 0.7648832201957703, "learning_rate": 4.400652569482348e-05, "loss": 0.2421, "step": 750000 }, { "epoch": 0.5997469954713308, "grad_norm": 0.5041489601135254, "learning_rate": 4.400253004528669e-05, "loss": 0.2329, "step": 750500 }, { "epoch": 0.6001465604250092, "grad_norm": 1.1644434928894043, "learning_rate": 4.399853439574991e-05, "loss": 0.2163, "step": 751000 }, { "epoch": 0.6005461253786877, "grad_norm": 1.9772003889083862, "learning_rate": 4.399453874621313e-05, "loss": 0.2227, "step": 751500 }, { "epoch": 0.6009456903323661, "grad_norm": 1.0899012088775635, "learning_rate": 4.399054309667634e-05, "loss": 0.2403, "step": 752000 }, { "epoch": 0.6013452552860445, "grad_norm": 0.6890034675598145, "learning_rate": 4.398654744713956e-05, "loss": 0.219, "step": 752500 }, { "epoch": 0.601744820239723, "grad_norm": 0.45218318700790405, "learning_rate": 4.3982551797602775e-05, "loss": 0.23, "step": 753000 }, { "epoch": 0.6021443851934014, "grad_norm": 0.5982565879821777, "learning_rate": 4.3978556148065986e-05, "loss": 0.2356, "step": 753500 }, { "epoch": 0.6025439501470798, "grad_norm": 0.9245608448982239, "learning_rate": 4.3974560498529205e-05, "loss": 0.2264, "step": 754000 }, { "epoch": 0.6029435151007583, "grad_norm": 0.4665878117084503, "learning_rate": 4.3970564848992416e-05, "loss": 0.211, "step": 754500 }, { "epoch": 0.6033430800544367, "grad_norm": 0.36746668815612793, "learning_rate": 4.3966569199455634e-05, "loss": 0.2096, "step": 755000 }, { "epoch": 0.6037426450081151, "grad_norm": 0.7364276647567749, "learning_rate": 4.396257354991885e-05, "loss": 0.2094, "step": 755500 }, { "epoch": 0.6041422099617936, "grad_norm": 0.6837543249130249, "learning_rate": 4.3958577900382064e-05, "loss": 0.2301, "step": 756000 }, { "epoch": 0.604541774915472, "grad_norm": 0.34209325909614563, "learning_rate": 4.395458225084528e-05, "loss": 0.2409, "step": 756500 }, { "epoch": 0.6049413398691504, "grad_norm": 1.0906201601028442, "learning_rate": 4.3950586601308493e-05, "loss": 0.2171, "step": 757000 }, { "epoch": 0.6053409048228289, "grad_norm": 1.7237566709518433, "learning_rate": 4.394659095177172e-05, "loss": 0.2322, "step": 757500 }, { "epoch": 0.6057404697765073, "grad_norm": 0.6450017690658569, "learning_rate": 4.394259530223493e-05, "loss": 0.2188, "step": 758000 }, { "epoch": 0.6061400347301857, "grad_norm": 0.4253807067871094, "learning_rate": 4.393859965269814e-05, "loss": 0.2484, "step": 758500 }, { "epoch": 0.6065395996838642, "grad_norm": 0.5544289946556091, "learning_rate": 4.393460400316136e-05, "loss": 0.2274, "step": 759000 }, { "epoch": 0.6069391646375426, "grad_norm": 0.5791451334953308, "learning_rate": 4.393060835362458e-05, "loss": 0.2223, "step": 759500 }, { "epoch": 0.607338729591221, "grad_norm": 0.1416904479265213, "learning_rate": 4.3926612704087796e-05, "loss": 0.2283, "step": 760000 }, { "epoch": 0.6077382945448995, "grad_norm": 0.7038934230804443, "learning_rate": 4.392261705455101e-05, "loss": 0.2182, "step": 760500 }, { "epoch": 0.6081378594985779, "grad_norm": 1.4120445251464844, "learning_rate": 4.391862140501422e-05, "loss": 0.2337, "step": 761000 }, { "epoch": 0.6085374244522563, "grad_norm": 0.17623810470104218, "learning_rate": 4.391462575547744e-05, "loss": 0.2141, "step": 761500 }, { "epoch": 0.6089369894059348, "grad_norm": 1.68950355052948, "learning_rate": 4.3910630105940655e-05, "loss": 0.2342, "step": 762000 }, { "epoch": 0.6093365543596132, "grad_norm": 0.6569217443466187, "learning_rate": 4.390663445640387e-05, "loss": 0.2248, "step": 762500 }, { "epoch": 0.6097361193132916, "grad_norm": 0.5435954332351685, "learning_rate": 4.3902638806867085e-05, "loss": 0.2411, "step": 763000 }, { "epoch": 0.6101356842669701, "grad_norm": 0.596948504447937, "learning_rate": 4.3898643157330296e-05, "loss": 0.2305, "step": 763500 }, { "epoch": 0.6105352492206486, "grad_norm": 1.6915849447250366, "learning_rate": 4.389464750779352e-05, "loss": 0.2374, "step": 764000 }, { "epoch": 0.610934814174327, "grad_norm": 0.9402076601982117, "learning_rate": 4.389065185825673e-05, "loss": 0.2318, "step": 764500 }, { "epoch": 0.6113343791280055, "grad_norm": 1.1245558261871338, "learning_rate": 4.3886656208719944e-05, "loss": 0.2182, "step": 765000 }, { "epoch": 0.6117339440816839, "grad_norm": 0.4361468553543091, "learning_rate": 4.388266055918316e-05, "loss": 0.246, "step": 765500 }, { "epoch": 0.6121335090353623, "grad_norm": 0.4640135169029236, "learning_rate": 4.387866490964638e-05, "loss": 0.22, "step": 766000 }, { "epoch": 0.6125330739890408, "grad_norm": 0.42536622285842896, "learning_rate": 4.38746692601096e-05, "loss": 0.2132, "step": 766500 }, { "epoch": 0.6129326389427192, "grad_norm": 0.9835397005081177, "learning_rate": 4.387067361057281e-05, "loss": 0.2249, "step": 767000 }, { "epoch": 0.6133322038963976, "grad_norm": 0.5579205751419067, "learning_rate": 4.386667796103602e-05, "loss": 0.2237, "step": 767500 }, { "epoch": 0.6137317688500761, "grad_norm": 0.5491019487380981, "learning_rate": 4.386268231149924e-05, "loss": 0.2014, "step": 768000 }, { "epoch": 0.6141313338037545, "grad_norm": 3.978527307510376, "learning_rate": 4.385868666196246e-05, "loss": 0.2278, "step": 768500 }, { "epoch": 0.6145308987574329, "grad_norm": 0.5805398225784302, "learning_rate": 4.3854691012425676e-05, "loss": 0.2199, "step": 769000 }, { "epoch": 0.6149304637111114, "grad_norm": 0.2604517638683319, "learning_rate": 4.385069536288889e-05, "loss": 0.2183, "step": 769500 }, { "epoch": 0.6153300286647898, "grad_norm": 0.8612673878669739, "learning_rate": 4.38466997133521e-05, "loss": 0.2399, "step": 770000 }, { "epoch": 0.6157295936184682, "grad_norm": 1.6486566066741943, "learning_rate": 4.3842704063815324e-05, "loss": 0.1977, "step": 770500 }, { "epoch": 0.6161291585721467, "grad_norm": 0.9465038776397705, "learning_rate": 4.3838708414278535e-05, "loss": 0.2515, "step": 771000 }, { "epoch": 0.6165287235258251, "grad_norm": 0.3152056038379669, "learning_rate": 4.3834712764741753e-05, "loss": 0.2205, "step": 771500 }, { "epoch": 0.6169282884795035, "grad_norm": 0.4775288999080658, "learning_rate": 4.3830717115204965e-05, "loss": 0.2122, "step": 772000 }, { "epoch": 0.617327853433182, "grad_norm": 1.0798420906066895, "learning_rate": 4.382672146566818e-05, "loss": 0.2514, "step": 772500 }, { "epoch": 0.6177274183868604, "grad_norm": 0.5105395317077637, "learning_rate": 4.38227258161314e-05, "loss": 0.2304, "step": 773000 }, { "epoch": 0.6181269833405388, "grad_norm": 0.5020871162414551, "learning_rate": 4.381873016659461e-05, "loss": 0.2328, "step": 773500 }, { "epoch": 0.6185265482942173, "grad_norm": 0.6652044653892517, "learning_rate": 4.381473451705783e-05, "loss": 0.2266, "step": 774000 }, { "epoch": 0.6189261132478957, "grad_norm": 0.5767709016799927, "learning_rate": 4.381073886752104e-05, "loss": 0.2347, "step": 774500 }, { "epoch": 0.6193256782015741, "grad_norm": 0.5908842086791992, "learning_rate": 4.380674321798426e-05, "loss": 0.2403, "step": 775000 }, { "epoch": 0.6197252431552526, "grad_norm": 1.8429127931594849, "learning_rate": 4.380274756844748e-05, "loss": 0.2291, "step": 775500 }, { "epoch": 0.620124808108931, "grad_norm": 0.9386611580848694, "learning_rate": 4.379875191891069e-05, "loss": 0.2553, "step": 776000 }, { "epoch": 0.6205243730626094, "grad_norm": 0.7297807335853577, "learning_rate": 4.379475626937391e-05, "loss": 0.2269, "step": 776500 }, { "epoch": 0.6209239380162879, "grad_norm": 1.7423710823059082, "learning_rate": 4.3790760619837127e-05, "loss": 0.2283, "step": 777000 }, { "epoch": 0.6213235029699663, "grad_norm": 0.8201688528060913, "learning_rate": 4.378676497030034e-05, "loss": 0.2342, "step": 777500 }, { "epoch": 0.6217230679236447, "grad_norm": 0.5559523701667786, "learning_rate": 4.3782769320763556e-05, "loss": 0.2368, "step": 778000 }, { "epoch": 0.6221226328773232, "grad_norm": 0.5768117308616638, "learning_rate": 4.377877367122677e-05, "loss": 0.228, "step": 778500 }, { "epoch": 0.6225221978310016, "grad_norm": 0.6892934441566467, "learning_rate": 4.3774778021689986e-05, "loss": 0.2335, "step": 779000 }, { "epoch": 0.62292176278468, "grad_norm": 0.48002585768699646, "learning_rate": 4.3770782372153204e-05, "loss": 0.2117, "step": 779500 }, { "epoch": 0.6233213277383585, "grad_norm": 0.6138873100280762, "learning_rate": 4.3766786722616415e-05, "loss": 0.2056, "step": 780000 }, { "epoch": 0.6237208926920369, "grad_norm": 0.4460686147212982, "learning_rate": 4.3762791073079634e-05, "loss": 0.2226, "step": 780500 }, { "epoch": 0.6241204576457153, "grad_norm": 0.7545770406723022, "learning_rate": 4.3758795423542845e-05, "loss": 0.2217, "step": 781000 }, { "epoch": 0.6245200225993938, "grad_norm": 1.06951904296875, "learning_rate": 4.375479977400607e-05, "loss": 0.2226, "step": 781500 }, { "epoch": 0.6249195875530722, "grad_norm": 1.3200165033340454, "learning_rate": 4.375080412446928e-05, "loss": 0.2281, "step": 782000 }, { "epoch": 0.6253191525067506, "grad_norm": 0.5047633647918701, "learning_rate": 4.374680847493249e-05, "loss": 0.2291, "step": 782500 }, { "epoch": 0.6257187174604291, "grad_norm": 0.6745198965072632, "learning_rate": 4.374281282539571e-05, "loss": 0.2227, "step": 783000 }, { "epoch": 0.6261182824141075, "grad_norm": 0.32213854789733887, "learning_rate": 4.373881717585893e-05, "loss": 0.223, "step": 783500 }, { "epoch": 0.6265178473677859, "grad_norm": 0.39328324794769287, "learning_rate": 4.373482152632215e-05, "loss": 0.1939, "step": 784000 }, { "epoch": 0.6269174123214644, "grad_norm": 0.3568546175956726, "learning_rate": 4.373082587678536e-05, "loss": 0.2282, "step": 784500 }, { "epoch": 0.6273169772751428, "grad_norm": 2.139554262161255, "learning_rate": 4.372683022724857e-05, "loss": 0.2295, "step": 785000 }, { "epoch": 0.6277165422288212, "grad_norm": 0.33760812878608704, "learning_rate": 4.372283457771179e-05, "loss": 0.2261, "step": 785500 }, { "epoch": 0.6281161071824997, "grad_norm": 1.1484150886535645, "learning_rate": 4.371883892817501e-05, "loss": 0.2342, "step": 786000 }, { "epoch": 0.6285156721361781, "grad_norm": 1.1624525785446167, "learning_rate": 4.3714843278638225e-05, "loss": 0.2356, "step": 786500 }, { "epoch": 0.6289152370898565, "grad_norm": 0.21700118482112885, "learning_rate": 4.3710847629101436e-05, "loss": 0.2136, "step": 787000 }, { "epoch": 0.629314802043535, "grad_norm": 0.5261659026145935, "learning_rate": 4.370685197956465e-05, "loss": 0.2261, "step": 787500 }, { "epoch": 0.6297143669972134, "grad_norm": 0.5640347599983215, "learning_rate": 4.3702856330027866e-05, "loss": 0.2114, "step": 788000 }, { "epoch": 0.6301139319508918, "grad_norm": 0.4187915325164795, "learning_rate": 4.3698860680491084e-05, "loss": 0.2067, "step": 788500 }, { "epoch": 0.6305134969045703, "grad_norm": 2.3570196628570557, "learning_rate": 4.36948650309543e-05, "loss": 0.2355, "step": 789000 }, { "epoch": 0.6309130618582487, "grad_norm": 0.23243282735347748, "learning_rate": 4.3690869381417514e-05, "loss": 0.2332, "step": 789500 }, { "epoch": 0.6313126268119271, "grad_norm": 0.5741555690765381, "learning_rate": 4.368687373188073e-05, "loss": 0.2218, "step": 790000 }, { "epoch": 0.6317121917656056, "grad_norm": 0.3831005096435547, "learning_rate": 4.368287808234395e-05, "loss": 0.223, "step": 790500 }, { "epoch": 0.632111756719284, "grad_norm": 0.34744009375572205, "learning_rate": 4.367888243280716e-05, "loss": 0.2339, "step": 791000 }, { "epoch": 0.6325113216729624, "grad_norm": 1.1368319988250732, "learning_rate": 4.367488678327038e-05, "loss": 0.2283, "step": 791500 }, { "epoch": 0.632910886626641, "grad_norm": 1.0490485429763794, "learning_rate": 4.367089113373359e-05, "loss": 0.2189, "step": 792000 }, { "epoch": 0.6333104515803194, "grad_norm": 0.7500748038291931, "learning_rate": 4.366689548419681e-05, "loss": 0.2292, "step": 792500 }, { "epoch": 0.6337100165339978, "grad_norm": 0.6467641592025757, "learning_rate": 4.366289983466003e-05, "loss": 0.2364, "step": 793000 }, { "epoch": 0.6341095814876763, "grad_norm": 1.7789288759231567, "learning_rate": 4.365890418512324e-05, "loss": 0.2149, "step": 793500 }, { "epoch": 0.6345091464413547, "grad_norm": 1.4563851356506348, "learning_rate": 4.365490853558645e-05, "loss": 0.221, "step": 794000 }, { "epoch": 0.6349087113950331, "grad_norm": 0.3016975522041321, "learning_rate": 4.365091288604967e-05, "loss": 0.2164, "step": 794500 }, { "epoch": 0.6353082763487116, "grad_norm": 0.4633747935295105, "learning_rate": 4.364691723651289e-05, "loss": 0.2259, "step": 795000 }, { "epoch": 0.63570784130239, "grad_norm": 0.691145658493042, "learning_rate": 4.3642921586976105e-05, "loss": 0.1988, "step": 795500 }, { "epoch": 0.6361074062560684, "grad_norm": 1.9188761711120605, "learning_rate": 4.3638925937439316e-05, "loss": 0.2275, "step": 796000 }, { "epoch": 0.6365069712097469, "grad_norm": 0.5483871698379517, "learning_rate": 4.363493028790253e-05, "loss": 0.1949, "step": 796500 }, { "epoch": 0.6369065361634253, "grad_norm": 0.5885448455810547, "learning_rate": 4.363093463836575e-05, "loss": 0.2185, "step": 797000 }, { "epoch": 0.6373061011171037, "grad_norm": 1.4083855152130127, "learning_rate": 4.3626938988828964e-05, "loss": 0.2301, "step": 797500 }, { "epoch": 0.6377056660707822, "grad_norm": 0.9391152262687683, "learning_rate": 4.362294333929218e-05, "loss": 0.2215, "step": 798000 }, { "epoch": 0.6381052310244606, "grad_norm": 1.2463512420654297, "learning_rate": 4.3618947689755394e-05, "loss": 0.2199, "step": 798500 }, { "epoch": 0.638504795978139, "grad_norm": 0.3792072832584381, "learning_rate": 4.361495204021861e-05, "loss": 0.2435, "step": 799000 }, { "epoch": 0.6389043609318175, "grad_norm": 0.4842444062232971, "learning_rate": 4.361095639068183e-05, "loss": 0.227, "step": 799500 }, { "epoch": 0.6393039258854959, "grad_norm": 0.537302553653717, "learning_rate": 4.360696074114504e-05, "loss": 0.2272, "step": 800000 }, { "epoch": 0.6397034908391743, "grad_norm": 0.8203772306442261, "learning_rate": 4.360296509160826e-05, "loss": 0.2274, "step": 800500 }, { "epoch": 0.6401030557928528, "grad_norm": 0.49707454442977905, "learning_rate": 4.359896944207147e-05, "loss": 0.212, "step": 801000 }, { "epoch": 0.6405026207465312, "grad_norm": 0.5812466740608215, "learning_rate": 4.359497379253469e-05, "loss": 0.2341, "step": 801500 }, { "epoch": 0.6409021857002096, "grad_norm": 1.8318969011306763, "learning_rate": 4.359097814299791e-05, "loss": 0.2237, "step": 802000 }, { "epoch": 0.6413017506538881, "grad_norm": 0.7821720242500305, "learning_rate": 4.358698249346112e-05, "loss": 0.2103, "step": 802500 }, { "epoch": 0.6417013156075665, "grad_norm": 1.695074200630188, "learning_rate": 4.358298684392434e-05, "loss": 0.2344, "step": 803000 }, { "epoch": 0.6421008805612449, "grad_norm": 2.0922155380249023, "learning_rate": 4.3578991194387556e-05, "loss": 0.2253, "step": 803500 }, { "epoch": 0.6425004455149234, "grad_norm": 0.32512956857681274, "learning_rate": 4.357499554485077e-05, "loss": 0.2368, "step": 804000 }, { "epoch": 0.6429000104686018, "grad_norm": 0.90260910987854, "learning_rate": 4.3570999895313985e-05, "loss": 0.2111, "step": 804500 }, { "epoch": 0.6432995754222802, "grad_norm": 0.524996817111969, "learning_rate": 4.3567004245777197e-05, "loss": 0.2188, "step": 805000 }, { "epoch": 0.6436991403759587, "grad_norm": 0.49474629759788513, "learning_rate": 4.3563008596240415e-05, "loss": 0.204, "step": 805500 }, { "epoch": 0.6440987053296371, "grad_norm": 0.4649052023887634, "learning_rate": 4.355901294670363e-05, "loss": 0.2199, "step": 806000 }, { "epoch": 0.6444982702833155, "grad_norm": 0.6617453098297119, "learning_rate": 4.3555017297166844e-05, "loss": 0.2219, "step": 806500 }, { "epoch": 0.644897835236994, "grad_norm": 0.5112816691398621, "learning_rate": 4.355102164763006e-05, "loss": 0.2519, "step": 807000 }, { "epoch": 0.6452974001906724, "grad_norm": 0.7199938893318176, "learning_rate": 4.3547025998093274e-05, "loss": 0.2199, "step": 807500 }, { "epoch": 0.6456969651443508, "grad_norm": 0.31842240691185, "learning_rate": 4.35430303485565e-05, "loss": 0.2193, "step": 808000 }, { "epoch": 0.6460965300980293, "grad_norm": 3.614222526550293, "learning_rate": 4.353903469901971e-05, "loss": 0.2415, "step": 808500 }, { "epoch": 0.6464960950517077, "grad_norm": 0.5894742608070374, "learning_rate": 4.353503904948292e-05, "loss": 0.2292, "step": 809000 }, { "epoch": 0.6468956600053861, "grad_norm": 0.6458737850189209, "learning_rate": 4.353104339994614e-05, "loss": 0.2245, "step": 809500 }, { "epoch": 0.6472952249590646, "grad_norm": 0.4193842113018036, "learning_rate": 4.352704775040936e-05, "loss": 0.2284, "step": 810000 }, { "epoch": 0.647694789912743, "grad_norm": 1.9019514322280884, "learning_rate": 4.3523052100872576e-05, "loss": 0.2177, "step": 810500 }, { "epoch": 0.6480943548664214, "grad_norm": 1.499301552772522, "learning_rate": 4.351905645133579e-05, "loss": 0.2211, "step": 811000 }, { "epoch": 0.6484939198200999, "grad_norm": 0.3986019790172577, "learning_rate": 4.3515060801799e-05, "loss": 0.2257, "step": 811500 }, { "epoch": 0.6488934847737783, "grad_norm": 0.8653873205184937, "learning_rate": 4.351106515226222e-05, "loss": 0.2277, "step": 812000 }, { "epoch": 0.6492930497274567, "grad_norm": 1.3047394752502441, "learning_rate": 4.3507069502725436e-05, "loss": 0.2135, "step": 812500 }, { "epoch": 0.6496926146811351, "grad_norm": 0.6085476279258728, "learning_rate": 4.3503073853188654e-05, "loss": 0.2212, "step": 813000 }, { "epoch": 0.6500921796348136, "grad_norm": 1.4729853868484497, "learning_rate": 4.3499078203651865e-05, "loss": 0.2261, "step": 813500 }, { "epoch": 0.650491744588492, "grad_norm": 1.8302658796310425, "learning_rate": 4.349508255411508e-05, "loss": 0.2199, "step": 814000 }, { "epoch": 0.6508913095421704, "grad_norm": 0.6863667368888855, "learning_rate": 4.34910869045783e-05, "loss": 0.2208, "step": 814500 }, { "epoch": 0.6512908744958489, "grad_norm": 0.5953105688095093, "learning_rate": 4.348709125504151e-05, "loss": 0.2195, "step": 815000 }, { "epoch": 0.6516904394495273, "grad_norm": 1.0713015794754028, "learning_rate": 4.348309560550473e-05, "loss": 0.2295, "step": 815500 }, { "epoch": 0.6520900044032057, "grad_norm": 0.532387912273407, "learning_rate": 4.347909995596794e-05, "loss": 0.2161, "step": 816000 }, { "epoch": 0.6524895693568842, "grad_norm": 0.5101562142372131, "learning_rate": 4.347510430643116e-05, "loss": 0.2286, "step": 816500 }, { "epoch": 0.6528891343105626, "grad_norm": 0.42262691259384155, "learning_rate": 4.347110865689438e-05, "loss": 0.2253, "step": 817000 }, { "epoch": 0.653288699264241, "grad_norm": 1.8892552852630615, "learning_rate": 4.346711300735759e-05, "loss": 0.2416, "step": 817500 }, { "epoch": 0.6536882642179195, "grad_norm": 0.8660463690757751, "learning_rate": 4.346311735782081e-05, "loss": 0.229, "step": 818000 }, { "epoch": 0.6540878291715979, "grad_norm": 1.9002809524536133, "learning_rate": 4.345912170828402e-05, "loss": 0.2113, "step": 818500 }, { "epoch": 0.6544873941252763, "grad_norm": 0.5560393333435059, "learning_rate": 4.345512605874724e-05, "loss": 0.2172, "step": 819000 }, { "epoch": 0.6548869590789549, "grad_norm": 0.5570691227912903, "learning_rate": 4.345113040921046e-05, "loss": 0.232, "step": 819500 }, { "epoch": 0.6552865240326333, "grad_norm": 0.9083230495452881, "learning_rate": 4.344713475967367e-05, "loss": 0.2268, "step": 820000 }, { "epoch": 0.6556860889863118, "grad_norm": 0.6030833721160889, "learning_rate": 4.344313911013688e-05, "loss": 0.2176, "step": 820500 }, { "epoch": 0.6560856539399902, "grad_norm": 0.5288799405097961, "learning_rate": 4.3439143460600104e-05, "loss": 0.2282, "step": 821000 }, { "epoch": 0.6564852188936686, "grad_norm": 0.5955397486686707, "learning_rate": 4.3435147811063316e-05, "loss": 0.1924, "step": 821500 }, { "epoch": 0.656884783847347, "grad_norm": 0.5692636966705322, "learning_rate": 4.3431152161526534e-05, "loss": 0.2173, "step": 822000 }, { "epoch": 0.6572843488010255, "grad_norm": 0.45736151933670044, "learning_rate": 4.3427156511989745e-05, "loss": 0.2291, "step": 822500 }, { "epoch": 0.6576839137547039, "grad_norm": 0.6397861242294312, "learning_rate": 4.3423160862452964e-05, "loss": 0.2333, "step": 823000 }, { "epoch": 0.6580834787083824, "grad_norm": 2.6534969806671143, "learning_rate": 4.341916521291618e-05, "loss": 0.2433, "step": 823500 }, { "epoch": 0.6584830436620608, "grad_norm": 0.16590847074985504, "learning_rate": 4.341516956337939e-05, "loss": 0.2277, "step": 824000 }, { "epoch": 0.6588826086157392, "grad_norm": 0.8407473564147949, "learning_rate": 4.341117391384261e-05, "loss": 0.207, "step": 824500 }, { "epoch": 0.6592821735694177, "grad_norm": 0.6105552911758423, "learning_rate": 4.340717826430582e-05, "loss": 0.2094, "step": 825000 }, { "epoch": 0.6596817385230961, "grad_norm": 3.0662994384765625, "learning_rate": 4.340318261476904e-05, "loss": 0.2547, "step": 825500 }, { "epoch": 0.6600813034767745, "grad_norm": 0.7158534526824951, "learning_rate": 4.339918696523226e-05, "loss": 0.2425, "step": 826000 }, { "epoch": 0.660480868430453, "grad_norm": 2.131422996520996, "learning_rate": 4.339519131569547e-05, "loss": 0.2278, "step": 826500 }, { "epoch": 0.6608804333841314, "grad_norm": 0.8151887059211731, "learning_rate": 4.339119566615869e-05, "loss": 0.2419, "step": 827000 }, { "epoch": 0.6612799983378098, "grad_norm": 0.16684281826019287, "learning_rate": 4.338720001662191e-05, "loss": 0.2215, "step": 827500 }, { "epoch": 0.6616795632914882, "grad_norm": 0.6663644909858704, "learning_rate": 4.338320436708512e-05, "loss": 0.2146, "step": 828000 }, { "epoch": 0.6620791282451667, "grad_norm": 0.17191651463508606, "learning_rate": 4.337920871754834e-05, "loss": 0.2165, "step": 828500 }, { "epoch": 0.6624786931988451, "grad_norm": 0.6319560408592224, "learning_rate": 4.337521306801155e-05, "loss": 0.2335, "step": 829000 }, { "epoch": 0.6628782581525235, "grad_norm": 0.7238369584083557, "learning_rate": 4.3371217418474766e-05, "loss": 0.2417, "step": 829500 }, { "epoch": 0.663277823106202, "grad_norm": 0.7892149686813354, "learning_rate": 4.3367221768937985e-05, "loss": 0.2175, "step": 830000 }, { "epoch": 0.6636773880598804, "grad_norm": 0.37296032905578613, "learning_rate": 4.3363226119401196e-05, "loss": 0.2383, "step": 830500 }, { "epoch": 0.6640769530135588, "grad_norm": 0.79742032289505, "learning_rate": 4.3359230469864414e-05, "loss": 0.2196, "step": 831000 }, { "epoch": 0.6644765179672373, "grad_norm": 2.1396279335021973, "learning_rate": 4.3355234820327626e-05, "loss": 0.2251, "step": 831500 }, { "epoch": 0.6648760829209157, "grad_norm": 0.6940603256225586, "learning_rate": 4.3351239170790844e-05, "loss": 0.2414, "step": 832000 }, { "epoch": 0.6652756478745941, "grad_norm": 1.0047725439071655, "learning_rate": 4.334724352125406e-05, "loss": 0.2235, "step": 832500 }, { "epoch": 0.6656752128282726, "grad_norm": 0.8357674479484558, "learning_rate": 4.3343247871717273e-05, "loss": 0.2164, "step": 833000 }, { "epoch": 0.666074777781951, "grad_norm": 0.9012974500656128, "learning_rate": 4.333925222218049e-05, "loss": 0.2024, "step": 833500 }, { "epoch": 0.6664743427356294, "grad_norm": 0.3705512583255768, "learning_rate": 4.333525657264371e-05, "loss": 0.2216, "step": 834000 }, { "epoch": 0.6668739076893079, "grad_norm": 2.539421319961548, "learning_rate": 4.333126092310693e-05, "loss": 0.2172, "step": 834500 }, { "epoch": 0.6672734726429863, "grad_norm": 0.4560263156890869, "learning_rate": 4.332726527357014e-05, "loss": 0.1978, "step": 835000 }, { "epoch": 0.6676730375966647, "grad_norm": 0.7270705103874207, "learning_rate": 4.332326962403335e-05, "loss": 0.2179, "step": 835500 }, { "epoch": 0.6680726025503432, "grad_norm": 0.9113181829452515, "learning_rate": 4.331927397449657e-05, "loss": 0.2213, "step": 836000 }, { "epoch": 0.6684721675040216, "grad_norm": 2.407869338989258, "learning_rate": 4.331527832495979e-05, "loss": 0.2359, "step": 836500 }, { "epoch": 0.6688717324577, "grad_norm": 0.6065270900726318, "learning_rate": 4.3311282675423005e-05, "loss": 0.2309, "step": 837000 }, { "epoch": 0.6692712974113785, "grad_norm": 0.3663254380226135, "learning_rate": 4.330728702588622e-05, "loss": 0.2255, "step": 837500 }, { "epoch": 0.6696708623650569, "grad_norm": 0.7190170884132385, "learning_rate": 4.330329137634943e-05, "loss": 0.2304, "step": 838000 }, { "epoch": 0.6700704273187353, "grad_norm": 0.5847136378288269, "learning_rate": 4.3299295726812647e-05, "loss": 0.2215, "step": 838500 }, { "epoch": 0.6704699922724138, "grad_norm": 0.6912901401519775, "learning_rate": 4.3295300077275865e-05, "loss": 0.2138, "step": 839000 }, { "epoch": 0.6708695572260922, "grad_norm": 0.7372891306877136, "learning_rate": 4.329130442773908e-05, "loss": 0.2108, "step": 839500 }, { "epoch": 0.6712691221797706, "grad_norm": 0.9296292662620544, "learning_rate": 4.3287308778202294e-05, "loss": 0.2144, "step": 840000 }, { "epoch": 0.6716686871334491, "grad_norm": 0.46937838196754456, "learning_rate": 4.3283313128665506e-05, "loss": 0.2499, "step": 840500 }, { "epoch": 0.6720682520871275, "grad_norm": 0.5894976258277893, "learning_rate": 4.327931747912873e-05, "loss": 0.2319, "step": 841000 }, { "epoch": 0.6724678170408059, "grad_norm": 0.8853931427001953, "learning_rate": 4.327532182959194e-05, "loss": 0.2126, "step": 841500 }, { "epoch": 0.6728673819944844, "grad_norm": 0.9823892712593079, "learning_rate": 4.327132618005516e-05, "loss": 0.2274, "step": 842000 }, { "epoch": 0.6732669469481628, "grad_norm": 0.7585694193840027, "learning_rate": 4.326733053051837e-05, "loss": 0.2169, "step": 842500 }, { "epoch": 0.6736665119018412, "grad_norm": 0.5536367297172546, "learning_rate": 4.326333488098159e-05, "loss": 0.2352, "step": 843000 }, { "epoch": 0.6740660768555197, "grad_norm": 0.5156483054161072, "learning_rate": 4.325933923144481e-05, "loss": 0.2193, "step": 843500 }, { "epoch": 0.6744656418091981, "grad_norm": 0.3669329583644867, "learning_rate": 4.325534358190802e-05, "loss": 0.203, "step": 844000 }, { "epoch": 0.6748652067628765, "grad_norm": 0.5692172050476074, "learning_rate": 4.325134793237124e-05, "loss": 0.2286, "step": 844500 }, { "epoch": 0.675264771716555, "grad_norm": 0.466249018907547, "learning_rate": 4.324735228283445e-05, "loss": 0.2286, "step": 845000 }, { "epoch": 0.6756643366702334, "grad_norm": 0.5651166439056396, "learning_rate": 4.324335663329767e-05, "loss": 0.2213, "step": 845500 }, { "epoch": 0.6760639016239118, "grad_norm": 0.7812852263450623, "learning_rate": 4.3239360983760886e-05, "loss": 0.2282, "step": 846000 }, { "epoch": 0.6764634665775903, "grad_norm": 0.7548393607139587, "learning_rate": 4.32353653342241e-05, "loss": 0.2025, "step": 846500 }, { "epoch": 0.6768630315312687, "grad_norm": 0.56592857837677, "learning_rate": 4.3231369684687315e-05, "loss": 0.2294, "step": 847000 }, { "epoch": 0.6772625964849472, "grad_norm": 1.177977442741394, "learning_rate": 4.3227374035150533e-05, "loss": 0.223, "step": 847500 }, { "epoch": 0.6776621614386257, "grad_norm": 0.5445961952209473, "learning_rate": 4.3223378385613745e-05, "loss": 0.211, "step": 848000 }, { "epoch": 0.6780617263923041, "grad_norm": 0.643682599067688, "learning_rate": 4.321938273607696e-05, "loss": 0.2243, "step": 848500 }, { "epoch": 0.6784612913459825, "grad_norm": 0.6928083896636963, "learning_rate": 4.3215387086540175e-05, "loss": 0.2252, "step": 849000 }, { "epoch": 0.678860856299661, "grad_norm": 0.8515148758888245, "learning_rate": 4.321139143700339e-05, "loss": 0.2302, "step": 849500 }, { "epoch": 0.6792604212533394, "grad_norm": 0.6664645075798035, "learning_rate": 4.320739578746661e-05, "loss": 0.2289, "step": 850000 }, { "epoch": 0.6796599862070178, "grad_norm": 0.941190779209137, "learning_rate": 4.320340013792982e-05, "loss": 0.2298, "step": 850500 }, { "epoch": 0.6800595511606963, "grad_norm": 0.45740261673927307, "learning_rate": 4.319940448839304e-05, "loss": 0.2121, "step": 851000 }, { "epoch": 0.6804591161143747, "grad_norm": 2.5258309841156006, "learning_rate": 4.319540883885625e-05, "loss": 0.2193, "step": 851500 }, { "epoch": 0.6808586810680531, "grad_norm": 2.394235849380493, "learning_rate": 4.319141318931947e-05, "loss": 0.2343, "step": 852000 }, { "epoch": 0.6812582460217316, "grad_norm": 0.30624687671661377, "learning_rate": 4.318741753978269e-05, "loss": 0.2091, "step": 852500 }, { "epoch": 0.68165781097541, "grad_norm": 2.422657012939453, "learning_rate": 4.31834218902459e-05, "loss": 0.2419, "step": 853000 }, { "epoch": 0.6820573759290884, "grad_norm": 1.8357890844345093, "learning_rate": 4.317942624070912e-05, "loss": 0.2294, "step": 853500 }, { "epoch": 0.6824569408827669, "grad_norm": 2.0546369552612305, "learning_rate": 4.3175430591172336e-05, "loss": 0.2376, "step": 854000 }, { "epoch": 0.6828565058364453, "grad_norm": 0.5816104412078857, "learning_rate": 4.317143494163555e-05, "loss": 0.2336, "step": 854500 }, { "epoch": 0.6832560707901237, "grad_norm": 1.2600511312484741, "learning_rate": 4.3167439292098766e-05, "loss": 0.2183, "step": 855000 }, { "epoch": 0.6836556357438022, "grad_norm": 0.8801799416542053, "learning_rate": 4.316344364256198e-05, "loss": 0.2462, "step": 855500 }, { "epoch": 0.6840552006974806, "grad_norm": 0.7720034718513489, "learning_rate": 4.3159447993025195e-05, "loss": 0.2076, "step": 856000 }, { "epoch": 0.684454765651159, "grad_norm": 2.741896152496338, "learning_rate": 4.3155452343488414e-05, "loss": 0.2157, "step": 856500 }, { "epoch": 0.6848543306048375, "grad_norm": 0.3899250030517578, "learning_rate": 4.3151456693951625e-05, "loss": 0.2316, "step": 857000 }, { "epoch": 0.6852538955585159, "grad_norm": 0.46445584297180176, "learning_rate": 4.314746104441484e-05, "loss": 0.2164, "step": 857500 }, { "epoch": 0.6856534605121943, "grad_norm": 0.957823634147644, "learning_rate": 4.3143465394878055e-05, "loss": 0.2276, "step": 858000 }, { "epoch": 0.6860530254658728, "grad_norm": 0.6560938954353333, "learning_rate": 4.313946974534128e-05, "loss": 0.2076, "step": 858500 }, { "epoch": 0.6864525904195512, "grad_norm": 0.5927508473396301, "learning_rate": 4.313547409580449e-05, "loss": 0.2384, "step": 859000 }, { "epoch": 0.6868521553732296, "grad_norm": 1.0259188413619995, "learning_rate": 4.31314784462677e-05, "loss": 0.2282, "step": 859500 }, { "epoch": 0.6872517203269081, "grad_norm": 1.7594847679138184, "learning_rate": 4.312748279673092e-05, "loss": 0.21, "step": 860000 }, { "epoch": 0.6876512852805865, "grad_norm": 0.6503469944000244, "learning_rate": 4.312348714719414e-05, "loss": 0.2148, "step": 860500 }, { "epoch": 0.6880508502342649, "grad_norm": 0.7658929824829102, "learning_rate": 4.311949149765736e-05, "loss": 0.2086, "step": 861000 }, { "epoch": 0.6884504151879434, "grad_norm": 0.40299785137176514, "learning_rate": 4.311549584812057e-05, "loss": 0.2261, "step": 861500 }, { "epoch": 0.6888499801416218, "grad_norm": 0.5794596672058105, "learning_rate": 4.311150019858378e-05, "loss": 0.226, "step": 862000 }, { "epoch": 0.6892495450953002, "grad_norm": 0.8412120342254639, "learning_rate": 4.3107504549047e-05, "loss": 0.2347, "step": 862500 }, { "epoch": 0.6896491100489787, "grad_norm": 0.5106658935546875, "learning_rate": 4.3103508899510216e-05, "loss": 0.2146, "step": 863000 }, { "epoch": 0.6900486750026571, "grad_norm": 0.8093492388725281, "learning_rate": 4.3099513249973435e-05, "loss": 0.2303, "step": 863500 }, { "epoch": 0.6904482399563355, "grad_norm": 0.520343542098999, "learning_rate": 4.3095517600436646e-05, "loss": 0.2328, "step": 864000 }, { "epoch": 0.690847804910014, "grad_norm": 0.6291338801383972, "learning_rate": 4.309152195089986e-05, "loss": 0.2172, "step": 864500 }, { "epoch": 0.6912473698636924, "grad_norm": 0.8832846879959106, "learning_rate": 4.308752630136308e-05, "loss": 0.2068, "step": 865000 }, { "epoch": 0.6916469348173708, "grad_norm": 0.49760714173316956, "learning_rate": 4.3083530651826294e-05, "loss": 0.2125, "step": 865500 }, { "epoch": 0.6920464997710493, "grad_norm": 2.26821231842041, "learning_rate": 4.307953500228951e-05, "loss": 0.2263, "step": 866000 }, { "epoch": 0.6924460647247277, "grad_norm": 3.242340564727783, "learning_rate": 4.3075539352752723e-05, "loss": 0.2156, "step": 866500 }, { "epoch": 0.6928456296784061, "grad_norm": 0.7108834385871887, "learning_rate": 4.307154370321594e-05, "loss": 0.2476, "step": 867000 }, { "epoch": 0.6932451946320846, "grad_norm": 0.5574171543121338, "learning_rate": 4.306754805367916e-05, "loss": 0.2122, "step": 867500 }, { "epoch": 0.693644759585763, "grad_norm": 1.1301844120025635, "learning_rate": 4.306355240414237e-05, "loss": 0.2139, "step": 868000 }, { "epoch": 0.6940443245394414, "grad_norm": 0.17186538875102997, "learning_rate": 4.305955675460559e-05, "loss": 0.2121, "step": 868500 }, { "epoch": 0.6944438894931199, "grad_norm": 0.1628149449825287, "learning_rate": 4.30555611050688e-05, "loss": 0.2145, "step": 869000 }, { "epoch": 0.6948434544467983, "grad_norm": 0.5606082081794739, "learning_rate": 4.305156545553202e-05, "loss": 0.2254, "step": 869500 }, { "epoch": 0.6952430194004767, "grad_norm": 1.3024812936782837, "learning_rate": 4.304756980599524e-05, "loss": 0.228, "step": 870000 }, { "epoch": 0.6956425843541552, "grad_norm": 0.5932826995849609, "learning_rate": 4.304357415645845e-05, "loss": 0.2264, "step": 870500 }, { "epoch": 0.6960421493078336, "grad_norm": 1.2737762928009033, "learning_rate": 4.303957850692167e-05, "loss": 0.2463, "step": 871000 }, { "epoch": 0.696441714261512, "grad_norm": 0.7818472981452942, "learning_rate": 4.3035582857384885e-05, "loss": 0.2245, "step": 871500 }, { "epoch": 0.6968412792151905, "grad_norm": 0.2922888696193695, "learning_rate": 4.3031587207848096e-05, "loss": 0.2489, "step": 872000 }, { "epoch": 0.6972408441688689, "grad_norm": 0.4309908151626587, "learning_rate": 4.3027591558311315e-05, "loss": 0.2214, "step": 872500 }, { "epoch": 0.6976404091225473, "grad_norm": 0.6716644167900085, "learning_rate": 4.3023595908774526e-05, "loss": 0.2351, "step": 873000 }, { "epoch": 0.6980399740762258, "grad_norm": 0.673507034778595, "learning_rate": 4.3019600259237744e-05, "loss": 0.2371, "step": 873500 }, { "epoch": 0.6984395390299042, "grad_norm": 0.6111753582954407, "learning_rate": 4.301560460970096e-05, "loss": 0.2298, "step": 874000 }, { "epoch": 0.6988391039835826, "grad_norm": 0.7252809405326843, "learning_rate": 4.3011608960164174e-05, "loss": 0.228, "step": 874500 }, { "epoch": 0.6992386689372612, "grad_norm": 1.0174798965454102, "learning_rate": 4.300761331062739e-05, "loss": 0.201, "step": 875000 }, { "epoch": 0.6996382338909396, "grad_norm": 0.8214683532714844, "learning_rate": 4.3003617661090604e-05, "loss": 0.2201, "step": 875500 }, { "epoch": 0.700037798844618, "grad_norm": 0.5928546190261841, "learning_rate": 4.299962201155382e-05, "loss": 0.2205, "step": 876000 }, { "epoch": 0.7004373637982965, "grad_norm": 0.934950590133667, "learning_rate": 4.299562636201704e-05, "loss": 0.2245, "step": 876500 }, { "epoch": 0.7008369287519749, "grad_norm": 0.49619075655937195, "learning_rate": 4.299163071248025e-05, "loss": 0.2154, "step": 877000 }, { "epoch": 0.7012364937056533, "grad_norm": 0.33241450786590576, "learning_rate": 4.298763506294347e-05, "loss": 0.2199, "step": 877500 }, { "epoch": 0.7016360586593318, "grad_norm": 1.0678614377975464, "learning_rate": 4.298363941340669e-05, "loss": 0.2422, "step": 878000 }, { "epoch": 0.7020356236130102, "grad_norm": 1.421491026878357, "learning_rate": 4.29796437638699e-05, "loss": 0.2115, "step": 878500 }, { "epoch": 0.7024351885666886, "grad_norm": 0.37942376732826233, "learning_rate": 4.297564811433312e-05, "loss": 0.229, "step": 879000 }, { "epoch": 0.7028347535203671, "grad_norm": 0.9798554182052612, "learning_rate": 4.297165246479633e-05, "loss": 0.253, "step": 879500 }, { "epoch": 0.7032343184740455, "grad_norm": 0.3961541950702667, "learning_rate": 4.296765681525955e-05, "loss": 0.2011, "step": 880000 }, { "epoch": 0.7036338834277239, "grad_norm": 0.9280670881271362, "learning_rate": 4.2963661165722765e-05, "loss": 0.2281, "step": 880500 }, { "epoch": 0.7040334483814024, "grad_norm": 0.5494555234909058, "learning_rate": 4.295966551618598e-05, "loss": 0.2392, "step": 881000 }, { "epoch": 0.7044330133350808, "grad_norm": 0.6585249304771423, "learning_rate": 4.2955669866649195e-05, "loss": 0.2313, "step": 881500 }, { "epoch": 0.7048325782887592, "grad_norm": 0.17553694546222687, "learning_rate": 4.2951674217112406e-05, "loss": 0.2373, "step": 882000 }, { "epoch": 0.7052321432424377, "grad_norm": 1.076353669166565, "learning_rate": 4.2947678567575624e-05, "loss": 0.2276, "step": 882500 }, { "epoch": 0.7056317081961161, "grad_norm": 0.6392458081245422, "learning_rate": 4.294368291803884e-05, "loss": 0.2458, "step": 883000 }, { "epoch": 0.7060312731497945, "grad_norm": 0.5985832214355469, "learning_rate": 4.2939687268502054e-05, "loss": 0.2159, "step": 883500 }, { "epoch": 0.706430838103473, "grad_norm": 0.4608159363269806, "learning_rate": 4.293569161896527e-05, "loss": 0.2353, "step": 884000 }, { "epoch": 0.7068304030571514, "grad_norm": 0.7654038071632385, "learning_rate": 4.2931695969428484e-05, "loss": 0.2224, "step": 884500 }, { "epoch": 0.7072299680108298, "grad_norm": 1.6665242910385132, "learning_rate": 4.292770031989171e-05, "loss": 0.2284, "step": 885000 }, { "epoch": 0.7076295329645083, "grad_norm": 0.4719606637954712, "learning_rate": 4.292370467035492e-05, "loss": 0.2222, "step": 885500 }, { "epoch": 0.7080290979181867, "grad_norm": 1.1671209335327148, "learning_rate": 4.291970902081813e-05, "loss": 0.2201, "step": 886000 }, { "epoch": 0.7084286628718651, "grad_norm": 0.672122597694397, "learning_rate": 4.291571337128135e-05, "loss": 0.2214, "step": 886500 }, { "epoch": 0.7088282278255436, "grad_norm": 1.2708660364151, "learning_rate": 4.291171772174457e-05, "loss": 0.2275, "step": 887000 }, { "epoch": 0.709227792779222, "grad_norm": 2.193948984146118, "learning_rate": 4.2907722072207786e-05, "loss": 0.2225, "step": 887500 }, { "epoch": 0.7096273577329004, "grad_norm": 0.48852601647377014, "learning_rate": 4.2903726422671e-05, "loss": 0.2115, "step": 888000 }, { "epoch": 0.7100269226865789, "grad_norm": 0.4512315094470978, "learning_rate": 4.289973077313421e-05, "loss": 0.2291, "step": 888500 }, { "epoch": 0.7104264876402573, "grad_norm": 0.46072447299957275, "learning_rate": 4.289573512359743e-05, "loss": 0.2268, "step": 889000 }, { "epoch": 0.7108260525939357, "grad_norm": 1.5286777019500732, "learning_rate": 4.2891739474060645e-05, "loss": 0.2235, "step": 889500 }, { "epoch": 0.7112256175476142, "grad_norm": 0.2251434475183487, "learning_rate": 4.2887743824523864e-05, "loss": 0.2273, "step": 890000 }, { "epoch": 0.7116251825012926, "grad_norm": 0.9269788265228271, "learning_rate": 4.2883748174987075e-05, "loss": 0.2195, "step": 890500 }, { "epoch": 0.712024747454971, "grad_norm": 0.2979867160320282, "learning_rate": 4.2879752525450286e-05, "loss": 0.2289, "step": 891000 }, { "epoch": 0.7124243124086495, "grad_norm": 0.7140158414840698, "learning_rate": 4.287575687591351e-05, "loss": 0.2203, "step": 891500 }, { "epoch": 0.7128238773623279, "grad_norm": 0.4763960540294647, "learning_rate": 4.287176122637672e-05, "loss": 0.2333, "step": 892000 }, { "epoch": 0.7132234423160063, "grad_norm": 0.6860237717628479, "learning_rate": 4.286776557683994e-05, "loss": 0.2196, "step": 892500 }, { "epoch": 0.7136230072696848, "grad_norm": 0.8299091458320618, "learning_rate": 4.286376992730315e-05, "loss": 0.2343, "step": 893000 }, { "epoch": 0.7140225722233632, "grad_norm": 0.23770104348659515, "learning_rate": 4.285977427776637e-05, "loss": 0.2459, "step": 893500 }, { "epoch": 0.7144221371770416, "grad_norm": 0.1491880565881729, "learning_rate": 4.285577862822959e-05, "loss": 0.2161, "step": 894000 }, { "epoch": 0.71482170213072, "grad_norm": 0.49598556756973267, "learning_rate": 4.28517829786928e-05, "loss": 0.2529, "step": 894500 }, { "epoch": 0.7152212670843985, "grad_norm": 2.6255507469177246, "learning_rate": 4.284778732915602e-05, "loss": 0.2292, "step": 895000 }, { "epoch": 0.7156208320380769, "grad_norm": 2.3558549880981445, "learning_rate": 4.284379167961923e-05, "loss": 0.2243, "step": 895500 }, { "epoch": 0.7160203969917553, "grad_norm": 0.18220743536949158, "learning_rate": 4.283979603008245e-05, "loss": 0.2289, "step": 896000 }, { "epoch": 0.7164199619454338, "grad_norm": 0.725283145904541, "learning_rate": 4.2835800380545666e-05, "loss": 0.2206, "step": 896500 }, { "epoch": 0.7168195268991122, "grad_norm": 0.47266364097595215, "learning_rate": 4.283180473100888e-05, "loss": 0.2091, "step": 897000 }, { "epoch": 0.7172190918527906, "grad_norm": 0.657813310623169, "learning_rate": 4.2827809081472096e-05, "loss": 0.2127, "step": 897500 }, { "epoch": 0.7176186568064691, "grad_norm": 1.756593108177185, "learning_rate": 4.2823813431935314e-05, "loss": 0.2359, "step": 898000 }, { "epoch": 0.7180182217601475, "grad_norm": 0.6522263288497925, "learning_rate": 4.2819817782398526e-05, "loss": 0.2324, "step": 898500 }, { "epoch": 0.718417786713826, "grad_norm": 0.8649700284004211, "learning_rate": 4.2815822132861744e-05, "loss": 0.2293, "step": 899000 }, { "epoch": 0.7188173516675044, "grad_norm": 0.27819573879241943, "learning_rate": 4.2811826483324955e-05, "loss": 0.2058, "step": 899500 }, { "epoch": 0.7192169166211828, "grad_norm": 0.4244461953639984, "learning_rate": 4.280783083378817e-05, "loss": 0.2318, "step": 900000 }, { "epoch": 0.7196164815748612, "grad_norm": 0.283732533454895, "learning_rate": 4.280383518425139e-05, "loss": 0.2379, "step": 900500 }, { "epoch": 0.7200160465285397, "grad_norm": 0.537649393081665, "learning_rate": 4.27998395347146e-05, "loss": 0.2174, "step": 901000 }, { "epoch": 0.7204156114822181, "grad_norm": 0.5739501714706421, "learning_rate": 4.279584388517782e-05, "loss": 0.2221, "step": 901500 }, { "epoch": 0.7208151764358965, "grad_norm": 0.5691890716552734, "learning_rate": 4.279184823564103e-05, "loss": 0.2254, "step": 902000 }, { "epoch": 0.721214741389575, "grad_norm": 0.8976100087165833, "learning_rate": 4.278785258610426e-05, "loss": 0.2272, "step": 902500 }, { "epoch": 0.7216143063432535, "grad_norm": 0.9645788669586182, "learning_rate": 4.278385693656747e-05, "loss": 0.2164, "step": 903000 }, { "epoch": 0.722013871296932, "grad_norm": 0.8404621481895447, "learning_rate": 4.277986128703068e-05, "loss": 0.2282, "step": 903500 }, { "epoch": 0.7224134362506104, "grad_norm": 0.4399958550930023, "learning_rate": 4.27758656374939e-05, "loss": 0.218, "step": 904000 }, { "epoch": 0.7228130012042888, "grad_norm": 0.5347242951393127, "learning_rate": 4.277186998795712e-05, "loss": 0.2098, "step": 904500 }, { "epoch": 0.7232125661579673, "grad_norm": 0.5428066849708557, "learning_rate": 4.276787433842033e-05, "loss": 0.2337, "step": 905000 }, { "epoch": 0.7236121311116457, "grad_norm": 1.0286186933517456, "learning_rate": 4.2763878688883546e-05, "loss": 0.2325, "step": 905500 }, { "epoch": 0.7240116960653241, "grad_norm": 0.6550084948539734, "learning_rate": 4.275988303934676e-05, "loss": 0.2311, "step": 906000 }, { "epoch": 0.7244112610190026, "grad_norm": 0.45286983251571655, "learning_rate": 4.2755887389809976e-05, "loss": 0.2231, "step": 906500 }, { "epoch": 0.724810825972681, "grad_norm": 0.7123512029647827, "learning_rate": 4.2751891740273194e-05, "loss": 0.2128, "step": 907000 }, { "epoch": 0.7252103909263594, "grad_norm": 0.6739441156387329, "learning_rate": 4.2747896090736406e-05, "loss": 0.2234, "step": 907500 }, { "epoch": 0.7256099558800378, "grad_norm": 0.420788049697876, "learning_rate": 4.2743900441199624e-05, "loss": 0.2128, "step": 908000 }, { "epoch": 0.7260095208337163, "grad_norm": 0.41126811504364014, "learning_rate": 4.2739904791662835e-05, "loss": 0.2244, "step": 908500 }, { "epoch": 0.7264090857873947, "grad_norm": 2.476842164993286, "learning_rate": 4.273590914212606e-05, "loss": 0.2154, "step": 909000 }, { "epoch": 0.7268086507410731, "grad_norm": 0.39396631717681885, "learning_rate": 4.273191349258927e-05, "loss": 0.2155, "step": 909500 }, { "epoch": 0.7272082156947516, "grad_norm": 0.6978639960289001, "learning_rate": 4.272791784305248e-05, "loss": 0.2284, "step": 910000 }, { "epoch": 0.72760778064843, "grad_norm": 0.6673126816749573, "learning_rate": 4.27239221935157e-05, "loss": 0.2019, "step": 910500 }, { "epoch": 0.7280073456021084, "grad_norm": 0.9737561345100403, "learning_rate": 4.271992654397892e-05, "loss": 0.229, "step": 911000 }, { "epoch": 0.7284069105557869, "grad_norm": 0.7489374876022339, "learning_rate": 4.271593089444214e-05, "loss": 0.235, "step": 911500 }, { "epoch": 0.7288064755094653, "grad_norm": 0.7555142045021057, "learning_rate": 4.271193524490535e-05, "loss": 0.213, "step": 912000 }, { "epoch": 0.7292060404631437, "grad_norm": 0.6010429263114929, "learning_rate": 4.270793959536856e-05, "loss": 0.2207, "step": 912500 }, { "epoch": 0.7296056054168222, "grad_norm": 1.1857478618621826, "learning_rate": 4.270394394583178e-05, "loss": 0.2185, "step": 913000 }, { "epoch": 0.7300051703705006, "grad_norm": 1.2229430675506592, "learning_rate": 4.2699948296295e-05, "loss": 0.2363, "step": 913500 }, { "epoch": 0.730404735324179, "grad_norm": 2.251037836074829, "learning_rate": 4.2695952646758215e-05, "loss": 0.2071, "step": 914000 }, { "epoch": 0.7308043002778575, "grad_norm": 0.22095665335655212, "learning_rate": 4.2691956997221427e-05, "loss": 0.2192, "step": 914500 }, { "epoch": 0.7312038652315359, "grad_norm": 0.6140109896659851, "learning_rate": 4.268796134768464e-05, "loss": 0.2253, "step": 915000 }, { "epoch": 0.7316034301852143, "grad_norm": 0.3803349435329437, "learning_rate": 4.268396569814786e-05, "loss": 0.2112, "step": 915500 }, { "epoch": 0.7320029951388928, "grad_norm": 1.0809364318847656, "learning_rate": 4.2679970048611074e-05, "loss": 0.227, "step": 916000 }, { "epoch": 0.7324025600925712, "grad_norm": 1.1603540182113647, "learning_rate": 4.267597439907429e-05, "loss": 0.1993, "step": 916500 }, { "epoch": 0.7328021250462496, "grad_norm": 0.6223179697990417, "learning_rate": 4.2671978749537504e-05, "loss": 0.216, "step": 917000 }, { "epoch": 0.7332016899999281, "grad_norm": 0.5817562937736511, "learning_rate": 4.266798310000072e-05, "loss": 0.219, "step": 917500 }, { "epoch": 0.7336012549536065, "grad_norm": 0.4100595712661743, "learning_rate": 4.266398745046394e-05, "loss": 0.2494, "step": 918000 }, { "epoch": 0.7340008199072849, "grad_norm": 0.49716565012931824, "learning_rate": 4.265999180092715e-05, "loss": 0.2182, "step": 918500 }, { "epoch": 0.7344003848609634, "grad_norm": 0.6334615349769592, "learning_rate": 4.265599615139037e-05, "loss": 0.1982, "step": 919000 }, { "epoch": 0.7347999498146418, "grad_norm": 1.0713385343551636, "learning_rate": 4.265200050185358e-05, "loss": 0.2287, "step": 919500 }, { "epoch": 0.7351995147683202, "grad_norm": 0.6924279928207397, "learning_rate": 4.26480048523168e-05, "loss": 0.2089, "step": 920000 }, { "epoch": 0.7355990797219987, "grad_norm": 0.35441678762435913, "learning_rate": 4.264400920278002e-05, "loss": 0.2236, "step": 920500 }, { "epoch": 0.7359986446756771, "grad_norm": 3.596282958984375, "learning_rate": 4.264001355324323e-05, "loss": 0.2215, "step": 921000 }, { "epoch": 0.7363982096293555, "grad_norm": 1.2956453561782837, "learning_rate": 4.263601790370645e-05, "loss": 0.2017, "step": 921500 }, { "epoch": 0.736797774583034, "grad_norm": 0.4000963568687439, "learning_rate": 4.2632022254169666e-05, "loss": 0.2145, "step": 922000 }, { "epoch": 0.7371973395367124, "grad_norm": 0.6873119473457336, "learning_rate": 4.262802660463288e-05, "loss": 0.2364, "step": 922500 }, { "epoch": 0.7375969044903908, "grad_norm": 0.4624563753604889, "learning_rate": 4.2624030955096095e-05, "loss": 0.2046, "step": 923000 }, { "epoch": 0.7379964694440693, "grad_norm": 0.8786656260490417, "learning_rate": 4.262003530555931e-05, "loss": 0.2307, "step": 923500 }, { "epoch": 0.7383960343977477, "grad_norm": 0.42003414034843445, "learning_rate": 4.2616039656022525e-05, "loss": 0.2077, "step": 924000 }, { "epoch": 0.7387955993514261, "grad_norm": 0.5578221678733826, "learning_rate": 4.261204400648574e-05, "loss": 0.2443, "step": 924500 }, { "epoch": 0.7391951643051046, "grad_norm": 0.6436789035797119, "learning_rate": 4.2608048356948955e-05, "loss": 0.2345, "step": 925000 }, { "epoch": 0.739594729258783, "grad_norm": 0.4072577655315399, "learning_rate": 4.260405270741217e-05, "loss": 0.2155, "step": 925500 }, { "epoch": 0.7399942942124614, "grad_norm": 0.5962798595428467, "learning_rate": 4.2600057057875384e-05, "loss": 0.2189, "step": 926000 }, { "epoch": 0.7403938591661399, "grad_norm": 0.5428974032402039, "learning_rate": 4.25960614083386e-05, "loss": 0.2278, "step": 926500 }, { "epoch": 0.7407934241198183, "grad_norm": 0.37288761138916016, "learning_rate": 4.259206575880182e-05, "loss": 0.21, "step": 927000 }, { "epoch": 0.7411929890734967, "grad_norm": 0.4621787667274475, "learning_rate": 4.258807010926503e-05, "loss": 0.2338, "step": 927500 }, { "epoch": 0.7415925540271752, "grad_norm": 1.198089599609375, "learning_rate": 4.258407445972825e-05, "loss": 0.209, "step": 928000 }, { "epoch": 0.7419921189808536, "grad_norm": 0.9517782330513, "learning_rate": 4.258007881019146e-05, "loss": 0.2462, "step": 928500 }, { "epoch": 0.742391683934532, "grad_norm": 0.8121616244316101, "learning_rate": 4.2576083160654687e-05, "loss": 0.2336, "step": 929000 }, { "epoch": 0.7427912488882105, "grad_norm": 0.9212857484817505, "learning_rate": 4.25720875111179e-05, "loss": 0.2224, "step": 929500 }, { "epoch": 0.7431908138418889, "grad_norm": 2.6974456310272217, "learning_rate": 4.256809186158111e-05, "loss": 0.237, "step": 930000 }, { "epoch": 0.7435903787955674, "grad_norm": 2.699167251586914, "learning_rate": 4.256409621204433e-05, "loss": 0.2184, "step": 930500 }, { "epoch": 0.7439899437492459, "grad_norm": 0.8837106227874756, "learning_rate": 4.2560100562507546e-05, "loss": 0.2154, "step": 931000 }, { "epoch": 0.7443895087029243, "grad_norm": 0.5648900866508484, "learning_rate": 4.255610491297076e-05, "loss": 0.2031, "step": 931500 }, { "epoch": 0.7447890736566027, "grad_norm": 1.097140908241272, "learning_rate": 4.2552109263433975e-05, "loss": 0.2127, "step": 932000 }, { "epoch": 0.7451886386102812, "grad_norm": 1.0731847286224365, "learning_rate": 4.254811361389719e-05, "loss": 0.2259, "step": 932500 }, { "epoch": 0.7455882035639596, "grad_norm": 0.8864458203315735, "learning_rate": 4.2544117964360405e-05, "loss": 0.2234, "step": 933000 }, { "epoch": 0.745987768517638, "grad_norm": 0.6629884839057922, "learning_rate": 4.254012231482362e-05, "loss": 0.211, "step": 933500 }, { "epoch": 0.7463873334713165, "grad_norm": 0.4780425727367401, "learning_rate": 4.2536126665286835e-05, "loss": 0.207, "step": 934000 }, { "epoch": 0.7467868984249949, "grad_norm": 2.385679244995117, "learning_rate": 4.253213101575005e-05, "loss": 0.2217, "step": 934500 }, { "epoch": 0.7471864633786733, "grad_norm": 1.1771334409713745, "learning_rate": 4.2528135366213264e-05, "loss": 0.2164, "step": 935000 }, { "epoch": 0.7475860283323518, "grad_norm": 0.6293238997459412, "learning_rate": 4.252413971667649e-05, "loss": 0.209, "step": 935500 }, { "epoch": 0.7479855932860302, "grad_norm": 0.2851349711418152, "learning_rate": 4.25201440671397e-05, "loss": 0.2199, "step": 936000 }, { "epoch": 0.7483851582397086, "grad_norm": 0.8605936169624329, "learning_rate": 4.251614841760291e-05, "loss": 0.2222, "step": 936500 }, { "epoch": 0.7487847231933871, "grad_norm": 1.110164761543274, "learning_rate": 4.251215276806613e-05, "loss": 0.2126, "step": 937000 }, { "epoch": 0.7491842881470655, "grad_norm": 0.42164847254753113, "learning_rate": 4.250815711852935e-05, "loss": 0.2122, "step": 937500 }, { "epoch": 0.7495838531007439, "grad_norm": 2.511159658432007, "learning_rate": 4.250416146899257e-05, "loss": 0.206, "step": 938000 }, { "epoch": 0.7499834180544224, "grad_norm": 0.975297749042511, "learning_rate": 4.250016581945578e-05, "loss": 0.2141, "step": 938500 }, { "epoch": 0.7503829830081008, "grad_norm": 0.7965412735939026, "learning_rate": 4.249617016991899e-05, "loss": 0.2247, "step": 939000 }, { "epoch": 0.7507825479617792, "grad_norm": 1.3132208585739136, "learning_rate": 4.249217452038221e-05, "loss": 0.2326, "step": 939500 }, { "epoch": 0.7511821129154577, "grad_norm": 0.48485061526298523, "learning_rate": 4.2488178870845426e-05, "loss": 0.2157, "step": 940000 }, { "epoch": 0.7515816778691361, "grad_norm": 0.555424690246582, "learning_rate": 4.2484183221308644e-05, "loss": 0.2435, "step": 940500 }, { "epoch": 0.7519812428228145, "grad_norm": 0.39876627922058105, "learning_rate": 4.2480187571771856e-05, "loss": 0.2374, "step": 941000 }, { "epoch": 0.752380807776493, "grad_norm": 1.0893007516860962, "learning_rate": 4.247619192223507e-05, "loss": 0.211, "step": 941500 }, { "epoch": 0.7527803727301714, "grad_norm": 0.4302767515182495, "learning_rate": 4.247219627269829e-05, "loss": 0.2186, "step": 942000 }, { "epoch": 0.7531799376838498, "grad_norm": 0.8266696929931641, "learning_rate": 4.2468200623161503e-05, "loss": 0.2191, "step": 942500 }, { "epoch": 0.7535795026375283, "grad_norm": 0.7150265574455261, "learning_rate": 4.246420497362472e-05, "loss": 0.2109, "step": 943000 }, { "epoch": 0.7539790675912067, "grad_norm": 0.6417500972747803, "learning_rate": 4.246020932408793e-05, "loss": 0.216, "step": 943500 }, { "epoch": 0.7543786325448851, "grad_norm": 0.45177632570266724, "learning_rate": 4.245621367455115e-05, "loss": 0.218, "step": 944000 }, { "epoch": 0.7547781974985636, "grad_norm": 0.5453196167945862, "learning_rate": 4.245221802501437e-05, "loss": 0.2275, "step": 944500 }, { "epoch": 0.755177762452242, "grad_norm": 1.90815269947052, "learning_rate": 4.244822237547758e-05, "loss": 0.2336, "step": 945000 }, { "epoch": 0.7555773274059204, "grad_norm": 0.24553893506526947, "learning_rate": 4.24442267259408e-05, "loss": 0.219, "step": 945500 }, { "epoch": 0.7559768923595989, "grad_norm": 0.8005672693252563, "learning_rate": 4.244023107640401e-05, "loss": 0.224, "step": 946000 }, { "epoch": 0.7563764573132773, "grad_norm": 0.45580554008483887, "learning_rate": 4.243623542686723e-05, "loss": 0.2075, "step": 946500 }, { "epoch": 0.7567760222669557, "grad_norm": 0.7861705422401428, "learning_rate": 4.243223977733045e-05, "loss": 0.2031, "step": 947000 }, { "epoch": 0.7571755872206342, "grad_norm": 0.9599922299385071, "learning_rate": 4.242824412779366e-05, "loss": 0.2374, "step": 947500 }, { "epoch": 0.7575751521743126, "grad_norm": 0.22098274528980255, "learning_rate": 4.2424248478256876e-05, "loss": 0.2049, "step": 948000 }, { "epoch": 0.757974717127991, "grad_norm": 0.4945424199104309, "learning_rate": 4.2420252828720095e-05, "loss": 0.2297, "step": 948500 }, { "epoch": 0.7583742820816695, "grad_norm": 0.461661159992218, "learning_rate": 4.2416257179183306e-05, "loss": 0.2147, "step": 949000 }, { "epoch": 0.7587738470353479, "grad_norm": 0.3909895420074463, "learning_rate": 4.2412261529646524e-05, "loss": 0.2288, "step": 949500 }, { "epoch": 0.7591734119890263, "grad_norm": 0.6551194787025452, "learning_rate": 4.2408265880109736e-05, "loss": 0.2125, "step": 950000 }, { "epoch": 0.7595729769427048, "grad_norm": 0.49388301372528076, "learning_rate": 4.2404270230572954e-05, "loss": 0.2208, "step": 950500 }, { "epoch": 0.7599725418963832, "grad_norm": 0.771077036857605, "learning_rate": 4.240027458103617e-05, "loss": 0.2312, "step": 951000 }, { "epoch": 0.7603721068500616, "grad_norm": 2.550574779510498, "learning_rate": 4.2396278931499384e-05, "loss": 0.2327, "step": 951500 }, { "epoch": 0.7607716718037401, "grad_norm": 0.5083726644515991, "learning_rate": 4.23922832819626e-05, "loss": 0.235, "step": 952000 }, { "epoch": 0.7611712367574185, "grad_norm": 0.11698604375123978, "learning_rate": 4.238828763242581e-05, "loss": 0.2178, "step": 952500 }, { "epoch": 0.7615708017110969, "grad_norm": 0.43672239780426025, "learning_rate": 4.238429198288904e-05, "loss": 0.2181, "step": 953000 }, { "epoch": 0.7619703666647754, "grad_norm": 0.46738114953041077, "learning_rate": 4.238029633335225e-05, "loss": 0.2146, "step": 953500 }, { "epoch": 0.7623699316184538, "grad_norm": 0.49094927310943604, "learning_rate": 4.237630068381546e-05, "loss": 0.1939, "step": 954000 }, { "epoch": 0.7627694965721322, "grad_norm": 1.6711900234222412, "learning_rate": 4.237230503427868e-05, "loss": 0.2246, "step": 954500 }, { "epoch": 0.7631690615258107, "grad_norm": 0.4010534882545471, "learning_rate": 4.23683093847419e-05, "loss": 0.207, "step": 955000 }, { "epoch": 0.7635686264794891, "grad_norm": 0.6391517519950867, "learning_rate": 4.2364313735205116e-05, "loss": 0.2237, "step": 955500 }, { "epoch": 0.7639681914331675, "grad_norm": 0.7092425227165222, "learning_rate": 4.236031808566833e-05, "loss": 0.2022, "step": 956000 }, { "epoch": 0.764367756386846, "grad_norm": 0.36892715096473694, "learning_rate": 4.235632243613154e-05, "loss": 0.2309, "step": 956500 }, { "epoch": 0.7647673213405244, "grad_norm": 0.3609027862548828, "learning_rate": 4.235232678659476e-05, "loss": 0.2244, "step": 957000 }, { "epoch": 0.7651668862942028, "grad_norm": 0.7239806652069092, "learning_rate": 4.2348331137057975e-05, "loss": 0.2279, "step": 957500 }, { "epoch": 0.7655664512478813, "grad_norm": 0.7506929636001587, "learning_rate": 4.234433548752119e-05, "loss": 0.2219, "step": 958000 }, { "epoch": 0.7659660162015598, "grad_norm": 0.6354817152023315, "learning_rate": 4.2340339837984404e-05, "loss": 0.2078, "step": 958500 }, { "epoch": 0.7663655811552382, "grad_norm": 0.7015244960784912, "learning_rate": 4.2336344188447616e-05, "loss": 0.2411, "step": 959000 }, { "epoch": 0.7667651461089167, "grad_norm": 0.9650365710258484, "learning_rate": 4.233234853891084e-05, "loss": 0.2314, "step": 959500 }, { "epoch": 0.7671647110625951, "grad_norm": 0.8067612648010254, "learning_rate": 4.232835288937405e-05, "loss": 0.2267, "step": 960000 }, { "epoch": 0.7675642760162735, "grad_norm": 0.9077547192573547, "learning_rate": 4.2324357239837264e-05, "loss": 0.2092, "step": 960500 }, { "epoch": 0.767963840969952, "grad_norm": 0.2912672460079193, "learning_rate": 4.232036159030048e-05, "loss": 0.2133, "step": 961000 }, { "epoch": 0.7683634059236304, "grad_norm": 0.7577641010284424, "learning_rate": 4.23163659407637e-05, "loss": 0.2228, "step": 961500 }, { "epoch": 0.7687629708773088, "grad_norm": 0.5200302600860596, "learning_rate": 4.231237029122692e-05, "loss": 0.2132, "step": 962000 }, { "epoch": 0.7691625358309873, "grad_norm": 1.8108941316604614, "learning_rate": 4.230837464169013e-05, "loss": 0.2337, "step": 962500 }, { "epoch": 0.7695621007846657, "grad_norm": 1.4577267169952393, "learning_rate": 4.230437899215334e-05, "loss": 0.2149, "step": 963000 }, { "epoch": 0.7699616657383441, "grad_norm": 0.6963854432106018, "learning_rate": 4.230038334261656e-05, "loss": 0.2265, "step": 963500 }, { "epoch": 0.7703612306920226, "grad_norm": 0.550114631652832, "learning_rate": 4.229638769307978e-05, "loss": 0.204, "step": 964000 }, { "epoch": 0.770760795645701, "grad_norm": 0.48985281586647034, "learning_rate": 4.2292392043542996e-05, "loss": 0.2251, "step": 964500 }, { "epoch": 0.7711603605993794, "grad_norm": 0.7458457946777344, "learning_rate": 4.228839639400621e-05, "loss": 0.2353, "step": 965000 }, { "epoch": 0.7715599255530579, "grad_norm": 0.36470741033554077, "learning_rate": 4.228440074446942e-05, "loss": 0.2416, "step": 965500 }, { "epoch": 0.7719594905067363, "grad_norm": 0.8348783850669861, "learning_rate": 4.2280405094932644e-05, "loss": 0.2068, "step": 966000 }, { "epoch": 0.7723590554604147, "grad_norm": 0.7998340725898743, "learning_rate": 4.2276409445395855e-05, "loss": 0.2134, "step": 966500 }, { "epoch": 0.7727586204140932, "grad_norm": 0.6616082787513733, "learning_rate": 4.227241379585907e-05, "loss": 0.2213, "step": 967000 }, { "epoch": 0.7731581853677716, "grad_norm": 0.4827256500720978, "learning_rate": 4.2268418146322285e-05, "loss": 0.2329, "step": 967500 }, { "epoch": 0.77355775032145, "grad_norm": 0.4179074466228485, "learning_rate": 4.22644224967855e-05, "loss": 0.212, "step": 968000 }, { "epoch": 0.7739573152751285, "grad_norm": 1.1691224575042725, "learning_rate": 4.226042684724872e-05, "loss": 0.2146, "step": 968500 }, { "epoch": 0.7743568802288069, "grad_norm": 1.0362207889556885, "learning_rate": 4.225643119771193e-05, "loss": 0.2131, "step": 969000 }, { "epoch": 0.7747564451824853, "grad_norm": 0.4639914333820343, "learning_rate": 4.225243554817515e-05, "loss": 0.206, "step": 969500 }, { "epoch": 0.7751560101361638, "grad_norm": 3.3701682090759277, "learning_rate": 4.224843989863836e-05, "loss": 0.2144, "step": 970000 }, { "epoch": 0.7755555750898422, "grad_norm": 1.1893696784973145, "learning_rate": 4.224444424910158e-05, "loss": 0.2289, "step": 970500 }, { "epoch": 0.7759551400435206, "grad_norm": 0.42926034331321716, "learning_rate": 4.22404485995648e-05, "loss": 0.2045, "step": 971000 }, { "epoch": 0.776354704997199, "grad_norm": 0.7731590867042542, "learning_rate": 4.223645295002801e-05, "loss": 0.2255, "step": 971500 }, { "epoch": 0.7767542699508775, "grad_norm": 0.8967592716217041, "learning_rate": 4.223245730049123e-05, "loss": 0.2087, "step": 972000 }, { "epoch": 0.7771538349045559, "grad_norm": 0.814207911491394, "learning_rate": 4.222846165095444e-05, "loss": 0.2252, "step": 972500 }, { "epoch": 0.7775533998582344, "grad_norm": 0.9298072457313538, "learning_rate": 4.222446600141766e-05, "loss": 0.1995, "step": 973000 }, { "epoch": 0.7779529648119128, "grad_norm": 0.677320659160614, "learning_rate": 4.2220470351880876e-05, "loss": 0.2117, "step": 973500 }, { "epoch": 0.7783525297655912, "grad_norm": 0.8451535105705261, "learning_rate": 4.221647470234409e-05, "loss": 0.2107, "step": 974000 }, { "epoch": 0.7787520947192696, "grad_norm": 1.5113961696624756, "learning_rate": 4.2212479052807306e-05, "loss": 0.2148, "step": 974500 }, { "epoch": 0.7791516596729481, "grad_norm": 0.5006364583969116, "learning_rate": 4.2208483403270524e-05, "loss": 0.2293, "step": 975000 }, { "epoch": 0.7795512246266265, "grad_norm": 0.5295639038085938, "learning_rate": 4.2204487753733735e-05, "loss": 0.2143, "step": 975500 }, { "epoch": 0.779950789580305, "grad_norm": 1.019473671913147, "learning_rate": 4.220049210419695e-05, "loss": 0.2086, "step": 976000 }, { "epoch": 0.7803503545339834, "grad_norm": 0.14247408509254456, "learning_rate": 4.2196496454660165e-05, "loss": 0.225, "step": 976500 }, { "epoch": 0.7807499194876618, "grad_norm": 0.6036010980606079, "learning_rate": 4.219250080512338e-05, "loss": 0.2321, "step": 977000 }, { "epoch": 0.7811494844413402, "grad_norm": 0.5410523414611816, "learning_rate": 4.21885051555866e-05, "loss": 0.2119, "step": 977500 }, { "epoch": 0.7815490493950187, "grad_norm": 0.6694096326828003, "learning_rate": 4.218450950604981e-05, "loss": 0.2261, "step": 978000 }, { "epoch": 0.7819486143486971, "grad_norm": 1.1002596616744995, "learning_rate": 4.218051385651303e-05, "loss": 0.2257, "step": 978500 }, { "epoch": 0.7823481793023755, "grad_norm": 0.4377724826335907, "learning_rate": 4.217651820697624e-05, "loss": 0.2169, "step": 979000 }, { "epoch": 0.782747744256054, "grad_norm": 0.46207529306411743, "learning_rate": 4.217252255743947e-05, "loss": 0.2149, "step": 979500 }, { "epoch": 0.7831473092097324, "grad_norm": 0.49427592754364014, "learning_rate": 4.216852690790268e-05, "loss": 0.2293, "step": 980000 }, { "epoch": 0.7835468741634108, "grad_norm": 0.7052193880081177, "learning_rate": 4.216453125836589e-05, "loss": 0.2205, "step": 980500 }, { "epoch": 0.7839464391170893, "grad_norm": 0.3347926139831543, "learning_rate": 4.216053560882911e-05, "loss": 0.2115, "step": 981000 }, { "epoch": 0.7843460040707677, "grad_norm": 1.6774003505706787, "learning_rate": 4.2156539959292326e-05, "loss": 0.2055, "step": 981500 }, { "epoch": 0.7847455690244461, "grad_norm": 0.7970127463340759, "learning_rate": 4.2152544309755545e-05, "loss": 0.2203, "step": 982000 }, { "epoch": 0.7851451339781246, "grad_norm": 1.3113212585449219, "learning_rate": 4.2148548660218756e-05, "loss": 0.2147, "step": 982500 }, { "epoch": 0.785544698931803, "grad_norm": 0.9576812386512756, "learning_rate": 4.214455301068197e-05, "loss": 0.2211, "step": 983000 }, { "epoch": 0.7859442638854814, "grad_norm": 1.3865739107131958, "learning_rate": 4.2140557361145186e-05, "loss": 0.2243, "step": 983500 }, { "epoch": 0.7863438288391599, "grad_norm": 1.4575685262680054, "learning_rate": 4.2136561711608404e-05, "loss": 0.2305, "step": 984000 }, { "epoch": 0.7867433937928383, "grad_norm": 3.0039331912994385, "learning_rate": 4.213256606207162e-05, "loss": 0.2068, "step": 984500 }, { "epoch": 0.7871429587465167, "grad_norm": 0.30448639392852783, "learning_rate": 4.2128570412534834e-05, "loss": 0.2217, "step": 985000 }, { "epoch": 0.7875425237001952, "grad_norm": 0.7127716541290283, "learning_rate": 4.2124574762998045e-05, "loss": 0.2385, "step": 985500 }, { "epoch": 0.7879420886538736, "grad_norm": 0.6677834391593933, "learning_rate": 4.212057911346127e-05, "loss": 0.2322, "step": 986000 }, { "epoch": 0.7883416536075522, "grad_norm": 0.44040626287460327, "learning_rate": 4.211658346392448e-05, "loss": 0.2204, "step": 986500 }, { "epoch": 0.7887412185612306, "grad_norm": 0.6315684914588928, "learning_rate": 4.211258781438769e-05, "loss": 0.2306, "step": 987000 }, { "epoch": 0.789140783514909, "grad_norm": 0.5440241694450378, "learning_rate": 4.210859216485091e-05, "loss": 0.2337, "step": 987500 }, { "epoch": 0.7895403484685874, "grad_norm": 1.0579980611801147, "learning_rate": 4.210459651531413e-05, "loss": 0.2183, "step": 988000 }, { "epoch": 0.7899399134222659, "grad_norm": 1.4038656949996948, "learning_rate": 4.210060086577735e-05, "loss": 0.2156, "step": 988500 }, { "epoch": 0.7903394783759443, "grad_norm": 1.9078361988067627, "learning_rate": 4.209660521624056e-05, "loss": 0.2155, "step": 989000 }, { "epoch": 0.7907390433296227, "grad_norm": 0.37978050112724304, "learning_rate": 4.209260956670377e-05, "loss": 0.2303, "step": 989500 }, { "epoch": 0.7911386082833012, "grad_norm": 0.9470463991165161, "learning_rate": 4.208861391716699e-05, "loss": 0.2148, "step": 990000 }, { "epoch": 0.7915381732369796, "grad_norm": 1.003413200378418, "learning_rate": 4.2084618267630207e-05, "loss": 0.2186, "step": 990500 }, { "epoch": 0.791937738190658, "grad_norm": 0.9881795048713684, "learning_rate": 4.2080622618093425e-05, "loss": 0.2267, "step": 991000 }, { "epoch": 0.7923373031443365, "grad_norm": 1.1058759689331055, "learning_rate": 4.2076626968556636e-05, "loss": 0.2057, "step": 991500 }, { "epoch": 0.7927368680980149, "grad_norm": 0.6593647003173828, "learning_rate": 4.207263131901985e-05, "loss": 0.209, "step": 992000 }, { "epoch": 0.7931364330516933, "grad_norm": 0.8090263605117798, "learning_rate": 4.206863566948307e-05, "loss": 0.2364, "step": 992500 }, { "epoch": 0.7935359980053718, "grad_norm": 2.2263972759246826, "learning_rate": 4.2064640019946284e-05, "loss": 0.2137, "step": 993000 }, { "epoch": 0.7939355629590502, "grad_norm": 0.7709357738494873, "learning_rate": 4.20606443704095e-05, "loss": 0.2327, "step": 993500 }, { "epoch": 0.7943351279127286, "grad_norm": 0.7244545221328735, "learning_rate": 4.2056648720872714e-05, "loss": 0.2178, "step": 994000 }, { "epoch": 0.7947346928664071, "grad_norm": 0.616186797618866, "learning_rate": 4.205265307133593e-05, "loss": 0.2171, "step": 994500 }, { "epoch": 0.7951342578200855, "grad_norm": 0.48847001791000366, "learning_rate": 4.204865742179915e-05, "loss": 0.2073, "step": 995000 }, { "epoch": 0.7955338227737639, "grad_norm": 0.5407263040542603, "learning_rate": 4.204466177226236e-05, "loss": 0.2107, "step": 995500 }, { "epoch": 0.7959333877274424, "grad_norm": 0.49169206619262695, "learning_rate": 4.204066612272558e-05, "loss": 0.2119, "step": 996000 }, { "epoch": 0.7963329526811208, "grad_norm": 2.5604546070098877, "learning_rate": 4.203667047318879e-05, "loss": 0.2451, "step": 996500 }, { "epoch": 0.7967325176347992, "grad_norm": 0.8154091238975525, "learning_rate": 4.203267482365201e-05, "loss": 0.2173, "step": 997000 }, { "epoch": 0.7971320825884777, "grad_norm": 1.0719754695892334, "learning_rate": 4.202867917411523e-05, "loss": 0.2114, "step": 997500 }, { "epoch": 0.7975316475421561, "grad_norm": 0.1887267827987671, "learning_rate": 4.202468352457844e-05, "loss": 0.2267, "step": 998000 }, { "epoch": 0.7979312124958345, "grad_norm": 1.9242697954177856, "learning_rate": 4.202068787504166e-05, "loss": 0.2055, "step": 998500 }, { "epoch": 0.798330777449513, "grad_norm": 0.8361847996711731, "learning_rate": 4.2016692225504875e-05, "loss": 0.2101, "step": 999000 }, { "epoch": 0.7987303424031914, "grad_norm": 1.246604561805725, "learning_rate": 4.201269657596809e-05, "loss": 0.2099, "step": 999500 }, { "epoch": 0.7991299073568698, "grad_norm": 0.9595007300376892, "learning_rate": 4.2008700926431305e-05, "loss": 0.2336, "step": 1000000 }, { "epoch": 0.7995294723105483, "grad_norm": 1.2168611288070679, "learning_rate": 4.2004705276894516e-05, "loss": 0.2112, "step": 1000500 }, { "epoch": 0.7999290372642267, "grad_norm": 0.530231237411499, "learning_rate": 4.2000709627357735e-05, "loss": 0.2287, "step": 1001000 }, { "epoch": 0.8003286022179051, "grad_norm": 2.8834099769592285, "learning_rate": 4.199671397782095e-05, "loss": 0.2161, "step": 1001500 }, { "epoch": 0.8007281671715836, "grad_norm": 0.4586220681667328, "learning_rate": 4.1992718328284164e-05, "loss": 0.2338, "step": 1002000 }, { "epoch": 0.801127732125262, "grad_norm": 0.7452694773674011, "learning_rate": 4.198872267874738e-05, "loss": 0.228, "step": 1002500 }, { "epoch": 0.8015272970789404, "grad_norm": 0.29604899883270264, "learning_rate": 4.1984727029210594e-05, "loss": 0.2142, "step": 1003000 }, { "epoch": 0.8019268620326189, "grad_norm": 0.4715476334095001, "learning_rate": 4.198073137967382e-05, "loss": 0.2285, "step": 1003500 }, { "epoch": 0.8023264269862973, "grad_norm": 0.7610575556755066, "learning_rate": 4.197673573013703e-05, "loss": 0.2295, "step": 1004000 }, { "epoch": 0.8027259919399757, "grad_norm": 0.5657440423965454, "learning_rate": 4.197274008060024e-05, "loss": 0.2292, "step": 1004500 }, { "epoch": 0.8031255568936542, "grad_norm": 0.6705012321472168, "learning_rate": 4.196874443106346e-05, "loss": 0.2314, "step": 1005000 }, { "epoch": 0.8035251218473326, "grad_norm": 0.38387972116470337, "learning_rate": 4.196474878152668e-05, "loss": 0.2346, "step": 1005500 }, { "epoch": 0.803924686801011, "grad_norm": 0.6390883326530457, "learning_rate": 4.1960753131989896e-05, "loss": 0.1986, "step": 1006000 }, { "epoch": 0.8043242517546895, "grad_norm": 0.5131704807281494, "learning_rate": 4.195675748245311e-05, "loss": 0.2177, "step": 1006500 }, { "epoch": 0.8047238167083679, "grad_norm": 0.40895316004753113, "learning_rate": 4.195276183291632e-05, "loss": 0.2238, "step": 1007000 }, { "epoch": 0.8051233816620463, "grad_norm": 1.4386707544326782, "learning_rate": 4.194876618337954e-05, "loss": 0.2369, "step": 1007500 }, { "epoch": 0.8055229466157248, "grad_norm": 0.6420083045959473, "learning_rate": 4.1944770533842755e-05, "loss": 0.2292, "step": 1008000 }, { "epoch": 0.8059225115694032, "grad_norm": 0.472776859998703, "learning_rate": 4.1940774884305974e-05, "loss": 0.2036, "step": 1008500 }, { "epoch": 0.8063220765230816, "grad_norm": 0.4791879951953888, "learning_rate": 4.1936779234769185e-05, "loss": 0.219, "step": 1009000 }, { "epoch": 0.8067216414767601, "grad_norm": 0.980270504951477, "learning_rate": 4.1932783585232397e-05, "loss": 0.2264, "step": 1009500 }, { "epoch": 0.8071212064304385, "grad_norm": 1.1314066648483276, "learning_rate": 4.192878793569562e-05, "loss": 0.2357, "step": 1010000 }, { "epoch": 0.8075207713841169, "grad_norm": 0.4521634876728058, "learning_rate": 4.192479228615883e-05, "loss": 0.225, "step": 1010500 }, { "epoch": 0.8079203363377954, "grad_norm": 0.5968225002288818, "learning_rate": 4.192079663662205e-05, "loss": 0.2095, "step": 1011000 }, { "epoch": 0.8083199012914738, "grad_norm": 0.8520902395248413, "learning_rate": 4.191680098708526e-05, "loss": 0.2328, "step": 1011500 }, { "epoch": 0.8087194662451522, "grad_norm": 0.2570018172264099, "learning_rate": 4.191280533754848e-05, "loss": 0.2314, "step": 1012000 }, { "epoch": 0.8091190311988307, "grad_norm": 0.6238954067230225, "learning_rate": 4.19088096880117e-05, "loss": 0.215, "step": 1012500 }, { "epoch": 0.8095185961525091, "grad_norm": 0.4134414792060852, "learning_rate": 4.190481403847491e-05, "loss": 0.2274, "step": 1013000 }, { "epoch": 0.8099181611061875, "grad_norm": 0.9231228828430176, "learning_rate": 4.190081838893813e-05, "loss": 0.2227, "step": 1013500 }, { "epoch": 0.8103177260598661, "grad_norm": 3.264875650405884, "learning_rate": 4.189682273940134e-05, "loss": 0.2419, "step": 1014000 }, { "epoch": 0.8107172910135445, "grad_norm": 1.8476923704147339, "learning_rate": 4.189282708986456e-05, "loss": 0.2203, "step": 1014500 }, { "epoch": 0.8111168559672229, "grad_norm": 0.5081549286842346, "learning_rate": 4.1888831440327776e-05, "loss": 0.2169, "step": 1015000 }, { "epoch": 0.8115164209209014, "grad_norm": 0.3611885905265808, "learning_rate": 4.188483579079099e-05, "loss": 0.2165, "step": 1015500 }, { "epoch": 0.8119159858745798, "grad_norm": 2.7092463970184326, "learning_rate": 4.18808401412542e-05, "loss": 0.234, "step": 1016000 }, { "epoch": 0.8123155508282582, "grad_norm": 0.6893032193183899, "learning_rate": 4.187684449171742e-05, "loss": 0.2092, "step": 1016500 }, { "epoch": 0.8127151157819367, "grad_norm": 0.4735258221626282, "learning_rate": 4.1872848842180636e-05, "loss": 0.227, "step": 1017000 }, { "epoch": 0.8131146807356151, "grad_norm": 0.688281238079071, "learning_rate": 4.1868853192643854e-05, "loss": 0.2356, "step": 1017500 }, { "epoch": 0.8135142456892935, "grad_norm": 1.2174841165542603, "learning_rate": 4.1864857543107065e-05, "loss": 0.2167, "step": 1018000 }, { "epoch": 0.813913810642972, "grad_norm": 1.1810448169708252, "learning_rate": 4.1860861893570283e-05, "loss": 0.2096, "step": 1018500 }, { "epoch": 0.8143133755966504, "grad_norm": 1.5556732416152954, "learning_rate": 4.18568662440335e-05, "loss": 0.2223, "step": 1019000 }, { "epoch": 0.8147129405503288, "grad_norm": 0.4514450132846832, "learning_rate": 4.185287059449671e-05, "loss": 0.2057, "step": 1019500 }, { "epoch": 0.8151125055040073, "grad_norm": 0.798240602016449, "learning_rate": 4.184887494495993e-05, "loss": 0.2127, "step": 1020000 }, { "epoch": 0.8155120704576857, "grad_norm": 0.8280342221260071, "learning_rate": 4.184487929542314e-05, "loss": 0.2161, "step": 1020500 }, { "epoch": 0.8159116354113641, "grad_norm": 0.9127042293548584, "learning_rate": 4.184088364588636e-05, "loss": 0.2152, "step": 1021000 }, { "epoch": 0.8163112003650426, "grad_norm": 0.4912547469139099, "learning_rate": 4.183688799634958e-05, "loss": 0.2282, "step": 1021500 }, { "epoch": 0.816710765318721, "grad_norm": 1.0092676877975464, "learning_rate": 4.183289234681279e-05, "loss": 0.2429, "step": 1022000 }, { "epoch": 0.8171103302723994, "grad_norm": 0.18694579601287842, "learning_rate": 4.182889669727601e-05, "loss": 0.2096, "step": 1022500 }, { "epoch": 0.8175098952260779, "grad_norm": 0.6586813926696777, "learning_rate": 4.182490104773922e-05, "loss": 0.2102, "step": 1023000 }, { "epoch": 0.8179094601797563, "grad_norm": 0.3340361714363098, "learning_rate": 4.182090539820244e-05, "loss": 0.2243, "step": 1023500 }, { "epoch": 0.8183090251334347, "grad_norm": 0.46324390172958374, "learning_rate": 4.1816909748665657e-05, "loss": 0.2106, "step": 1024000 }, { "epoch": 0.8187085900871132, "grad_norm": 0.7076193690299988, "learning_rate": 4.181291409912887e-05, "loss": 0.2124, "step": 1024500 }, { "epoch": 0.8191081550407916, "grad_norm": 0.5344807505607605, "learning_rate": 4.1808918449592086e-05, "loss": 0.2332, "step": 1025000 }, { "epoch": 0.81950771999447, "grad_norm": 0.3743733763694763, "learning_rate": 4.1804922800055304e-05, "loss": 0.2076, "step": 1025500 }, { "epoch": 0.8199072849481485, "grad_norm": 0.5321909189224243, "learning_rate": 4.1800927150518516e-05, "loss": 0.2183, "step": 1026000 }, { "epoch": 0.8203068499018269, "grad_norm": 0.3114723861217499, "learning_rate": 4.1796931500981734e-05, "loss": 0.2428, "step": 1026500 }, { "epoch": 0.8207064148555053, "grad_norm": 0.978782057762146, "learning_rate": 4.1792935851444945e-05, "loss": 0.228, "step": 1027000 }, { "epoch": 0.8211059798091838, "grad_norm": 1.1408711671829224, "learning_rate": 4.1788940201908164e-05, "loss": 0.2297, "step": 1027500 }, { "epoch": 0.8215055447628622, "grad_norm": 2.4648046493530273, "learning_rate": 4.178494455237138e-05, "loss": 0.2253, "step": 1028000 }, { "epoch": 0.8219051097165406, "grad_norm": 2.0934410095214844, "learning_rate": 4.178094890283459e-05, "loss": 0.2245, "step": 1028500 }, { "epoch": 0.8223046746702191, "grad_norm": 0.6167880892753601, "learning_rate": 4.177695325329781e-05, "loss": 0.2263, "step": 1029000 }, { "epoch": 0.8227042396238975, "grad_norm": 0.6993730664253235, "learning_rate": 4.177295760376102e-05, "loss": 0.2247, "step": 1029500 }, { "epoch": 0.8231038045775759, "grad_norm": 0.5615753531455994, "learning_rate": 4.176896195422425e-05, "loss": 0.2299, "step": 1030000 }, { "epoch": 0.8235033695312544, "grad_norm": 2.047165870666504, "learning_rate": 4.176496630468746e-05, "loss": 0.229, "step": 1030500 }, { "epoch": 0.8239029344849328, "grad_norm": 0.6069421768188477, "learning_rate": 4.176097065515067e-05, "loss": 0.2385, "step": 1031000 }, { "epoch": 0.8243024994386112, "grad_norm": 1.9119889736175537, "learning_rate": 4.175697500561389e-05, "loss": 0.2219, "step": 1031500 }, { "epoch": 0.8247020643922897, "grad_norm": 0.3581688106060028, "learning_rate": 4.175297935607711e-05, "loss": 0.2209, "step": 1032000 }, { "epoch": 0.8251016293459681, "grad_norm": 0.35705703496932983, "learning_rate": 4.1748983706540325e-05, "loss": 0.2249, "step": 1032500 }, { "epoch": 0.8255011942996465, "grad_norm": 0.942995011806488, "learning_rate": 4.174498805700354e-05, "loss": 0.2206, "step": 1033000 }, { "epoch": 0.825900759253325, "grad_norm": 0.7796698212623596, "learning_rate": 4.174099240746675e-05, "loss": 0.2249, "step": 1033500 }, { "epoch": 0.8263003242070034, "grad_norm": 0.7402902841567993, "learning_rate": 4.1736996757929966e-05, "loss": 0.2116, "step": 1034000 }, { "epoch": 0.8266998891606818, "grad_norm": 0.727790892124176, "learning_rate": 4.1733001108393184e-05, "loss": 0.2103, "step": 1034500 }, { "epoch": 0.8270994541143603, "grad_norm": 0.5432760119438171, "learning_rate": 4.17290054588564e-05, "loss": 0.2191, "step": 1035000 }, { "epoch": 0.8274990190680387, "grad_norm": 0.6535998582839966, "learning_rate": 4.1725009809319614e-05, "loss": 0.227, "step": 1035500 }, { "epoch": 0.8278985840217171, "grad_norm": 0.6338879466056824, "learning_rate": 4.1721014159782826e-05, "loss": 0.1982, "step": 1036000 }, { "epoch": 0.8282981489753956, "grad_norm": 0.22412413358688354, "learning_rate": 4.171701851024605e-05, "loss": 0.2517, "step": 1036500 }, { "epoch": 0.828697713929074, "grad_norm": 0.7728255987167358, "learning_rate": 4.171302286070926e-05, "loss": 0.231, "step": 1037000 }, { "epoch": 0.8290972788827524, "grad_norm": 0.39380449056625366, "learning_rate": 4.170902721117248e-05, "loss": 0.2122, "step": 1037500 }, { "epoch": 0.8294968438364309, "grad_norm": 0.2797977030277252, "learning_rate": 4.170503156163569e-05, "loss": 0.2265, "step": 1038000 }, { "epoch": 0.8298964087901093, "grad_norm": 0.5967977046966553, "learning_rate": 4.170103591209891e-05, "loss": 0.2246, "step": 1038500 }, { "epoch": 0.8302959737437877, "grad_norm": 0.6614946722984314, "learning_rate": 4.169704026256213e-05, "loss": 0.2199, "step": 1039000 }, { "epoch": 0.8306955386974662, "grad_norm": 0.5846842527389526, "learning_rate": 4.169304461302534e-05, "loss": 0.1963, "step": 1039500 }, { "epoch": 0.8310951036511446, "grad_norm": 0.39686667919158936, "learning_rate": 4.168904896348856e-05, "loss": 0.2598, "step": 1040000 }, { "epoch": 0.831494668604823, "grad_norm": 0.3633386492729187, "learning_rate": 4.168505331395177e-05, "loss": 0.215, "step": 1040500 }, { "epoch": 0.8318942335585015, "grad_norm": 1.423526644706726, "learning_rate": 4.168105766441499e-05, "loss": 0.2071, "step": 1041000 }, { "epoch": 0.8322937985121799, "grad_norm": 0.23865696787834167, "learning_rate": 4.1677062014878205e-05, "loss": 0.2326, "step": 1041500 }, { "epoch": 0.8326933634658584, "grad_norm": 1.1438764333724976, "learning_rate": 4.167306636534142e-05, "loss": 0.2065, "step": 1042000 }, { "epoch": 0.8330929284195369, "grad_norm": 0.6241656541824341, "learning_rate": 4.166907071580463e-05, "loss": 0.2181, "step": 1042500 }, { "epoch": 0.8334924933732153, "grad_norm": 0.9165002107620239, "learning_rate": 4.166507506626785e-05, "loss": 0.219, "step": 1043000 }, { "epoch": 0.8338920583268937, "grad_norm": 0.48479828238487244, "learning_rate": 4.1661079416731065e-05, "loss": 0.202, "step": 1043500 }, { "epoch": 0.8342916232805722, "grad_norm": 0.9105204343795776, "learning_rate": 4.165708376719428e-05, "loss": 0.2102, "step": 1044000 }, { "epoch": 0.8346911882342506, "grad_norm": 0.15267132222652435, "learning_rate": 4.1653088117657494e-05, "loss": 0.207, "step": 1044500 }, { "epoch": 0.835090753187929, "grad_norm": 2.0029587745666504, "learning_rate": 4.164909246812071e-05, "loss": 0.2013, "step": 1045000 }, { "epoch": 0.8354903181416075, "grad_norm": 2.211552619934082, "learning_rate": 4.164509681858393e-05, "loss": 0.2264, "step": 1045500 }, { "epoch": 0.8358898830952859, "grad_norm": 1.83284592628479, "learning_rate": 4.164110116904714e-05, "loss": 0.231, "step": 1046000 }, { "epoch": 0.8362894480489643, "grad_norm": 0.6500670313835144, "learning_rate": 4.163710551951036e-05, "loss": 0.2095, "step": 1046500 }, { "epoch": 0.8366890130026428, "grad_norm": 0.3840869069099426, "learning_rate": 4.163310986997357e-05, "loss": 0.2134, "step": 1047000 }, { "epoch": 0.8370885779563212, "grad_norm": 1.9315160512924194, "learning_rate": 4.162911422043679e-05, "loss": 0.2179, "step": 1047500 }, { "epoch": 0.8374881429099996, "grad_norm": 0.5032544136047363, "learning_rate": 4.162511857090001e-05, "loss": 0.2123, "step": 1048000 }, { "epoch": 0.837887707863678, "grad_norm": 0.32760369777679443, "learning_rate": 4.162112292136322e-05, "loss": 0.2179, "step": 1048500 }, { "epoch": 0.8382872728173565, "grad_norm": 0.40468212962150574, "learning_rate": 4.161712727182644e-05, "loss": 0.2157, "step": 1049000 }, { "epoch": 0.8386868377710349, "grad_norm": 0.5073228478431702, "learning_rate": 4.1613131622289656e-05, "loss": 0.2328, "step": 1049500 }, { "epoch": 0.8390864027247134, "grad_norm": 0.6528324484825134, "learning_rate": 4.160913597275287e-05, "loss": 0.2179, "step": 1050000 }, { "epoch": 0.8394859676783918, "grad_norm": 0.8416422009468079, "learning_rate": 4.1605140323216086e-05, "loss": 0.2068, "step": 1050500 }, { "epoch": 0.8398855326320702, "grad_norm": 1.319531798362732, "learning_rate": 4.16011446736793e-05, "loss": 0.2251, "step": 1051000 }, { "epoch": 0.8402850975857487, "grad_norm": 3.2961106300354004, "learning_rate": 4.1597149024142515e-05, "loss": 0.2238, "step": 1051500 }, { "epoch": 0.8406846625394271, "grad_norm": 0.3476772904396057, "learning_rate": 4.159315337460573e-05, "loss": 0.2303, "step": 1052000 }, { "epoch": 0.8410842274931055, "grad_norm": 0.6628751754760742, "learning_rate": 4.1589157725068945e-05, "loss": 0.2244, "step": 1052500 }, { "epoch": 0.841483792446784, "grad_norm": 0.7278143763542175, "learning_rate": 4.158516207553216e-05, "loss": 0.2126, "step": 1053000 }, { "epoch": 0.8418833574004624, "grad_norm": 1.3170850276947021, "learning_rate": 4.1581166425995374e-05, "loss": 0.2272, "step": 1053500 }, { "epoch": 0.8422829223541408, "grad_norm": 0.341525673866272, "learning_rate": 4.15771707764586e-05, "loss": 0.2271, "step": 1054000 }, { "epoch": 0.8426824873078193, "grad_norm": 0.9201053977012634, "learning_rate": 4.157317512692181e-05, "loss": 0.2135, "step": 1054500 }, { "epoch": 0.8430820522614977, "grad_norm": 0.3525870144367218, "learning_rate": 4.156917947738502e-05, "loss": 0.224, "step": 1055000 }, { "epoch": 0.8434816172151761, "grad_norm": 1.0652344226837158, "learning_rate": 4.156518382784824e-05, "loss": 0.2254, "step": 1055500 }, { "epoch": 0.8438811821688545, "grad_norm": 0.4538939297199249, "learning_rate": 4.156118817831146e-05, "loss": 0.2236, "step": 1056000 }, { "epoch": 0.844280747122533, "grad_norm": 0.4208916127681732, "learning_rate": 4.155719252877468e-05, "loss": 0.2078, "step": 1056500 }, { "epoch": 0.8446803120762114, "grad_norm": 0.4294392764568329, "learning_rate": 4.155319687923789e-05, "loss": 0.225, "step": 1057000 }, { "epoch": 0.8450798770298898, "grad_norm": 1.5984013080596924, "learning_rate": 4.15492012297011e-05, "loss": 0.2014, "step": 1057500 }, { "epoch": 0.8454794419835683, "grad_norm": 0.9653556942939758, "learning_rate": 4.154520558016432e-05, "loss": 0.2154, "step": 1058000 }, { "epoch": 0.8458790069372467, "grad_norm": 0.5849599838256836, "learning_rate": 4.1541209930627536e-05, "loss": 0.2155, "step": 1058500 }, { "epoch": 0.8462785718909251, "grad_norm": 0.7856464385986328, "learning_rate": 4.1537214281090754e-05, "loss": 0.2307, "step": 1059000 }, { "epoch": 0.8466781368446036, "grad_norm": 0.3462643623352051, "learning_rate": 4.1533218631553966e-05, "loss": 0.2282, "step": 1059500 }, { "epoch": 0.847077701798282, "grad_norm": 0.796184778213501, "learning_rate": 4.152922298201718e-05, "loss": 0.2315, "step": 1060000 }, { "epoch": 0.8474772667519604, "grad_norm": 0.5886601209640503, "learning_rate": 4.1525227332480395e-05, "loss": 0.2174, "step": 1060500 }, { "epoch": 0.8478768317056389, "grad_norm": 0.49876904487609863, "learning_rate": 4.1521231682943614e-05, "loss": 0.2056, "step": 1061000 }, { "epoch": 0.8482763966593173, "grad_norm": 1.3741384744644165, "learning_rate": 4.151723603340683e-05, "loss": 0.225, "step": 1061500 }, { "epoch": 0.8486759616129957, "grad_norm": 0.37307634949684143, "learning_rate": 4.151324038387004e-05, "loss": 0.2193, "step": 1062000 }, { "epoch": 0.8490755265666742, "grad_norm": 0.9057477712631226, "learning_rate": 4.150924473433326e-05, "loss": 0.2248, "step": 1062500 }, { "epoch": 0.8494750915203526, "grad_norm": 0.8004012703895569, "learning_rate": 4.150524908479648e-05, "loss": 0.2247, "step": 1063000 }, { "epoch": 0.849874656474031, "grad_norm": 0.5544862151145935, "learning_rate": 4.150125343525969e-05, "loss": 0.2448, "step": 1063500 }, { "epoch": 0.8502742214277095, "grad_norm": 0.8534271717071533, "learning_rate": 4.149725778572291e-05, "loss": 0.2135, "step": 1064000 }, { "epoch": 0.8506737863813879, "grad_norm": 0.4176851511001587, "learning_rate": 4.149326213618612e-05, "loss": 0.2149, "step": 1064500 }, { "epoch": 0.8510733513350663, "grad_norm": 0.6667443513870239, "learning_rate": 4.148926648664934e-05, "loss": 0.218, "step": 1065000 }, { "epoch": 0.8514729162887448, "grad_norm": 2.0387253761291504, "learning_rate": 4.148527083711256e-05, "loss": 0.2128, "step": 1065500 }, { "epoch": 0.8518724812424232, "grad_norm": 0.9104942679405212, "learning_rate": 4.148127518757577e-05, "loss": 0.2132, "step": 1066000 }, { "epoch": 0.8522720461961016, "grad_norm": 0.8676040768623352, "learning_rate": 4.1477279538038987e-05, "loss": 0.239, "step": 1066500 }, { "epoch": 0.8526716111497801, "grad_norm": 0.2957562804222107, "learning_rate": 4.14732838885022e-05, "loss": 0.2275, "step": 1067000 }, { "epoch": 0.8530711761034585, "grad_norm": 2.9416425228118896, "learning_rate": 4.1469288238965416e-05, "loss": 0.2214, "step": 1067500 }, { "epoch": 0.8534707410571369, "grad_norm": 0.5546291470527649, "learning_rate": 4.1465292589428634e-05, "loss": 0.2365, "step": 1068000 }, { "epoch": 0.8538703060108154, "grad_norm": 0.8078615665435791, "learning_rate": 4.1461296939891846e-05, "loss": 0.2062, "step": 1068500 }, { "epoch": 0.8542698709644938, "grad_norm": 0.5278241038322449, "learning_rate": 4.1457301290355064e-05, "loss": 0.2195, "step": 1069000 }, { "epoch": 0.8546694359181723, "grad_norm": 0.5871700048446655, "learning_rate": 4.145330564081828e-05, "loss": 0.2103, "step": 1069500 }, { "epoch": 0.8550690008718508, "grad_norm": 0.43970412015914917, "learning_rate": 4.1449309991281494e-05, "loss": 0.2183, "step": 1070000 }, { "epoch": 0.8554685658255292, "grad_norm": 0.560461163520813, "learning_rate": 4.144531434174471e-05, "loss": 0.2296, "step": 1070500 }, { "epoch": 0.8558681307792076, "grad_norm": 0.5702808499336243, "learning_rate": 4.144131869220792e-05, "loss": 0.2338, "step": 1071000 }, { "epoch": 0.8562676957328861, "grad_norm": 1.6946300268173218, "learning_rate": 4.143732304267114e-05, "loss": 0.2122, "step": 1071500 }, { "epoch": 0.8566672606865645, "grad_norm": 0.7101428508758545, "learning_rate": 4.143332739313436e-05, "loss": 0.2299, "step": 1072000 }, { "epoch": 0.857066825640243, "grad_norm": 0.4775520861148834, "learning_rate": 4.142933174359757e-05, "loss": 0.2313, "step": 1072500 }, { "epoch": 0.8574663905939214, "grad_norm": 1.7694039344787598, "learning_rate": 4.142533609406079e-05, "loss": 0.2083, "step": 1073000 }, { "epoch": 0.8578659555475998, "grad_norm": 0.4884648025035858, "learning_rate": 4.1421340444524e-05, "loss": 0.2136, "step": 1073500 }, { "epoch": 0.8582655205012782, "grad_norm": 1.0780616998672485, "learning_rate": 4.141734479498722e-05, "loss": 0.2119, "step": 1074000 }, { "epoch": 0.8586650854549567, "grad_norm": 0.5273327827453613, "learning_rate": 4.141334914545044e-05, "loss": 0.2241, "step": 1074500 }, { "epoch": 0.8590646504086351, "grad_norm": 0.6411398649215698, "learning_rate": 4.140935349591365e-05, "loss": 0.2204, "step": 1075000 }, { "epoch": 0.8594642153623135, "grad_norm": 1.0135247707366943, "learning_rate": 4.140535784637687e-05, "loss": 0.1958, "step": 1075500 }, { "epoch": 0.859863780315992, "grad_norm": 0.25675591826438904, "learning_rate": 4.1401362196840085e-05, "loss": 0.2283, "step": 1076000 }, { "epoch": 0.8602633452696704, "grad_norm": 1.9429361820220947, "learning_rate": 4.1397366547303296e-05, "loss": 0.2133, "step": 1076500 }, { "epoch": 0.8606629102233488, "grad_norm": 1.5057289600372314, "learning_rate": 4.1393370897766515e-05, "loss": 0.2107, "step": 1077000 }, { "epoch": 0.8610624751770273, "grad_norm": 1.242264747619629, "learning_rate": 4.1389375248229726e-05, "loss": 0.2228, "step": 1077500 }, { "epoch": 0.8614620401307057, "grad_norm": 0.47199007868766785, "learning_rate": 4.1385379598692944e-05, "loss": 0.2207, "step": 1078000 }, { "epoch": 0.8618616050843841, "grad_norm": 0.43644607067108154, "learning_rate": 4.138138394915616e-05, "loss": 0.2193, "step": 1078500 }, { "epoch": 0.8622611700380626, "grad_norm": 0.3684595823287964, "learning_rate": 4.1377388299619374e-05, "loss": 0.2292, "step": 1079000 }, { "epoch": 0.862660734991741, "grad_norm": 0.6154922842979431, "learning_rate": 4.137339265008259e-05, "loss": 0.2283, "step": 1079500 }, { "epoch": 0.8630602999454194, "grad_norm": 0.44529369473457336, "learning_rate": 4.1369397000545803e-05, "loss": 0.2334, "step": 1080000 }, { "epoch": 0.8634598648990979, "grad_norm": 0.5958248972892761, "learning_rate": 4.136540135100903e-05, "loss": 0.2203, "step": 1080500 }, { "epoch": 0.8638594298527763, "grad_norm": 1.3985224962234497, "learning_rate": 4.136140570147224e-05, "loss": 0.2275, "step": 1081000 }, { "epoch": 0.8642589948064547, "grad_norm": 0.9570584893226624, "learning_rate": 4.135741005193545e-05, "loss": 0.2065, "step": 1081500 }, { "epoch": 0.8646585597601332, "grad_norm": 2.2103688716888428, "learning_rate": 4.135341440239867e-05, "loss": 0.2146, "step": 1082000 }, { "epoch": 0.8650581247138116, "grad_norm": 0.5346314311027527, "learning_rate": 4.134941875286189e-05, "loss": 0.2167, "step": 1082500 }, { "epoch": 0.86545768966749, "grad_norm": 0.5512805581092834, "learning_rate": 4.1345423103325106e-05, "loss": 0.238, "step": 1083000 }, { "epoch": 0.8658572546211685, "grad_norm": 0.8565719127655029, "learning_rate": 4.134142745378832e-05, "loss": 0.2101, "step": 1083500 }, { "epoch": 0.8662568195748469, "grad_norm": 1.0811212062835693, "learning_rate": 4.133743180425153e-05, "loss": 0.2313, "step": 1084000 }, { "epoch": 0.8666563845285253, "grad_norm": 1.9773720502853394, "learning_rate": 4.133343615471475e-05, "loss": 0.2095, "step": 1084500 }, { "epoch": 0.8670559494822038, "grad_norm": 0.7050249576568604, "learning_rate": 4.1329440505177965e-05, "loss": 0.2113, "step": 1085000 }, { "epoch": 0.8674555144358822, "grad_norm": 0.1546313762664795, "learning_rate": 4.132544485564118e-05, "loss": 0.2041, "step": 1085500 }, { "epoch": 0.8678550793895606, "grad_norm": 1.1212059259414673, "learning_rate": 4.1321449206104395e-05, "loss": 0.2134, "step": 1086000 }, { "epoch": 0.8682546443432391, "grad_norm": 0.4295004904270172, "learning_rate": 4.1317453556567606e-05, "loss": 0.2078, "step": 1086500 }, { "epoch": 0.8686542092969175, "grad_norm": 0.5659120678901672, "learning_rate": 4.131345790703083e-05, "loss": 0.2034, "step": 1087000 }, { "epoch": 0.8690537742505959, "grad_norm": 0.5736664533615112, "learning_rate": 4.130946225749404e-05, "loss": 0.2063, "step": 1087500 }, { "epoch": 0.8694533392042744, "grad_norm": 0.6515297293663025, "learning_rate": 4.130546660795726e-05, "loss": 0.2282, "step": 1088000 }, { "epoch": 0.8698529041579528, "grad_norm": 0.5296384692192078, "learning_rate": 4.130147095842047e-05, "loss": 0.2285, "step": 1088500 }, { "epoch": 0.8702524691116312, "grad_norm": 0.8607897758483887, "learning_rate": 4.129747530888369e-05, "loss": 0.212, "step": 1089000 }, { "epoch": 0.8706520340653097, "grad_norm": 0.4898971915245056, "learning_rate": 4.129347965934691e-05, "loss": 0.2229, "step": 1089500 }, { "epoch": 0.8710515990189881, "grad_norm": 0.5062271952629089, "learning_rate": 4.128948400981012e-05, "loss": 0.2479, "step": 1090000 }, { "epoch": 0.8714511639726665, "grad_norm": 0.469392329454422, "learning_rate": 4.128548836027334e-05, "loss": 0.2006, "step": 1090500 }, { "epoch": 0.871850728926345, "grad_norm": 0.4375726580619812, "learning_rate": 4.128149271073655e-05, "loss": 0.202, "step": 1091000 }, { "epoch": 0.8722502938800234, "grad_norm": 0.8343220353126526, "learning_rate": 4.127749706119977e-05, "loss": 0.2058, "step": 1091500 }, { "epoch": 0.8726498588337018, "grad_norm": 0.511073112487793, "learning_rate": 4.1273501411662986e-05, "loss": 0.2065, "step": 1092000 }, { "epoch": 0.8730494237873803, "grad_norm": 0.9260953068733215, "learning_rate": 4.12695057621262e-05, "loss": 0.226, "step": 1092500 }, { "epoch": 0.8734489887410587, "grad_norm": 0.599119246006012, "learning_rate": 4.1265510112589416e-05, "loss": 0.2171, "step": 1093000 }, { "epoch": 0.8738485536947371, "grad_norm": 0.5260900259017944, "learning_rate": 4.1261514463052634e-05, "loss": 0.2074, "step": 1093500 }, { "epoch": 0.8742481186484156, "grad_norm": 0.6979879140853882, "learning_rate": 4.1257518813515845e-05, "loss": 0.2133, "step": 1094000 }, { "epoch": 0.874647683602094, "grad_norm": 1.9545183181762695, "learning_rate": 4.1253523163979063e-05, "loss": 0.2356, "step": 1094500 }, { "epoch": 0.8750472485557724, "grad_norm": 0.8122169971466064, "learning_rate": 4.1249527514442275e-05, "loss": 0.2321, "step": 1095000 }, { "epoch": 0.8754468135094509, "grad_norm": 1.4786702394485474, "learning_rate": 4.124553186490549e-05, "loss": 0.2162, "step": 1095500 }, { "epoch": 0.8758463784631293, "grad_norm": 0.8864335417747498, "learning_rate": 4.124153621536871e-05, "loss": 0.2074, "step": 1096000 }, { "epoch": 0.8762459434168077, "grad_norm": 0.27742621302604675, "learning_rate": 4.123754056583192e-05, "loss": 0.2124, "step": 1096500 }, { "epoch": 0.8766455083704862, "grad_norm": 1.8444455862045288, "learning_rate": 4.123354491629514e-05, "loss": 0.1936, "step": 1097000 }, { "epoch": 0.8770450733241647, "grad_norm": 0.28228357434272766, "learning_rate": 4.122954926675835e-05, "loss": 0.2183, "step": 1097500 }, { "epoch": 0.8774446382778431, "grad_norm": 2.4478585720062256, "learning_rate": 4.122555361722157e-05, "loss": 0.2171, "step": 1098000 }, { "epoch": 0.8778442032315216, "grad_norm": 1.0726900100708008, "learning_rate": 4.122155796768479e-05, "loss": 0.1924, "step": 1098500 }, { "epoch": 0.8782437681852, "grad_norm": 0.4203329086303711, "learning_rate": 4.1217562318148e-05, "loss": 0.2269, "step": 1099000 }, { "epoch": 0.8786433331388784, "grad_norm": 1.322377324104309, "learning_rate": 4.121356666861122e-05, "loss": 0.2307, "step": 1099500 }, { "epoch": 0.8790428980925569, "grad_norm": 0.6485905051231384, "learning_rate": 4.1209571019074437e-05, "loss": 0.2076, "step": 1100000 }, { "epoch": 0.8794424630462353, "grad_norm": 1.1410236358642578, "learning_rate": 4.120557536953765e-05, "loss": 0.2156, "step": 1100500 }, { "epoch": 0.8798420279999137, "grad_norm": 0.5416204333305359, "learning_rate": 4.1201579720000866e-05, "loss": 0.2241, "step": 1101000 }, { "epoch": 0.8802415929535922, "grad_norm": 0.6234997510910034, "learning_rate": 4.119758407046408e-05, "loss": 0.2356, "step": 1101500 }, { "epoch": 0.8806411579072706, "grad_norm": 1.1678460836410522, "learning_rate": 4.1193588420927296e-05, "loss": 0.2229, "step": 1102000 }, { "epoch": 0.881040722860949, "grad_norm": 2.523134708404541, "learning_rate": 4.1189592771390514e-05, "loss": 0.2233, "step": 1102500 }, { "epoch": 0.8814402878146275, "grad_norm": 0.4060690701007843, "learning_rate": 4.1185597121853725e-05, "loss": 0.2197, "step": 1103000 }, { "epoch": 0.8818398527683059, "grad_norm": 0.6607658863067627, "learning_rate": 4.1181601472316944e-05, "loss": 0.2145, "step": 1103500 }, { "epoch": 0.8822394177219843, "grad_norm": 0.6194354891777039, "learning_rate": 4.1177605822780155e-05, "loss": 0.2263, "step": 1104000 }, { "epoch": 0.8826389826756628, "grad_norm": 0.6981911063194275, "learning_rate": 4.117361017324337e-05, "loss": 0.2386, "step": 1104500 }, { "epoch": 0.8830385476293412, "grad_norm": 0.632169783115387, "learning_rate": 4.116961452370659e-05, "loss": 0.2241, "step": 1105000 }, { "epoch": 0.8834381125830196, "grad_norm": 0.6707410216331482, "learning_rate": 4.11656188741698e-05, "loss": 0.2083, "step": 1105500 }, { "epoch": 0.8838376775366981, "grad_norm": 0.9383687376976013, "learning_rate": 4.116162322463302e-05, "loss": 0.2104, "step": 1106000 }, { "epoch": 0.8842372424903765, "grad_norm": 1.4300072193145752, "learning_rate": 4.115762757509624e-05, "loss": 0.2202, "step": 1106500 }, { "epoch": 0.8846368074440549, "grad_norm": 0.5293726921081543, "learning_rate": 4.115363192555946e-05, "loss": 0.2286, "step": 1107000 }, { "epoch": 0.8850363723977334, "grad_norm": 2.03117036819458, "learning_rate": 4.114963627602267e-05, "loss": 0.2346, "step": 1107500 }, { "epoch": 0.8854359373514118, "grad_norm": 1.9368810653686523, "learning_rate": 4.114564062648588e-05, "loss": 0.2281, "step": 1108000 }, { "epoch": 0.8858355023050902, "grad_norm": 0.6441903710365295, "learning_rate": 4.11416449769491e-05, "loss": 0.2247, "step": 1108500 }, { "epoch": 0.8862350672587687, "grad_norm": 0.6266528964042664, "learning_rate": 4.113764932741232e-05, "loss": 0.2239, "step": 1109000 }, { "epoch": 0.8866346322124471, "grad_norm": 1.5802003145217896, "learning_rate": 4.1133653677875535e-05, "loss": 0.2294, "step": 1109500 }, { "epoch": 0.8870341971661255, "grad_norm": 0.49167701601982117, "learning_rate": 4.1129658028338746e-05, "loss": 0.2115, "step": 1110000 }, { "epoch": 0.887433762119804, "grad_norm": 1.0287748575210571, "learning_rate": 4.112566237880196e-05, "loss": 0.212, "step": 1110500 }, { "epoch": 0.8878333270734824, "grad_norm": 0.5804826021194458, "learning_rate": 4.1121666729265176e-05, "loss": 0.2243, "step": 1111000 }, { "epoch": 0.8882328920271608, "grad_norm": 0.29663509130477905, "learning_rate": 4.1117671079728394e-05, "loss": 0.2137, "step": 1111500 }, { "epoch": 0.8886324569808393, "grad_norm": 0.3290671706199646, "learning_rate": 4.111367543019161e-05, "loss": 0.2166, "step": 1112000 }, { "epoch": 0.8890320219345177, "grad_norm": 0.43540915846824646, "learning_rate": 4.1109679780654824e-05, "loss": 0.2071, "step": 1112500 }, { "epoch": 0.8894315868881961, "grad_norm": 0.47070810198783875, "learning_rate": 4.1105684131118035e-05, "loss": 0.2227, "step": 1113000 }, { "epoch": 0.8898311518418746, "grad_norm": 0.31968167424201965, "learning_rate": 4.110168848158126e-05, "loss": 0.2247, "step": 1113500 }, { "epoch": 0.890230716795553, "grad_norm": 1.1432499885559082, "learning_rate": 4.109769283204447e-05, "loss": 0.2326, "step": 1114000 }, { "epoch": 0.8906302817492314, "grad_norm": 0.9192473292350769, "learning_rate": 4.109369718250769e-05, "loss": 0.2206, "step": 1114500 }, { "epoch": 0.8910298467029099, "grad_norm": 0.4564472436904907, "learning_rate": 4.10897015329709e-05, "loss": 0.2232, "step": 1115000 }, { "epoch": 0.8914294116565883, "grad_norm": 2.431563377380371, "learning_rate": 4.108570588343412e-05, "loss": 0.2234, "step": 1115500 }, { "epoch": 0.8918289766102667, "grad_norm": 0.9609575867652893, "learning_rate": 4.108171023389734e-05, "loss": 0.2112, "step": 1116000 }, { "epoch": 0.8922285415639452, "grad_norm": 1.0187263488769531, "learning_rate": 4.107771458436055e-05, "loss": 0.2055, "step": 1116500 }, { "epoch": 0.8926281065176236, "grad_norm": 0.5106242895126343, "learning_rate": 4.107371893482377e-05, "loss": 0.2255, "step": 1117000 }, { "epoch": 0.893027671471302, "grad_norm": 0.5251019597053528, "learning_rate": 4.106972328528698e-05, "loss": 0.2061, "step": 1117500 }, { "epoch": 0.8934272364249805, "grad_norm": 0.3256416916847229, "learning_rate": 4.10657276357502e-05, "loss": 0.2263, "step": 1118000 }, { "epoch": 0.8938268013786589, "grad_norm": 0.37108418345451355, "learning_rate": 4.1061731986213415e-05, "loss": 0.2274, "step": 1118500 }, { "epoch": 0.8942263663323373, "grad_norm": 0.4412253201007843, "learning_rate": 4.1057736336676626e-05, "loss": 0.2136, "step": 1119000 }, { "epoch": 0.8946259312860158, "grad_norm": 0.7170975208282471, "learning_rate": 4.1053740687139845e-05, "loss": 0.2144, "step": 1119500 }, { "epoch": 0.8950254962396942, "grad_norm": 0.4587725102901459, "learning_rate": 4.104974503760306e-05, "loss": 0.215, "step": 1120000 }, { "epoch": 0.8954250611933726, "grad_norm": 0.6820108294487, "learning_rate": 4.1045749388066274e-05, "loss": 0.2017, "step": 1120500 }, { "epoch": 0.895824626147051, "grad_norm": 1.725791096687317, "learning_rate": 4.104175373852949e-05, "loss": 0.231, "step": 1121000 }, { "epoch": 0.8962241911007295, "grad_norm": 0.6638616323471069, "learning_rate": 4.1037758088992704e-05, "loss": 0.2361, "step": 1121500 }, { "epoch": 0.8966237560544079, "grad_norm": 0.3298245072364807, "learning_rate": 4.103376243945592e-05, "loss": 0.2079, "step": 1122000 }, { "epoch": 0.8970233210080863, "grad_norm": 0.5937789082527161, "learning_rate": 4.102976678991914e-05, "loss": 0.2184, "step": 1122500 }, { "epoch": 0.8974228859617648, "grad_norm": 0.5058948397636414, "learning_rate": 4.102577114038235e-05, "loss": 0.2095, "step": 1123000 }, { "epoch": 0.8978224509154432, "grad_norm": 0.6067737340927124, "learning_rate": 4.102177549084557e-05, "loss": 0.2224, "step": 1123500 }, { "epoch": 0.8982220158691216, "grad_norm": 0.705824613571167, "learning_rate": 4.101777984130878e-05, "loss": 0.2158, "step": 1124000 }, { "epoch": 0.8986215808228001, "grad_norm": 1.5169615745544434, "learning_rate": 4.1013784191772006e-05, "loss": 0.2007, "step": 1124500 }, { "epoch": 0.8990211457764786, "grad_norm": 0.8274109959602356, "learning_rate": 4.100978854223522e-05, "loss": 0.2061, "step": 1125000 }, { "epoch": 0.8994207107301571, "grad_norm": 0.7510071992874146, "learning_rate": 4.100579289269843e-05, "loss": 0.2185, "step": 1125500 }, { "epoch": 0.8998202756838355, "grad_norm": 1.1802046298980713, "learning_rate": 4.100179724316165e-05, "loss": 0.1972, "step": 1126000 }, { "epoch": 0.9002198406375139, "grad_norm": 0.766839861869812, "learning_rate": 4.0997801593624866e-05, "loss": 0.2039, "step": 1126500 }, { "epoch": 0.9006194055911924, "grad_norm": 0.4426947236061096, "learning_rate": 4.099380594408808e-05, "loss": 0.2169, "step": 1127000 }, { "epoch": 0.9010189705448708, "grad_norm": 0.2864760160446167, "learning_rate": 4.0989810294551295e-05, "loss": 0.2402, "step": 1127500 }, { "epoch": 0.9014185354985492, "grad_norm": 0.5074548125267029, "learning_rate": 4.098581464501451e-05, "loss": 0.2127, "step": 1128000 }, { "epoch": 0.9018181004522277, "grad_norm": 2.7314958572387695, "learning_rate": 4.0981818995477725e-05, "loss": 0.229, "step": 1128500 }, { "epoch": 0.9022176654059061, "grad_norm": 0.7916517853736877, "learning_rate": 4.097782334594094e-05, "loss": 0.2107, "step": 1129000 }, { "epoch": 0.9026172303595845, "grad_norm": 0.5633601546287537, "learning_rate": 4.0973827696404154e-05, "loss": 0.2145, "step": 1129500 }, { "epoch": 0.903016795313263, "grad_norm": 0.6094526052474976, "learning_rate": 4.096983204686737e-05, "loss": 0.2318, "step": 1130000 }, { "epoch": 0.9034163602669414, "grad_norm": 2.1329376697540283, "learning_rate": 4.0965836397330584e-05, "loss": 0.2327, "step": 1130500 }, { "epoch": 0.9038159252206198, "grad_norm": 0.48923298716545105, "learning_rate": 4.096184074779381e-05, "loss": 0.2038, "step": 1131000 }, { "epoch": 0.9042154901742983, "grad_norm": 0.5350711941719055, "learning_rate": 4.095784509825702e-05, "loss": 0.2374, "step": 1131500 }, { "epoch": 0.9046150551279767, "grad_norm": 1.3004155158996582, "learning_rate": 4.095384944872023e-05, "loss": 0.2263, "step": 1132000 }, { "epoch": 0.9050146200816551, "grad_norm": 0.5554841756820679, "learning_rate": 4.094985379918345e-05, "loss": 0.2277, "step": 1132500 }, { "epoch": 0.9054141850353336, "grad_norm": 0.7279914021492004, "learning_rate": 4.094585814964667e-05, "loss": 0.2268, "step": 1133000 }, { "epoch": 0.905813749989012, "grad_norm": 2.2599079608917236, "learning_rate": 4.0941862500109886e-05, "loss": 0.2012, "step": 1133500 }, { "epoch": 0.9062133149426904, "grad_norm": 0.5105291604995728, "learning_rate": 4.09378668505731e-05, "loss": 0.1978, "step": 1134000 }, { "epoch": 0.9066128798963689, "grad_norm": 1.202580213546753, "learning_rate": 4.093387120103631e-05, "loss": 0.2073, "step": 1134500 }, { "epoch": 0.9070124448500473, "grad_norm": 1.1800898313522339, "learning_rate": 4.092987555149953e-05, "loss": 0.2091, "step": 1135000 }, { "epoch": 0.9074120098037257, "grad_norm": 0.4693198800086975, "learning_rate": 4.0925879901962746e-05, "loss": 0.2114, "step": 1135500 }, { "epoch": 0.9078115747574041, "grad_norm": 0.6703384518623352, "learning_rate": 4.0921884252425964e-05, "loss": 0.2171, "step": 1136000 }, { "epoch": 0.9082111397110826, "grad_norm": 0.7820002436637878, "learning_rate": 4.0917888602889175e-05, "loss": 0.2071, "step": 1136500 }, { "epoch": 0.908610704664761, "grad_norm": 2.7786319255828857, "learning_rate": 4.091389295335239e-05, "loss": 0.2227, "step": 1137000 }, { "epoch": 0.9090102696184394, "grad_norm": 0.2364293485879898, "learning_rate": 4.090989730381561e-05, "loss": 0.2274, "step": 1137500 }, { "epoch": 0.9094098345721179, "grad_norm": 0.24346444010734558, "learning_rate": 4.090590165427882e-05, "loss": 0.2253, "step": 1138000 }, { "epoch": 0.9098093995257963, "grad_norm": 0.6206730008125305, "learning_rate": 4.090190600474204e-05, "loss": 0.2277, "step": 1138500 }, { "epoch": 0.9102089644794747, "grad_norm": 1.0932694673538208, "learning_rate": 4.089791035520525e-05, "loss": 0.2139, "step": 1139000 }, { "epoch": 0.9106085294331532, "grad_norm": 1.9472814798355103, "learning_rate": 4.089391470566847e-05, "loss": 0.2117, "step": 1139500 }, { "epoch": 0.9110080943868316, "grad_norm": 1.3891441822052002, "learning_rate": 4.088991905613169e-05, "loss": 0.217, "step": 1140000 }, { "epoch": 0.91140765934051, "grad_norm": 0.9084506630897522, "learning_rate": 4.08859234065949e-05, "loss": 0.2267, "step": 1140500 }, { "epoch": 0.9118072242941885, "grad_norm": 0.7849106192588806, "learning_rate": 4.088192775705812e-05, "loss": 0.2113, "step": 1141000 }, { "epoch": 0.9122067892478669, "grad_norm": 1.2351521253585815, "learning_rate": 4.087793210752133e-05, "loss": 0.2208, "step": 1141500 }, { "epoch": 0.9126063542015453, "grad_norm": 0.7363126277923584, "learning_rate": 4.087393645798455e-05, "loss": 0.2187, "step": 1142000 }, { "epoch": 0.9130059191552238, "grad_norm": 0.3391164243221283, "learning_rate": 4.086994080844777e-05, "loss": 0.2032, "step": 1142500 }, { "epoch": 0.9134054841089022, "grad_norm": 0.7496381402015686, "learning_rate": 4.086594515891098e-05, "loss": 0.2248, "step": 1143000 }, { "epoch": 0.9138050490625806, "grad_norm": 0.9676142930984497, "learning_rate": 4.0861949509374196e-05, "loss": 0.2129, "step": 1143500 }, { "epoch": 0.9142046140162591, "grad_norm": 0.6113422513008118, "learning_rate": 4.0857953859837414e-05, "loss": 0.2193, "step": 1144000 }, { "epoch": 0.9146041789699375, "grad_norm": 1.3391896486282349, "learning_rate": 4.0853958210300626e-05, "loss": 0.1945, "step": 1144500 }, { "epoch": 0.9150037439236159, "grad_norm": 0.39646679162979126, "learning_rate": 4.0849962560763844e-05, "loss": 0.203, "step": 1145000 }, { "epoch": 0.9154033088772944, "grad_norm": 0.9129787087440491, "learning_rate": 4.0845966911227055e-05, "loss": 0.2372, "step": 1145500 }, { "epoch": 0.9158028738309728, "grad_norm": 0.6936927437782288, "learning_rate": 4.0841971261690274e-05, "loss": 0.2137, "step": 1146000 }, { "epoch": 0.9162024387846512, "grad_norm": 1.0102088451385498, "learning_rate": 4.083797561215349e-05, "loss": 0.2393, "step": 1146500 }, { "epoch": 0.9166020037383297, "grad_norm": 0.813483476638794, "learning_rate": 4.08339799626167e-05, "loss": 0.2223, "step": 1147000 }, { "epoch": 0.9170015686920081, "grad_norm": 0.870299220085144, "learning_rate": 4.082998431307992e-05, "loss": 0.2146, "step": 1147500 }, { "epoch": 0.9174011336456865, "grad_norm": 1.3661075830459595, "learning_rate": 4.082598866354313e-05, "loss": 0.2105, "step": 1148000 }, { "epoch": 0.917800698599365, "grad_norm": 0.48279234766960144, "learning_rate": 4.082199301400635e-05, "loss": 0.2128, "step": 1148500 }, { "epoch": 0.9182002635530434, "grad_norm": 0.6327897906303406, "learning_rate": 4.081799736446957e-05, "loss": 0.2055, "step": 1149000 }, { "epoch": 0.9185998285067218, "grad_norm": 0.7444247007369995, "learning_rate": 4.081400171493278e-05, "loss": 0.2128, "step": 1149500 }, { "epoch": 0.9189993934604003, "grad_norm": 0.882736086845398, "learning_rate": 4.0810006065396e-05, "loss": 0.2203, "step": 1150000 }, { "epoch": 0.9193989584140787, "grad_norm": 0.8802958130836487, "learning_rate": 4.080601041585922e-05, "loss": 0.2096, "step": 1150500 }, { "epoch": 0.9197985233677571, "grad_norm": 0.8671351671218872, "learning_rate": 4.0802014766322435e-05, "loss": 0.2149, "step": 1151000 }, { "epoch": 0.9201980883214356, "grad_norm": 0.7143961787223816, "learning_rate": 4.079801911678565e-05, "loss": 0.2128, "step": 1151500 }, { "epoch": 0.920597653275114, "grad_norm": 3.1327028274536133, "learning_rate": 4.079402346724886e-05, "loss": 0.2077, "step": 1152000 }, { "epoch": 0.9209972182287924, "grad_norm": 0.5005737543106079, "learning_rate": 4.0790027817712076e-05, "loss": 0.2271, "step": 1152500 }, { "epoch": 0.921396783182471, "grad_norm": 0.577066957950592, "learning_rate": 4.0786032168175295e-05, "loss": 0.2166, "step": 1153000 }, { "epoch": 0.9217963481361494, "grad_norm": 0.7742093801498413, "learning_rate": 4.0782036518638506e-05, "loss": 0.1973, "step": 1153500 }, { "epoch": 0.9221959130898278, "grad_norm": 0.4862828552722931, "learning_rate": 4.0778040869101724e-05, "loss": 0.205, "step": 1154000 }, { "epoch": 0.9225954780435063, "grad_norm": 0.825691282749176, "learning_rate": 4.0774045219564936e-05, "loss": 0.2113, "step": 1154500 }, { "epoch": 0.9229950429971847, "grad_norm": 0.6099650859832764, "learning_rate": 4.0770049570028154e-05, "loss": 0.215, "step": 1155000 }, { "epoch": 0.9233946079508631, "grad_norm": 2.051143169403076, "learning_rate": 4.076605392049137e-05, "loss": 0.2199, "step": 1155500 }, { "epoch": 0.9237941729045416, "grad_norm": 0.789681077003479, "learning_rate": 4.0762058270954583e-05, "loss": 0.2234, "step": 1156000 }, { "epoch": 0.92419373785822, "grad_norm": 0.6683809161186218, "learning_rate": 4.07580626214178e-05, "loss": 0.2142, "step": 1156500 }, { "epoch": 0.9245933028118984, "grad_norm": 0.5137190818786621, "learning_rate": 4.075406697188101e-05, "loss": 0.2103, "step": 1157000 }, { "epoch": 0.9249928677655769, "grad_norm": 0.28556203842163086, "learning_rate": 4.075007132234424e-05, "loss": 0.2016, "step": 1157500 }, { "epoch": 0.9253924327192553, "grad_norm": 0.6260321736335754, "learning_rate": 4.074607567280745e-05, "loss": 0.2287, "step": 1158000 }, { "epoch": 0.9257919976729337, "grad_norm": 1.301973581314087, "learning_rate": 4.074208002327066e-05, "loss": 0.229, "step": 1158500 }, { "epoch": 0.9261915626266122, "grad_norm": 0.5095052719116211, "learning_rate": 4.073808437373388e-05, "loss": 0.2192, "step": 1159000 }, { "epoch": 0.9265911275802906, "grad_norm": 0.6689547300338745, "learning_rate": 4.07340887241971e-05, "loss": 0.2298, "step": 1159500 }, { "epoch": 0.926990692533969, "grad_norm": 1.0450600385665894, "learning_rate": 4.0730093074660316e-05, "loss": 0.2278, "step": 1160000 }, { "epoch": 0.9273902574876475, "grad_norm": 0.29276949167251587, "learning_rate": 4.072609742512353e-05, "loss": 0.2294, "step": 1160500 }, { "epoch": 0.9277898224413259, "grad_norm": 1.302266240119934, "learning_rate": 4.072210177558674e-05, "loss": 0.2497, "step": 1161000 }, { "epoch": 0.9281893873950043, "grad_norm": 0.18396691977977753, "learning_rate": 4.0718106126049957e-05, "loss": 0.203, "step": 1161500 }, { "epoch": 0.9285889523486828, "grad_norm": 0.45783883333206177, "learning_rate": 4.0714110476513175e-05, "loss": 0.2123, "step": 1162000 }, { "epoch": 0.9289885173023612, "grad_norm": 0.9871062636375427, "learning_rate": 4.071011482697639e-05, "loss": 0.2219, "step": 1162500 }, { "epoch": 0.9293880822560396, "grad_norm": 0.7277144193649292, "learning_rate": 4.0706119177439604e-05, "loss": 0.196, "step": 1163000 }, { "epoch": 0.9297876472097181, "grad_norm": 1.1505624055862427, "learning_rate": 4.0702123527902816e-05, "loss": 0.2211, "step": 1163500 }, { "epoch": 0.9301872121633965, "grad_norm": 0.3523178696632385, "learning_rate": 4.069812787836604e-05, "loss": 0.2084, "step": 1164000 }, { "epoch": 0.9305867771170749, "grad_norm": 0.5982407331466675, "learning_rate": 4.069413222882925e-05, "loss": 0.2424, "step": 1164500 }, { "epoch": 0.9309863420707534, "grad_norm": 0.5261115431785583, "learning_rate": 4.069013657929247e-05, "loss": 0.2252, "step": 1165000 }, { "epoch": 0.9313859070244318, "grad_norm": 0.8470924496650696, "learning_rate": 4.068614092975568e-05, "loss": 0.2104, "step": 1165500 }, { "epoch": 0.9317854719781102, "grad_norm": 0.8190592527389526, "learning_rate": 4.06821452802189e-05, "loss": 0.2069, "step": 1166000 }, { "epoch": 0.9321850369317887, "grad_norm": 1.4740058183670044, "learning_rate": 4.067814963068212e-05, "loss": 0.2187, "step": 1166500 }, { "epoch": 0.9325846018854671, "grad_norm": 1.4152299165725708, "learning_rate": 4.067415398114533e-05, "loss": 0.2093, "step": 1167000 }, { "epoch": 0.9329841668391455, "grad_norm": 1.1463522911071777, "learning_rate": 4.067015833160855e-05, "loss": 0.2284, "step": 1167500 }, { "epoch": 0.933383731792824, "grad_norm": 0.6153119802474976, "learning_rate": 4.066616268207176e-05, "loss": 0.2054, "step": 1168000 }, { "epoch": 0.9337832967465024, "grad_norm": 0.5620208978652954, "learning_rate": 4.066216703253498e-05, "loss": 0.2289, "step": 1168500 }, { "epoch": 0.9341828617001808, "grad_norm": 0.7038373947143555, "learning_rate": 4.0658171382998196e-05, "loss": 0.2161, "step": 1169000 }, { "epoch": 0.9345824266538593, "grad_norm": 0.6521101593971252, "learning_rate": 4.065417573346141e-05, "loss": 0.2101, "step": 1169500 }, { "epoch": 0.9349819916075377, "grad_norm": 0.5208590030670166, "learning_rate": 4.0650180083924625e-05, "loss": 0.2227, "step": 1170000 }, { "epoch": 0.9353815565612161, "grad_norm": 0.9393600225448608, "learning_rate": 4.0646184434387843e-05, "loss": 0.1985, "step": 1170500 }, { "epoch": 0.9357811215148946, "grad_norm": 0.768405020236969, "learning_rate": 4.0642188784851055e-05, "loss": 0.2356, "step": 1171000 }, { "epoch": 0.936180686468573, "grad_norm": 0.26820722222328186, "learning_rate": 4.063819313531427e-05, "loss": 0.2322, "step": 1171500 }, { "epoch": 0.9365802514222514, "grad_norm": 0.6810299754142761, "learning_rate": 4.0634197485777485e-05, "loss": 0.208, "step": 1172000 }, { "epoch": 0.9369798163759299, "grad_norm": 0.7354421019554138, "learning_rate": 4.06302018362407e-05, "loss": 0.2167, "step": 1172500 }, { "epoch": 0.9373793813296083, "grad_norm": 1.8129303455352783, "learning_rate": 4.062620618670392e-05, "loss": 0.2224, "step": 1173000 }, { "epoch": 0.9377789462832867, "grad_norm": 0.6419135332107544, "learning_rate": 4.062221053716713e-05, "loss": 0.1982, "step": 1173500 }, { "epoch": 0.9381785112369652, "grad_norm": 0.5924184322357178, "learning_rate": 4.061821488763035e-05, "loss": 0.2127, "step": 1174000 }, { "epoch": 0.9385780761906436, "grad_norm": 0.6780502796173096, "learning_rate": 4.061421923809356e-05, "loss": 0.2325, "step": 1174500 }, { "epoch": 0.938977641144322, "grad_norm": 0.599991500377655, "learning_rate": 4.061022358855679e-05, "loss": 0.2169, "step": 1175000 }, { "epoch": 0.9393772060980005, "grad_norm": 0.7488709688186646, "learning_rate": 4.060622793902e-05, "loss": 0.2152, "step": 1175500 }, { "epoch": 0.9397767710516789, "grad_norm": 0.6690383553504944, "learning_rate": 4.060223228948321e-05, "loss": 0.2101, "step": 1176000 }, { "epoch": 0.9401763360053573, "grad_norm": 0.3953857719898224, "learning_rate": 4.059823663994643e-05, "loss": 0.2147, "step": 1176500 }, { "epoch": 0.9405759009590358, "grad_norm": 0.777978777885437, "learning_rate": 4.0594240990409646e-05, "loss": 0.2336, "step": 1177000 }, { "epoch": 0.9409754659127142, "grad_norm": 0.6765526533126831, "learning_rate": 4.0590245340872864e-05, "loss": 0.2134, "step": 1177500 }, { "epoch": 0.9413750308663926, "grad_norm": 0.408281534910202, "learning_rate": 4.0586249691336076e-05, "loss": 0.2045, "step": 1178000 }, { "epoch": 0.9417745958200711, "grad_norm": 0.35747435688972473, "learning_rate": 4.058225404179929e-05, "loss": 0.2222, "step": 1178500 }, { "epoch": 0.9421741607737495, "grad_norm": 0.3984758257865906, "learning_rate": 4.0578258392262505e-05, "loss": 0.2069, "step": 1179000 }, { "epoch": 0.9425737257274279, "grad_norm": 1.6681334972381592, "learning_rate": 4.0574262742725724e-05, "loss": 0.2175, "step": 1179500 }, { "epoch": 0.9429732906811064, "grad_norm": 3.0898330211639404, "learning_rate": 4.057026709318894e-05, "loss": 0.2265, "step": 1180000 }, { "epoch": 0.9433728556347849, "grad_norm": 0.9774110913276672, "learning_rate": 4.056627144365215e-05, "loss": 0.2337, "step": 1180500 }, { "epoch": 0.9437724205884633, "grad_norm": 1.1199792623519897, "learning_rate": 4.0562275794115365e-05, "loss": 0.2251, "step": 1181000 }, { "epoch": 0.9441719855421418, "grad_norm": 0.7901882529258728, "learning_rate": 4.055828014457859e-05, "loss": 0.2109, "step": 1181500 }, { "epoch": 0.9445715504958202, "grad_norm": 0.7050341367721558, "learning_rate": 4.05542844950418e-05, "loss": 0.2221, "step": 1182000 }, { "epoch": 0.9449711154494986, "grad_norm": 0.8015345335006714, "learning_rate": 4.055028884550501e-05, "loss": 0.2376, "step": 1182500 }, { "epoch": 0.9453706804031771, "grad_norm": 0.9773871898651123, "learning_rate": 4.054629319596823e-05, "loss": 0.2279, "step": 1183000 }, { "epoch": 0.9457702453568555, "grad_norm": 0.48336973786354065, "learning_rate": 4.054229754643145e-05, "loss": 0.2155, "step": 1183500 }, { "epoch": 0.9461698103105339, "grad_norm": 1.0582072734832764, "learning_rate": 4.053830189689467e-05, "loss": 0.2124, "step": 1184000 }, { "epoch": 0.9465693752642124, "grad_norm": 1.1119633913040161, "learning_rate": 4.053430624735788e-05, "loss": 0.2147, "step": 1184500 }, { "epoch": 0.9469689402178908, "grad_norm": 1.0588659048080444, "learning_rate": 4.053031059782109e-05, "loss": 0.2027, "step": 1185000 }, { "epoch": 0.9473685051715692, "grad_norm": 0.7052562832832336, "learning_rate": 4.052631494828431e-05, "loss": 0.2174, "step": 1185500 }, { "epoch": 0.9477680701252477, "grad_norm": 0.5192912817001343, "learning_rate": 4.0522319298747526e-05, "loss": 0.2224, "step": 1186000 }, { "epoch": 0.9481676350789261, "grad_norm": 1.0974630117416382, "learning_rate": 4.0518323649210745e-05, "loss": 0.2287, "step": 1186500 }, { "epoch": 0.9485672000326045, "grad_norm": 0.4779643416404724, "learning_rate": 4.0514327999673956e-05, "loss": 0.2316, "step": 1187000 }, { "epoch": 0.948966764986283, "grad_norm": 0.5485232472419739, "learning_rate": 4.051033235013717e-05, "loss": 0.2093, "step": 1187500 }, { "epoch": 0.9493663299399614, "grad_norm": 0.9834853410720825, "learning_rate": 4.050633670060039e-05, "loss": 0.2115, "step": 1188000 }, { "epoch": 0.9497658948936398, "grad_norm": 1.3675614595413208, "learning_rate": 4.0502341051063604e-05, "loss": 0.1978, "step": 1188500 }, { "epoch": 0.9501654598473183, "grad_norm": 0.5379268527030945, "learning_rate": 4.049834540152682e-05, "loss": 0.2111, "step": 1189000 }, { "epoch": 0.9505650248009967, "grad_norm": 0.6154172420501709, "learning_rate": 4.0494349751990033e-05, "loss": 0.2094, "step": 1189500 }, { "epoch": 0.9509645897546751, "grad_norm": 0.4788236916065216, "learning_rate": 4.049035410245325e-05, "loss": 0.2199, "step": 1190000 }, { "epoch": 0.9513641547083536, "grad_norm": 0.485445499420166, "learning_rate": 4.048635845291647e-05, "loss": 0.2079, "step": 1190500 }, { "epoch": 0.951763719662032, "grad_norm": 0.7564946413040161, "learning_rate": 4.048236280337968e-05, "loss": 0.2149, "step": 1191000 }, { "epoch": 0.9521632846157104, "grad_norm": 0.47638165950775146, "learning_rate": 4.04783671538429e-05, "loss": 0.2153, "step": 1191500 }, { "epoch": 0.9525628495693889, "grad_norm": 0.397220641374588, "learning_rate": 4.047437150430611e-05, "loss": 0.2118, "step": 1192000 }, { "epoch": 0.9529624145230673, "grad_norm": 1.6937304735183716, "learning_rate": 4.047037585476933e-05, "loss": 0.2331, "step": 1192500 }, { "epoch": 0.9533619794767457, "grad_norm": 0.5773670673370361, "learning_rate": 4.046638020523255e-05, "loss": 0.2038, "step": 1193000 }, { "epoch": 0.9537615444304242, "grad_norm": 0.4988578259944916, "learning_rate": 4.046238455569576e-05, "loss": 0.2241, "step": 1193500 }, { "epoch": 0.9541611093841026, "grad_norm": 0.8172646760940552, "learning_rate": 4.045838890615898e-05, "loss": 0.2171, "step": 1194000 }, { "epoch": 0.954560674337781, "grad_norm": 0.18186521530151367, "learning_rate": 4.0454393256622195e-05, "loss": 0.2021, "step": 1194500 }, { "epoch": 0.9549602392914595, "grad_norm": 2.580526828765869, "learning_rate": 4.0450397607085406e-05, "loss": 0.22, "step": 1195000 }, { "epoch": 0.9553598042451379, "grad_norm": 1.747023105621338, "learning_rate": 4.0446401957548625e-05, "loss": 0.2142, "step": 1195500 }, { "epoch": 0.9557593691988163, "grad_norm": 0.6407414674758911, "learning_rate": 4.0442406308011836e-05, "loss": 0.2067, "step": 1196000 }, { "epoch": 0.9561589341524948, "grad_norm": 0.4706435203552246, "learning_rate": 4.0438410658475054e-05, "loss": 0.22, "step": 1196500 }, { "epoch": 0.9565584991061732, "grad_norm": 1.0635950565338135, "learning_rate": 4.043441500893827e-05, "loss": 0.2196, "step": 1197000 }, { "epoch": 0.9569580640598516, "grad_norm": 0.2097167819738388, "learning_rate": 4.0430419359401484e-05, "loss": 0.2134, "step": 1197500 }, { "epoch": 0.95735762901353, "grad_norm": 2.1404168605804443, "learning_rate": 4.04264237098647e-05, "loss": 0.2168, "step": 1198000 }, { "epoch": 0.9577571939672085, "grad_norm": 0.6510955095291138, "learning_rate": 4.0422428060327914e-05, "loss": 0.2186, "step": 1198500 }, { "epoch": 0.9581567589208869, "grad_norm": 1.2076261043548584, "learning_rate": 4.041843241079113e-05, "loss": 0.2166, "step": 1199000 }, { "epoch": 0.9585563238745654, "grad_norm": 0.654853105545044, "learning_rate": 4.041443676125435e-05, "loss": 0.2367, "step": 1199500 }, { "epoch": 0.9589558888282438, "grad_norm": 1.7120811939239502, "learning_rate": 4.041044111171756e-05, "loss": 0.2375, "step": 1200000 }, { "epoch": 0.9593554537819222, "grad_norm": 0.24387498199939728, "learning_rate": 4.040644546218078e-05, "loss": 0.2119, "step": 1200500 }, { "epoch": 0.9597550187356007, "grad_norm": 0.5607019066810608, "learning_rate": 4.040244981264399e-05, "loss": 0.1967, "step": 1201000 }, { "epoch": 0.9601545836892791, "grad_norm": 0.6365593075752258, "learning_rate": 4.0398454163107216e-05, "loss": 0.2169, "step": 1201500 }, { "epoch": 0.9605541486429575, "grad_norm": 0.6944308280944824, "learning_rate": 4.039445851357043e-05, "loss": 0.2216, "step": 1202000 }, { "epoch": 0.960953713596636, "grad_norm": 0.5304255485534668, "learning_rate": 4.039046286403364e-05, "loss": 0.2134, "step": 1202500 }, { "epoch": 0.9613532785503144, "grad_norm": 0.47641199827194214, "learning_rate": 4.038646721449686e-05, "loss": 0.2125, "step": 1203000 }, { "epoch": 0.9617528435039928, "grad_norm": 0.15664519369602203, "learning_rate": 4.0382471564960075e-05, "loss": 0.2249, "step": 1203500 }, { "epoch": 0.9621524084576712, "grad_norm": 0.552148163318634, "learning_rate": 4.0378475915423293e-05, "loss": 0.2087, "step": 1204000 }, { "epoch": 0.9625519734113497, "grad_norm": 0.6236949563026428, "learning_rate": 4.0374480265886505e-05, "loss": 0.2038, "step": 1204500 }, { "epoch": 0.9629515383650281, "grad_norm": 1.2051286697387695, "learning_rate": 4.0370484616349716e-05, "loss": 0.2117, "step": 1205000 }, { "epoch": 0.9633511033187065, "grad_norm": 0.17686845362186432, "learning_rate": 4.0366488966812934e-05, "loss": 0.2116, "step": 1205500 }, { "epoch": 0.963750668272385, "grad_norm": 0.5742860436439514, "learning_rate": 4.036249331727615e-05, "loss": 0.2283, "step": 1206000 }, { "epoch": 0.9641502332260634, "grad_norm": 0.7008791565895081, "learning_rate": 4.035849766773937e-05, "loss": 0.203, "step": 1206500 }, { "epoch": 0.9645497981797418, "grad_norm": 2.954329013824463, "learning_rate": 4.035450201820258e-05, "loss": 0.2048, "step": 1207000 }, { "epoch": 0.9649493631334203, "grad_norm": 0.21739381551742554, "learning_rate": 4.0350506368665794e-05, "loss": 0.2188, "step": 1207500 }, { "epoch": 0.9653489280870987, "grad_norm": 0.4443833827972412, "learning_rate": 4.034651071912902e-05, "loss": 0.2241, "step": 1208000 }, { "epoch": 0.9657484930407773, "grad_norm": 0.22875292599201202, "learning_rate": 4.034251506959223e-05, "loss": 0.214, "step": 1208500 }, { "epoch": 0.9661480579944557, "grad_norm": 0.5095024108886719, "learning_rate": 4.033851942005544e-05, "loss": 0.2079, "step": 1209000 }, { "epoch": 0.9665476229481341, "grad_norm": 0.49345430731773376, "learning_rate": 4.033452377051866e-05, "loss": 0.2225, "step": 1209500 }, { "epoch": 0.9669471879018126, "grad_norm": 0.5692427158355713, "learning_rate": 4.033052812098188e-05, "loss": 0.2213, "step": 1210000 }, { "epoch": 0.967346752855491, "grad_norm": 0.37973418831825256, "learning_rate": 4.0326532471445096e-05, "loss": 0.2318, "step": 1210500 }, { "epoch": 0.9677463178091694, "grad_norm": 1.8951683044433594, "learning_rate": 4.032253682190831e-05, "loss": 0.22, "step": 1211000 }, { "epoch": 0.9681458827628479, "grad_norm": 0.5011489391326904, "learning_rate": 4.031854117237152e-05, "loss": 0.2161, "step": 1211500 }, { "epoch": 0.9685454477165263, "grad_norm": 0.8559617400169373, "learning_rate": 4.031454552283474e-05, "loss": 0.2323, "step": 1212000 }, { "epoch": 0.9689450126702047, "grad_norm": 0.520291268825531, "learning_rate": 4.0310549873297955e-05, "loss": 0.2147, "step": 1212500 }, { "epoch": 0.9693445776238832, "grad_norm": 0.5505502820014954, "learning_rate": 4.0306554223761174e-05, "loss": 0.2111, "step": 1213000 }, { "epoch": 0.9697441425775616, "grad_norm": 0.5639253258705139, "learning_rate": 4.0302558574224385e-05, "loss": 0.2159, "step": 1213500 }, { "epoch": 0.97014370753124, "grad_norm": 0.4007701575756073, "learning_rate": 4.0298562924687596e-05, "loss": 0.2187, "step": 1214000 }, { "epoch": 0.9705432724849185, "grad_norm": 0.2387533336877823, "learning_rate": 4.029456727515082e-05, "loss": 0.2159, "step": 1214500 }, { "epoch": 0.9709428374385969, "grad_norm": 0.5724208354949951, "learning_rate": 4.029057162561403e-05, "loss": 0.2197, "step": 1215000 }, { "epoch": 0.9713424023922753, "grad_norm": 0.5179265737533569, "learning_rate": 4.028657597607725e-05, "loss": 0.2197, "step": 1215500 }, { "epoch": 0.9717419673459537, "grad_norm": 1.0726046562194824, "learning_rate": 4.028258032654046e-05, "loss": 0.2193, "step": 1216000 }, { "epoch": 0.9721415322996322, "grad_norm": 0.21787644922733307, "learning_rate": 4.027858467700368e-05, "loss": 0.2148, "step": 1216500 }, { "epoch": 0.9725410972533106, "grad_norm": 0.7481268048286438, "learning_rate": 4.02745890274669e-05, "loss": 0.2255, "step": 1217000 }, { "epoch": 0.972940662206989, "grad_norm": 0.4038498103618622, "learning_rate": 4.027059337793011e-05, "loss": 0.2163, "step": 1217500 }, { "epoch": 0.9733402271606675, "grad_norm": 0.4040386378765106, "learning_rate": 4.026659772839333e-05, "loss": 0.2019, "step": 1218000 }, { "epoch": 0.9737397921143459, "grad_norm": 0.5379074811935425, "learning_rate": 4.026260207885654e-05, "loss": 0.2233, "step": 1218500 }, { "epoch": 0.9741393570680243, "grad_norm": 1.461305022239685, "learning_rate": 4.025860642931976e-05, "loss": 0.2325, "step": 1219000 }, { "epoch": 0.9745389220217028, "grad_norm": 0.8052461743354797, "learning_rate": 4.0254610779782976e-05, "loss": 0.21, "step": 1219500 }, { "epoch": 0.9749384869753812, "grad_norm": 1.3054050207138062, "learning_rate": 4.025061513024619e-05, "loss": 0.2055, "step": 1220000 }, { "epoch": 0.9753380519290596, "grad_norm": 0.6358039975166321, "learning_rate": 4.0246619480709406e-05, "loss": 0.2231, "step": 1220500 }, { "epoch": 0.9757376168827381, "grad_norm": 0.4487379193305969, "learning_rate": 4.0242623831172624e-05, "loss": 0.2104, "step": 1221000 }, { "epoch": 0.9761371818364165, "grad_norm": 0.7767283320426941, "learning_rate": 4.0238628181635836e-05, "loss": 0.211, "step": 1221500 }, { "epoch": 0.976536746790095, "grad_norm": 0.6871442794799805, "learning_rate": 4.0234632532099054e-05, "loss": 0.1975, "step": 1222000 }, { "epoch": 0.9769363117437734, "grad_norm": 0.2117345929145813, "learning_rate": 4.0230636882562265e-05, "loss": 0.2204, "step": 1222500 }, { "epoch": 0.9773358766974518, "grad_norm": 1.0175856351852417, "learning_rate": 4.022664123302548e-05, "loss": 0.2139, "step": 1223000 }, { "epoch": 0.9777354416511302, "grad_norm": 1.0092862844467163, "learning_rate": 4.02226455834887e-05, "loss": 0.2199, "step": 1223500 }, { "epoch": 0.9781350066048087, "grad_norm": 0.774587869644165, "learning_rate": 4.021864993395191e-05, "loss": 0.2137, "step": 1224000 }, { "epoch": 0.9785345715584871, "grad_norm": 0.3687361180782318, "learning_rate": 4.021465428441513e-05, "loss": 0.2092, "step": 1224500 }, { "epoch": 0.9789341365121655, "grad_norm": 0.5976564288139343, "learning_rate": 4.021065863487834e-05, "loss": 0.208, "step": 1225000 }, { "epoch": 0.979333701465844, "grad_norm": 0.6576083302497864, "learning_rate": 4.020666298534157e-05, "loss": 0.2164, "step": 1225500 }, { "epoch": 0.9797332664195224, "grad_norm": 0.3264186680316925, "learning_rate": 4.020266733580478e-05, "loss": 0.1993, "step": 1226000 }, { "epoch": 0.9801328313732008, "grad_norm": 0.6756389737129211, "learning_rate": 4.019867168626799e-05, "loss": 0.2249, "step": 1226500 }, { "epoch": 0.9805323963268793, "grad_norm": 0.6653280258178711, "learning_rate": 4.019467603673121e-05, "loss": 0.2124, "step": 1227000 }, { "epoch": 0.9809319612805577, "grad_norm": 0.37290143966674805, "learning_rate": 4.019068038719443e-05, "loss": 0.2351, "step": 1227500 }, { "epoch": 0.9813315262342361, "grad_norm": 2.289090394973755, "learning_rate": 4.0186684737657645e-05, "loss": 0.2208, "step": 1228000 }, { "epoch": 0.9817310911879146, "grad_norm": 0.483743280172348, "learning_rate": 4.0182689088120856e-05, "loss": 0.2279, "step": 1228500 }, { "epoch": 0.982130656141593, "grad_norm": 0.39069801568984985, "learning_rate": 4.017869343858407e-05, "loss": 0.2151, "step": 1229000 }, { "epoch": 0.9825302210952714, "grad_norm": 0.7066575884819031, "learning_rate": 4.0174697789047286e-05, "loss": 0.2346, "step": 1229500 }, { "epoch": 0.9829297860489499, "grad_norm": 0.5345682501792908, "learning_rate": 4.0170702139510504e-05, "loss": 0.2235, "step": 1230000 }, { "epoch": 0.9833293510026283, "grad_norm": 2.563924789428711, "learning_rate": 4.016670648997372e-05, "loss": 0.2379, "step": 1230500 }, { "epoch": 0.9837289159563067, "grad_norm": 0.8821489214897156, "learning_rate": 4.0162710840436934e-05, "loss": 0.2279, "step": 1231000 }, { "epoch": 0.9841284809099852, "grad_norm": 0.6358765959739685, "learning_rate": 4.0158715190900145e-05, "loss": 0.2207, "step": 1231500 }, { "epoch": 0.9845280458636636, "grad_norm": 0.6890300512313843, "learning_rate": 4.015471954136337e-05, "loss": 0.2093, "step": 1232000 }, { "epoch": 0.984927610817342, "grad_norm": 0.9456961154937744, "learning_rate": 4.015072389182658e-05, "loss": 0.2141, "step": 1232500 }, { "epoch": 0.9853271757710205, "grad_norm": 0.6889389753341675, "learning_rate": 4.01467282422898e-05, "loss": 0.2038, "step": 1233000 }, { "epoch": 0.9857267407246989, "grad_norm": 1.5209242105484009, "learning_rate": 4.014273259275301e-05, "loss": 0.2199, "step": 1233500 }, { "epoch": 0.9861263056783773, "grad_norm": 0.7349516153335571, "learning_rate": 4.013873694321623e-05, "loss": 0.2158, "step": 1234000 }, { "epoch": 0.9865258706320558, "grad_norm": 0.4522801637649536, "learning_rate": 4.013474129367945e-05, "loss": 0.2044, "step": 1234500 }, { "epoch": 0.9869254355857342, "grad_norm": 0.5382198691368103, "learning_rate": 4.013074564414266e-05, "loss": 0.2201, "step": 1235000 }, { "epoch": 0.9873250005394126, "grad_norm": 1.8913568258285522, "learning_rate": 4.012674999460588e-05, "loss": 0.2021, "step": 1235500 }, { "epoch": 0.9877245654930912, "grad_norm": 0.546360969543457, "learning_rate": 4.012275434506909e-05, "loss": 0.2273, "step": 1236000 }, { "epoch": 0.9881241304467696, "grad_norm": 0.40958303213119507, "learning_rate": 4.011875869553231e-05, "loss": 0.2123, "step": 1236500 }, { "epoch": 0.988523695400448, "grad_norm": 0.5750629305839539, "learning_rate": 4.0114763045995525e-05, "loss": 0.2061, "step": 1237000 }, { "epoch": 0.9889232603541265, "grad_norm": 0.5920727252960205, "learning_rate": 4.0110767396458737e-05, "loss": 0.2094, "step": 1237500 }, { "epoch": 0.9893228253078049, "grad_norm": 0.6362273693084717, "learning_rate": 4.010677174692195e-05, "loss": 0.209, "step": 1238000 }, { "epoch": 0.9897223902614833, "grad_norm": 0.4974212348461151, "learning_rate": 4.010277609738517e-05, "loss": 0.2129, "step": 1238500 }, { "epoch": 0.9901219552151618, "grad_norm": 0.5830400586128235, "learning_rate": 4.0098780447848384e-05, "loss": 0.2293, "step": 1239000 }, { "epoch": 0.9905215201688402, "grad_norm": 3.0543553829193115, "learning_rate": 4.00947847983116e-05, "loss": 0.2048, "step": 1239500 }, { "epoch": 0.9909210851225186, "grad_norm": 0.3602750897407532, "learning_rate": 4.0090789148774814e-05, "loss": 0.2091, "step": 1240000 }, { "epoch": 0.9913206500761971, "grad_norm": 0.5716806650161743, "learning_rate": 4.008679349923803e-05, "loss": 0.2116, "step": 1240500 }, { "epoch": 0.9917202150298755, "grad_norm": 1.925385594367981, "learning_rate": 4.008279784970125e-05, "loss": 0.2341, "step": 1241000 }, { "epoch": 0.9921197799835539, "grad_norm": 0.23924711346626282, "learning_rate": 4.007880220016446e-05, "loss": 0.2145, "step": 1241500 }, { "epoch": 0.9925193449372324, "grad_norm": 0.2742629945278168, "learning_rate": 4.007480655062768e-05, "loss": 0.2037, "step": 1242000 }, { "epoch": 0.9929189098909108, "grad_norm": 0.5694785118103027, "learning_rate": 4.007081090109089e-05, "loss": 0.2256, "step": 1242500 }, { "epoch": 0.9933184748445892, "grad_norm": 0.43767720460891724, "learning_rate": 4.006681525155411e-05, "loss": 0.2077, "step": 1243000 }, { "epoch": 0.9937180397982677, "grad_norm": 0.6076911687850952, "learning_rate": 4.006281960201733e-05, "loss": 0.216, "step": 1243500 }, { "epoch": 0.9941176047519461, "grad_norm": 1.1030248403549194, "learning_rate": 4.005882395248054e-05, "loss": 0.219, "step": 1244000 }, { "epoch": 0.9945171697056245, "grad_norm": 0.8624241352081299, "learning_rate": 4.005482830294376e-05, "loss": 0.2064, "step": 1244500 }, { "epoch": 0.994916734659303, "grad_norm": 0.501841127872467, "learning_rate": 4.005083265340697e-05, "loss": 0.219, "step": 1245000 }, { "epoch": 0.9953162996129814, "grad_norm": 0.39358747005462646, "learning_rate": 4.004683700387019e-05, "loss": 0.2268, "step": 1245500 }, { "epoch": 0.9957158645666598, "grad_norm": 0.6012535691261292, "learning_rate": 4.0042841354333405e-05, "loss": 0.2241, "step": 1246000 }, { "epoch": 0.9961154295203383, "grad_norm": 0.42552128434181213, "learning_rate": 4.003884570479662e-05, "loss": 0.2204, "step": 1246500 }, { "epoch": 0.9965149944740167, "grad_norm": 0.4850093126296997, "learning_rate": 4.0034850055259835e-05, "loss": 0.211, "step": 1247000 }, { "epoch": 0.9969145594276951, "grad_norm": 0.580950140953064, "learning_rate": 4.003085440572305e-05, "loss": 0.2126, "step": 1247500 }, { "epoch": 0.9973141243813736, "grad_norm": 0.658244252204895, "learning_rate": 4.0026858756186265e-05, "loss": 0.2144, "step": 1248000 }, { "epoch": 0.997713689335052, "grad_norm": 1.2926474809646606, "learning_rate": 4.002286310664948e-05, "loss": 0.2232, "step": 1248500 }, { "epoch": 0.9981132542887304, "grad_norm": 0.6937040686607361, "learning_rate": 4.0018867457112694e-05, "loss": 0.2154, "step": 1249000 }, { "epoch": 0.9985128192424089, "grad_norm": 0.988490879535675, "learning_rate": 4.001487180757591e-05, "loss": 0.2205, "step": 1249500 }, { "epoch": 0.9989123841960873, "grad_norm": 0.9401140213012695, "learning_rate": 4.001087615803913e-05, "loss": 0.212, "step": 1250000 }, { "epoch": 0.9993119491497657, "grad_norm": 2.6317858695983887, "learning_rate": 4.000688050850234e-05, "loss": 0.2032, "step": 1250500 }, { "epoch": 0.9997115141034442, "grad_norm": 0.504814624786377, "learning_rate": 4.000288485896556e-05, "loss": 0.2276, "step": 1251000 }, { "epoch": 1.0001110790571226, "grad_norm": 0.4598837196826935, "learning_rate": 3.999888920942877e-05, "loss": 0.2146, "step": 1251500 }, { "epoch": 1.0005106440108011, "grad_norm": 0.6243917942047119, "learning_rate": 3.9994893559891997e-05, "loss": 0.2206, "step": 1252000 }, { "epoch": 1.0009102089644795, "grad_norm": 0.29243791103363037, "learning_rate": 3.999089791035521e-05, "loss": 0.2088, "step": 1252500 }, { "epoch": 1.001309773918158, "grad_norm": 0.9229438304901123, "learning_rate": 3.998690226081842e-05, "loss": 0.2135, "step": 1253000 }, { "epoch": 1.0017093388718363, "grad_norm": 0.4150417149066925, "learning_rate": 3.998290661128164e-05, "loss": 0.2091, "step": 1253500 }, { "epoch": 1.0021089038255149, "grad_norm": 2.247568368911743, "learning_rate": 3.9978910961744856e-05, "loss": 0.2211, "step": 1254000 }, { "epoch": 1.0025084687791932, "grad_norm": 0.6649703979492188, "learning_rate": 3.9974915312208074e-05, "loss": 0.2099, "step": 1254500 }, { "epoch": 1.0029080337328717, "grad_norm": 0.22142952680587769, "learning_rate": 3.9970919662671285e-05, "loss": 0.2184, "step": 1255000 }, { "epoch": 1.00330759868655, "grad_norm": 0.6502035856246948, "learning_rate": 3.99669240131345e-05, "loss": 0.2202, "step": 1255500 }, { "epoch": 1.0037071636402286, "grad_norm": 0.4007321000099182, "learning_rate": 3.9962928363597715e-05, "loss": 0.2079, "step": 1256000 }, { "epoch": 1.004106728593907, "grad_norm": 0.8135641813278198, "learning_rate": 3.995893271406093e-05, "loss": 0.196, "step": 1256500 }, { "epoch": 1.0045062935475855, "grad_norm": 0.4629269540309906, "learning_rate": 3.995493706452415e-05, "loss": 0.208, "step": 1257000 }, { "epoch": 1.0049058585012638, "grad_norm": 0.5258995294570923, "learning_rate": 3.995094141498736e-05, "loss": 0.2064, "step": 1257500 }, { "epoch": 1.0053054234549423, "grad_norm": 0.6011514663696289, "learning_rate": 3.9946945765450574e-05, "loss": 0.2178, "step": 1258000 }, { "epoch": 1.0057049884086207, "grad_norm": 0.7489943504333496, "learning_rate": 3.99429501159138e-05, "loss": 0.217, "step": 1258500 }, { "epoch": 1.0061045533622992, "grad_norm": 0.5136427879333496, "learning_rate": 3.993895446637701e-05, "loss": 0.2102, "step": 1259000 }, { "epoch": 1.0065041183159775, "grad_norm": 1.186418890953064, "learning_rate": 3.993495881684023e-05, "loss": 0.2129, "step": 1259500 }, { "epoch": 1.006903683269656, "grad_norm": 0.4394020140171051, "learning_rate": 3.993096316730344e-05, "loss": 0.2009, "step": 1260000 }, { "epoch": 1.0073032482233344, "grad_norm": 0.4353010952472687, "learning_rate": 3.992696751776666e-05, "loss": 0.2117, "step": 1260500 }, { "epoch": 1.007702813177013, "grad_norm": 0.2168494611978531, "learning_rate": 3.992297186822988e-05, "loss": 0.2225, "step": 1261000 }, { "epoch": 1.0081023781306913, "grad_norm": 0.5182827115058899, "learning_rate": 3.991897621869309e-05, "loss": 0.217, "step": 1261500 }, { "epoch": 1.0085019430843698, "grad_norm": 2.017793893814087, "learning_rate": 3.9914980569156306e-05, "loss": 0.2084, "step": 1262000 }, { "epoch": 1.0089015080380481, "grad_norm": 0.4792706072330475, "learning_rate": 3.991098491961952e-05, "loss": 0.2135, "step": 1262500 }, { "epoch": 1.0093010729917267, "grad_norm": 0.7783152461051941, "learning_rate": 3.9906989270082736e-05, "loss": 0.2126, "step": 1263000 }, { "epoch": 1.009700637945405, "grad_norm": 1.2935973405838013, "learning_rate": 3.9902993620545954e-05, "loss": 0.2061, "step": 1263500 }, { "epoch": 1.0101002028990835, "grad_norm": 0.3910047113895416, "learning_rate": 3.9898997971009166e-05, "loss": 0.2056, "step": 1264000 }, { "epoch": 1.0104997678527619, "grad_norm": 0.5781631469726562, "learning_rate": 3.989500232147238e-05, "loss": 0.2025, "step": 1264500 }, { "epoch": 1.0108993328064404, "grad_norm": 1.2061123847961426, "learning_rate": 3.98910066719356e-05, "loss": 0.2331, "step": 1265000 }, { "epoch": 1.0112988977601187, "grad_norm": 0.4641139507293701, "learning_rate": 3.9887011022398813e-05, "loss": 0.2172, "step": 1265500 }, { "epoch": 1.0116984627137973, "grad_norm": 0.6800472140312195, "learning_rate": 3.988301537286203e-05, "loss": 0.1982, "step": 1266000 }, { "epoch": 1.0120980276674756, "grad_norm": 0.5219371318817139, "learning_rate": 3.987901972332524e-05, "loss": 0.2082, "step": 1266500 }, { "epoch": 1.0124975926211541, "grad_norm": 0.7243718504905701, "learning_rate": 3.987502407378846e-05, "loss": 0.2186, "step": 1267000 }, { "epoch": 1.0128971575748325, "grad_norm": 2.7509002685546875, "learning_rate": 3.987102842425168e-05, "loss": 0.2107, "step": 1267500 }, { "epoch": 1.013296722528511, "grad_norm": 0.5417560338973999, "learning_rate": 3.986703277471489e-05, "loss": 0.2399, "step": 1268000 }, { "epoch": 1.0136962874821893, "grad_norm": 0.7145971655845642, "learning_rate": 3.986303712517811e-05, "loss": 0.2208, "step": 1268500 }, { "epoch": 1.0140958524358679, "grad_norm": 0.38542014360427856, "learning_rate": 3.985904147564132e-05, "loss": 0.2107, "step": 1269000 }, { "epoch": 1.0144954173895462, "grad_norm": 0.34342578053474426, "learning_rate": 3.985504582610454e-05, "loss": 0.2198, "step": 1269500 }, { "epoch": 1.0148949823432247, "grad_norm": 0.6460293531417847, "learning_rate": 3.985105017656776e-05, "loss": 0.1963, "step": 1270000 }, { "epoch": 1.015294547296903, "grad_norm": 0.5998449325561523, "learning_rate": 3.984705452703097e-05, "loss": 0.2144, "step": 1270500 }, { "epoch": 1.0156941122505816, "grad_norm": 0.7863841652870178, "learning_rate": 3.9843058877494187e-05, "loss": 0.2243, "step": 1271000 }, { "epoch": 1.01609367720426, "grad_norm": 0.5182644128799438, "learning_rate": 3.9839063227957405e-05, "loss": 0.2013, "step": 1271500 }, { "epoch": 1.0164932421579385, "grad_norm": 0.6021581888198853, "learning_rate": 3.9835067578420616e-05, "loss": 0.2018, "step": 1272000 }, { "epoch": 1.0168928071116168, "grad_norm": 0.6301814913749695, "learning_rate": 3.9831071928883834e-05, "loss": 0.1963, "step": 1272500 }, { "epoch": 1.0172923720652953, "grad_norm": 1.0307202339172363, "learning_rate": 3.9827076279347046e-05, "loss": 0.1962, "step": 1273000 }, { "epoch": 1.0176919370189736, "grad_norm": 0.6130672693252563, "learning_rate": 3.9823080629810264e-05, "loss": 0.2087, "step": 1273500 }, { "epoch": 1.0180915019726522, "grad_norm": 0.4295696020126343, "learning_rate": 3.981908498027348e-05, "loss": 0.21, "step": 1274000 }, { "epoch": 1.0184910669263305, "grad_norm": 0.5995665192604065, "learning_rate": 3.9815089330736694e-05, "loss": 0.2262, "step": 1274500 }, { "epoch": 1.018890631880009, "grad_norm": 0.8657405972480774, "learning_rate": 3.981109368119991e-05, "loss": 0.1914, "step": 1275000 }, { "epoch": 1.0192901968336874, "grad_norm": 0.45475032925605774, "learning_rate": 3.980709803166312e-05, "loss": 0.2121, "step": 1275500 }, { "epoch": 1.019689761787366, "grad_norm": 0.4823838770389557, "learning_rate": 3.980310238212635e-05, "loss": 0.2143, "step": 1276000 }, { "epoch": 1.0200893267410442, "grad_norm": 1.9737447500228882, "learning_rate": 3.979910673258956e-05, "loss": 0.2005, "step": 1276500 }, { "epoch": 1.0204888916947228, "grad_norm": 1.0304450988769531, "learning_rate": 3.979511108305277e-05, "loss": 0.2305, "step": 1277000 }, { "epoch": 1.020888456648401, "grad_norm": 0.35390007495880127, "learning_rate": 3.979111543351599e-05, "loss": 0.2094, "step": 1277500 }, { "epoch": 1.0212880216020797, "grad_norm": 0.4846334159374237, "learning_rate": 3.978711978397921e-05, "loss": 0.2008, "step": 1278000 }, { "epoch": 1.0216875865557582, "grad_norm": 0.9454705715179443, "learning_rate": 3.9783124134442426e-05, "loss": 0.2006, "step": 1278500 }, { "epoch": 1.0220871515094365, "grad_norm": 0.6389827728271484, "learning_rate": 3.977912848490564e-05, "loss": 0.2226, "step": 1279000 }, { "epoch": 1.022486716463115, "grad_norm": 0.3015144467353821, "learning_rate": 3.977513283536885e-05, "loss": 0.2223, "step": 1279500 }, { "epoch": 1.0228862814167934, "grad_norm": 0.7865749001502991, "learning_rate": 3.977113718583207e-05, "loss": 0.2157, "step": 1280000 }, { "epoch": 1.023285846370472, "grad_norm": 0.6969341039657593, "learning_rate": 3.9767141536295285e-05, "loss": 0.2012, "step": 1280500 }, { "epoch": 1.0236854113241503, "grad_norm": 1.7030259370803833, "learning_rate": 3.97631458867585e-05, "loss": 0.1924, "step": 1281000 }, { "epoch": 1.0240849762778288, "grad_norm": 0.44586554169654846, "learning_rate": 3.9759150237221714e-05, "loss": 0.2207, "step": 1281500 }, { "epoch": 1.0244845412315071, "grad_norm": 0.5171001553535461, "learning_rate": 3.9755154587684926e-05, "loss": 0.2085, "step": 1282000 }, { "epoch": 1.0248841061851857, "grad_norm": 0.5863750576972961, "learning_rate": 3.975115893814815e-05, "loss": 0.2209, "step": 1282500 }, { "epoch": 1.025283671138864, "grad_norm": 0.6718472242355347, "learning_rate": 3.974716328861136e-05, "loss": 0.1978, "step": 1283000 }, { "epoch": 1.0256832360925425, "grad_norm": 0.7513668537139893, "learning_rate": 3.974316763907458e-05, "loss": 0.2054, "step": 1283500 }, { "epoch": 1.0260828010462208, "grad_norm": 3.2262754440307617, "learning_rate": 3.973917198953779e-05, "loss": 0.231, "step": 1284000 }, { "epoch": 1.0264823659998994, "grad_norm": 0.31876930594444275, "learning_rate": 3.973517634000101e-05, "loss": 0.2248, "step": 1284500 }, { "epoch": 1.0268819309535777, "grad_norm": 0.740149736404419, "learning_rate": 3.973118069046423e-05, "loss": 0.2051, "step": 1285000 }, { "epoch": 1.0272814959072563, "grad_norm": 0.8654366135597229, "learning_rate": 3.972718504092744e-05, "loss": 0.2123, "step": 1285500 }, { "epoch": 1.0276810608609346, "grad_norm": 0.408313125371933, "learning_rate": 3.972318939139066e-05, "loss": 0.2075, "step": 1286000 }, { "epoch": 1.0280806258146131, "grad_norm": 0.5923413634300232, "learning_rate": 3.971919374185387e-05, "loss": 0.2227, "step": 1286500 }, { "epoch": 1.0284801907682914, "grad_norm": 0.6319562792778015, "learning_rate": 3.971519809231709e-05, "loss": 0.217, "step": 1287000 }, { "epoch": 1.02887975572197, "grad_norm": 0.5902479290962219, "learning_rate": 3.9711202442780306e-05, "loss": 0.2062, "step": 1287500 }, { "epoch": 1.0292793206756483, "grad_norm": 0.6734107136726379, "learning_rate": 3.970720679324352e-05, "loss": 0.2128, "step": 1288000 }, { "epoch": 1.0296788856293269, "grad_norm": 0.7072934508323669, "learning_rate": 3.9703211143706735e-05, "loss": 0.2108, "step": 1288500 }, { "epoch": 1.0300784505830052, "grad_norm": 0.6936883330345154, "learning_rate": 3.969921549416995e-05, "loss": 0.2068, "step": 1289000 }, { "epoch": 1.0304780155366837, "grad_norm": 0.42267221212387085, "learning_rate": 3.9695219844633165e-05, "loss": 0.2071, "step": 1289500 }, { "epoch": 1.030877580490362, "grad_norm": 0.6086959838867188, "learning_rate": 3.969122419509638e-05, "loss": 0.2059, "step": 1290000 }, { "epoch": 1.0312771454440406, "grad_norm": 1.4351541996002197, "learning_rate": 3.9687228545559595e-05, "loss": 0.1957, "step": 1290500 }, { "epoch": 1.031676710397719, "grad_norm": 0.35263556241989136, "learning_rate": 3.968323289602281e-05, "loss": 0.2065, "step": 1291000 }, { "epoch": 1.0320762753513975, "grad_norm": 2.2935476303100586, "learning_rate": 3.967923724648603e-05, "loss": 0.2258, "step": 1291500 }, { "epoch": 1.0324758403050758, "grad_norm": 0.19392652809619904, "learning_rate": 3.967524159694924e-05, "loss": 0.1989, "step": 1292000 }, { "epoch": 1.0328754052587543, "grad_norm": 0.5994540452957153, "learning_rate": 3.967124594741246e-05, "loss": 0.2094, "step": 1292500 }, { "epoch": 1.0332749702124326, "grad_norm": 0.5755767226219177, "learning_rate": 3.966725029787567e-05, "loss": 0.2034, "step": 1293000 }, { "epoch": 1.0336745351661112, "grad_norm": 1.4711006879806519, "learning_rate": 3.966325464833889e-05, "loss": 0.2139, "step": 1293500 }, { "epoch": 1.0340741001197895, "grad_norm": 0.7334244251251221, "learning_rate": 3.965925899880211e-05, "loss": 0.1955, "step": 1294000 }, { "epoch": 1.034473665073468, "grad_norm": 0.6201550364494324, "learning_rate": 3.965526334926532e-05, "loss": 0.2138, "step": 1294500 }, { "epoch": 1.0348732300271464, "grad_norm": 1.2543913125991821, "learning_rate": 3.965126769972854e-05, "loss": 0.2002, "step": 1295000 }, { "epoch": 1.035272794980825, "grad_norm": 0.7348403930664062, "learning_rate": 3.964727205019175e-05, "loss": 0.2035, "step": 1295500 }, { "epoch": 1.0356723599345032, "grad_norm": 0.7324264049530029, "learning_rate": 3.964327640065497e-05, "loss": 0.2129, "step": 1296000 }, { "epoch": 1.0360719248881818, "grad_norm": 0.6986691355705261, "learning_rate": 3.9639280751118186e-05, "loss": 0.2174, "step": 1296500 }, { "epoch": 1.03647148984186, "grad_norm": 0.2208835929632187, "learning_rate": 3.96352851015814e-05, "loss": 0.2265, "step": 1297000 }, { "epoch": 1.0368710547955386, "grad_norm": 0.868411660194397, "learning_rate": 3.9631289452044616e-05, "loss": 0.2212, "step": 1297500 }, { "epoch": 1.037270619749217, "grad_norm": 1.091151237487793, "learning_rate": 3.9627293802507834e-05, "loss": 0.2101, "step": 1298000 }, { "epoch": 1.0376701847028955, "grad_norm": 0.6926202774047852, "learning_rate": 3.9623298152971045e-05, "loss": 0.2321, "step": 1298500 }, { "epoch": 1.0380697496565738, "grad_norm": 1.1525542736053467, "learning_rate": 3.961930250343426e-05, "loss": 0.2217, "step": 1299000 }, { "epoch": 1.0384693146102524, "grad_norm": 1.0695725679397583, "learning_rate": 3.9615306853897475e-05, "loss": 0.2171, "step": 1299500 }, { "epoch": 1.0388688795639307, "grad_norm": 0.876142144203186, "learning_rate": 3.961131120436069e-05, "loss": 0.2187, "step": 1300000 }, { "epoch": 1.0392684445176092, "grad_norm": 2.82513427734375, "learning_rate": 3.960731555482391e-05, "loss": 0.2046, "step": 1300500 }, { "epoch": 1.0396680094712876, "grad_norm": 0.7408906817436218, "learning_rate": 3.960331990528712e-05, "loss": 0.1949, "step": 1301000 }, { "epoch": 1.0400675744249661, "grad_norm": 0.6833987832069397, "learning_rate": 3.959932425575034e-05, "loss": 0.2057, "step": 1301500 }, { "epoch": 1.0404671393786444, "grad_norm": 0.9262207746505737, "learning_rate": 3.959532860621355e-05, "loss": 0.2318, "step": 1302000 }, { "epoch": 1.040866704332323, "grad_norm": 0.6291196942329407, "learning_rate": 3.959133295667678e-05, "loss": 0.2078, "step": 1302500 }, { "epoch": 1.0412662692860013, "grad_norm": 0.7917730212211609, "learning_rate": 3.958733730713999e-05, "loss": 0.2128, "step": 1303000 }, { "epoch": 1.0416658342396798, "grad_norm": 0.9499486088752747, "learning_rate": 3.95833416576032e-05, "loss": 0.2108, "step": 1303500 }, { "epoch": 1.0420653991933582, "grad_norm": 0.3721921741962433, "learning_rate": 3.957934600806642e-05, "loss": 0.2141, "step": 1304000 }, { "epoch": 1.0424649641470367, "grad_norm": 0.4388205111026764, "learning_rate": 3.9575350358529636e-05, "loss": 0.2034, "step": 1304500 }, { "epoch": 1.042864529100715, "grad_norm": 0.3595607876777649, "learning_rate": 3.9571354708992855e-05, "loss": 0.2108, "step": 1305000 }, { "epoch": 1.0432640940543936, "grad_norm": 0.26360756158828735, "learning_rate": 3.9567359059456066e-05, "loss": 0.2181, "step": 1305500 }, { "epoch": 1.0436636590080721, "grad_norm": 0.3864990472793579, "learning_rate": 3.956336340991928e-05, "loss": 0.2163, "step": 1306000 }, { "epoch": 1.0440632239617504, "grad_norm": 0.5064546465873718, "learning_rate": 3.9559367760382496e-05, "loss": 0.2288, "step": 1306500 }, { "epoch": 1.044462788915429, "grad_norm": 0.6377919316291809, "learning_rate": 3.9555372110845714e-05, "loss": 0.2223, "step": 1307000 }, { "epoch": 1.0448623538691073, "grad_norm": 0.8240339756011963, "learning_rate": 3.955137646130893e-05, "loss": 0.2197, "step": 1307500 }, { "epoch": 1.0452619188227859, "grad_norm": 0.5286998152732849, "learning_rate": 3.9547380811772144e-05, "loss": 0.1904, "step": 1308000 }, { "epoch": 1.0456614837764642, "grad_norm": 0.7446879148483276, "learning_rate": 3.9543385162235355e-05, "loss": 0.2267, "step": 1308500 }, { "epoch": 1.0460610487301427, "grad_norm": 0.5698186159133911, "learning_rate": 3.953938951269858e-05, "loss": 0.2079, "step": 1309000 }, { "epoch": 1.046460613683821, "grad_norm": 1.089905023574829, "learning_rate": 3.953539386316179e-05, "loss": 0.2239, "step": 1309500 }, { "epoch": 1.0468601786374996, "grad_norm": 0.9562650918960571, "learning_rate": 3.953139821362501e-05, "loss": 0.1998, "step": 1310000 }, { "epoch": 1.047259743591178, "grad_norm": 1.3682670593261719, "learning_rate": 3.952740256408822e-05, "loss": 0.2098, "step": 1310500 }, { "epoch": 1.0476593085448564, "grad_norm": 0.42940041422843933, "learning_rate": 3.952340691455144e-05, "loss": 0.2114, "step": 1311000 }, { "epoch": 1.0480588734985348, "grad_norm": 0.6715023517608643, "learning_rate": 3.951941126501466e-05, "loss": 0.2193, "step": 1311500 }, { "epoch": 1.0484584384522133, "grad_norm": 0.4771224856376648, "learning_rate": 3.951541561547787e-05, "loss": 0.2287, "step": 1312000 }, { "epoch": 1.0488580034058916, "grad_norm": 0.5305761098861694, "learning_rate": 3.951141996594109e-05, "loss": 0.239, "step": 1312500 }, { "epoch": 1.0492575683595702, "grad_norm": 2.166661262512207, "learning_rate": 3.95074243164043e-05, "loss": 0.2125, "step": 1313000 }, { "epoch": 1.0496571333132485, "grad_norm": 0.497926265001297, "learning_rate": 3.9503428666867517e-05, "loss": 0.2068, "step": 1313500 }, { "epoch": 1.050056698266927, "grad_norm": 0.34204548597335815, "learning_rate": 3.9499433017330735e-05, "loss": 0.2228, "step": 1314000 }, { "epoch": 1.0504562632206054, "grad_norm": 0.5302955508232117, "learning_rate": 3.9495437367793946e-05, "loss": 0.2172, "step": 1314500 }, { "epoch": 1.050855828174284, "grad_norm": 1.3920265436172485, "learning_rate": 3.9491441718257164e-05, "loss": 0.2126, "step": 1315000 }, { "epoch": 1.0512553931279622, "grad_norm": 1.8626676797866821, "learning_rate": 3.948744606872038e-05, "loss": 0.2252, "step": 1315500 }, { "epoch": 1.0516549580816408, "grad_norm": 0.4795547425746918, "learning_rate": 3.9483450419183594e-05, "loss": 0.2116, "step": 1316000 }, { "epoch": 1.052054523035319, "grad_norm": 0.7273180484771729, "learning_rate": 3.947945476964681e-05, "loss": 0.2047, "step": 1316500 }, { "epoch": 1.0524540879889976, "grad_norm": 0.6694060564041138, "learning_rate": 3.9475459120110024e-05, "loss": 0.1968, "step": 1317000 }, { "epoch": 1.052853652942676, "grad_norm": 1.001574158668518, "learning_rate": 3.947146347057324e-05, "loss": 0.2036, "step": 1317500 }, { "epoch": 1.0532532178963545, "grad_norm": 0.5347727537155151, "learning_rate": 3.946746782103646e-05, "loss": 0.2229, "step": 1318000 }, { "epoch": 1.0536527828500328, "grad_norm": 0.5991740226745605, "learning_rate": 3.946347217149967e-05, "loss": 0.2193, "step": 1318500 }, { "epoch": 1.0540523478037114, "grad_norm": 1.169661045074463, "learning_rate": 3.945947652196289e-05, "loss": 0.2123, "step": 1319000 }, { "epoch": 1.0544519127573897, "grad_norm": 0.6314510703086853, "learning_rate": 3.94554808724261e-05, "loss": 0.1964, "step": 1319500 }, { "epoch": 1.0548514777110682, "grad_norm": 0.20778264105319977, "learning_rate": 3.945148522288932e-05, "loss": 0.2383, "step": 1320000 }, { "epoch": 1.0552510426647466, "grad_norm": 3.327683448791504, "learning_rate": 3.944748957335254e-05, "loss": 0.2253, "step": 1320500 }, { "epoch": 1.055650607618425, "grad_norm": 0.664120078086853, "learning_rate": 3.944349392381575e-05, "loss": 0.21, "step": 1321000 }, { "epoch": 1.0560501725721034, "grad_norm": 0.5084836483001709, "learning_rate": 3.943949827427897e-05, "loss": 0.2267, "step": 1321500 }, { "epoch": 1.056449737525782, "grad_norm": 0.9351341724395752, "learning_rate": 3.9435502624742185e-05, "loss": 0.2153, "step": 1322000 }, { "epoch": 1.0568493024794603, "grad_norm": 0.5772596001625061, "learning_rate": 3.94315069752054e-05, "loss": 0.2282, "step": 1322500 } ], "logging_steps": 500, "max_steps": 6256805, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.72779290836992e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }